From a2daff6803a384ce065e3681a2affea1da59c5f5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 31 May 2011 14:09:00 -0700 Subject: fuse: fix non-ANSI void function notation Fix void function parameter list sparse warning: fs/fuse/inode.c:74:44: warning: non-ANSI function declaration of function 'fuse_alloc_forget' Signed-off-by: Randy Dunlap Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc6ec4b2f0ff..5354906e797c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -71,7 +71,7 @@ struct fuse_mount_data { unsigned blksize; }; -struct fuse_forget_link *fuse_alloc_forget() +struct fuse_forget_link *fuse_alloc_forget(void) { return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); } -- cgit v1.2.3 From 6e6938b6d3130305a5960c86b1a9b21e58cf6144 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 6 Jun 2010 10:38:15 -0600 Subject: writeback: introduce .tagged_writepages for the WB_SYNC_NONE sync stage sync(2) is performed in two stages: the WB_SYNC_NONE sync and the WB_SYNC_ALL sync. Identify the first stage with .tagged_writepages and do livelock prevention for it, too. Jan's commit f446daaea9 ("mm: implement writeback livelock avoidance using page tagging") is a partial fix in that it only fixed the WB_SYNC_ALL phase livelock. Although ext4 is tested to no longer livelock with commit f446daaea9, it may due to some "redirty_tail() after pages_skipped" effect which is by no means a guarantee for _all_ the file systems. Note that writeback_inodes_sb() is called by not only sync(), they are treated the same because the other callers also need livelock prevention. Impact: It changes the order in which pages/inodes are synced to disk. Now in the WB_SYNC_NONE stage, it won't proceed to write the next inode until finished with the current inode. Acked-by: Jan Kara CC: Dave Chinner Signed-off-by: Wu Fengguang --- fs/ext4/inode.c | 4 ++-- fs/fs-writeback.c | 17 +++++++++-------- include/linux/writeback.h | 1 + mm/page-writeback.c | 4 ++-- 4 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a5763e3505ba..8558b6c3450a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct address_space *mapping, index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; @@ -2973,7 +2973,7 @@ static int ext4_da_writepages(struct address_space *mapping, } retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); while (!ret && wbc->nr_to_write > 0) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0f015a0468de..5ed2ce9a28d0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -36,6 +36,7 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; @@ -650,6 +651,7 @@ static long wb_writeback(struct bdi_writeback *wb, { struct writeback_control wbc = { .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, .older_than_this = NULL, .for_kupdate = work->for_kupdate, .for_background = work->for_background, @@ -657,7 +659,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; - long write_chunk; + long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; if (wbc.for_kupdate) { @@ -683,9 +685,7 @@ static long wb_writeback(struct bdi_writeback *wb, * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else + if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) write_chunk = LONG_MAX; wbc.wb_start = jiffies; /* livelock avoidance */ @@ -1188,10 +1188,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .done = &done, - .nr_pages = nr, + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 17e7ccc322a5..3f6542ca6198 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -47,6 +47,7 @@ struct writeback_control { unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ + unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f698862420..955fe35d01e0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -892,12 +892,12 @@ int write_cache_pages(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { -- cgit v1.2.3 From 94c3dcbb0b0cdfd82cedd21705424d8044edc42c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 27 Apr 2011 19:05:21 -0600 Subject: writeback: update dirtied_when for synced inode to prevent livelock Explicitly update .dirtied_when on synced inodes, so that they are no longer considered for writeback in the next round. It can prevent both of the following livelock schemes: - while true; do echo data >> f; done - while true; do touch f; done (in theory) The exact livelock condition is, during sync(1): (1) no new inodes are dirtied (2) an inode being actively dirtied On (2), the inode will be tagged and synced with .nr_to_write=LONG_MAX. When finished, it will be redirty_tail()ed because it's still dirty and (.nr_to_write > 0). redirty_tail() won't update its ->dirtied_when on condition (1). The sync work will then revisit it on the next queue_io() and find it eligible again because its old ->dirtied_when predates the sync work start time. We'll do more aggressive "keep writeback as long as we wrote something" logic in wb_writeback(). The "use LONG_MAX .nr_to_write" trick in commit b9543dac5bbc ("writeback: avoid livelocking WB_SYNC_ALL writeback") will no longer be enough to stop sync livelock. Reviewed-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5ed2ce9a28d0..fe190a8b0bc8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -419,6 +419,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { + /* + * Sync livelock prevention. Each inode is tagged and synced in + * one shot. If still dirty, it will be redirty_tail()'ed below. + * Update the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() -- cgit v1.2.3 From cb9bd1159c5fe8995e151fa7df10fa19f8c119cc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 22:50:57 -0600 Subject: writeback: introduce writeback_control.inodes_written The flusher works on dirty inodes in batches, and may quit prematurely if the batch of inodes happen to be metadata-only dirtied: in this case wbc->nr_to_write won't be decreased at all, which stands for "no pages written" but also mis-interpreted as "no progress". So introduce writeback_control.inodes_written to count the inodes get cleaned from VFS POV. A non-zero value means there are some progress on writeback, in which case more writeback can be tried. Acked-by: Jan Kara Acked-by: Mel Gorman Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 4 ++++ include/linux/writeback.h | 1 + 2 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index fe190a8b0bc8..e4504299f4a5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -464,6 +464,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * No need to add it back to the LRU. */ list_del_init(&inode->i_wb_list); + wbc->inodes_written++; } } inode_sync_complete(inode); @@ -725,6 +726,7 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.more_io = 0; wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; + wbc.inodes_written = 0; trace_wbc_writeback_start(&wbc, wb->bdi); if (work->sb) @@ -741,6 +743,8 @@ static long wb_writeback(struct bdi_writeback *wb, */ if (wbc.nr_to_write <= 0) continue; + if (wbc.inodes_written) + continue; /* * Didn't write everything and we don't have more IO, bail */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f6542ca6198..7df9026f7129 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -34,6 +34,7 @@ struct writeback_control { long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ + long inodes_written; /* # of inodes written (at least) */ /* * For a_ops->writepages(): is start or end are non-zero then this is -- cgit v1.2.3 From e6fb6da2e10682d477f2fdb749451d9fe5d168e8 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 22 Jul 2010 10:23:44 -0600 Subject: writeback: try more writeback as long as something was written writeback_inodes_wb()/__writeback_inodes_sb() are not aggressive in that they only populate possibly a subset of eligible inodes into b_io at entrance time. When the queued set of inodes are all synced, they just return, possibly with all queued inode pages written but still wbc.nr_to_write > 0. For kupdate and background writeback, there may be more eligible inodes sitting in b_dirty when the current set of b_io inodes are completed. So it is necessary to try another round of writeback as long as we made some progress in this round. When there are no more eligible inodes, no more inodes will be enqueued in queue_io(), hence nothing could/will be synced and we may safely bail. For example, imagine 100 inodes i0, i1, i2, ..., i90, i91, i99 At queue_io() time, i90-i99 happen to be expired and moved to s_io for IO. When finished successfully, if their total size is less than MAX_WRITEBACK_PAGES, nr_to_write will be > 0. Then wb_writeback() will quit the background work (w/o this patch) while it's still over background threshold. This will be a fairly normal/frequent case I guess. Now that we do tagged sync and update inode->dirtied_when after the sync, this change won't livelock sync(1). I actually tried to write 1 page per 1ms with this command write-and-fsync -n10000 -S 1000 -c 4096 /fs/test and do sync(1) at the same time. The sync completes quickly on ext4, xfs, btrfs. Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e4504299f4a5..271cf2150ba0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -739,22 +739,22 @@ static long wb_writeback(struct bdi_writeback *wb, wrote += write_chunk - wbc.nr_to_write; /* - * If we consumed everything, see if we have more + * Did we write something? Try for more + * + * Dirty inodes are moved to b_io for writeback in batches. + * The completion of the current batch does not necessarily + * mean the overall work is done. So we keep looping as long + * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write <= 0) + if (wbc.nr_to_write < write_chunk) continue; if (wbc.inodes_written) continue; /* - * Didn't write everything and we don't have more IO, bail + * No more inodes for IO, bail */ if (!wbc.more_io) break; - /* - * Did we write something? Try for more - */ - if (wbc.nr_to_write < write_chunk) - continue; /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise -- cgit v1.2.3 From ba9aa8399fda48510d80c2fed1afb8fedbe1bb41 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 20:32:30 -0600 Subject: writeback: the kupdate expire timestamp should be a moving target Dynamically compute the dirty expire timestamp at queue_io() time. writeback_control.older_than_this used to be determined at entrance to the kupdate writeback work. This _static_ timestamp may go stale if the kupdate work runs on and on. The flusher may then stuck with some old busy inodes, never considering newly expired inodes thereafter. This has two possible problems: - It is unfair for a large dirty inode to delay (for a long time) the writeback of small dirty inodes. - As time goes by, the large and busy dirty inode may contain only _freshly_ dirtied pages. Ignoring newly expired dirty inodes risks delaying the expired dirty pages to the end of LRU lists, triggering the evil pageout(). Nevertheless this patch merely addresses part of the problem. v2: keep policy changes inside wb_writeback() and keep the wbc.older_than_this visibility as suggested by Dave. CC: Dave Chinner Acked-by: Jan Kara Acked-by: Mel Gorman Signed-off-by: Itaru Kitayama Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 271cf2150ba0..0adee7853b80 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -672,11 +672,6 @@ static long wb_writeback(struct bdi_writeback *wb, long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; - if (wbc.for_kupdate) { - wbc.older_than_this = &oldest_jif; - oldest_jif = jiffies - - msecs_to_jiffies(dirty_expire_interval * 10); - } if (!wbc.range_cyclic) { wbc.range_start = 0; wbc.range_end = LLONG_MAX; @@ -723,6 +718,12 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !over_bground_thresh()) break; + if (work->for_kupdate) { + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + wbc.older_than_this = &oldest_jif; + } + wbc.more_io = 0; wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; -- cgit v1.2.3 From 424b351fe1901fc909fd0ca4f21dab58f24c1aac Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 20:11:53 -0600 Subject: writeback: refill b_io iff empty There is no point to carry different refill policies between for_kupdate and other type of works. Use a consistent "refill b_io iff empty" policy which can guarantee fairness in an easy to understand way. A b_io refill will setup a _fixed_ work set with all currently eligible inodes and start a new round of walk through b_io. The "fixed" work set means no new inodes will be added to the work set during the walk. Only when a complete walk over b_io is done, new inodes that are eligible at the time will be enqueued and the walk be started over. This procedure provides fairness among the inodes because it guarantees each inode to be synced once and only once at each round. So all inodes will be free from starvations. This change relies on wb_writeback() to keep retrying as long as we made some progress on cleaning some pages and/or inodes. Without that ability, the old logic on background works relies on aggressively queuing all eligible inodes into b_io at every time. But that's not a guarantee. The below test script completes a slightly faster now: 2.6.39-rc3 2.6.39-rc3-dyn-expire+ ------------------------------------------------ all elapsed 256.043 252.367 stddev 24.381 12.530 tar elapsed 30.097 28.808 dd elapsed 13.214 11.782 #!/bin/zsh cp /c/linux-2.6.38.3.tar.bz2 /dev/shm/ umount /dev/sda7 mkfs.xfs -f /dev/sda7 mount /dev/sda7 /fs echo 3 > /proc/sys/vm/drop_caches tic=$(cat /proc/uptime|cut -d' ' -f2) cd /fs time tar jxf /dev/shm/linux-2.6.38.3.tar.bz2 & time dd if=/dev/zero of=/fs/zero bs=1M count=1000 & wait sync tac=$(cat /proc/uptime|cut -d' ' -f2) echo elapsed: $((tac - tic)) It maintains roughly the same small vs. large file writeout shares, and offers large files better chances to be written in nice 4M chunks. Analyzes from Dave Chinner in great details: Let's say we have lots of inodes with 100 dirty pages being created, and one large writeback going on. We expire 8 new inodes for every 1024 pages we write back. With the old code, we do: b_more_io (large inode) -> b_io (1l) 8 newly expired inodes -> b_io (1l, 8s) writeback large inode 1024 pages -> b_more_io b_more_io (large inode) -> b_io (8s, 1l) 8 newly expired inodes -> b_io (8s, 1l, 8s) writeback 8 small inodes 800 pages 1 large inode 224 pages -> b_more_io b_more_io (large inode) -> b_io (8s, 1l) 8 newly expired inodes -> b_io (8s, 1l, 8s) ..... Your new code: b_more_io (large inode) -> b_io (1l) 8 newly expired inodes -> b_io (1l, 8s) writeback large inode 1024 pages -> b_more_io (b_io == 8s) writeback 8 small inodes 800 pages b_io empty: (1800 pages written) b_more_io (large inode) -> b_io (1l) 14 newly expired inodes -> b_io (1l, 14s) writeback large inode 1024 pages -> b_more_io (b_io == 14s) writeback 10 small inodes 1000 pages 1 small inode 24 pages -> b_more_io (1l, 1s(24)) writeback 5 small inodes 500 pages b_io empty: (2548 pages written) b_more_io (large inode) -> b_io (1l, 1s(24)) 20 newly expired inodes -> b_io (1l, 1s(24), 20s) ...... Rough progression of pages written at b_io refill: Old code: total large file % of writeback 1024 224 21.9% (fixed) New code: total large file % of writeback 1800 1024 ~55% 2550 1024 ~40% 3050 1024 ~33% 3500 1024 ~29% 3950 1024 ~26% 4250 1024 ~24% 4500 1024 ~22.7% 4700 1024 ~21.7% 4800 1024 ~21.3% 4800 1024 ~21.3% (pretty much steady state from here) Ok, so the steady state is reached with a similar percentage of writeback to the large file as the existing code. Ok, that's good, but providing some evidence that is doesn't change the shared of writeback to the large should be in the commit message ;) The other advantage to this is that we always write 1024 page chunks to the large file, rather than smaller "whatever remains" chunks. CC: Jan Kara Acked-by: Mel Gorman Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0adee7853b80..664acdb2e7ef 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -589,7 +589,8 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) + + if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); while (!list_empty(&wb->b_io)) { @@ -616,7 +617,7 @@ static void __writeback_inodes_sb(struct super_block *sb, WARN_ON(!rwsem_is_locked(&sb->s_umount)); spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) + if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); spin_unlock(&inode_wb_list_lock); -- cgit v1.2.3 From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Apr 2011 18:19:44 -0600 Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Wu Fengguang --- fs/block_dev.c | 16 +++++--- fs/fs-writeback.c | 97 +++++++++++++++++++++++---------------------- fs/inode.c | 5 +-- include/linux/backing-dev.h | 2 + include/linux/writeback.h | 2 - mm/backing-dev.c | 21 ++++++++-- mm/filemap.c | 6 +-- mm/rmap.c | 4 +- 8 files changed, 85 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f908f0..3c9a03e51b62 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } - EXPORT_SYMBOL(I_BDEV); /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *old = inode->i_data.backing_dev_info; + + if (unlikely(dst == old)) /* deadlock avoidance */ + return; + bdi_lock_two(&old->wb, &dst->wb); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&old->wb.list_lock); + spin_unlock(&dst->wb.list_lock); } static sector_t max_block(struct block_device *bdev) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 664acdb2e7ef..36a30917e0dc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -181,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); } - /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -196,11 +197,9 @@ void inode_wb_list_del(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode) +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -214,11 +213,9 @@ static void redirty_tail(struct inode *inode) /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ -static void requeue_io(struct inode *inode) +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } @@ -226,7 +223,7 @@ static void inode_sync_complete(struct inode *inode) { /* * Prevent speculative execution through - * spin_unlock(&inode_wb_list_lock); + * spin_unlock(&wb->list_lock); */ smp_mb(); @@ -302,7 +299,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -317,7 +314,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) /* * Wait for writeback on an inode to complete. */ -static void inode_wait_for_writeback(struct inode *inode) +static void inode_wait_for_writeback(struct inode *inode, + struct bdi_writeback *wb) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -325,15 +323,15 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * Write out an inode's dirty pages. Called under wb->list_lock and * inode->i_lock. Either the caller has an active reference on the inode or * the inode has I_WILL_FREE set. * @@ -344,13 +342,14 @@ static void inode_wait_for_writeback(struct inode *inode) * livelocks, etc. */ static int -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; unsigned dirty; int ret; - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); if (!atomic_read(&inode->i_count)) @@ -368,14 +367,14 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { - requeue_io(inode); + requeue_io(inode, wb); return 0; } /* * It's a data-integrity sync. We must wait. */ - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); } BUG_ON(inode->i_state & I_SYNC); @@ -384,7 +383,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); ret = do_writepages(mapping, wbc); @@ -415,7 +414,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { @@ -438,7 +437,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * slice used up: queue for next turn */ - requeue_io(inode); + requeue_io(inode, wb); } else { /* * Writeback blocked by something other than @@ -447,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode); + redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -456,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * submission or metadata updates after data IO * completion. */ - redirty_tail(inode); + redirty_tail(inode, wb); } else { /* * The inode is clean. At this point we either have @@ -521,7 +520,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * superblock, move all inodes not belonging * to it back onto the dirty list. */ - redirty_tail(inode); + redirty_tail(inode, wb); continue; } @@ -541,7 +540,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode); + requeue_io(inode, wb); continue; } @@ -557,19 +556,19 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, __iget(inode); pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wbc); + writeback_single_inode(inode, wb, wbc); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode); + redirty_tail(inode, wb); } spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); iput(inode); cond_resched(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -588,7 +587,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -598,7 +597,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, struct super_block *sb = inode->i_sb; if (!pin_sb_for_writeback(sb)) { - requeue_io(inode); + requeue_io(inode, wb); continue; } ret = writeback_sb_inodes(sb, wb, wbc, false); @@ -607,7 +606,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); /* Leave any unwritten inodes on b_io */ } @@ -616,11 +615,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); } /* @@ -762,15 +761,15 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); } return wrote; @@ -1104,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } spin_unlock(&inode->i_lock); - spin_lock(&inode_wb_list_lock); + spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); @@ -1309,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -1321,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, &wbc); + ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1345,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wbc); + ret = writeback_single_inode(inode, wb, wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/inode.c b/fs/inode.c index 0f7e88a7803f..4be128cbc754 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -37,7 +37,7 @@ * inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list - * inode_wb_list_lock protects: + * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash @@ -48,7 +48,7 @@ * inode->i_lock * inode_lru_lock * - * inode_wb_list_lock + * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock @@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru); static DEFINE_SPINLOCK(inode_lru_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 96f4094b706d..47feb2c4706a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -57,6 +57,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + spinlock_t list_lock; /* protects the b_* lists */ }; struct backing_dev_info { @@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_arm_supers_timer(void); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 7df9026f7129..c2d957fb38d3 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -9,8 +9,6 @@ struct backing_dev_info; -extern spinlock_t inode_wb_list_lock; - /* * fs/fs-writeback.c */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09a..5f6553ef1ba7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); +void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) +{ + if (wb1 < wb2) { + spin_lock(&wb1->list_lock); + spin_lock_nested(&wb2->list_lock, 1); + } else { + spin_lock(&wb2->list_lock); + spin_lock_nested(&wb1->list_lock, 1); + } +} + #ifdef CONFIG_DEBUG_FS #include #include @@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + spin_lock_init(&wb->list_lock); setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } @@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_wb_list_lock); + bdi_lock_two(&bdi->wb, dst); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); + spin_unlock(&dst->list_lock); } bdi_unregister(bdi); diff --git a/mm/filemap.c b/mm/filemap.c index d7b10578a64b..1e492c3dd6f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -81,7 +81,7 @@ * ->i_mutex * ->i_alloc_sem (various) * - * inode_wb_list_lock + * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * @@ -99,9 +99,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * inode_wb_list_lock (zap_pte_range->set_page_dirty) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * diff --git a/mm/rmap.c b/mm/rmap.c index 0eb463ea88dd..d04e36a7cc9f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,11 +32,11 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) - * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_wb_list_lock in __sync_single_inode) + * within bdi.wb->list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock -- cgit v1.2.3 From e8dfc30582995ae12454cda517b17d6294175b07 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 21 Apr 2011 12:06:32 -0600 Subject: writeback: elevate queue_io() into wb_writeback() Code refactor for more logical code layout. No behavior change. - remove the mis-named __writeback_inodes_sb() - wb_writeback()/writeback_inodes_wb() will decide when to queue_io() before calling __writeback_inodes_wb() Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 36a30917e0dc..565b1fd15be6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -580,17 +580,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, return 1; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static void __writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { int ret = 0; if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&wb->list_lock); - - if (list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -606,19 +602,16 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&wb->list_lock); /* Leave any unwritten inodes on b_io */ } -static void __writeback_inodes_sb(struct super_block *sb, - struct bdi_writeback *wb, struct writeback_control *wbc) +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); - writeback_sb_inodes(sb, wb, wbc, true); + __writeback_inodes_wb(wb, wbc); spin_unlock(&wb->list_lock); } @@ -685,7 +678,7 @@ static long wb_writeback(struct bdi_writeback *wb, * The intended call sequence for WB_SYNC_ALL writeback is: * * wb_writeback() - * __writeback_inodes_sb() <== called only once + * writeback_sb_inodes() <== called only once * write_cache_pages() <== called once for each inode * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages @@ -694,6 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb, write_chunk = LONG_MAX; wbc.wb_start = jiffies; /* livelock avoidance */ + spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -730,10 +724,12 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.inodes_written = 0; trace_wbc_writeback_start(&wbc, wb->bdi); + if (list_empty(&wb->b_io)) + queue_io(wb, wbc.older_than_this); if (work->sb) - __writeback_inodes_sb(work->sb, wb, &wbc); + writeback_sb_inodes(work->sb, wb, &wbc, true); else - writeback_inodes_wb(wb, &wbc); + __writeback_inodes_wb(wb, &wbc); trace_wbc_writeback_written(&wbc, wb->bdi); work->nr_pages -= write_chunk - wbc.nr_to_write; @@ -761,7 +757,6 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&wb->list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); @@ -769,8 +764,8 @@ static long wb_writeback(struct bdi_writeback *wb, inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&wb->list_lock); } + spin_unlock(&wb->list_lock); return wrote; } -- cgit v1.2.3 From e185dda89d69cde142b48059413a03561f41f78a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 23 Apr 2011 11:26:07 -0600 Subject: writeback: avoid extra sync work at enqueue time This removes writeback_control.wb_start and does more straightforward sync livelock prevention by setting .older_than_this to prevent extra inodes from being enqueued in the first place. Acked-by: Jan Kara Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 16 +++------------- include/linux/writeback.h | 3 --- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 565b1fd15be6..d0553f33fb50 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -544,15 +544,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, continue; } - /* - * Was this inode dirtied after sync_sb_inodes was called? - * This keeps sync from extra jobs and livelock. - */ - if (inode_dirtied_after(inode, wbc->wb_start)) { - spin_unlock(&inode->i_lock); - return 1; - } - __iget(inode); pages_skipped = wbc->pages_skipped; @@ -585,9 +576,6 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb, { int ret = 0; - if (!wbc->wb_start) - wbc->wb_start = jiffies; /* livelock avoidance */ - while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; @@ -686,7 +674,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) write_chunk = LONG_MAX; - wbc.wb_start = jiffies; /* livelock avoidance */ + oldest_jif = jiffies; + wbc.older_than_this = &oldest_jif; + spin_lock(&wb->list_lock); for (;;) { /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index c2d957fb38d3..d8e96a480850 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -26,9 +26,6 @@ struct writeback_control { enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ - unsigned long wb_start; /* Time writeback_inodes_wb was - called. This is needed to avoid - extra jobs and livelock */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ -- cgit v1.2.3 From b7a2441f9966fe3e1be960a876ab52e6029ea005 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 21 Jul 2010 22:19:51 -0600 Subject: writeback: remove writeback_control.more_io When wbc.more_io was first introduced, it indicates whether there are at least one superblock whose s_more_io contains more IO work. Now with the per-bdi writeback, it can be replaced with a simple b_more_io test. Acked-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 9 ++------- include/linux/writeback.h | 1 - include/trace/events/ext4.h | 6 ++---- include/trace/events/writeback.h | 5 +---- 4 files changed, 5 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d0553f33fb50..f43c479feee9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -560,12 +560,8 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, iput(inode); cond_resched(); spin_lock(&wb->list_lock); - if (wbc->nr_to_write <= 0) { - wbc->more_io = 1; + if (wbc->nr_to_write <= 0) return 1; - } - if (!list_empty(&wb->b_more_io)) - wbc->more_io = 1; } /* b_io is empty */ return 1; @@ -708,7 +704,6 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.older_than_this = &oldest_jif; } - wbc.more_io = 0; wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; wbc.inodes_written = 0; @@ -740,7 +735,7 @@ static long wb_writeback(struct bdi_writeback *wb, /* * No more inodes for IO, bail */ - if (!wbc.more_io) + if (list_empty(&wb->b_more_io)) break; /* * Nothing written. Wait for some inode to diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d8e96a480850..8797b20dd22b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -46,7 +46,6 @@ struct writeback_control { unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ - unsigned more_io:1; /* more io to be dispatched */ }; /* diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index e09592d2f916..b225d0d8c87f 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -404,7 +404,6 @@ TRACE_EVENT(ext4_da_writepages_result, __field( int, pages_written ) __field( long, pages_skipped ) __field( int, sync_mode ) - __field( char, more_io ) __field( pgoff_t, writeback_index ) ), @@ -415,16 +414,15 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; - __entry->more_io = wbc->more_io; __entry->writeback_index = inode->i_mapping->writeback_index; ), TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " - " more_io %d sync_mode %d writeback_index %lu", + "sync_mode %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, - __entry->more_io, __entry->sync_mode, + __entry->sync_mode, (unsigned long) __entry->writeback_index) ); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 4e249b927eaa..b2cfac5f3313 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -101,7 +101,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) - __field(int, more_io) __field(unsigned long, older_than_this) __field(long, range_start) __field(long, range_end) @@ -116,7 +115,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; - __entry->more_io = wbc->more_io; __entry->older_than_this = wbc->older_than_this ? *wbc->older_than_this : 0; __entry->range_start = (long)wbc->range_start; @@ -124,7 +122,7 @@ DECLARE_EVENT_CLASS(wbc_class, ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " - "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " + "bgrd=%d reclm=%d cyclic=%d older=0x%lx " "start=0x%lx end=0x%lx", __entry->name, __entry->nr_to_write, @@ -134,7 +132,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, - __entry->more_io, __entry->older_than_this, __entry->range_start, __entry->range_end) -- cgit v1.2.3 From 846d5a091b0506b75489577cde27f39b37a192a4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 5 May 2011 21:10:38 -0600 Subject: writeback: remove .nonblocking and .encountered_congestion Remove two unused struct writeback_control fields: .encountered_congestion (completely unused) .nonblocking (never set, checked/showed in XFS,NFS/btrfs) The .for_background check in nfs_write_inode() is also removed btw, as .for_background implies WB_SYNC_NONE. Reviewed-by: Jan Kara Proposed-by: Christoph Hellwig Signed-off-by: Wu Fengguang --- fs/nfs/write.c | 3 +-- fs/xfs/linux-2.6/xfs_aops.c | 2 +- include/linux/writeback.h | 2 -- include/trace/events/btrfs.h | 6 ++---- 4 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e268e3b23497..dd6a6cee39a7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1564,8 +1564,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int status; bool sync = true; - if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || - wbc->for_background) + if (wbc->sync_mode == WB_SYNC_NONE) sync = false; status = pnfs_layoutcommit_inode(inode, sync); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 79ce38be15a1..7559861481aa 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -970,7 +970,7 @@ xfs_vm_writepage( offset = page_offset(page); type = IO_OVERWRITE; - if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + if (wbc->sync_mode == WB_SYNC_NONE) nonblocking = 1; do { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8797b20dd22b..2f1b512bd6e0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -39,8 +39,6 @@ struct writeback_control { loff_t range_start; loff_t range_end; - unsigned nonblocking:1; /* Don't get stuck on request queues */ - unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 4114129f0794..b31702ac15be 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -284,7 +284,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) - __field( char, nonblocking ) __field( char, for_kupdate ) __field( char, for_reclaim ) __field( char, range_cyclic ) @@ -299,7 +298,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; - __entry->nonblocking = wbc->nonblocking; __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; @@ -310,13 +308,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage, TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, " "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, " - "range_end = %llu, nonblocking = %d, for_kupdate = %d, " + "range_end = %llu, for_kupdate = %d, " "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu", show_root_type(__entry->root_objectid), (unsigned long)__entry->ino, __entry->index, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, - __entry->nonblocking, __entry->for_kupdate, + __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic, (unsigned long)__entry->writeback_index) ); -- cgit v1.2.3 From 251d6a471c831e22880b3c146bb4556ddfb1dc82 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 1 Dec 2010 17:33:37 -0600 Subject: writeback: trace event writeback_single_inode It is valuable to know how the dirty inodes are iterated and their IO size. "writeback_single_inode: bdi 8:0: ino=134246746 state=I_DIRTY_SYNC|I_SYNC age=414 index=0 to_write=1024 wrote=0" - "state" reflects inode->i_state at the end of writeback_single_inode() - "index" reflects mapping->writeback_index after the ->writepages() call - "to_write" is the wbc->nr_to_write at entrance of writeback_single_inode() - "wrote" is the number of pages actually written v2: add trace event writeback_single_inode_requeue as proposed by Dave. CC: Dave Chinner Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 4 +++ include/trace/events/writeback.h | 70 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f43c479feee9..5185fad48b62 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -346,6 +346,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; @@ -368,6 +369,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, */ if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode, wb); + trace_writeback_single_inode_requeue(inode, wbc, + nr_to_write); return 0; } @@ -467,6 +470,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, } } inode_sync_complete(inode); + trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b2cfac5f3313..898277bc89b4 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -8,6 +8,19 @@ #include #include +#define show_inode_state(state) \ + __print_flags(state, "|", \ + {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ + {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ + {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ + {I_NEW, "I_NEW"}, \ + {I_WILL_FREE, "I_WILL_FREE"}, \ + {I_FREEING, "I_FREEING"}, \ + {I_CLEAR, "I_CLEAR"}, \ + {I_SYNC, "I_SYNC"}, \ + {I_REFERENCED, "I_REFERENCED"} \ + ) + struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_work_class, @@ -184,6 +197,63 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, TP_ARGS(usec_timeout, usec_delayed) ); +DECLARE_EVENT_CLASS(writeback_single_inode_template, + + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long nr_to_write + ), + + TP_ARGS(inode, wbc, nr_to_write), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned long, state) + __field(unsigned long, age) + __field(unsigned long, writeback_index) + __field(long, nr_to_write) + __field(unsigned long, wrote) + ), + + TP_fast_assign( + strncpy(__entry->name, + dev_name(inode->i_mapping->backing_dev_info->dev), 32); + __entry->ino = inode->i_ino; + __entry->state = inode->i_state; + __entry->age = (jiffies - inode->dirtied_when) * + 1000 / HZ; + __entry->writeback_index = inode->i_mapping->writeback_index; + __entry->nr_to_write = nr_to_write; + __entry->wrote = nr_to_write - wbc->nr_to_write; + ), + + TP_printk("bdi %s: ino=%lu state=%s age=%lu " + "index=%lu to_write=%ld wrote=%lu", + __entry->name, + __entry->ino, + show_inode_state(__entry->state), + __entry->age, + __entry->writeback_index, + __entry->nr_to_write, + __entry->wrote + ) +); + +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue, + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long nr_to_write), + TP_ARGS(inode, wbc, nr_to_write) +); + +DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, + TP_PROTO(struct inode *inode, + struct writeback_control *wbc, + unsigned long nr_to_write), + TP_ARGS(inode, wbc, nr_to_write) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ -- cgit v1.2.3 From e84d0a4f8e39a73003a6ec9a11b07702745f4c1f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sat, 23 Apr 2011 12:27:27 -0600 Subject: writeback: trace event writeback_queue_io Note that it adds a little overheads to account the moved/enqueued inodes from b_dirty to b_io. The "moved" accounting may be later used to limit the number of inodes that can be moved in one shot, in order to keep spinlock hold time under control. Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 14 ++++++++++---- include/trace/events/writeback.h | 25 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5185fad48b62..6caa98247a5b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -248,15 +248,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) /* * Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ -static void move_expired_inodes(struct list_head *delaying_queue, +static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + unsigned long *older_than_this) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; + int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); @@ -267,12 +268,13 @@ static void move_expired_inodes(struct list_head *delaying_queue, do_sb_sort = 1; sb = inode->i_sb; list_move(&inode->i_wb_list, &tmp); + moved++; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); - return; + goto out; } /* Move inodes from one superblock together */ @@ -284,6 +286,8 @@ static void move_expired_inodes(struct list_head *delaying_queue, list_move(&inode->i_wb_list, dispatch_queue); } } +out: + return moved; } /* @@ -299,9 +303,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + trace_writeback_queue_io(wb, older_than_this, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 898277bc89b4..205d14919ef2 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -162,6 +162,31 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written); DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); +TRACE_EVENT(writeback_queue_io, + TP_PROTO(struct bdi_writeback *wb, + unsigned long *older_than_this, + int moved), + TP_ARGS(wb, older_than_this, moved), + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, older) + __field(long, age) + __field(int, moved) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + __entry->older = older_than_this ? *older_than_this : 0; + __entry->age = older_than_this ? + (jiffies - *older_than_this) * 1000 / HZ : -1; + __entry->moved = moved; + ), + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d", + __entry->name, + __entry->older, /* older_than_this in jiffies */ + __entry->age, /* older_than_this in relative milliseconds */ + __entry->moved) +); + DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), -- cgit v1.2.3 From 785c4bcc0d88ff006a0b2120815a71e86ecf21ce Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 23 May 2011 18:33:01 +0200 Subject: ext3: Add fixed tracepoints This commit adds fixed tracepoints to the ext3 code. It is based on ext4 tracepoints, however due to the differences of both file systems, there are some tracepoints missing (those for delaloc and for multi-block allocator) and there are some ext3 specific as well (for reservation windows). Here is a list: ext3_free_inode ext3_request_inode ext3_allocate_inode ext3_evict_inode ext3_drop_inode ext3_mark_inode_dirty ext3_write_begin ext3_ordered_write_end ext3_writeback_write_end ext3_journalled_write_end ext3_ordered_writepage ext3_writeback_writepage ext3_journalled_writepage ext3_readpage ext3_releasepage ext3_invalidatepage ext3_discard_blocks ext3_request_blocks ext3_allocate_blocks ext3_free_blocks ext3_sync_file_enter ext3_sync_file_exit ext3_sync_fs ext3_rsv_window_add ext3_discard_reservation ext3_alloc_new_reservation ext3_reserved ext3_forget ext3_read_block_bitmap ext3_direct_IO_enter ext3_direct_IO_exit ext3_unlink_enter ext3_unlink_exit ext3_truncate_enter ext3_truncate_exit ext3_get_blocks_enter ext3_get_blocks_exit ext3_load_inode Signed-off-by: Lukas Czerner Cc: Jan Kara Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 34 +- fs/ext3/fsync.c | 15 +- fs/ext3/ialloc.c | 4 + fs/ext3/inode.c | 29 ++ fs/ext3/namei.c | 3 + fs/ext3/super.c | 13 + include/trace/events/ext3.h | 864 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 946 insertions(+), 16 deletions(-) create mode 100644 include/trace/events/ext3.h (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index fe52297e31ad..f7d111e499ad 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * balloc.c contains the blocks allocation and deallocation routines @@ -161,6 +162,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) desc = ext3_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + trace_ext3_read_block_bitmap(sb, block_group); bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -351,6 +353,7 @@ void ext3_rsv_window_add(struct super_block *sb, struct rb_node * parent = NULL; struct ext3_reserve_window_node *this; + trace_ext3_rsv_window_add(sb, rsv); while (*p) { parent = *p; @@ -476,8 +479,10 @@ void ext3_discard_reservation(struct inode *inode) rsv = &block_i->rsv_window_node; if (!rsv_is_empty(&rsv->rsv_window)) { spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); rsv_window_remove(inode->i_sb, rsv); + } spin_unlock(rsv_lock); } } @@ -683,14 +688,10 @@ error_return: void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { - struct super_block * sb; + struct super_block *sb = inode->i_sb; unsigned long dquot_freed_blocks; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + trace_ext3_free_blocks(inode, block, count); ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) dquot_free_block(inode, dquot_freed_blocks); @@ -1136,6 +1137,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, else start_block = grp_goal + group_first_block; + trace_ext3_alloc_new_reservation(sb, start_block); size = my_rsv->rsv_goal_size; if (!rsv_is_empty(&my_rsv->rsv_window)) { @@ -1230,8 +1232,11 @@ retry: * check if the first free block is within the * free space we just reserved */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); return 0; /* success */ + } /* * if the first free bit we found is out of the reservable space * continue search for next reservable space, @@ -1514,10 +1519,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext3_new_block: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1528,8 +1529,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, return 0; } + trace_ext3_request_blocks(inode, goal, num); + sbi = EXT3_SB(sb); - es = EXT3_SB(sb)->s_es; + es = sbi->s_es; ext3_debug("goal=%lu.\n", goal); /* * Allocate a block from reservation only when @@ -1742,6 +1745,10 @@ allocated: brelse(bitmap_bh); dquot_free_block(inode, *count-num); *count = num; + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + return ret_block; io_error: @@ -1996,6 +2003,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, if ((next - start) < minblocks) goto free_extent; + trace_ext3_discard_blocks(sb, discard_block, next - start); /* Send the TRIM command down to the device */ err = sb_issue_discard(sb, discard_block, next - start, GFP_NOFS, 0); diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 09b13bb34c94..06a4394d2bc3 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * akpm: A new design for ext3_sync_file(). @@ -51,10 +52,13 @@ int ext3_sync_file(struct file *file, int datasync) int ret, needs_barrier = 0; tid_t commit_tid; + J_ASSERT(ext3_journal_current_handle() == NULL); + + trace_ext3_sync_file_enter(file, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) return 0; - J_ASSERT(ext3_journal_current_handle() == NULL); /* * data=writeback,ordered: @@ -70,8 +74,10 @@ int ext3_sync_file(struct file *file, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext3_should_journal_data(inode)) - return ext3_force_commit(inode->i_sb); + if (ext3_should_journal_data(inode)) { + ret = ext3_force_commit(inode->i_sb); + goto out; + } if (datasync) commit_tid = atomic_read(&ei->i_datasync_tid); @@ -91,5 +97,8 @@ int ext3_sync_file(struct file *file, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + +out: + trace_ext3_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bfc2dc43681d..bf09cbf938cc 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -118,6 +119,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) ino = inode->i_ino; ext3_debug ("freeing inode %lu\n", ino); + trace_ext3_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); @@ -426,6 +428,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -601,6 +604,7 @@ got: } ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); goto really_out; fail: ext3_std_error(sb, err); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3451d23c3bae..3aa05eebe0b8 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "xattr.h" #include "acl.h" @@ -70,6 +71,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -198,6 +200,7 @@ void ext3_evict_inode (struct inode *inode) handle_t *handle; int want_delete = 0; + trace_ext3_evict_inode(inode); if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); want_delete = 1; @@ -842,6 +845,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -970,6 +974,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -1217,6 +1224,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + trace_ext3_write_begin(inode, pos, len, flags); + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1332,6 +1341,7 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; + trace_ext3_ordered_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1377,7 @@ static int ext3_writeback_write_end(struct file *file, struct inode *inode = file->f_mapping->host; int ret; + trace_ext3_writeback_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); update_file_sizes(inode, pos, copied); /* @@ -1395,6 +1406,7 @@ static int ext3_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1577,6 +1589,7 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1647,6 +1660,7 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1689,6 +1703,7 @@ static int ext3_journalled_writepage(struct page *page, if (ext3_journal_current_handle()) goto no_write; + trace_ext3_journalled_writepage(page); handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1739,6 +1754,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1753,6 +1769,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ @@ -1766,6 +1784,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1794,6 +1813,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_length(iov, nr_segs); int retries = 0; + trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (rw == WRITE) { loff_t final_size = offset + count; @@ -1868,6 +1889,8 @@ retry: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); return ret; } @@ -2446,6 +2469,8 @@ void ext3_truncate(struct inode *inode) unsigned blocksize = inode->i_sb->s_blocksize; struct page *page; + trace_ext3_truncate_enter(inode); + if (!ext3_can_truncate(inode)) goto out_notrans; @@ -2597,6 +2622,7 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); return; out_notrans: /* @@ -2605,6 +2631,7 @@ out_notrans: */ if (inode->i_nlink) ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2746,6 +2773,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); @@ -3372,6 +3400,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 34b6d9bfc48a..51736a4ff0cd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "namei.h" #include "xattr.h" @@ -2144,6 +2145,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) struct ext3_dir_entry_2 * de; handle_t *handle; + trace_ext3_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2189,6 +2191,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext3_journal_stop(handle); brelse (bh); + trace_ext3_unlink_exit(dentry, retval); return retval; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index aad153ef6b78..662290fb6fff 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -44,6 +44,9 @@ #include "acl.h" #include "namei.h" +#define CREATE_TRACE_POINTS +#include + #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA #else @@ -497,6 +500,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + static void ext3_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -788,6 +799,7 @@ static const struct super_operations ext3_sops = { .destroy_inode = ext3_destroy_inode, .write_inode = ext3_write_inode, .dirty_inode = ext3_dirty_inode, + .drop_inode = ext3_drop_inode, .evict_inode = ext3_evict_inode, .put_super = ext3_put_super, .sync_fs = ext3_sync_fs, @@ -2507,6 +2519,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_ext3_sync_fs(sb, wait); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h new file mode 100644 index 000000000000..7b53c0573dc9 --- /dev/null +++ b/include/trace/events/ext3.h @@ -0,0 +1,864 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ext3 + +#if !defined(_TRACE_EXT3_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EXT3_H + +#include + +TRACE_EVENT(ext3_free_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( uid_t, uid ) + __field( gid_t, gid ) + __field( blkcnt_t, blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->uid = inode->i_uid; + __entry->gid = inode->i_gid; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->uid, __entry->gid, + (unsigned long) __entry->blocks) +); + +TRACE_EVENT(ext3_request_inode, + TP_PROTO(struct inode *dir, int mode), + + TP_ARGS(dir, mode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, dir ) + __field( umode_t, mode ) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %d,%d dir %lu mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->dir, __entry->mode) +); + +TRACE_EVENT(ext3_allocate_inode, + TP_PROTO(struct inode *inode, struct inode *dir, int mode), + + TP_ARGS(inode, dir, mode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, dir ) + __field( umode_t, mode ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->dir, __entry->mode) +); + +TRACE_EVENT(ext3_evict_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( int, nlink ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nlink = inode->i_nlink; + ), + + TP_printk("dev %d,%d ino %lu nlink %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->nlink) +); + +TRACE_EVENT(ext3_drop_inode, + TP_PROTO(struct inode *inode, int drop), + + TP_ARGS(inode, drop), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( int, drop ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->drop = drop; + ), + + TP_printk("dev %d,%d ino %lu drop %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->drop) +); + +TRACE_EVENT(ext3_mark_inode_dirty, + TP_PROTO(struct inode *inode, unsigned long IP), + + TP_ARGS(inode, IP), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d ino %lu caller %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, (void *)__entry->ip) +); + +TRACE_EVENT(ext3_write_begin, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int flags), + + TP_ARGS(inode, pos, len, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->flags = flags; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->flags) +); + +DECLARE_EVENT_CLASS(ext3__write_end, + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, pos ) + __field( unsigned int, len ) + __field( unsigned int, copied ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->len = len; + __entry->copied = copied; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->copied) +); + +DEFINE_EVENT(ext3__write_end, ext3_ordered_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + +DEFINE_EVENT(ext3__write_end, ext3_writeback_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + +DEFINE_EVENT(ext3__write_end, ext3_journalled_write_end, + + TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, + unsigned int copied), + + TP_ARGS(inode, pos, len, copied) +); + +DECLARE_EVENT_CLASS(ext3__page_op, + TP_PROTO(struct page *page), + + TP_ARGS(page), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( pgoff_t, index ) + + ), + + TP_fast_assign( + __entry->index = page->index; + __entry->ino = page->mapping->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu page_index %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->index) +); + +DEFINE_EVENT(ext3__page_op, ext3_ordered_writepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_writeback_writepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_journalled_writepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_readpage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext3__page_op, ext3_releasepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +TRACE_EVENT(ext3_invalidatepage, + TP_PROTO(struct page *page, unsigned long offset), + + TP_ARGS(page, offset), + + TP_STRUCT__entry( + __field( pgoff_t, index ) + __field( unsigned long, offset ) + __field( ino_t, ino ) + __field( dev_t, dev ) + + ), + + TP_fast_assign( + __entry->index = page->index; + __entry->offset = offset; + __entry->ino = page->mapping->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->index, __entry->offset) +); + +TRACE_EVENT(ext3_discard_blocks, + TP_PROTO(struct super_block *sb, unsigned long blk, + unsigned long count), + + TP_ARGS(sb, blk, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, blk ) + __field( unsigned long, count ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blk = blk; + __entry->count = count; + ), + + TP_printk("dev %d,%d blk %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blk, __entry->count) +); + +TRACE_EVENT(ext3_request_blocks, + TP_PROTO(struct inode *inode, unsigned long goal, + unsigned long count), + + TP_ARGS(inode, goal, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( unsigned long, count ) + __field( unsigned long, goal ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->count = count; + __entry->goal = goal; + ), + + TP_printk("dev %d,%d ino %lu count %lu goal %lu ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->count, __entry->goal) +); + +TRACE_EVENT(ext3_allocate_blocks, + TP_PROTO(struct inode *inode, unsigned long goal, + unsigned long count, unsigned long block), + + TP_ARGS(inode, goal, count, block), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( unsigned long, block ) + __field( unsigned long, count ) + __field( unsigned long, goal ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->block = block; + __entry->count = count; + __entry->goal = goal; + ), + + TP_printk("dev %d,%d ino %lu count %lu block %lu goal %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->count, __entry->block, + __entry->goal) +); + +TRACE_EVENT(ext3_free_blocks, + TP_PROTO(struct inode *inode, unsigned long block, + unsigned long count), + + TP_ARGS(inode, block, count), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( unsigned long, block ) + __field( unsigned long, count ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->block = block; + __entry->count = count; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o block %lu count %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->block, __entry->count) +); + +TRACE_EVENT(ext3_sync_file_enter, + TP_PROTO(struct file *file, int datasync), + + TP_ARGS(file, datasync), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, parent ) + __field( int, datasync ) + ), + + TP_fast_assign( + struct dentry *dentry = file->f_path.dentry; + + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = dentry->d_inode->i_ino; + __entry->datasync = datasync; + __entry->parent = dentry->d_parent->d_inode->i_ino; + ), + + TP_printk("dev %d,%d ino %lu parent %ld datasync %d ", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->datasync) +); + +TRACE_EVENT(ext3_sync_file_exit, + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret), + + TP_STRUCT__entry( + __field( int, ret ) + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->ret = ret; + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->ret) +); + +TRACE_EVENT(ext3_sync_fs, + TP_PROTO(struct super_block *sb, int wait), + + TP_ARGS(sb, wait), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, wait ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->wait = wait; + ), + + TP_printk("dev %d,%d wait %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait) +); + +TRACE_EVENT(ext3_rsv_window_add, + TP_PROTO(struct super_block *sb, + struct ext3_reserve_window_node *rsv_node), + + TP_ARGS(sb, rsv_node), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->start = rsv_node->rsv_window._rsv_start; + __entry->end = rsv_node->rsv_window._rsv_end; + ), + + TP_printk("dev %d,%d start %lu end %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->start, __entry->end) +); + +TRACE_EVENT(ext3_discard_reservation, + TP_PROTO(struct inode *inode, + struct ext3_reserve_window_node *rsv_node), + + TP_ARGS(inode, rsv_node), + + TP_STRUCT__entry( + __field( unsigned long, start ) + __field( unsigned long, end ) + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->start = rsv_node->rsv_window._rsv_start; + __entry->end = rsv_node->rsv_window._rsv_end; + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu start %lu end %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long)__entry->ino, __entry->start, + __entry->end) +); + +TRACE_EVENT(ext3_alloc_new_reservation, + TP_PROTO(struct super_block *sb, unsigned long goal), + + TP_ARGS(sb, goal), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, goal ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->goal = goal; + ), + + TP_printk("dev %d,%d goal %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->goal) +); + +TRACE_EVENT(ext3_reserved, + TP_PROTO(struct super_block *sb, unsigned long block, + struct ext3_reserve_window_node *rsv_node), + + TP_ARGS(sb, block, rsv_node), + + TP_STRUCT__entry( + __field( unsigned long, block ) + __field( unsigned long, start ) + __field( unsigned long, end ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->block = block; + __entry->start = rsv_node->rsv_window._rsv_start; + __entry->end = rsv_node->rsv_window._rsv_end; + __entry->dev = sb->s_dev; + ), + + TP_printk("dev %d,%d block %lu, start %lu end %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->block, __entry->start, __entry->end) +); + +TRACE_EVENT(ext3_forget, + TP_PROTO(struct inode *inode, int is_metadata, unsigned long block), + + TP_ARGS(inode, is_metadata, block), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( umode_t, mode ) + __field( int, is_metadata ) + __field( unsigned long, block ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->is_metadata = is_metadata; + __entry->block = block; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->is_metadata, __entry->block) +); + +TRACE_EVENT(ext3_read_block_bitmap, + TP_PROTO(struct super_block *sb, unsigned int group), + + TP_ARGS(sb, group), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( __u32, group ) + + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->group = group; + ), + + TP_printk("dev %d,%d group %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->group) +); + +TRACE_EVENT(ext3_direct_IO_enter, + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), + + TP_ARGS(inode, offset, len, rw), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( unsigned long, len ) + __field( int, rw ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->rw) +); + +TRACE_EVENT(ext3_direct_IO_exit, + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, + int rw, int ret), + + TP_ARGS(inode, offset, len, rw, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( unsigned long, len ) + __field( int, rw ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->rw, __entry->ret) +); + +TRACE_EVENT(ext3_unlink_enter, + TP_PROTO(struct inode *parent, struct dentry *dentry), + + TP_ARGS(parent, dentry), + + TP_STRUCT__entry( + __field( ino_t, parent ) + __field( ino_t, ino ) + __field( loff_t, size ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->parent = parent->i_ino; + __entry->ino = dentry->d_inode->i_ino; + __entry->size = dentry->d_inode->i_size; + __entry->dev = dentry->d_inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu size %lld parent %ld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long)__entry->size, + (unsigned long) __entry->parent) +); + +TRACE_EVENT(ext3_unlink_exit, + TP_PROTO(struct dentry *dentry, int ret), + + TP_ARGS(dentry, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = dentry->d_inode->i_ino; + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->ret) +); + +DECLARE_EVENT_CLASS(ext3__truncate, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( blkcnt_t, blocks ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %d,%d ino %lu blocks %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, (unsigned long) __entry->blocks) +); + +DEFINE_EVENT(ext3__truncate, ext3_truncate_enter, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(ext3__truncate, ext3_truncate_exit, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +TRACE_EVENT(ext3_get_blocks_enter, + TP_PROTO(struct inode *inode, unsigned long lblk, + unsigned long len, int create), + + TP_ARGS(inode, lblk, len, create), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( unsigned long, lblk ) + __field( unsigned long, len ) + __field( int, create ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->len = len; + __entry->create = create; + ), + + TP_printk("dev %d,%d ino %lu lblk %lu len %lu create %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->len, __entry->create) +); + +TRACE_EVENT(ext3_get_blocks_exit, + TP_PROTO(struct inode *inode, unsigned long lblk, + unsigned long pblk, unsigned long len, int ret), + + TP_ARGS(inode, lblk, pblk, len, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( unsigned long, lblk ) + __field( unsigned long, pblk ) + __field( unsigned long, len ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu lblk %lu pblk %lu len %lu ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->pblk, + __entry->len, __entry->ret) +); + +TRACE_EVENT(ext3_load_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino) +); + +#endif /* _TRACE_EXT3_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 99cb1a318c37bf462c53d43f4dacb7b4896ce0c9 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 23 May 2011 18:33:02 +0200 Subject: jbd: Add fixed tracepoints This commit adds fixed tracepoint for jbd. It has been based on fixed tracepoints for jbd2, however there are missing those for collecting statistics, since I think that it will require more intrusive patch so I should have its own commit, if someone decide that it is needed. Also there are new tracepoints in __journal_drop_transaction() and journal_update_superblock(). The list of jbd tracepoints: jbd_checkpoint jbd_start_commit jbd_commit_locking jbd_commit_flushing jbd_commit_logging jbd_drop_transaction jbd_end_commit jbd_do_submit_data jbd_cleanup_journal_tail jbd_update_superblock_end Signed-off-by: Lukas Czerner Cc: Jan Kara Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 4 + fs/jbd/commit.c | 11 +++ fs/jbd/journal.c | 4 + include/trace/events/jbd.h | 203 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 222 insertions(+) create mode 100644 include/trace/events/jbd.h (limited to 'fs') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e4b87bc1fa56..dea7503b47e8 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Unlink a buffer from a transaction checkpoint list. @@ -358,6 +359,7 @@ int log_do_checkpoint(journal_t *journal) * journal straight away. */ result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -503,6 +505,7 @@ int cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %u), " "freeing %u\n", @@ -752,6 +755,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd_drop_transaction(journal, transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); kfree(transaction); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 72ffa974b0b8..eedd201374a8 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -204,6 +205,8 @@ write_out_data: if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; @@ -236,6 +239,8 @@ write_out_data: jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; @@ -266,6 +271,7 @@ write_out_data: } } spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); return err; @@ -316,12 +322,14 @@ void journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -392,6 +400,7 @@ void journal_commit_transaction(journal_t *journal) */ journal_switch_revoke_table(journal); + trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -493,6 +502,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); + trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -946,6 +956,7 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e2d4285fbe90..ab019ee77888 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -38,6 +38,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include #include @@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait) } else write_dirty_buffer(bh, WRITE); + trace_jbd_update_superblock_end(journal, wait); out: /* If we have just flushed the log (by marking s_start==0), then * any future commit will have to be careful to update the diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h new file mode 100644 index 000000000000..aff64d82d713 --- /dev/null +++ b/include/trace/events/jbd.h @@ -0,0 +1,203 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM jbd + +#if !defined(_TRACE_JBD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_JBD_H + +#include +#include + +TRACE_EVENT(jbd_checkpoint, + + TP_PROTO(journal_t *journal, int result), + + TP_ARGS(journal, result), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, result ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->result = result; + ), + + TP_printk("dev %d,%d result %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->result) +); + +DECLARE_EVENT_CLASS(jbd_commit, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + ), + + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) +); + +DEFINE_EVENT(jbd_commit, jbd_start_commit, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +DEFINE_EVENT(jbd_commit, jbd_commit_locking, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +DEFINE_EVENT(jbd_commit, jbd_commit_flushing, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +DEFINE_EVENT(jbd_commit, jbd_commit_logging, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction) +); + +TRACE_EVENT(jbd_drop_transaction, + + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + ), + + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) +); + +TRACE_EVENT(jbd_end_commit, + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + __field( int, head ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + __entry->head = journal->j_tail_sequence; + ), + + TP_printk("dev %d,%d transaction %d sync %d head %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit, __entry->head) +); + +TRACE_EVENT(jbd_do_submit_data, + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + + TP_ARGS(journal, commit_transaction), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( char, sync_commit ) + __field( int, transaction ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->sync_commit = commit_transaction->t_synchronous_commit; + __entry->transaction = commit_transaction->t_tid; + ), + + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) +); + +TRACE_EVENT(jbd_cleanup_journal_tail, + + TP_PROTO(journal_t *journal, tid_t first_tid, + unsigned long block_nr, unsigned long freed), + + TP_ARGS(journal, first_tid, block_nr, freed), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( tid_t, tail_sequence ) + __field( tid_t, first_tid ) + __field(unsigned long, block_nr ) + __field(unsigned long, freed ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->tail_sequence = journal->j_tail_sequence; + __entry->first_tid = first_tid; + __entry->block_nr = block_nr; + __entry->freed = freed; + ), + + TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tail_sequence, __entry->first_tid, + __entry->block_nr, __entry->freed) +); + +TRACE_EVENT(jbd_update_superblock_end, + TP_PROTO(journal_t *journal, int wait), + + TP_ARGS(journal, wait), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, wait ) + ), + + TP_fast_assign( + __entry->dev = journal->j_fs_dev->bd_dev; + __entry->wait = wait; + ), + + TP_printk("dev %d,%d wait %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait) +); + +#endif /* _TRACE_JBD_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 40680f2fa4670ab35ee554822a69dda1a118f966 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 May 2011 22:24:47 +0200 Subject: ext3: Convert ext3 to new truncate calling convention Mostly trivial conversion. We fix a bug that IS_IMMUTABLE and IS_APPEND files could not be truncated during failed writes as we change the code. In fact the test is not needed at all because both IS_IMMUTABLE and IS_APPEND is tested in upper layers in do_sys_[f]truncate(), may_write(), etc. Signed-off-by: Jan Kara --- fs/ext3/file.c | 1 - fs/ext3/inode.c | 27 +++++++++++---------------- include/linux/ext3_fs.h | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext3/file.c b/fs/ext3/file.c index f55df0e61cbd..86c8ab343f6f 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -71,7 +71,6 @@ const struct file_operations ext3_file_operations = { }; const struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3aa05eebe0b8..b4051c9ac5f2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -234,12 +234,10 @@ void ext3_evict_inode (struct inode *inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); EXT3_I(inode)->i_dtime = get_seconds(); @@ -890,6 +888,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -938,9 +939,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -1849,7 +1847,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext3_truncate_failed_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1863,7 +1861,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate(inode); + ext3_truncate_failed_write(inode); ret = PTR_ERR(handle); goto out; } @@ -2414,8 +2412,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, int ext3_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -3264,9 +3260,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - rc = vmtruncate(inode, attr->ia_size); - if (rc) - goto err_out; + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); } setattr_copy(inode, attr); diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 5e06acf95d0f..9aaa3a84d373 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -913,7 +913,7 @@ extern void ext3_dirty_inode(struct inode *, int); extern int ext3_change_inode_journal_flag(struct inode *, int); extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *); extern int ext3_can_truncate(struct inode *inode); -extern void ext3_truncate (struct inode *); +extern void ext3_truncate(struct inode *inode); extern void ext3_set_inode_flags(struct inode *); extern void ext3_get_inode_flags(struct ext3_inode_info *); extern void ext3_set_aops(struct inode *inode); -- cgit v1.2.3 From 05713082ab7690a2b22b044cfc867f346c39cd2d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 May 2011 17:17:18 +0200 Subject: jbd: remove dependency on __GFP_NOFAIL The callers of start_this_handle() (or better ext3_journal_start()) are not really prepared to handle allocation failures. Such failures can for example result in silent data loss when it happens in ext3_..._writepage(). OTOH __GFP_NOFAIL is going away so we just retry allocation in start_this_handle(). This loop is potentially dangerous because the oom killer cannot be invoked for GFP_NOFS allocation, so there is a potential for infinitely looping. But still this is better than silent data loss. Signed-off-by: Jan Kara --- fs/jbd/transaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index f7ee81a065da..83a661890868 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,6 +26,7 @@ #include #include #include +#include static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); if (!new_transaction) { - ret = -ENOMEM; - goto out; + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; } } -- cgit v1.2.3 From bd5c9e1854e13d0c62a3de29a5fbc15dd6a4d8c6 Mon Sep 17 00:00:00 2001 From: Ding Dinghua Date: Thu, 26 May 2011 10:29:01 +0800 Subject: jbd: fix a bug of leaking jh->b_jcount journal_get_create_access should drop jh->b_jcount in error handling path Signed-off-by: Ding Dinghua Signed-off-by: Jan Kara --- fs/jbd/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 83a661890868..dc39efd05d54 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -844,8 +844,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } -- cgit v1.2.3 From ad95c5e9bc8b5885f94dce720137cac8fa8da4c9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 30 May 2011 13:29:20 +0200 Subject: ext3: Fix oops in ext3_try_to_allocate_with_rsv() Block allocation is called from two places: ext3_get_blocks_handle() and ext3_xattr_block_set(). These two callers are not necessarily synchronized because xattr code holds only xattr_sem and i_mutex, and ext3_get_blocks_handle() may hold only truncate_mutex when called from writepage() path. Block reservation code does not expect two concurrent allocations to happen to the same inode and thus assertions can be triggered or reservation structure corruption can occur. Fix the problem by taking truncate_mutex in xattr code to serialize allocations. CC: Sage Weil CC: stable@kernel.org Reported-by: Fyodor Ustinov Signed-off-by: Jan Kara --- fs/ext3/xattr.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 32e6cc23bd9a..d565759d82ee 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -803,8 +803,16 @@ inserted: /* We need to allocate a new block */ ext3_fsblk_t goal = ext3_group_first_block_no(sb, EXT3_I(inode)->i_block_group); - ext3_fsblk_t block = ext3_new_block(handle, inode, - goal, &error); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); -- cgit v1.2.3 From ee3e77f18010679a889b3831c2dd931238c12d09 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 3 Jun 2011 21:58:11 +0200 Subject: ext3: Improve truncate error handling New truncate calling convention allows us to handle errors from ext3_block_truncate_page(). So reorganize the code so that ext3_block_truncate_page() is called before we change inode size. This also removes unnecessary block zeroing from error recovery after failed buffered writes (zeroing isn't needed because we could have never written non-zero data to disk). We have to be careful and keep zeroing in direct IO write error recovery because there we might have already overwritten end of the last file block. Signed-off-by: Jan Kara --- fs/ext3/inode.c | 101 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b4051c9ac5f2..d2e4547c7806 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -43,6 +43,7 @@ #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -1207,6 +1208,16 @@ static void ext3_truncate_failed_write(struct inode *inode) ext3_truncate(inode); } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1847,7 +1858,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - ext3_truncate_failed_write(inode); + ext3_truncate_failed_direct_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1861,7 +1872,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate_failed_write(inode); + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); goto out; } @@ -1971,17 +1982,24 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -2026,11 +2044,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -2044,6 +2074,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -2455,7 +2488,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2463,7 +2495,6 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; trace_ext3_truncate_enter(inode); @@ -2473,37 +2504,12 @@ void ext3_truncate(struct inode *inode) if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - goto out_notrans; - } - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) goto out_notrans; - } last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -3251,11 +3257,30 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } if ((attr->ia_valid & ATTR_SIZE) && -- cgit v1.2.3 From 81fe8c62febade6b5d0915269b06a0c50448da27 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 10 Jun 2011 14:59:05 -0700 Subject: ext3/ioctl.c: silence sparse warnings about different address spaces The 'from' argument for copy_from_user and the 'to' argument for copy_to_user should both be tagged as __user address space. Signed-off-by: H Hartley Sweeten Cc: Andrew Morton Cc: Andreas Dilger Signed-off-by: Jan Kara --- fs/ext3/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index f4090bd2f345..c7f43944f160 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -285,7 +285,7 @@ group_add_out: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -293,7 +293,7 @@ group_add_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; -- cgit v1.2.3 From 2c2ea9451fc2a12ee57c8346f0da26969d07ee7f Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 22 Jun 2011 10:51:09 +0200 Subject: ext3: Return -EINVAL when start is beyond the end of fs in ext3_trim_fs() We should return -EINVAL when the FITRIM parameters are not sane, but currently we are exiting silently if start is beyond the end of the file system. This commit fixes this so we return -EINVAL as other file systems do. Signed-off-by: Lukas Czerner CC: Jan Kara Signed-off-by: Jan Kara --- fs/ext3/balloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index f7d111e499ad..6386d76f44a7 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -2108,7 +2108,7 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) return -EINVAL; if (start >= max_blks) - goto out; + return -EINVAL; if (start + len > max_blks) len = max_blks - start; @@ -2156,8 +2156,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (ret >= 0) ret = 0; - -out: range->len = trimmed * sb->s_blocksize; return ret; -- cgit v1.2.3 From bb189247f35688a3353545902c56290fb7d7754a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 24 Jun 2011 23:11:59 +0200 Subject: jbd: Fix oops in journal_remove_journal_head() journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 journal_commit_transaction() ... processing t_forget list __journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); journal_try_to_free_buffers() journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() journal_put_journal_head(jh) journal_remove_journal_head(bh); journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]journal_refile_buffer(), [__]journal_unfile_buffer(), and __journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 27 ++++++++------- fs/jbd/commit.c | 46 ++++++++++++------------- fs/jbd/journal.c | 95 +++++++++++++++++----------------------------------- fs/jbd/transaction.c | 73 ++++++++++++++++++++-------------------- include/linux/jbd.h | 1 - 5 files changed, 104 insertions(+), 138 deletions(-) (limited to 'fs') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index dea7503b47e8..61655a37c731 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -96,10 +96,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -221,8 +225,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -241,7 +245,6 @@ restart: */ released = __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } @@ -305,12 +308,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -526,9 +529,9 @@ int cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and release + * them. * - * Called with the journal locked. * Called with j_list_lock held. * Returns number of bufers reaped (for debug) */ @@ -635,8 +638,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -655,13 +658,14 @@ int __journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -672,10 +676,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -687,7 +689,6 @@ int __journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -706,6 +707,8 @@ void __journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index eedd201374a8..8799207df058 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -258,10 +258,6 @@ write_out_data: jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); - journal_remove_journal_head(bh); - /* One for our safety reference, other for - * journal_remove_journal_head() */ - put_bh(bh); release_data_buffer(bh); } @@ -455,14 +451,9 @@ void journal_commit_transaction(journal_t *journal) } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) { + jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } + jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } @@ -807,10 +798,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -868,28 +865,27 @@ restart_loop: __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index ab019ee77888..9fe061fb8779 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1803,10 +1803,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -1819,17 +1818,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = journal_add_journal_head(bh); * ... - * jh->b_transaction = xxx; - * journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *journal_add_journal_head(struct buffer_head *bh) @@ -1893,61 +1891,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void journal_put_journal_head(struct journal_head *jh) @@ -1957,11 +1923,12 @@ void journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index dc39efd05d54..7e59c6e66f9b 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -696,7 +696,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; /* first access by this transaction */ jh->b_modified = 0; @@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ret = -EIO; goto no_journal; } - - if (jh->b_transaction != NULL) { + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } @@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { @@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/include/linux/jbd.h b/include/linux/jbd.h index e06965081ba5..e6a5e34bed4f 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -940,7 +940,6 @@ extern int journal_force_commit(journal_t *); */ struct journal_head *journal_add_journal_head(struct buffer_head *bh); struct journal_head *journal_grab_journal_head(struct buffer_head *bh); -void journal_remove_journal_head(struct buffer_head *bh); void journal_put_journal_head(struct journal_head *jh); /* -- cgit v1.2.3 From f8f8527103a264b5e4ab2ce5c1743b28f3219d90 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 27 Jun 2011 13:45:43 +0200 Subject: eCryptfs: export global eCryptfs definitions to include/linux/ecryptfs.h Some eCryptfs specific definitions, such as the current version and the authentication token structure, are moved to the new include file 'include/linux/ecryptfs.h', in order to be available for all kernel subsystems. Signed-off-by: Roberto Sassu Acked-by: Gianluca Ramunno Acked-by: Tyler Hicks Acked-by: David Howells Signed-off-by: Mimi Zohar --- fs/ecryptfs/ecryptfs_kernel.h | 109 +--------------------------------------- include/linux/ecryptfs.h | 113 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 108 deletions(-) create mode 100644 include/linux/ecryptfs.h (limited to 'fs') diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 43c7c43b06f5..bb8ec5d4301c 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -36,125 +36,18 @@ #include #include #include +#include -/* Version verification for shared data structures w/ userspace */ -#define ECRYPTFS_VERSION_MAJOR 0x00 -#define ECRYPTFS_VERSION_MINOR 0x04 -#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03 -/* These flags indicate which features are supported by the kernel - * module; userspace tools such as the mount helper read - * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine - * how to behave. */ -#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 -#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 -#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 -#define ECRYPTFS_VERSIONING_POLICY 0x00000008 -#define ECRYPTFS_VERSIONING_XATTR 0x00000010 -#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 -#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 -#define ECRYPTFS_VERSIONING_HMAC 0x00000080 -#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 -#define ECRYPTFS_VERSIONING_GCM 0x00000200 -#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ - | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ - | ECRYPTFS_VERSIONING_PUBKEY \ - | ECRYPTFS_VERSIONING_XATTR \ - | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC \ - | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) -#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 -#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH -#define ECRYPTFS_SALT_SIZE 8 -#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2) -/* The original signature size is only for what is stored on disk; all - * in-memory representations are expanded hex, so it better adapted to - * be passed around or referenced on the command line */ -#define ECRYPTFS_SIG_SIZE 8 -#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2) -#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX -#define ECRYPTFS_MAX_KEY_BYTES 64 -#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 #define ECRYPTFS_DEFAULT_IV_BYTES 16 -#define ECRYPTFS_FILE_VERSION 0x03 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 #define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) -#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 #define ECRYPTFS_XATTR_NAME "user.ecryptfs" -#define RFC2440_CIPHER_DES3_EDE 0x02 -#define RFC2440_CIPHER_CAST_5 0x03 -#define RFC2440_CIPHER_BLOWFISH 0x04 -#define RFC2440_CIPHER_AES_128 0x07 -#define RFC2440_CIPHER_AES_192 0x08 -#define RFC2440_CIPHER_AES_256 0x09 -#define RFC2440_CIPHER_TWOFISH 0x0a -#define RFC2440_CIPHER_CAST_6 0x0b - -#define RFC2440_CIPHER_RSA 0x01 - -/** - * For convenience, we may need to pass around the encrypted session - * key between kernel and userspace because the authentication token - * may not be extractable. For example, the TPM may not release the - * private key, instead requiring the encrypted data and returning the - * decrypted data. - */ -struct ecryptfs_session_key { -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001 -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002 -#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004 -#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008 - u32 flags; - u32 encrypted_key_size; - u32 decrypted_key_size; - u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; - u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES]; -}; - -struct ecryptfs_password { - u32 password_bytes; - s32 hash_algo; - u32 hash_iterations; - u32 session_key_encryption_key_bytes; -#define ECRYPTFS_PERSISTENT_PASSWORD 0x01 -#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02 - u32 flags; - /* Iterated-hash concatenation of salt and passphrase */ - u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - /* Always in expanded hex */ - u8 salt[ECRYPTFS_SALT_SIZE]; -}; - -enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; - -struct ecryptfs_private_key { - u32 key_size; - u32 data_len; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1]; - u8 data[]; -}; - -/* May be a password or a private key */ -struct ecryptfs_auth_tok { - u16 version; /* 8-bit major and 8-bit minor */ - u16 token_type; -#define ECRYPTFS_ENCRYPT_ONLY 0x00000001 - u32 flags; - struct ecryptfs_session_key session_key; - u8 reserved[32]; - union { - struct ecryptfs_password password; - struct ecryptfs_private_key private_key; - } token; -} __attribute__ ((packed)); - void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); diff --git a/include/linux/ecryptfs.h b/include/linux/ecryptfs.h new file mode 100644 index 000000000000..2224a8c0cb64 --- /dev/null +++ b/include/linux/ecryptfs.h @@ -0,0 +1,113 @@ +#ifndef _LINUX_ECRYPTFS_H +#define _LINUX_ECRYPTFS_H + +/* Version verification for shared data structures w/ userspace */ +#define ECRYPTFS_VERSION_MAJOR 0x00 +#define ECRYPTFS_VERSION_MINOR 0x04 +#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03 +/* These flags indicate which features are supported by the kernel + * module; userspace tools such as the mount helper read + * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine + * how to behave. */ +#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 +#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 +#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 +#define ECRYPTFS_VERSIONING_POLICY 0x00000008 +#define ECRYPTFS_VERSIONING_XATTR 0x00000010 +#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 +#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 +#define ECRYPTFS_VERSIONING_HMAC 0x00000080 +#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 +#define ECRYPTFS_VERSIONING_GCM 0x00000200 +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ + | ECRYPTFS_VERSIONING_PUBKEY \ + | ECRYPTFS_VERSIONING_XATTR \ + | ECRYPTFS_VERSIONING_MULTKEY \ + | ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) +#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 +#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH +#define ECRYPTFS_SALT_SIZE 8 +#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2) +/* The original signature size is only for what is stored on disk; all + * in-memory representations are expanded hex, so it better adapted to + * be passed around or referenced on the command line */ +#define ECRYPTFS_SIG_SIZE 8 +#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2) +#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX +#define ECRYPTFS_MAX_KEY_BYTES 64 +#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 +#define ECRYPTFS_FILE_VERSION 0x03 +#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 + +#define RFC2440_CIPHER_DES3_EDE 0x02 +#define RFC2440_CIPHER_CAST_5 0x03 +#define RFC2440_CIPHER_BLOWFISH 0x04 +#define RFC2440_CIPHER_AES_128 0x07 +#define RFC2440_CIPHER_AES_192 0x08 +#define RFC2440_CIPHER_AES_256 0x09 +#define RFC2440_CIPHER_TWOFISH 0x0a +#define RFC2440_CIPHER_CAST_6 0x0b + +#define RFC2440_CIPHER_RSA 0x01 + +/** + * For convenience, we may need to pass around the encrypted session + * key between kernel and userspace because the authentication token + * may not be extractable. For example, the TPM may not release the + * private key, instead requiring the encrypted data and returning the + * decrypted data. + */ +struct ecryptfs_session_key { +#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001 +#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002 +#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004 +#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008 + u32 flags; + u32 encrypted_key_size; + u32 decrypted_key_size; + u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; + u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES]; +}; + +struct ecryptfs_password { + u32 password_bytes; + s32 hash_algo; + u32 hash_iterations; + u32 session_key_encryption_key_bytes; +#define ECRYPTFS_PERSISTENT_PASSWORD 0x01 +#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02 + u32 flags; + /* Iterated-hash concatenation of salt and passphrase */ + u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; + u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; + /* Always in expanded hex */ + u8 salt[ECRYPTFS_SALT_SIZE]; +}; + +enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; + +struct ecryptfs_private_key { + u32 key_size; + u32 data_len; + u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; + char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1]; + u8 data[]; +}; + +/* May be a password or a private key */ +struct ecryptfs_auth_tok { + u16 version; /* 8-bit major and 8-bit minor */ + u16 token_type; +#define ECRYPTFS_ENCRYPT_ONLY 0x00000001 + u32 flags; + struct ecryptfs_session_key session_key; + u8 reserved[32]; + union { + struct ecryptfs_password password; + struct ecryptfs_private_key private_key; + } token; +} __attribute__ ((packed)); + +#endif /* _LINUX_ECRYPTFS_H */ -- cgit v1.2.3 From 1252cc3b232e582e887623dc5f70979418caaaa2 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 27 Jun 2011 13:45:45 +0200 Subject: eCryptfs: added support for the encrypted key type The function ecryptfs_keyring_auth_tok_for_sig() has been modified in order to search keys of both 'user' and 'encrypted' types. Signed-off-by: Roberto Sassu Acked-by: Gianluca Ramunno Acked-by: Tyler Hicks Signed-off-by: Mimi Zohar --- fs/ecryptfs/ecryptfs_kernel.h | 41 +++++++++++++++++++++++++++++++++++++++-- fs/ecryptfs/keystore.c | 13 ++++++++----- 2 files changed, 47 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index bb8ec5d4301c..b36c5572b3f3 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -29,6 +29,7 @@ #define ECRYPTFS_KERNEL_H #include +#include #include #include #include @@ -78,11 +79,47 @@ struct ecryptfs_page_crypt_context { } param; }; +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + if (key->type == &key_type_encrypted) + return (struct ecryptfs_auth_tok *) + (&((struct encrypted_key_payload *)key->payload.data)->payload_data); + else + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return request_key(&key_type_encrypted, sig, NULL); +} + +#else +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return ERR_PTR(-ENOKEY); +} + +#endif /* CONFIG_ENCRYPTED_KEYS */ + static inline struct ecryptfs_auth_tok * ecryptfs_get_key_payload_data(struct key *key) { - return (struct ecryptfs_auth_tok *) - (((struct user_key_payload*)key->payload.data)->data); + struct ecryptfs_auth_tok *auth_tok; + + auth_tok = ecryptfs_get_encrypted_key_payload_data(key); + if (!auth_tok) + return (struct ecryptfs_auth_tok *) + (((struct user_key_payload *)key->payload.data)->data); + else + return auth_tok; } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 27a7fefb83eb..2cff13ac8937 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1635,11 +1635,14 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, (*auth_tok_key) = request_key(&key_type_user, sig, NULL); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { - printk(KERN_ERR "Could not find key with description: [%s]\n", - sig); - rc = process_request_key_err(PTR_ERR(*auth_tok_key)); - (*auth_tok_key) = NULL; - goto out; + (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); + if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + printk(KERN_ERR "Could not find key with description: [%s]\n", + sig); + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; + goto out; + } } down_write(&(*auth_tok_key)->sem); rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); -- cgit v1.2.3 From d3ad8434aa83ef7c88bc91edcfe012cdcbab9f3e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 27 Jun 2011 12:36:29 -0400 Subject: jbd2: use WRITE_SYNC in journal checkpoint In journal checkpoint, we write the buffer and wait for its finish. But in cfq, the async queue has a very low priority, and in our test, if there are too many sync queues and every queue is filled up with requests, the write request will be delayed for quite a long time and all the tasks which are waiting for journal space will end with errors like: INFO: task attr_set:3816 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. attr_set D ffff880028393480 0 3816 1 0x00000000 ffff8802073fbae8 0000000000000086 ffff8802140847c8 ffff8800283934e8 ffff8802073fb9d8 ffffffff8103e456 ffff8802140847b8 ffff8801ed728080 ffff8801db4bc080 ffff8801ed728450 ffff880028393480 0000000000000002 Call Trace: [] ? __dequeue_entity+0x33/0x38 [] ? need_resched+0x23/0x2d [] ? thread_return+0xa2/0xbc [] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [] __mutex_lock_common+0x14e/0x1a9 [] ? brelse+0x13/0x15 [ext4] [] __mutex_lock_slowpath+0x19/0x1b [] mutex_lock+0x1b/0x32 [] __jbd2_journal_insert_checkpoint+0xe3/0x20c [jbd2] [] start_this_handle+0x438/0x527 [jbd2] [] ? autoremove_wake_function+0x0/0x3e [] jbd2_journal_start+0xa1/0xcc [jbd2] [] ext4_journal_start_sb+0x57/0x81 [ext4] [] ext4_xattr_set+0x6c/0xe3 [ext4] [] ext4_xattr_user_set+0x42/0x4b [ext4] [] generic_setxattr+0x6b/0x76 [] __vfs_setxattr_noperm+0x47/0xc0 [] vfs_setxattr+0x7f/0x9a [] setxattr+0xb5/0xe8 [] ? do_filp_open+0x571/0xa6e [] sys_fsetxattr+0x6b/0x91 [] system_call_fastpath+0x16/0x1b So this patch tries to use WRITE_SYNC in __flush_batch so that the request will be moved into sync queue and handled by cfq timely. We also use the new plug, sot that all the WRITE_SYNC requests can be given as a whole when we unplug it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: Jan Kara Reported-by: Robin Dong --- fs/jbd2/checkpoint.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2c62c5aae82f..16a698bd906d 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -257,9 +257,12 @@ static void __flush_batch(journal_t *journal, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; -- cgit v1.2.3 From ed7a7e16724a4123fce1fc0ff1f5131a0596f189 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 27 Jun 2011 15:35:53 -0400 Subject: ext4: fix incorrect error msg in ext4_ext_insert_index In function ext4_ext_insert_index when eh_entries of curp is bigger than eh_max, error messages will be printed out, but the content is about logical and ei_block, that's incorret. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc81e7a2..eb63c7b8dfd2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -808,8 +808,9 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) > le16_to_cpu(curp->p_hdr->eh_max))) { EXT4_ERROR_INODE(inode, - "logical %d == ei_block %d!", - logical, le32_to_cpu(curp->p_idx->ei_block)); + "eh_entries %d > eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); return -EIO; } if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { -- cgit v1.2.3 From ff9893dc8aa622a4f122293a6861566a284edea5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Jun 2011 16:36:31 -0400 Subject: ext4: split ext4_ind_truncate from ext4_truncate We are about to move all indirect inode functions to a new file. Before we do that, let's split ext4_ind_truncate() out of ext4_truncate() leaving only generic code in the latter, so we will be able to move ext4_ind_truncate() to the new file. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 36 ++++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1921392cd708..8532dd43d320 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1834,6 +1834,8 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern void ext4_ind_truncate(struct inode *inode); + /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e3126c051006..a8f310b77f56 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4470,6 +4470,26 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) * ext4_truncate() run will find them and release them. */ void ext4_truncate(struct inode *inode) +{ + trace_ext4_truncate_enter(inode); + + if (!ext4_can_truncate(inode)) + return; + + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_ext_truncate(inode); + else + ext4_ind_truncate(inode); + + trace_ext4_truncate_exit(inode); +} + +void ext4_ind_truncate(struct inode *inode) { handle_t *handle; struct ext4_inode_info *ei = EXT4_I(inode); @@ -4484,22 +4504,6 @@ void ext4_truncate(struct inode *inode) ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; - trace_ext4_truncate_enter(inode); - - if (!ext4_can_truncate(inode)) - return; - - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - - if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ext4_ext_truncate(inode); - trace_ext4_truncate_exit(inode); - return; - } - handle = start_transaction(inode); if (IS_ERR(handle)) return; /* AKPM: return what? */ -- cgit v1.2.3 From 8bb2b247124ba6093455d4aef26743b1bef27bc5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Jun 2011 17:10:28 -0400 Subject: ext4: rename ext4_indirect_* funcs to ext4_ind_* We are going to move all ext4_ind_* functions to indirect.c. Before we do that, let's rename 2 functions called ext4_indirect_* to ext4_ind_*, to keep to the naming convention. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a8f310b77f56..6c1d28e37235 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1075,8 +1075,7 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) * Calculate the number of metadata blocks need to reserve * to allocate a new block at @lblocks for non extent file based file */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, - sector_t lblock) +static int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) { struct ext4_inode_info *ei = EXT4_I(inode); sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); @@ -1107,7 +1106,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, lblock); + return ext4_ind_calc_metadata_amount(inode, lblock); } /* @@ -5456,8 +5455,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, - int chunk) +static int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int indirects; @@ -5483,7 +5481,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, nrblocks, chunk); return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } -- cgit v1.2.3 From a212d1a71deea10ba4f6de2aaac0221be34ddb29 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 7 Jun 2011 11:56:50 +0800 Subject: jbd: Use WRITE_SYNC in journal checkpoint. In journal checkpoint, we write the buffer and wait for its finish. But in cfq, the async queue has a very low priority, and in our test, if there are too many sync queues and every queue is filled up with requests, and the process will hang waiting for the log space. So this patch tries to use WRITE_SYNC in __flush_batch so that the request will be moved into sync queue and handled by cfq timely. We also use the new plug, sot that all the WRITE_SYNC requests can be given as a whole when we unplug it. Reported-by: Robin Dong Signed-off-by: Tao Ma Signed-off-by: Jan Kara --- fs/jbd/checkpoint.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 61655a37c731..f94fc48ff3a0 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* @@ -257,9 +258,12 @@ static void __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(bhs[i], WRITE); + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; -- cgit v1.2.3 From 1f7d1e77419050831a905353683807fa69a26625 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 27 Jun 2011 19:16:02 -0400 Subject: ext4: move __ext4_check_blockref to block_validity.c In preparation for moving the indirect functions to a separate file, move __ext4_check_blockref() to block_validity.c and rename it to ext4_check_blockref() which is exported as globally visible function. Also, rename the cpp macro ext4_check_inode_blockref() to ext4_ind_check_inode(), to make it clear that it is only valid for use with non-extent mapped inodes. Signed-off-by: "Theodore Ts'o" --- fs/ext4/block_validity.c | 20 ++++++++++++++++++++ fs/ext4/ext4.h | 15 +++++++++++++++ fs/ext4/inode.c | 35 +---------------------------------- 3 files changed, 36 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3fba80..af103be491b0 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -246,3 +246,23 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, return 1; } +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8532dd43d320..82ba7eb7c4a5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2124,6 +2124,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) sb->s_dirt =1; } +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + /* * Inodes and files operations */ @@ -2153,6 +2166,8 @@ extern void ext4_exit_system_zone(void); extern int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6c1d28e37235..3dca5264ccff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -360,39 +360,6 @@ static int ext4_block_to_path(struct inode *inode, return n; } -static int __ext4_check_blockref(const char *function, unsigned int line, - struct inode *inode, - __le32 *p, unsigned int max) -{ - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - __le32 *bref = p; - unsigned int blk; - - while (bref < p+max) { - blk = le32_to_cpu(*bref++); - if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); - ext4_error_inode(inode, function, line, blk, - "invalid block"); - return -EIO; - } - } - return 0; -} - - -#define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - (__le32 *)(bh)->b_data, \ - EXT4_ADDR_PER_BLOCK((inode)->i_sb)) - -#define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - EXT4_I(inode)->i_data, \ - EXT4_NDIR_BLOCKS) - /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question @@ -5010,7 +4977,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { /* Validate block references which are part of inode */ - ret = ext4_check_inode_blockref(inode); + ret = ext4_ind_check_inode(inode); } if (ret) goto bad_inode; -- cgit v1.2.3 From 9f125d641beb898f5bf2fe69583192c18043517a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 27 Jun 2011 19:16:04 -0400 Subject: ext4: move common truncate functions to header file Move two functions that will be needed by the indirect functions to be moved to indirect.c as well as inode.c to truncate.h as inline functions, so that we can avoid having duplicate copies of the function (which can be a maintenance problem) without having to expose them as globally functions. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 48 ++++++------------------------------------------ fs/ext4/truncate.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 42 deletions(-) create mode 100644 fs/ext4/truncate.h (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3dca5264ccff..9b82ac7b0f55 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,6 +47,7 @@ #include "xattr.h" #include "acl.h" #include "ext4_extents.h" +#include "truncate.h" #include @@ -88,33 +89,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } -/* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static unsigned long blocks_for_truncate(struct inode *inode) -{ - ext4_lblk_t needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext4 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT4_MAX_TRANS_DATA) - needed = EXT4_MAX_TRANS_DATA; - - return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make @@ -129,7 +103,7 @@ static handle_t *start_transaction(struct inode *inode) { handle_t *result; - result = ext4_journal_start(inode, blocks_for_truncate(inode)); + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); if (!IS_ERR(result)) return result; @@ -149,7 +123,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) return 0; if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; - if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) return 0; return 1; } @@ -204,7 +178,7 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); + handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -1555,16 +1529,6 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static void ext4_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); -} - static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, @@ -4134,7 +4098,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (unlikely(err)) goto out_err; err = ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); + ext4_blocks_for_truncate(inode)); if (unlikely(err)) goto out_err; if (bh) { @@ -4329,7 +4293,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); + ext4_blocks_for_truncate(inode)); } /* diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 000000000000..011ba6670d99 --- /dev/null +++ b/fs/ext4/truncate.h @@ -0,0 +1,43 @@ +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + -- cgit v1.2.3 From dae1e52cb1267bf8f52e5e47a80fab566d7e8aa4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Jun 2011 19:40:50 -0400 Subject: ext4: move ext4_ind_* functions from inode.c to indirect.c This patch moves functions from inode.c to indirect.c. The moved functions are ext4_ind_* functions and their helpers. Functions called from inode.c are declared extern. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 2 +- fs/ext4/block_validity.c | 1 + fs/ext4/ext4.h | 9 + fs/ext4/indirect.c | 1510 ++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/inode.c | 1486 --------------------------------------------- 5 files changed, 1521 insertions(+), 1487 deletions(-) create mode 100644 fs/ext4/indirect.c (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460ba9e..56fd8f865930 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o + mmp.o indirect.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index af103be491b0..8efb2f0a3447 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -266,3 +266,4 @@ int ext4_check_blockref(const char *function, unsigned int line, } return 0; } + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 82ba7eb7c4a5..ddaf5043fb38 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1834,6 +1834,15 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern void ext4_ind_truncate(struct inode *inode); /* ioctl.c */ diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 000000000000..c3e85a86e821 --- /dev/null +++ b/fs/ext4/indirect.c @@ -0,0 +1,1510 @@ +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + */ + +#include +#include "ext4_jbd2.h" +#include "truncate.h" + +#include + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) + goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of desired blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + struct ext4_allocation_request ar; + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); + if (*err) + goto failed_out; + + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += ar.len; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i = 1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); + for (i = 1; i <= n ; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); + } + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, + int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), + blks, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, map->m_lblk, + partial, indirect_blks, count); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); + return err; +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && ext4_should_dioread_nolock(inode)) + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL, NULL, 0); + else { + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + +out_unlock: + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); +} + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b82ac7b0f55..de50b16a8f67 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -12,10 +12,6 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * @@ -89,45 +85,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } -/* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) - return 0; - return 1; -} - /* * Restart the transaction associated with *handle. This does a commit, * so before we call here everything must be consistently dirtied against @@ -251,760 +208,6 @@ no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -/** - * ext4_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext4 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext4_block_to_path(struct inode *inode, - ext4_lblk_t i_block, - ext4_lblk_t offsets[4], int *boundary) -{ - int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT4_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ((i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT4_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT4_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT4_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", - i_block + direct_blocks + - indirect_blocks + double_blocks, inode->i_ino); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -/** - * ext4_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) - */ -static Indirect *ext4_get_branch(struct inode *inode, int depth, - ext4_lblk_t *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) - goto failure; - - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto failure; - } - /* validate block references */ - if (ext4_check_indirect_blockref(inode, bh)) { - put_bh(bh); - goto failure; - } - } - - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext4_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; - __le32 *p; - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; -} - -/** - * ext4_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - * Because this is only used for non-extent files, we limit the block nr - * to 32 bits. - */ -static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, - Indirect *partial) -{ - ext4_fsblk_t goal; - - /* - * XXX need to get goal block from mballoc's data structures - */ - - goal = ext4_find_near(inode, partial); - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - return goal; -} - -/** - * ext4_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, - int blocks_to_boundary) -{ - unsigned int count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - WARN_ON(1); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** - * ext4_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext4_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext4_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -EIO; - goto failed; - } - - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); - if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ - unlock_buffer(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); - } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - - return err; -} - -/** - * ext4_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext4_alloc_branch) - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) -{ - int i; - int err = 0; - ext4_fsblk_t current_block; - - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i) = cpu_to_le32(current_block++); - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - */ - ext4_mark_inode_dirty(handle, inode); - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, - EXT4_FREE_BLOCKS_FORGET); - } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); - - return err; -} - -/* - * The ext4_ind_map_blocks() function handles non-extents inodes - * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_map_blocks(). - * - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - * - * The ext4_ind_get_blocks() function should be called with - * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem - * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system - * blocks. - */ -static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - int flags) -{ - int err = -EIO; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext4_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - int count = 0; - ext4_fsblk_t first_block = 0; - - trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, map->m_lblk, offsets, - &blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext4_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - count++; - /*map more blocks*/ - while (count < map->m_len && count <= blocks_to_boundary) { - ext4_fsblk_t blk; - - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) - goto cleanup; - - /* - * Okay, we need to do block allocation. - */ - goal = ext4_find_goal(inode, map->m_lblk, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); - /* - * Block out ext4_truncate while we alter the tree - */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext4_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); - if (err) - goto cleanup; - - map->m_flags |= EXT4_MAP_NEW; - - ext4_update_inode_fsync_trans(handle, inode, 1); -got_it: - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = le32_to_cpu(chain[depth-1].key); - map->m_len = count; - if (count > blocks_to_boundary) - map->m_flags |= EXT4_MAP_BOUNDARY; - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); - return err; -} - #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { @@ -1012,32 +215,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) } #endif -/* - * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -static int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - /* * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock @@ -3379,114 +2556,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -/* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_length(iov, nr_segs); - int retries = 0; - - if (rw == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL, NULL, 0); - else { - ret = blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL); - - if (unlikely((rw & WRITE) && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - /* * ext4_get_block used when preparing for a DIO write or buffer write. * We allocate an uinitialized extent if blocks haven't been allocated. @@ -3958,383 +3027,6 @@ unlock: return err; } -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext4_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext4_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext4_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext4_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext4_find_shared(struct inode *inode, int depth, - ext4_lblk_t offsets[4], Indirect chain[4], - __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext4_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext4. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - * - * Return 0 on success, 1 on invalid block range - * and < 0 on fatal error. - */ -static int ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) -{ - __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; - int err; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { - EXT4_ERROR_INODE(inode, "attempt to clear invalid " - "blocks %llu len %lu", - (unsigned long long) block_to_free, count); - return 1; - } - - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } - - for (p = first; p < last; p++) - *p = 0; - - ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); - return 0; -out_err: - ext4_std_error(inode->i_sb, err); - return err; -} - -/** - * ext4_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext4_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext4_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err = 0; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - err = ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p); - if (err) - break; - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (!err && count > 0) - err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - if (err < 0) - /* fatal error */ - return; - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) - ext4_handle_dirty_metadata(handle, inode, this_bh); - else - EXT4_ERROR_INODE(inode, - "circular indirect block detected at " - "block %llu", - (unsigned long long) this_bh->b_blocknr); - } -} - -/** - * ext4_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext4_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext4_fsblk_t nr; - __le32 *p; - - if (ext4_handle_is_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { - EXT4_ERROR_INODE(inode, - "invalid indirect mapped " - "block %lu (level %d)", - (unsigned long) nr, depth); - break; - } - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, - "Read failure"); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext4_free_branches(handle, inode, bh, - (__le32 *) bh->b_data, - (__le32 *) bh->b_data + addr_per_block, - depth); - brelse(bh); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (ext4_handle_is_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - } - - /* - * The forget flag here is critical because if - * we are journaling (and not doing data - * journaling), we have to make sure a revoke - * record is written to prevent the journal - * replay from overwriting the (former) - * indirect block if it gets reallocated as a - * data block. This must happen in the same - * transaction where the data blocks are - * actually freed. - */ - ext4_free_blocks(handle, inode, NULL, nr, 1, - EXT4_FREE_BLOCKS_METADATA| - EXT4_FREE_BLOCKS_FORGET); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext4_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, - inode, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext4_free_data(handle, inode, parent_bh, first, last); - } -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -4419,161 +3111,6 @@ void ext4_truncate(struct inode *inode) trace_ext4_truncate_exit(inode); } -void ext4_ind_truncate(struct inode *inode) -{ - handle_t *handle; - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n = 0; - ext4_lblk_t last_block, max_block; - unsigned blocksize = inode->i_sb->s_blocksize; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ - - last_block = (inode->i_size + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) - goto out_stop; - - if (last_block != max_block) { - n = ext4_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - } - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext4 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - if (last_block == max_block) { - /* - * It is unnecessary to free any data blocks if last_block is - * equal to the indirect block limit. - */ - goto out_unlock; - } else if (n == 1) { /* direct blocks */ - ext4_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT4_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext4_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); - ext4_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext4_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT4_IND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT4_IND_BLOCK] = 0; - } - case EXT4_IND_BLOCK: - nr = i_data[EXT4_DIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT4_DIND_BLOCK] = 0; - } - case EXT4_DIND_BLOCK: - nr = i_data[EXT4_TIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT4_TIND_BLOCK] = 0; - } - case EXT4_TIND_BLOCK: - ; - } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); -} - /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If 'in_mem' is true, we have all @@ -5386,29 +3923,6 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) -{ - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } - /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block - */ - indirects = nrblocks * 2 + 1; - return indirects; -} - static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) -- cgit v1.2.3 From f86186b44b4164600cce03d0d93ad48ec21fa429 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 28 Jun 2011 10:01:31 -0400 Subject: ext4: refactor duplicated block placement code I found that ext4_ext_find_goal() and ext4_find_near() share the same code for returning a coloured start block based on i_block_group. We can refactor this into a common function so that they don't diverge in the future. Thanks to adilger for suggesting the new function name. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 37 +------------------------------------ fs/ext4/indirect.c | 28 +--------------------------- 4 files changed, 51 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f6949511e..f8224adf496e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ddaf5043fb38..49d2cea47382 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1743,6 +1743,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc); #define ext4_free_blocks_after_init(sb, group, desc) \ ext4_init_block_bitmap(sb, NULL, group, desc) +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb63c7b8dfd2..f331e5010f68 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - /* - * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular - * files will start at the second block group. This - * tends to speed up directory access and improves - * fsck times. - */ - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour + block; + return ext4_inode_to_goal_block(inode); } /* diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c3e85a86e821..6c271115dbb6 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -207,11 +207,6 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) struct ext4_inode_info *ei = EXT4_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { @@ -227,28 +222,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; + return ext4_inode_to_goal_block(inode); } /** -- cgit v1.2.3 From 9331b6261058eb85ae7c57ab8ac279e7fdaa9f04 Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Tue, 28 Jun 2011 10:19:05 -0400 Subject: ext4: quiet 'unused variables' compile warnings Unused variables was deleted. Signed-off-by: Yongqiang Yang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 -- fs/ext4/mballoc.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f331e5010f68..31ae5fbe89e5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3073,12 +3073,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_ext_path *path) { struct ext4_extent *ex; - struct ext4_extent_header *eh; int depth; int err = 0; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d56850..389386b41c98 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4666,12 +4666,10 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, struct ext4_buddy e4b; int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. -- cgit v1.2.3 From 275d3ba6b40d0f098693b9089c6fee9bd4e55d74 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 29 Jun 2011 21:44:45 -0400 Subject: ext4: remove loop around bio_alloc() These days, bio_alloc() is guaranteed to never fail (as long as nvecs is less than BIO_MAX_PAGES), so we don't need the loop around the struct bio allocation. Signed-off-by: "Theodore Ts'o" --- fs/ext4/page-io.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76d470a..430c401d0895 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -285,11 +285,7 @@ static int io_submit_init(struct ext4_io_submit *io, io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) return -ENOMEM; - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (bio == NULL); - + bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_private = io->io_end = io_end; -- cgit v1.2.3 From d46db3d58233be4be980eb1e42eebe7808bcabab Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 4 May 2011 19:54:37 -0600 Subject: writeback: make writeback_control.nr_to_write straight Pass struct wb_writeback_work all the way down to writeback_sb_inodes(), and initialize the struct writeback_control there. struct writeback_control is basically designed to control writeback of a single file, but we keep abuse it for writing multiple files in writeback_sb_inodes() and its callers. It immediately clean things up, e.g. suddenly wbc.nr_to_write vs work->nr_pages starts to make sense, and instead of saving and restoring pages_skipped in writeback_sb_inodes it can always start with a clean zero value. It also makes a neat IO pattern change: large dirty files are now written in the full 4MB writeback chunk size, rather than whatever remained quota in wbc->nr_to_write. Acked-by: Jan Kara Proposed-by: Christoph Hellwig Signed-off-by: Wu Fengguang --- fs/btrfs/extent_io.c | 2 - fs/fs-writeback.c | 196 ++++++++++++++++++++++----------------- include/linux/writeback.h | 6 +- include/trace/events/writeback.h | 39 +++++--- mm/backing-dev.c | 17 +--- mm/page-writeback.c | 17 +--- 6 files changed, 148 insertions(+), 129 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11c1efd..561262d35689 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2551,7 +2551,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, }; struct writeback_control wbc_writepages = { .sync_mode = wbc->sync_mode, - .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, @@ -2584,7 +2583,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; struct writeback_control wbc_writepages = { .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6caa98247a5b..2c947da39f6e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -29,12 +29,22 @@ #include #include "internal.h" +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024L + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; @@ -472,7 +482,6 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * No need to add it back to the LRU. */ list_del_init(&inode->i_wb_list); - wbc->inodes_written++; } } inode_sync_complete(inode); @@ -506,6 +515,31 @@ static bool pin_sb_for_writeback(struct super_block *sb) return false; } +static long writeback_chunk_size(struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else + pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); + + return pages; +} + /* * Write a portion of b_io inodes which belong to @sb. * @@ -513,18 +547,30 @@ static bool pin_sb_for_writeback(struct super_block *sb) * inodes. Otherwise write only ones which go sequentially * in reverse order. * - * Return 1, if the caller writeback routine should be - * interrupted. Otherwise return 0. + * Return the number of pages and/or inodes written. */ -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, - struct writeback_control *wbc, bool only_this_sb) +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + while (!list_empty(&wb->b_io)) { - long pages_skipped; struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { - if (only_this_sb) { + if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging @@ -539,7 +585,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Bounce back to the caller to unpin this and * pin the next superblock. */ - return 0; + break; } /* @@ -553,12 +599,18 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, requeue_io(inode, wb); continue; } - __iget(inode); + write_chunk = writeback_chunk_size(work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + writeback_single_inode(inode, wb, &wbc); - pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wb, wbc); - if (wbc->pages_skipped != pages_skipped) { + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + if (!(inode->i_state & I_DIRTY)) + wrote++; + if (wbc.pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. @@ -570,17 +622,25 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, iput(inode); cond_resched(); spin_lock(&wb->list_lock); - if (wbc->nr_to_write <= 0) - return 1; + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } - /* b_io is empty */ - return 1; + return wrote; } -static void __writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - int ret = 0; + unsigned long start_time = jiffies; + long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -590,33 +650,37 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb, requeue_io(inode, wb); continue; } - ret = writeback_sb_inodes(sb, wb, wbc, false); + wrote += writeback_sb_inodes(sb, wb, work); drop_super(sb); - if (ret) - break; + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } /* Leave any unwritten inodes on b_io */ + return wrote; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + }; + spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - __writeback_inodes_wb(wb, wbc); + queue_io(wb, NULL); + __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); -} -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 + return nr_pages - work.nr_pages; +} static inline bool over_bground_thresh(void) { @@ -646,42 +710,13 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - struct writeback_control wbc = { - .sync_mode = work->sync_mode, - .tagged_writepages = work->tagged_writepages, - .older_than_this = NULL, - .for_kupdate = work->for_kupdate, - .for_background = work->for_background, - .range_cyclic = work->range_cyclic, - }; + long nr_pages = work->nr_pages; unsigned long oldest_jif; - long wrote = 0; - long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; - - if (!wbc.range_cyclic) { - wbc.range_start = 0; - wbc.range_end = LLONG_MAX; - } - - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * writeback_sb_inodes() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) - write_chunk = LONG_MAX; + long progress; oldest_jif = jiffies; - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; spin_lock(&wb->list_lock); for (;;) { @@ -711,24 +746,17 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_kupdate) { oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - wbc.older_than_this = &oldest_jif; + work->older_than_this = &oldest_jif; } - wbc.nr_to_write = write_chunk; - wbc.pages_skipped = 0; - wbc.inodes_written = 0; - - trace_wbc_writeback_start(&wbc, wb->bdi); + trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) - queue_io(wb, wbc.older_than_this); + queue_io(wb, work->older_than_this); if (work->sb) - writeback_sb_inodes(work->sb, wb, &wbc, true); + progress = writeback_sb_inodes(work->sb, wb, work); else - __writeback_inodes_wb(wb, &wbc); - trace_wbc_writeback_written(&wbc, wb->bdi); - - work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); /* * Did we write something? Try for more @@ -738,9 +766,7 @@ static long wb_writeback(struct bdi_writeback *wb, * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write < write_chunk) - continue; - if (wbc.inodes_written) + if (progress) continue; /* * No more inodes for IO, bail @@ -753,8 +779,8 @@ static long wb_writeback(struct bdi_writeback *wb, * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); - trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); @@ -762,7 +788,7 @@ static long wb_writeback(struct bdi_writeback *wb, } spin_unlock(&wb->list_lock); - return wrote; + return nr_pages - work->nr_pages; } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2f1b512bd6e0..df1b7f18f100 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -24,12 +24,9 @@ enum writeback_sync_modes { */ struct writeback_control { enum writeback_sync_modes sync_mode; - unsigned long *older_than_this; /* If !NULL, only write back inodes - older than this */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ - long inodes_written; /* # of inodes written (at least) */ /* * For a_ops->writepages(): is start or end are non-zero then this is @@ -56,8 +53,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); int writeback_inodes_sb_if_idle(struct super_block *); int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); void sync_inodes_sb(struct super_block *); -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 205d14919ef2..3e7662a0cfa3 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -62,6 +62,9 @@ DEFINE_EVENT(writeback_work_class, name, \ DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); +DEFINE_WRITEBACK_WORK_EVENT(writeback_start); +DEFINE_WRITEBACK_WORK_EVENT(writeback_written); +DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), @@ -101,6 +104,30 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); DEFINE_WRITEBACK_EVENT(writeback_thread_stop); +DEFINE_WRITEBACK_EVENT(balance_dirty_start); +DEFINE_WRITEBACK_EVENT(balance_dirty_wait); + +TRACE_EVENT(balance_dirty_written, + + TP_PROTO(struct backing_dev_info *bdi, int written), + + TP_ARGS(bdi, written), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(int, written) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->written = written; + ), + + TP_printk("bdi %s written %d", + __entry->name, + __entry->written + ) +); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), @@ -114,7 +141,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) - __field(unsigned long, older_than_this) __field(long, range_start) __field(long, range_end) ), @@ -128,14 +154,12 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; - __entry->older_than_this = wbc->older_than_this ? - *wbc->older_than_this : 0; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " - "bgrd=%d reclm=%d cyclic=%d older=0x%lx " + "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx", __entry->name, __entry->nr_to_write, @@ -145,7 +169,6 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, - __entry->older_than_this, __entry->range_start, __entry->range_end) ) @@ -154,12 +177,6 @@ DECLARE_EVENT_CLASS(wbc_class, DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) -DEFINE_WBC_EVENT(wbc_writeback_start); -DEFINE_WBC_EVENT(wbc_writeback_written); -DEFINE_WBC_EVENT(wbc_writeback_wait); -DEFINE_WBC_EVENT(wbc_balance_dirty_start); -DEFINE_WBC_EVENT(wbc_balance_dirty_written); -DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5f6553ef1ba7..7ba303be5e03 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -260,18 +260,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -static void bdi_flush_io(struct backing_dev_info *bdi) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .range_cyclic = 1, - .nr_to_write = 1024, - }; - - writeback_inodes_wb(&bdi->wb, &wbc); -} - /* * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be @@ -457,9 +445,10 @@ static int bdi_forker_thread(void *ptr) if (IS_ERR(task)) { /* * If thread creation fails, force writeout of - * the bdi from the thread. + * the bdi from the thread. Hopefully 1024 is + * large enough for efficient IO. */ - bdi_flush_io(bdi); + writeback_inodes_wb(&bdi->wb, 1024); } else { /* * The spinlock makes sure we do not lose diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1965d05a29cc..9d6ac2b6d942 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -491,13 +491,6 @@ static void balance_dirty_pages(struct address_space *mapping, struct backing_dev_info *bdi = mapping->backing_dev_info; for (;;) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = write_chunk, - .range_cyclic = 1, - }; - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); @@ -559,17 +552,17 @@ static void balance_dirty_pages(struct address_space *mapping, * threshold otherwise wait until the disk writes catch * up. */ - trace_wbc_balance_dirty_start(&wbc, bdi); + trace_balance_dirty_start(bdi); if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes_wb(&bdi->wb, &wbc); - pages_written += write_chunk - wbc.nr_to_write; - trace_wbc_balance_dirty_written(&wbc, bdi); + pages_written += writeback_inodes_wb(&bdi->wb, + write_chunk); + trace_balance_dirty_written(bdi, pages_written); if (pages_written >= write_chunk) break; /* We've done our duty */ } - trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); + trace_balance_dirty_wait(bdi); /* * Increase the delay for each loop, up to our previous -- cgit v1.2.3 From e98be2d599207c6b31e9bb340d52a231b2f3662d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 11:22:30 -0600 Subject: writeback: bdi write bandwidth estimation The estimation value will start from 100MB/s and adapt to the real bandwidth in seconds. It tries to update the bandwidth only when disk is fully utilized. Any inactive period of more than one second will be skipped. The estimated bandwidth will be reflecting how fast the device can writeout when _fully utilized_, and won't drop to 0 when it goes idle. The value will remain constant at disk idle time. At busy write time, if not considering fluctuations, it will also remain high unless be knocked down by possible concurrent reads that compete for the disk time and bandwidth with async writes. The estimation is not done purely in the flusher because there is no guarantee for write_cache_pages() to return timely to update bandwidth. The bdi->avg_write_bandwidth smoothing is very effective for filtering out sudden spikes, however may be a little biased in long term. The overheads are low because the bdi bandwidth update only occurs at 200ms intervals. The 200ms update interval is suitable, because it's not possible to get the real bandwidth for the instance at all, due to large fluctuations. The NFS commits can be as large as seconds worth of data. One XFS completion may be as large as half second worth of data if we are going to increase the write chunk to half second worth of data. In ext4, fluctuations with time period of around 5 seconds is observed. And there is another pattern of irregular periods of up to 20 seconds on SSD tests. That's why we are not only doing the estimation at 200ms intervals, but also averaging them over a period of 3 seconds and then go further to do another level of smoothing in avg_write_bandwidth. CC: Li Shaohua CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 13 +++++++ include/linux/backing-dev.h | 5 +++ include/linux/writeback.h | 3 ++ mm/backing-dev.c | 12 +++++++ mm/page-writeback.c | 87 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 120 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2c947da39f6e..5826992910e9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -692,6 +692,16 @@ static inline bool over_bground_thresh(void) global_page_state(NR_UNSTABLE_NFS) > background_thresh); } +/* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, start_time); +} + /* * Explicit flushing or periodic writeback of "old" data. * @@ -710,6 +720,7 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { + unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long oldest_jif; struct inode *inode; @@ -758,6 +769,8 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb->bdi, work); + wb_update_bandwidth(wb, wb_start); + /* * Did we write something? Try for more * diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 469d56443c63..a008982e7c08 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -73,6 +73,11 @@ struct backing_dev_info { struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw */ + struct prop_local_percpu completions; int dirty_exceeded; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index df1b7f18f100..66862f2d90c8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -118,6 +118,9 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time); + void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 83f18a1d9d10..a76cdd160277 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -638,6 +638,11 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); } +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -660,6 +665,13 @@ int bdi_init(struct backing_dev_info *bdi) } bdi->dirty_exceeded = 0; + + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->write_bandwidth = INIT_BW; + bdi->avg_write_bandwidth = INIT_BW; + err = prop_local_init_percpu(&bdi->completions); if (err) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8cd71376c63d..446bdf7b975b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,6 +36,11 @@ #include #include +/* + * Estimate write bandwidth at 200ms intervals. + */ +#define BANDWIDTH_INTERVAL max(HZ/5, 1) + /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. @@ -471,6 +476,85 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) return bdi_dirty; } +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = bdi->avg_write_bandwidth; + unsigned long old = bdi->write_bandwidth; + u64 bw; + + /* + * bw = written * HZ / elapsed + * + * bw * elapsed + write_bandwidth * (period - elapsed) + * write_bandwidth = --------------------------------------------------- + * period + */ + bw = written - bdi->written_stamp; + bw *= HZ; + if (unlikely(elapsed > period)) { + do_div(bw, elapsed); + avg = bw; + goto out; + } + bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw >>= ilog2(period); + + /* + * one more level of smoothing, for filtering out sudden spikes + */ + if (avg > old && old >= (unsigned long)bw) + avg -= (avg - old) >> 3; + + if (avg < old && old <= (unsigned long)bw) + avg += (old - avg) >> 3; + +out: + bdi->write_bandwidth = bw; + bdi->avg_write_bandwidth = avg; +} + +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long written; + + /* + * rate-limit, only update once every 200ms. + */ + if (elapsed < BANDWIDTH_INTERVAL) + return; + + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + + /* + * Skip quiet periods when disk bandwidth is under-utilized. + * (at least 1s idle time between two flusher runs) + */ + if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + goto snapshot; + + bdi_update_write_bandwidth(bdi, elapsed, written); + +snapshot: + bdi->written_stamp = written; + bdi->bw_time_stamp = now; +} + +static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long start_time) +{ + if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) + return; + spin_lock(&bdi->wb.list_lock); + __bdi_update_bandwidth(bdi, start_time); + spin_unlock(&bdi->wb.list_lock); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -490,6 +574,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pause = 1; bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; + unsigned long start_time = jiffies; for (;;) { nr_reclaimable = global_page_state(NR_FILE_DIRTY) + @@ -544,6 +629,8 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; + bdi_update_bandwidth(bdi, start_time); + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been -- cgit v1.2.3 From c42843f2f0bbc9d716a32caf667d18fc2bf3bc4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 2 Mar 2011 15:54:09 -0600 Subject: writeback: introduce smoothed global dirty limit The start of a heavy weight application (ie. KVM) may instantly knock down determine_dirtyable_memory() if the swap is not enabled or full. global_dirty_limits() and bdi_dirty_limit() will in turn get global/bdi dirty thresholds that are _much_ lower than the global/bdi dirty pages. balance_dirty_pages() will then heavily throttle all dirtiers including the light ones, until the dirty pages drop below the new dirty thresholds. During this _deep_ dirty-exceeded state, the system may appear rather unresponsive to the users. About "deep" dirty-exceeded: task_dirty_limit() assigns 1/8 lower dirty threshold to heavy dirtiers than light ones, and the dirty pages will be throttled around the heavy dirtiers' dirty threshold and reasonably below the light dirtiers' dirty threshold. In this state, only the heavy dirtiers will be throttled and the dirty pages are carefully controlled to not exceed the light dirtiers' dirty threshold. However if the threshold itself suddenly drops below the number of dirty pages, the light dirtiers will get heavily throttled. So introduce global_dirty_limit for tracking the global dirty threshold with policies - follow downwards slowly - follow up in one shot global_dirty_limit can effectively mask out the impact of sudden drop of dirtyable memory. It will be used in the next patch for two new type of dirty limits. Note that the new dirty limits are not going to avoid throttling the light dirtiers, but could limit their sleep time to 200ms. Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 6 ++++ mm/page-writeback.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5826992910e9..227ff12257f3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -699,7 +699,7 @@ static inline bool over_bground_thresh(void) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, start_time); + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 66862f2d90c8..e9d371b6053b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -84,6 +84,8 @@ static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); +extern unsigned long global_dirty_limit; + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern unsigned long dirty_background_bytes; @@ -119,6 +121,10 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty); void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time); void page_writeback_init(void); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f3e1b46ace5..da959952b9f5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -116,6 +116,7 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ +unsigned long global_dirty_limit; /* * Scale the writeback cache size proportional to the relative writeout speeds. @@ -516,7 +517,67 @@ out: bdi->avg_write_bandwidth = avg; } +/* + * The global dirtyable memory and dirty threshold could be suddenly knocked + * down by a large amount (eg. on the startup of KVM in a swapless system). + * This may throw the system into deep dirty exceeded state and throttle + * heavy/light dirtiers alike. To retain good responsiveness, maintain + * global_dirty_limit for tracking slowly down to the knocked down dirty + * threshold. + */ +static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +{ + unsigned long limit = global_dirty_limit; + + /* + * Follow up in one step. + */ + if (limit < thresh) { + limit = thresh; + goto update; + } + + /* + * Follow down slowly. Use the higher one as the target, because thresh + * may drop below dirty. This is exactly the reason to introduce + * global_dirty_limit which is guaranteed to lie above the dirty pages. + */ + thresh = max(thresh, dirty); + if (limit > thresh) { + limit -= (limit - thresh) >> 5; + goto update; + } + return; +update: + global_dirty_limit = limit; +} + +static void global_update_bandwidth(unsigned long thresh, + unsigned long dirty, + unsigned long now) +{ + static DEFINE_SPINLOCK(dirty_lock); + static unsigned long update_time; + + /* + * check locklessly first to optimize away locking for the most time + */ + if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + return; + + spin_lock(&dirty_lock); + if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { + update_dirty_limit(thresh, dirty); + update_time = now; + } + spin_unlock(&dirty_lock); +} + void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time) { unsigned long now = jiffies; @@ -538,6 +599,9 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) goto snapshot; + if (thresh) + global_update_bandwidth(thresh, dirty, now); + bdi_update_write_bandwidth(bdi, elapsed, written); snapshot: @@ -546,12 +610,17 @@ snapshot: } static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, unsigned long start_time) { if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) return; spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, start_time); + __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, + start_time); spin_unlock(&bdi->wb.list_lock); } @@ -630,7 +699,8 @@ static void balance_dirty_pages(struct address_space *mapping, if (!bdi->dirty_exceeded) bdi->dirty_exceeded = 1; - bdi_update_bandwidth(bdi, start_time); + bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, + bdi_thresh, bdi_dirty, start_time); /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked -- cgit v1.2.3 From 1a12d8bd7b2998be01ee55edb64e7473728abb9c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 29 Aug 2010 13:28:09 -0600 Subject: writeback: scale IO chunk size up to half device bandwidth Originally, MAX_WRITEBACK_PAGES was hard-coded to 1024 because of a concern of not holding I_SYNC for too long. (At least, that was the comment previously.) This doesn't make sense now because the only time we wait for I_SYNC is if we are calling sync or fsync, and in that case we need to write out all of the data anyway. Previously there may have been other code paths that waited on I_SYNC, but not any more. -- Theodore Ts'o So remove the MAX_WRITEBACK_PAGES constraint. The writeback pages will adapt to as large as the storage device can write within 500ms. XFS is observed to do IO completions in a batch, and the batch size is equal to the write chunk size. To avoid dirty pages to suddenly drop out of balance_dirty_pages()'s dirty control scope and create large fluctuations, the chunk size is also limited to half the control scope. The balance_dirty_pages() control scrope is [(background_thresh + dirty_thresh) / 2, dirty_thresh] which is by default [15%, 20%] of global dirty pages, whose range size is dirty_thresh / DIRTY_FULL_SCOPE. The adpative write chunk size will be rounded to the nearest 4MB boundary. http://bugzilla.kernel.org/show_bug.cgi?id=13930 CC: Theodore Ts'o CC: Dave Chinner CC: Chris Mason CC: Peter Zijlstra Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 23 ++++++++++------------- include/linux/writeback.h | 11 +++++++++++ 2 files changed, 21 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 227ff12257f3..50445cf0b83a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -29,15 +29,6 @@ #include #include "internal.h" -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024L - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -515,7 +506,8 @@ static bool pin_sb_for_writeback(struct super_block *sb) return false; } -static long writeback_chunk_size(struct wb_writeback_work *work) +static long writeback_chunk_size(struct backing_dev_info *bdi, + struct wb_writeback_work *work) { long pages; @@ -534,8 +526,13 @@ static long writeback_chunk_size(struct wb_writeback_work *work) */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; - else - pages = min(MAX_WRITEBACK_PAGES, work->nr_pages); + else { + pages = min(bdi->avg_write_bandwidth / 2, + global_dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + pages = round_down(pages + MIN_WRITEBACK_PAGES, + MIN_WRITEBACK_PAGES); + } return pages; } @@ -600,7 +597,7 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } __iget(inode); - write_chunk = writeback_chunk_size(work); + write_chunk = writeback_chunk_size(wb->bdi, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b625073b80c8..f1bfa12ea246 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -8,6 +8,10 @@ #include /* + * The 1/4 region under the global dirty thresh is for smooth dirty throttling: + * + * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) + * * The 1/16 region above the global dirty limit will be put to maximum pauses: * * (limit, limit + limit/DIRTY_MAXPAUSE_AREA) @@ -25,9 +29,16 @@ * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ +#define DIRTY_SCOPE 8 +#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) #define DIRTY_MAXPAUSE_AREA 16 #define DIRTY_PASSGOOD_AREA 8 +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) + struct backing_dev_info; /* -- cgit v1.2.3 From 7132de744ba76930d13033061018ddd7e3e8cd91 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Sun, 10 Jul 2011 19:37:48 -0400 Subject: ext4: fix i_blocks/quota accounting when extent insertion fails The current implementation of ext4_free_blocks() always calls dquot_free_block This looks quite sensible in the most cases: blocks to be freed are associated with inode and were accounted in quota and i_blocks some time ago. However, there is a case when blocks to free were not accounted by the time calling ext4_free_blocks() yet: 1. delalloc is on, write_begin pre-allocated some space in quota 2. write-back happens, ext4 allocates some blocks in ext4_ext_map_blocks() 3. then ext4_ext_map_blocks() gets an error (e.g. ENOSPC) from ext4_ext_insert_extent() and calls ext4_free_blocks(). In this scenario, ext4_free_blocks() calls dquot_free_block() who, in turn, decrements i_blocks for blocks which were not accounted yet (due to delalloc) After clean umount, e2fsck reports something like: > Inode 21, i_blocks is 5080, should be 5128. Fix? because i_blocks was erroneously decremented as explained above. The patch fixes the problem by passing the new flag EXT4_FREE_BLOCKS_NO_QUOT_UPDATE to ext4_free_blocks(), to request that the dquot_free_block() call be skipped. Signed-off-by: Maxim Patlasov Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 4 +++- fs/ext4/mballoc.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 49d2cea47382..d13f3b509886 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -526,6 +526,7 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 /* * ioctl commands diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 31ae5fbe89e5..a86213882655 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3565,12 +3565,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_ext_get_actual_len(&newex), fb_flags); goto out2; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 389386b41c98..1900ec7a1579 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4637,7 +4637,7 @@ do_more: } ext4_mark_super_dirty(sb); error_return: - if (freed) + if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); -- cgit v1.2.3 From 575a1d4bdfa2ea9fc10733013136145b497e1be0 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Sun, 10 Jul 2011 20:07:25 -0400 Subject: ext4: free allocated and pre-allocated blocks when check_eofblocks_fl fails Upon corrupted inode or disk failures, we may fail after we already allocate some blocks from the inode or take some blocks from the inode's preallocation list, but before we successfully insert the corresponding extent to the extent tree. In this case, we should free any allocated blocks and discard the inode's preallocated blocks because the entries in the inode's preallocation list may be in an inconsistent state. Signed-off-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/extents.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a86213882655..c969ae23a535 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3560,10 +3560,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); - if (err) - goto out2; - - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!err) + err = ext4_ext_insert_extent(handle, inode, path, + &newex, flags); if (err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; -- cgit v1.2.3 From 4862fd6047ed02e2726667c54d35f538eecc56aa Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 10 Jul 2011 22:05:08 -0400 Subject: jbd2: remove jbd2_dev_to_name() from jbd2 tracepoints Using function calls in TP_printk causes perf heartburn, so print the MAJOR/MINOR device numbers instead. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 67 --------------------------------------------- include/linux/jbd2.h | 6 ---- include/trace/events/jbd2.h | 36 ++++++++++++------------ 3 files changed, 19 insertions(+), 90 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0dfa5b598e68..f24df13adc4e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2390,73 +2390,6 @@ static void __exit journal_exit(void) jbd2_journal_destroy_caches(); } -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - * - * The caller should use rcu_read_lock() in order to make sure the - * device name stays valid until its done with it. We use - * rcu_read_lock() as well to make sure we're safe in case the caller - * gets sloppy, and because rcu_read_lock() is cheap and can be safely - * nested. - */ -struct devname_cache { - struct rcu_head rcu; - dev_t device; - char devname[BDEVNAME_SIZE]; -}; -#define CACHE_SIZE_BITS 6 -static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; -static DEFINE_SPINLOCK(devname_cache_lock); - -static void free_devcache(struct rcu_head *rcu) -{ - kfree(rcu); -} - -const char *jbd2_dev_to_name(dev_t device) -{ - int i = hash_32(device, CACHE_SIZE_BITS); - char *ret; - struct block_device *bd; - static struct devname_cache *new_dev; - - rcu_read_lock(); - if (devcache[i] && devcache[i]->device == device) { - ret = devcache[i]->devname; - rcu_read_unlock(); - return ret; - } - rcu_read_unlock(); - - new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); - if (!new_dev) - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ - bd = bdget(device); - spin_lock(&devname_cache_lock); - if (devcache[i]) { - if (devcache[i]->device == device) { - kfree(new_dev); - bdput(bd); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; - } - call_rcu(&devcache[i]->rcu, free_devcache); - } - devcache[i] = new_dev; - devcache[i]->device = device; - if (bd) { - bdevname(bd, devcache[i]->devname); - bdput(bd); - } else - __bdevname(device, devcache[i]->devname); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; -} -EXPORT_SYMBOL(jbd2_dev_to_name); - MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d087c2e7b2aa..38f307b8c334 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1329,12 +1329,6 @@ extern int jbd_blocks_per_page(struct inode *inode); #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) #define JBUFFER_TRACE(jh, info) do {} while (0) -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - */ -extern const char *jbd2_dev_to_name(dev_t device); - #endif /* __KERNEL__ */ #endif /* _LINUX_JBD2_H */ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index bf16545cc977..75964412ddbb 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -26,8 +26,8 @@ TRACE_EVENT(jbd2_checkpoint, __entry->result = result; ), - TP_printk("dev %s result %d", - jbd2_dev_to_name(__entry->dev), __entry->result) + TP_printk("dev %d,%d result %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, @@ -48,9 +48,9 @@ DECLARE_EVENT_CLASS(jbd2_commit, __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %s transaction %d sync %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit) + TP_printk("dev %d,%d transaction %d sync %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, @@ -100,9 +100,9 @@ TRACE_EVENT(jbd2_end_commit, __entry->head = journal->j_tail_sequence; ), - TP_printk("dev %s transaction %d sync %d head %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit, __entry->head) + TP_printk("dev %d,%d transaction %d sync %d head %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, @@ -120,8 +120,9 @@ TRACE_EVENT(jbd2_submit_inode_data, __entry->ino = inode->i_ino; ), - TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) + TP_printk("dev %d,%d ino %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino) ); TRACE_EVENT(jbd2_run_stats, @@ -156,9 +157,9 @@ TRACE_EVENT(jbd2_run_stats, __entry->blocks_logged = stats->rs_blocks_logged; ), - TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " + TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u " "logging %u handle_count %u blocks %u blocks_logged %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), @@ -192,9 +193,9 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->dropped = stats->cs_dropped; ), - TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " + TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u " "written %u dropped %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); @@ -222,9 +223,10 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, __entry->freed = freed; ), - TP_printk("dev %s from %u to %u offset %lu freed %lu", - jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, - __entry->first_tid, __entry->block_nr, __entry->freed) + TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tail_sequence, __entry->first_tid, + __entry->block_nr, __entry->freed) ); #endif /* _TRACE_JBD2_H */ -- cgit v1.2.3 From 12706394bcaa48e3d5e19c97d7b4e5683ebb12fb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 10 Jul 2011 22:37:50 -0400 Subject: ext4: add tracepoint for ext4_journal_start This will help debug who is responsible for starting a jbd2 transaction. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 + include/trace/events/ext4.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa864b3..7910e61809e7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -269,6 +269,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) journal_t *journal; handle_t *handle; + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 5ce2b2f5f524..6f27a59fc90d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1520,6 +1520,28 @@ TRACE_EVENT(ext4_load_inode, (unsigned long) __entry->ino) ); +TRACE_EVENT(ext4_journal_start, + TP_PROTO(struct super_block *sb, int nblocks, unsigned long IP), + + TP_ARGS(sb, nblocks, IP), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nblocks ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nblocks = nblocks; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d nblocks %d caller %pF", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nblocks, (void *)__entry->ip) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 22f10457432387615fa1ae6e0375d9cacc50819b Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sun, 10 Jul 2011 23:52:37 -0400 Subject: ext4: fix trim length underflow with small trim length In 0f0a25b, we adjust 'len' with s_first_data_block - start, but it could underflow in case blocksize=1K, fstrim_range.len=512 and fstrim_range.start = 0. In this case, when we run the code: len -= first_data_blk - start; len will be underflow to -1ULL. In the end, although we are safe that last_group check later will limit the trim to the whole volume, but that isn't what the user really want. So this patch fix it. It also adds the check for 'start' like ext3 so that we can break immediately if the start is invalid. Cc: Lukas Czerner Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1900ec7a1579..b189cb4ff20f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4902,6 +4902,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; + if (start + len <= first_data_blk) + goto out; if (start < first_data_blk) { len -= first_data_blk - start; start = first_data_blk; @@ -4950,5 +4952,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } range->len = trimmed * sb->s_blocksize; +out: return ret; } -- cgit v1.2.3 From 169ddc3ec83b5f732e51d975befb191d50795844 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:00:07 -0400 Subject: ext4: speed up group trim with the right free block count When we trim some free blocks in a group of ext4, we need to calculate the free blocks properly and check whether there are enough freed blocks left for us to trim. Current solution will only calculate free spaces if they are large for a trim which isn't appropriate. Let us see a small example: a group has 1.5M free which are 300k, 300k, 300k, 300k, 300k. And minblocks is 1M. With current solution, we have to iterate the whole group since these 300k will never be subtracted from 1.5M. But actually we should exit after we find the first 2 free spaces since the left 3 chunks only sum up to 900K if we subtract the first 600K although they can't be trimed. Reviewed-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b189cb4ff20f..4a25725e9157 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4821,7 +4821,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t minblocks) { void *bitmap; - ext4_grpblk_t next, count = 0; + ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; int ret; @@ -4848,6 +4848,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next - start, group, &e4b); count += next - start; } + free_count += next - start; start = next + 1; if (fatal_signal_pending(current)) { @@ -4861,7 +4862,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); } - if ((e4b.bd_info->bb_free - count) < minblocks) + if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } ext4_unlock_group(sb, group); -- cgit v1.2.3 From b3d4c2b10b68d205d3eb1b5c17dcb4649a502798 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:01:52 -0400 Subject: ext4: Add new ext4 trim tracepoints Add ext4_trim_extent and ext4_trim_all_free. Reviewed-by: Lukas Czerner Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 ++++ include/trace/events/ext4.h | 49 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4a25725e9157..7aa4c16caca1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4780,6 +4780,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, { struct ext4_free_extent ex; + trace_ext4_trim_extent(sb, group, start, count); + assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; @@ -4825,6 +4827,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, struct ext4_buddy e4b; int ret; + trace_ext4_trim_all_free(sb, group, start, max); + ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_error(sb, "Error in loading buddy " diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6f27a59fc90d..51d88139eb8c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1542,6 +1542,55 @@ TRACE_EVENT(ext4_journal_start, __entry->nblocks, (void *)__entry->ip) ); +DECLARE_EVENT_CLASS(ext4__trim, + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( __u32, group ) + __field( int, start ) + __field( int, len ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); + __entry->group = group; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d group %u, start %d, len %d", + __entry->dev_major, __entry->dev_minor, + __entry->group, __entry->start, __entry->len) +); + +DEFINE_EVENT(ext4__trim, ext4_trim_extent, + + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len) +); + +DEFINE_EVENT(ext4__trim, ext4_trim_all_free, + + TP_PROTO(struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), + + TP_ARGS(sb, group, start, len) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 3d56b8d2c74cc3f375ce332b3ac3519e009d79ee Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:03:38 -0400 Subject: ext4: Speed up FITRIM by recording flags in ext4_group_info In ext4, when FITRIM is called every time, we iterate all the groups and do trim one by one. It is a bit time wasting if the group has been trimmed and there is no change since the last trim. So this patch adds a new flag in ext4_group_info->bb_state to indicate that the group has been trimmed, and it will be cleared if some blocks is freed(in release_blocks_on_commit). Another trim_minlen is added in ext4_sb_info to record the last minlen we use to trim the volume, so that if the caller provide a small one, we will go on the trim regardless of the bb_state. A simple test with my intel x25m ssd: df -h shows: /dev/sdb1 40G 21G 17G 56% /mnt/ext4 Block size: 4096 run the FITRIM with the following parameter: range.start = 0; range.len = UINT64_MAX; range.minlen = 1048576; without the patch: [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.505s user 0m0.000s sys 0m1.224s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.359s user 0m0.000s sys 0m1.178s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.228s user 0m0.000s sys 0m1.151s with the patch: [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m5.625s user 0m0.000s sys 0m1.269s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m0.002s user 0m0.000s sys 0m0.001s [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a real 0m0.002s user 0m0.000s sys 0m0.001s A big improvement for the 2nd and 3rd run. Even after I delete some big image files, it is still much faster than iterating the whole disk. [root@boyu-tm test]# time ./ftrim /mnt/ext4/a real 0m1.217s user 0m0.000s sys 0m0.196s Cc: Lukas Czerner Reviewed-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 13 ++++++++++++- fs/ext4/mballoc.c | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d13f3b509886..62cee2b6fe79 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1215,6 +1215,9 @@ struct ext4_sb_info { /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2080,11 +2083,19 @@ struct ext4_group_info { * 5 free 8-block regions. */ }; -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7aa4c16caca1..73c254085a41 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2628,6 +2628,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) rb_erase(&entry->node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); + if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() @@ -4838,6 +4847,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; @@ -4869,6 +4882,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } + + if (!ret) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); +out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4957,6 +4974,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } range->len = trimmed * sb->s_blocksize; + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + out: return ret; } -- cgit v1.2.3 From 22612283f7da1ce9849d9b3716010b07a0446fd9 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 00:04:34 -0400 Subject: ext4: Change the wrong param comment for ext4_trim_all_free at ext4_trim_all_free() comment, there is no longer an @e4b parameter, instead it is @group. Reported-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 73c254085a41..04a3d92aafb4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4811,7 +4811,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system - * @e4b: ext4 buddy + * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count -- cgit v1.2.3 From fa09200b8334f9a6af3f656edae924a98d85630f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 27 May 2011 12:06:11 -0400 Subject: Btrfs: try to only do one btrfs_search_slot in do_setxattr I've been watching how many btrfs_search_slot()'s we do and I noticed that when we create a file with selinux enabled we were doing 2 each time we initialize the security context. That's because we lookup the xattr first so we can delete it if we're setting a new value to an existing xattr. But in the create case we don't have any xattrs, so it is completely useless to have the extra lookup. So re-arrange things so that we only lookup first if we specifically have XATTR_REPLACE. That way in the basic case we only do 1 search, and in the more complicated case we do the normal 2 lookups. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/dir-item.c | 9 ++------ fs/btrfs/xattr.c | 66 ++++++++++++++++++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 685f2593c4f0..c360a848d97f 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); - /* - * FIXME: at some point we should handle xattr's that are larger than - * what we can fit in our leaf. We set location to NULL b/c we arent - * pointing at anything else, that will change if we store the xattr - * data in a separate inode. - */ - BUG_ON(IS_ERR(dir_item)); + if (IS_ERR(dir_item)) + return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5366fe452ab0..d733b9cfea34 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -102,43 +102,57 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - /* first lets see if we already have this xattr */ - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - strlen(name), -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - - /* ok we already have this xattr, lets remove it */ - if (di) { - /* if we want create only exit */ - if (flags & XATTR_CREATE) { - ret = -EEXIST; + if (flags & XATTR_REPLACE) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, + name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); + if (ret) + goto out; btrfs_release_path(path); + } - /* if we don't have a value then we are removing the xattr */ - if (!value) +again: + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EEXIST) { + if (flags & XATTR_CREATE) goto out; - } else { + /* + * We can't use the path we already have since we won't have the + * proper locking for a delete, so release the path and + * re-lookup to delete the thing. + */ btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + /* Shouldn't happen but just in case... */ + btrfs_release_path(path); + goto again; + } - if (flags & XATTR_REPLACE) { - /* we couldn't find the attr to replace */ - ret = -ENODATA; + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) goto out; + + /* + * We have a value to set, so go back and try to insert it now. + */ + if (value) { + btrfs_release_path(path); + goto again; } } - - /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - BUG_ON(ret); out: btrfs_free_path(path); return ret; -- cgit v1.2.3 From b5009945be18023942ce28327893c7bc1e58fe54 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Jun 2011 15:07:51 -0400 Subject: Btrfs: do transaction space reservation before joining the transaction We have to do weird things when handling enospc in the transaction joining code. Because we've already joined the transaction we cannot commit the transaction within the reservation code since it will deadlock, so we have to return EAGAIN and then make sure we don't retry too many times. Instead of doing this, just do the reservation the normal way before we join the transaction, that way we can do whatever we want to try and reclaim space, and then if it fails we know for sure we are out of space and we can return ENOSPC. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 --- fs/btrfs/extent-tree.c | 20 -------------------- fs/btrfs/transaction.c | 36 +++++++++++++++++------------------- 3 files changed, 17 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 60e13ef23a5e..28e170bcdcb5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2223,9 +2223,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456fdb60..0ed5fe03a06a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3874,26 +3874,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, return 0; } -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items) -{ - u64 num_bytes; - int ret; - - if (num_items == 0 || root->fs_info->chunk_root == root) - return 0; - - num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes); - if (!ret) { - trans->bytes_reserved += num_bytes; - trans->block_rsv = &root->fs_info->trans_block_rsv; - } - return ret; -} - void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 51dcec86757f..654755b18951 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -260,7 +260,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; + u64 num_bytes = 0; int ret; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -274,6 +274,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_rsv = NULL; goto got_it; } + + /* + * Do the reservation before we join the transaction so we can do all + * the appropriate flushing if need be. + */ + if (num_items > 0 && root != root->fs_info->chunk_root) { + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(NULL, root, + &root->fs_info->trans_block_rsv, + num_bytes); + if (ret) + return ERR_PTR(ret); + } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -310,24 +323,9 @@ again: goto again; } - if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items); - if (ret == -EAGAIN && !retries) { - retries++; - btrfs_commit_transaction(h, root); - goto again; - } else if (ret == -EAGAIN) { - /* - * We have already retried and got EAGAIN, so really we - * don't have space, so set ret to -ENOSPC. - */ - ret = -ENOSPC; - } - - if (ret < 0) { - btrfs_end_transaction(h, root); - return ERR_PTR(ret); - } + if (num_bytes) { + h->block_rsv = &root->fs_info->trans_block_rsv; + h->bytes_reserved = num_bytes; } got_it: -- cgit v1.2.3 From fdb5effd5c2a7e01dc3a4217bb194e2d3a5b160f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Jun 2011 16:07:44 -0400 Subject: Btrfs: serialize flushers in reserve_metadata_bytes We keep having problems with early enospc, and that's because our method of making space is inherently racy. The problem is we can have one guy trying to make space for himself, and in the meantime people come in and steal his reservation. In order to stop this we make a waitqueue and put anybody who comes into reserve_metadata_bytes on that waitqueue if somebody is trying to make more space. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/extent-tree.c | 69 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 51 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 28e170bcdcb5..406c33876605 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -756,6 +756,8 @@ struct btrfs_space_info { chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + unsigned int flush:1; /* set if we are trying to make space */ + unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -766,6 +768,7 @@ struct btrfs_space_info { spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; + wait_queue_head_t wait; }; struct btrfs_block_rsv { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ed5fe03a06a..340c14715091 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; + found->flush = 0; + init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); atomic_set(&found->caching_threads, 0); @@ -3314,6 +3316,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; + smp_mb(); + if (root->fs_info->delalloc_bytes == 0) { + if (trans) + return 0; + btrfs_wait_ordered_extents(root, 0, 0); + return 0; + } + max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { @@ -3356,6 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } + if (reclaimed >= to_reclaim && !trans) + btrfs_wait_ordered_extents(root, 0, 0); return reclaimed >= to_reclaim; } @@ -3380,15 +3392,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; - bool reserved = false; bool committed = false; + bool flushing = false; again: - ret = -ENOSPC; - if (reserved) - num_bytes = 0; - + ret = 0; spin_lock(&space_info->lock); + /* + * We only want to wait if somebody other than us is flushing and we are + * actually alloed to flush. + */ + while (flush && !flushing && space_info->flush) { + spin_unlock(&space_info->lock); + /* + * If we have a trans handle we can't wait because the flusher + * may have to commit the transaction, which would mean we would + * deadlock since we are waiting for the flusher to finish, but + * hold the current transaction open. + */ + if (trans) + return -EAGAIN; + ret = wait_event_interruptible(space_info->wait, + !space_info->flush); + /* Must have been interrupted, return */ + if (ret) + return -EINTR; + + spin_lock(&space_info->lock); + } + + ret = -ENOSPC; unused = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -3403,8 +3436,7 @@ again: if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; if (unused >= num_bytes) { - if (!reserved) - space_info->bytes_reserved += orig_bytes; + space_info->bytes_reserved += orig_bytes; ret = 0; } else { /* @@ -3429,17 +3461,14 @@ again: * to reclaim space we can actually use it instead of somebody else * stealing it from us. */ - if (ret && !reserved) { - space_info->bytes_reserved += orig_bytes; - reserved = true; + if (ret && flush) { + flushing = true; + space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret) - return 0; - - if (!flush) + if (!ret || !flush) goto out; /* @@ -3447,9 +3476,7 @@ again: * metadata until after the IO is completed. */ ret = shrink_delalloc(trans, root, num_bytes, 1); - if (ret > 0) - return 0; - else if (ret < 0) + if (ret < 0) goto out; /* @@ -3462,11 +3489,11 @@ again: goto again; } - spin_lock(&space_info->lock); /* * Not enough space to be reclaimed, don't bother committing the * transaction. */ + spin_lock(&space_info->lock); if (space_info->bytes_pinned < orig_bytes) ret = -ENOSPC; spin_unlock(&space_info->lock); @@ -3489,12 +3516,12 @@ again: } out: - if (reserved) { + if (flushing) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= orig_bytes; + space_info->flush = 0; + wake_up_all(&space_info->wait); spin_unlock(&space_info->lock); } - return ret; } -- cgit v1.2.3 From 2f356126c589d562f98e2287f9c7b983388dc62f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 10 Jun 2011 15:31:13 -0400 Subject: Btrfs: use the normal checksumming infrastructure for free space cache We used to store the checksums of the space cache directly in the space cache, however that doesn't work out too well if we have more space than we can fit the checksums into the first page. So instead use the normal checksumming infrastructure. There were problems with doing this originally but those problems don't exist now so this works out fine. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 169 ++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 110 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bf0d61567f3d..fd7fa2a74f06 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { + printk(KERN_INFO "Old style space inode found, converting.\n"); + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; + block_group->disk_cache_state = BTRFS_DC_CLEAR; + } + if (!btrfs_fs_closing(root->fs_info)) { block_group->inode = igrab(inode); block_group->iref = 1; @@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + BTRFS_INODE_PREALLOC); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -239,17 +245,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct page *page; - u32 *checksums = NULL, *crc; - char *disk_crcs = NULL; struct btrfs_key key; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - u32 cur_crc = ~(u32)0; pgoff_t index = 0; - unsigned long first_page_offset; - int num_checksums; int ret = 0; INIT_LIST_HEAD(&bitmaps); @@ -292,16 +293,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) goto out; - /* Setup everything for doing checksumming */ - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; - checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); - if (!checksums) - goto out; - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - disk_crcs = kzalloc(first_page_offset, GFP_NOFS); - if (!disk_crcs) - goto out; - ret = readahead_cache(inode); if (ret) goto out; @@ -311,17 +302,11 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space *e; void *addr; unsigned long offset = 0; - unsigned long start_offset = 0; int need_loop = 0; if (!num_entries && !num_bitmaps) break; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } - page = grab_cache_page(inode->i_mapping, index); if (!page) goto free_cache; @@ -342,8 +327,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (index == 0) { u64 *gen; - memcpy(disk_crcs, addr, first_page_offset); - gen = addr + (sizeof(u32) * num_checksums); + /* + * We put a bogus crc in the front of the first page in + * case old kernels try to mount a fs with the new + * format to make sure they discard the cache. + */ + addr += sizeof(u64); + offset += sizeof(u64); + + gen = addr; if (*gen != BTRFS_I(inode)->generation) { printk(KERN_ERR "btrfs: space cache generation" " (%llu) does not match inode (%llu)\n", @@ -355,24 +347,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, page_cache_release(page); goto free_cache; } - crc = (u32 *)disk_crcs; - } - entry = addr + start_offset; - - /* First lets check our crc before we do anything fun */ - cur_crc = ~(u32)0; - cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, - PAGE_CACHE_SIZE - start_offset); - btrfs_csum_final(cur_crc, (char *)&cur_crc); - if (cur_crc != *crc) { - printk(KERN_ERR "btrfs: crc mismatch for page %lu\n", - index); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; + addr += sizeof(u64); + offset += sizeof(u64); } - crc++; + entry = addr; while (1) { if (!num_entries) @@ -470,8 +448,6 @@ next: ret = 1; out: - kfree(checksums); - kfree(disk_crcs); return ret; free_cache: __btrfs_remove_free_space_cache(ctl); @@ -569,8 +545,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_key key; u64 start, end, len; u64 bytes = 0; - u32 *crc, *checksums; - unsigned long first_page_offset; + u32 crc = ~(u32)0; int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; @@ -590,34 +565,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* Since the first page has all of our checksums and our generation we - * need to calculate the offset into the page that we can start writing - * our entries. - */ - first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); - filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); - /* make sure we don't overflow that first page */ - if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { - /* this is really the same as running out of space, where we also return 0 */ - printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); - ret = 0; - goto out_update; - } - - /* We need a checksum per page. */ - crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); - if (!crc) - return -1; - pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); - if (!pages) { - kfree(crc); + if (!pages) return -1; - } /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -648,7 +602,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, unlock_page(pages[i]); page_cache_release(pages[i]); } - goto out_free; + goto out; } pages[index] = page; index++; @@ -668,17 +622,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Write out the extent entries */ do { struct btrfs_free_space_entry *entry; - void *addr; + void *addr, *orig; unsigned long offset = 0; - unsigned long start_offset = 0; next_page = false; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } - if (index >= num_pages) { out_of_space = true; break; @@ -686,10 +634,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, page = pages[index]; - addr = kmap(page); - entry = addr + start_offset; + orig = addr = kmap(page); + if (index == 0) { + u64 *gen; - memset(addr, 0, PAGE_CACHE_SIZE); + /* + * We're going to put in a bogus crc for this page to + * make sure that old kernels who aren't aware of this + * format will be sure to discard the cache. + */ + addr += sizeof(u64); + offset += sizeof(u64); + + gen = addr; + *gen = trans->transid; + addr += sizeof(u64); + offset += sizeof(u64); + } + entry = addr; + + memset(addr, 0, PAGE_CACHE_SIZE - offset); while (node && !next_page) { struct btrfs_free_space *e; @@ -752,13 +716,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, next_page = true; entry++; } - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr + start_offset, *crc, - PAGE_CACHE_SIZE - start_offset); - kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; + /* Generate bogus crc value */ + if (index == 0) { + u32 *tmp; + crc = btrfs_csum_data(root, orig + sizeof(u64), crc, + PAGE_CACHE_SIZE - sizeof(u64)); + btrfs_csum_final(crc, (char *)&crc); + crc++; + tmp = orig; + *tmp = crc; + } + + kunmap(page); bytes += PAGE_CACHE_SIZE; @@ -779,11 +749,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, addr = kmap(page); memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; bytes += PAGE_CACHE_SIZE; list_del_init(&entry->list); @@ -796,7 +762,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, i_size_read(inode) - 1, &cached_state, GFP_NOFS); ret = 0; - goto out_free; + goto out; } /* Zero out the rest of the pages just to make sure */ @@ -811,20 +777,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, index++; } - /* Write the checksums and trans id to the first page */ - { - void *addr; - u64 *gen; - - page = pages[0]; - - addr = kmap(page); - memcpy(addr, checksums, sizeof(u32) * num_pages); - gen = addr + (sizeof(u32) * num_pages); - *gen = trans->transid; - kunmap(page); - } - ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, bytes, &cached_state); btrfs_drop_pages(pages, num_pages); @@ -833,7 +785,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) { ret = 0; - goto out_free; + goto out; } BTRFS_I(inode)->generation = trans->transid; @@ -850,7 +802,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); - goto out_free; + goto out; } leaf = path->nodes[0]; if (ret > 0) { @@ -866,7 +818,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); btrfs_release_path(path); - goto out_free; + goto out; } } header = btrfs_item_ptr(leaf, path->slots[0], @@ -879,11 +831,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = 1; -out_free: - kfree(checksums); +out: kfree(pages); - -out_update: if (ret != 1) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; -- cgit v1.2.3 From df98b6e2c52f65665eaf0fc23e647fb64335b289 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 20 Jun 2011 14:53:48 -0400 Subject: Btrfs: fix how we merge extent states and deal with cached states First, we can sometimes free the state we're merging, which means anybody who calls merge_state() may have the state it passed in free'ed. This is problematic because we could end up caching the state, which makes caching useless as the state will no longer be part of the tree. So instead of free'ing the state we passed into merge_state(), set it's end to the other->end and free the other state. This way we are sure to cache the correct state. Also because we can merge states together, instead of only using the cache'd state if it's start == the start we are looking for, go ahead and use it if the start we are looking for is within the range of the cached state. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b181a94a7170..abb922daf1b6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -279,11 +279,10 @@ static int merge_state(struct extent_io_tree *tree, if (other->start == state->end + 1 && other->state == state->state) { merge_cb(tree, state, other); - other->start = state->start; - state->tree = NULL; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - state = NULL; + state->end = other->end; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); } } @@ -349,7 +348,6 @@ static int insert_state(struct extent_io_tree *tree, "%llu %llu\n", (unsigned long long)found->start, (unsigned long long)found->end, (unsigned long long)start, (unsigned long long)end); - free_extent_state(state); return -EEXIST; } state->tree = tree; @@ -498,7 +496,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start == start) { + if (cached && cached->tree && cached->start <= start && + cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -740,7 +739,8 @@ again: spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->start == start && state->tree) { + if (state->start <= start && state->end > start && + state->tree) { node = &state->rb_node; goto hit_next; } @@ -781,13 +781,13 @@ hit_next: if (err) goto out; - next_node = rb_next(node); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; + next_node = rb_next(&state->rb_node); if (next_node && start < end && prealloc && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); @@ -860,7 +860,6 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - atomic_inc(&prealloc->refs); err = insert_state(tree, prealloc, start, this_end, &bits); BUG_ON(err == -EEXIST); @@ -870,7 +869,6 @@ hit_next: goto out; } cache_state(prealloc, cached_state); - free_extent_state(prealloc); prealloc = NULL; start = this_end + 1; goto search_again; @@ -1562,7 +1560,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start == start) + if (cached && cached->tree && cached->start <= start && + cached->end > start) node = &cached->rb_node; else node = tree_search(tree, start); -- cgit v1.2.3 From ffb505ff0f7b52318dea46dd139107a8371b4ad7 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 11 Jul 2011 11:43:59 -0400 Subject: ext4: remove redundant goto in ext4_ext_insert_extent() If eh->eh_entries is smaller than eh->eh_max, the routine will go to the "repeat" and then go to "has_space" directlly , since argument "depth" and "eh" are not even changed. Therefore, goto "has_space" directly and remove redundant "repeat" tag. Signed-off-by: Robin Dong --- fs/ext4/extents.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c969ae23a535..9cbdcb2110f5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1723,7 +1723,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, goto merge; } -repeat: depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) @@ -1745,7 +1744,7 @@ repeat: ext_debug("next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; - goto repeat; + goto has_space; } ext_debug("next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); -- cgit v1.2.3 From 598dbdf2433cad55bd44d923f67a053871e3eabf Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Mon, 11 Jul 2011 18:24:01 -0400 Subject: ext4: avoid unneeded ext4_ext_next_leaf_block() while inserting extents Optimize ext4_ext_insert_extent() by avoiding ext4_ext_next_leaf_block() when the result is not used/needed. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9cbdcb2110f5..f1c538e5055c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1730,9 +1730,10 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); - next = ext4_ext_next_leaf_block(inode, path); - if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCKS) { + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(inode, path); + if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); -- cgit v1.2.3 From 823ba01fc07751200c43e45733925a98b73eac3a Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 18:26:01 -0400 Subject: ext4: fix a race which could leak memory in ext4_groupinfo_create_slab() In ext4_groupinfo_create_slab, we create ext4_groupinfo_caches within ext4_grpinfo_slab_create_mutex, but set it outside the lock, and there does exist some case that we may create it twice and causes a memory leak. So set it before we call mutex_unlock. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 04a3d92aafb4..2b9a71b99b2b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2404,14 +2404,14 @@ static int ext4_groupinfo_create_slab(size_t size) slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_groupinfo_caches[cache_index] = cachep; + mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); return -ENOMEM; } - ext4_groupinfo_caches[cache_index] = cachep; - return 0; } -- cgit v1.2.3 From caaf7a29d31da21bb8d8200d5e42d1c93d3c6e00 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 11 Jul 2011 18:42:42 -0400 Subject: ext4: Fix a double free of sbi->s_group_info in ext4_mb_init_backend If we meet with an error in ext4_mb_add_groupinfo, we kfree sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)], but fail to reset it to NULL. So the caller ext4_mb_init_backend will try to kfree it again and causes a double free. So fix it by resetting it to NULL. Some typo in comments of mballoc.c are also changed. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2b9a71b99b2b..b97a2d2f0fdf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -75,8 +75,8 @@ * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This make sure that - * that the we have contiguous physical blocks representing the file blocks + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except @@ -84,7 +84,7 @@ * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list repreasented as + * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * @@ -152,7 +152,7 @@ * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it - * can used for allocation. ext4_mb_good_group explains how the groups are + * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first @@ -2279,8 +2279,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, exit_group_info: /* If a meta_group_info table has been allocated, release it now */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ -- cgit v1.2.3 From afb86178cb9b6a7329cf8709aa210fb0a245b606 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Mon, 11 Jul 2011 18:47:04 -0400 Subject: ext4: remove unnecessary comments in ext4_orphan_add() The comment from Al Viro about possible race in the ext4_orphan_add() is not justified. There is no race possible as we always have either i_mutex locked, or the inode can not be referenced from outside hence the J_ASSERS should not be hit from the reason described in comment. This commit replaces it with notion that we are holding i_mutex so it should not be possible for i_nlink to be changed while waiting for s_orphan_lock. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b754b7721f51..8dde5ab239cc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1989,18 +1989,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; - /* Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. */ - - /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on s_orphan_lock), so race with ext4_link() which might bump - * ->i_nlink. For, say it, character device. Not a regular file, - * not a directory, not a symlink and ->i_nlink > 0. - * - * tytso, 4/25/2009: I'm not sure how that could happen; - * shouldn't the fs core protect us from these sort of - * unlink()/link() races? + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); -- cgit v1.2.3 From 82c2c8b8616fa9e77264c53f0df483f74ac54613 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 1 Jun 2011 16:54:32 +0400 Subject: lockd: properly convert be32 values in debug messages lockd: server returns status 50331648 it's quite hard to understand that number in this message is 3 in big endian Signed-off-by: Vasily Averin Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index e374050a911c..8392cb85bd54 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -302,7 +302,8 @@ nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc) /* We appear to be out of the grace period */ wake_up_all(&host->h_gracewait); } - dprintk("lockd: server returns status %d\n", resp->status); + dprintk("lockd: server returns status %d\n", + ntohl(resp->status)); return 0; /* Okay, call complete */ } @@ -690,7 +691,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; if (resp->status != nlm_lck_denied_nolocks) - printk("lockd: unexpected unlock status: %d\n", resp->status); + printk("lockd: unexpected unlock status: %d\n", + ntohl(resp->status)); /* What to do now? I'm out of my depth... */ status = -ENOLCK; out: @@ -843,6 +845,7 @@ nlm_stat_to_errno(__be32 status) return -ENOLCK; #endif } - printk(KERN_NOTICE "lockd: unexpected server status %d\n", status); + printk(KERN_NOTICE "lockd: unexpected server status %d\n", + ntohl(status)); return -ENOLCK; } -- cgit v1.2.3 From c9895cb69b07a4b17d8fdae26667f9a9fba5183b Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 31 May 2011 18:48:56 -0400 Subject: NFS: pnfs IPv6 support Handle ipv6 remote addresses from GETDEVICEINFO - supports netid "tcp" for ipv4 and "tcp6" for ipv6 as rfc 5665 specifies - added ds_remotestr to avoid having to handle different AFs in every dprintk - tested against pynfs 4.1 server, submitting ipv6 support patch to pynfs - tested with IPv6 disabled, it compiles cleanly and relies on rpc_pton to refuse to accept IPv6 addresses Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 7 +- fs/nfs/nfs4filelayout.h | 5 +- fs/nfs/nfs4filelayoutdev.c | 255 ++++++++++++++++++++++++++++++++------------- 3 files changed, 191 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index f9d03abcd04c..b690bb5ee58d 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -344,8 +344,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; } - dprintk("%s USE DS:ip %x %hu\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr); /* No multipath support. Use first DS */ data->ds_clp = ds->ds_clp; @@ -384,9 +383,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; } - dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__, data->inode->i_ino, sync, (size_t) data->args.count, offset, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + ds->ds_remotestr); data->write_done_cb = filelayout_write_done_cb; data->ds_clp = ds->ds_clp; diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index cebe01e3795e..6c6a817710e5 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -49,8 +49,9 @@ enum stripetype4 { /* Individual ip address */ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ - u32 ds_ip_addr; - u32 ds_port; + struct sockaddr_storage ds_addr; + size_t ds_addrlen; + char *ds_remotestr; /* human readable addr+port */ struct nfs_client *ds_clp; atomic_t ds_count; }; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 3b7bf1377264..1a8308b90108 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -56,28 +56,56 @@ print_ds(struct nfs4_pnfs_ds *ds) printk("%s NULL device\n", __func__); return; } - printk(" ip_addr %x port %hu\n" + printk(" ds %s\n" " ref count %d\n" " client %p\n" " cl_exchange_flags %x\n", - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + ds->ds_remotestr, atomic_read(&ds->ds_count), ds->ds_clp, ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } /* nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(u32 ip_addr, u32 port) +_data_server_lookup_locked(struct sockaddr *addr, size_t addrlen) { struct nfs4_pnfs_ds *ds; - - dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", - ntohl(ip_addr), ntohs(port)); + struct sockaddr_in *a, *b; + struct sockaddr_in6 *a6, *b6; list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { - if (ds->ds_ip_addr == ip_addr && - ds->ds_port == port) { - return ds; + if (addr->sa_family != ds->ds_addr.ss_family) + continue; + + switch (addr->sa_family) { + case AF_INET: + a = (struct sockaddr_in *)addr; + b = (struct sockaddr_in *)&ds->ds_addr; + + if (a->sin_addr.s_addr == b->sin_addr.s_addr && + a->sin_port == b->sin_port) + return ds; + break; + + case AF_INET6: + a6 = (struct sockaddr_in6 *)addr; + b6 = (struct sockaddr_in6 *)&ds->ds_addr; + + /* LINKLOCAL addresses must have matching scope_id */ + if (ipv6_addr_scope(&a6->sin6_addr) == + IPV6_ADDR_SCOPE_LINKLOCAL && + a6->sin6_scope_id != b6->sin6_scope_id) + continue; + + if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && + a6->sin6_port == b6->sin6_port) + return ds; + break; + + default: + dprintk("%s: unhandled address family: %u\n", + __func__, addr->sa_family); + return NULL; } } return NULL; @@ -91,19 +119,14 @@ static int nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) { struct nfs_client *clp; - struct sockaddr_in sin; int status = 0; - dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + dprintk("--> %s addr %s au_flavor %d\n", __func__, ds->ds_remotestr, mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = ds->ds_ip_addr; - sin.sin_port = ds->ds_port; - - clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, - sizeof(sin), IPPROTO_TCP); + clp = nfs4_set_ds_client(mds_srv->nfs_client, + (struct sockaddr *)&ds->ds_addr, + ds->ds_addrlen, IPPROTO_TCP); if (IS_ERR(clp)) { status = PTR_ERR(clp); goto out; @@ -115,8 +138,8 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) goto out_put; } ds->ds_clp = clp; - dprintk("%s [existing] ip=%x, port=%hu\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + dprintk("%s [existing] server=%s\n", __func__, + ds->ds_remotestr); goto out; } @@ -135,8 +158,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) goto out_put; ds->ds_clp = clp; - dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), - ntohs(ds->ds_port)); + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; out_put: @@ -153,6 +175,7 @@ destroy_ds(struct nfs4_pnfs_ds *ds) if (ds->ds_clp) nfs_put_client(ds->ds_clp); + kfree(ds->ds_remotestr); kfree(ds); } @@ -179,31 +202,85 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) kfree(dsaddr); } +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct sockaddr *ds_addr, gfp_t gfp_flags) +{ + char buf[INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN]; + char *remotestr; + char *startsep = ""; + char *endsep = ""; + size_t len; + uint16_t port; + + switch (ds_addr->sa_family) { + case AF_INET: + port = ((struct sockaddr_in *)ds_addr)->sin_port; + break; + case AF_INET6: + startsep = "["; + endsep = "]"; + port = ((struct sockaddr_in6 *)ds_addr)->sin6_port; + break; + default: + dprintk("%s: Unknown address family %u\n", + __func__, ds_addr->sa_family); + return NULL; + } + + if (!rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf))) { + dprintk("%s: error printing addr\n", __func__); + return NULL; + } + + len = strlen(buf) + strlen(startsep) + strlen(endsep) + 1 + 5 + 1; + remotestr = kzalloc(len, gfp_flags); + + if (unlikely(!remotestr)) { + dprintk("%s: couldn't alloc remotestr\n", __func__); + return NULL; + } + + snprintf(remotestr, len, "%s%s%s:%u", + startsep, buf, endsep, ntohs(port)); + + return remotestr; +} + static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) +nfs4_pnfs_ds_add(struct sockaddr *addr, size_t addrlen, gfp_t gfp_flags) { - struct nfs4_pnfs_ds *tmp_ds, *ds; + struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; + char *remotestr; ds = kzalloc(sizeof(*tmp_ds), gfp_flags); if (!ds) goto out; + /* this is only used for debugging, so it's ok if its NULL */ + remotestr = nfs4_pnfs_remotestr(addr, gfp_flags); + spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(ip_addr, port); + tmp_ds = _data_server_lookup_locked(addr, addrlen); if (tmp_ds == NULL) { - ds->ds_ip_addr = ip_addr; - ds->ds_port = port; + memcpy(&ds->ds_addr, addr, addrlen); + ds->ds_addrlen = addrlen; + ds->ds_remotestr = remotestr; atomic_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); - dprintk("%s add new data server ip 0x%x\n", __func__, - ds->ds_ip_addr); + dprintk("%s add new data server %s\n", __func__, + ds->ds_remotestr); } else { + kfree(remotestr); kfree(ds); atomic_inc(&tmp_ds->ds_count); - dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", - __func__, tmp_ds->ds_ip_addr, + dprintk("%s data server %s found, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_remotestr, atomic_read(&tmp_ds->ds_count)); ds = tmp_ds; } @@ -213,18 +290,21 @@ out: } /* - * Currently only support ipv4, and one multi-path address. + * Currently only supports ipv4, ipv6 and one multi-path address. */ static struct nfs4_pnfs_ds * decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) { struct nfs4_pnfs_ds *ds = NULL; - char *buf; - const char *ipend, *pstr; - u32 ip_addr, port; - int nlen, rlen, i; + char *buf, *portstr; + struct sockaddr_storage ss; + size_t sslen; + u32 port; + int nlen, rlen; int tmp[2]; __be32 *p; + char *netid, *match_netid; + size_t match_netid_len; /* r_netid */ p = xdr_inline_decode(streamp, 4); @@ -236,62 +316,97 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla if (unlikely(!p)) goto out_err; - /* Check that netid is "tcp" */ - if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { - dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); + netid = kmalloc(nlen+1, gfp_flags); + if (unlikely(!netid)) goto out_err; - } - /* r_addr */ + netid[nlen] = '\0'; + memcpy(netid, p, nlen); + + /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ p = xdr_inline_decode(streamp, 4); if (unlikely(!p)) - goto out_err; + goto out_free_netid; rlen = be32_to_cpup(p); p = xdr_inline_decode(streamp, rlen); if (unlikely(!p)) - goto out_err; + goto out_free_netid; - /* ipv6 length plus port is legal */ - if (rlen > INET6_ADDRSTRLEN + 8) { + /* port is ".ABC.DEF", 8 chars max */ + if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { dprintk("%s: Invalid address, length %d\n", __func__, rlen); - goto out_err; + goto out_free_netid; } buf = kmalloc(rlen + 1, gfp_flags); if (!buf) { dprintk("%s: Not enough memory\n", __func__); - goto out_err; + goto out_free_netid; } buf[rlen] = '\0'; memcpy(buf, p, rlen); - /* replace the port dots with dashes for the in4_pton() delimiter*/ - for (i = 0; i < 2; i++) { - char *res = strrchr(buf, '.'); - if (!res) { - dprintk("%s: Failed finding expected dots in port\n", - __func__); - goto out_free; - } - *res = '-'; + /* replace port '.' with '-' */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot in port\n", + __func__); + goto out_free_buf; + } + *portstr = '-'; + + /* find '.' between address and port */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot between address and " + "port\n", __func__); + goto out_free_buf; } + *portstr = '\0'; - /* Currently only support ipv4 address */ - if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { - dprintk("%s: Only ipv4 addresses supported\n", __func__); - goto out_free; + if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&ss, sizeof(ss))) { + dprintk("%s: Error parsing address %s\n", __func__, buf); + goto out_free_buf; } - /* port */ - pstr = ipend; - sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); + portstr++; + sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); port = htons((tmp[0] << 8) | (tmp[1])); - ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); - dprintk("%s: Decoded address and port %s\n", __func__, buf); -out_free: + switch (ss.ss_family) { + case AF_INET: + ((struct sockaddr_in *)&ss)->sin_port = port; + sslen = sizeof(struct sockaddr_in); + match_netid = "tcp"; + match_netid_len = 3; + break; + + case AF_INET6: + ((struct sockaddr_in6 *)&ss)->sin6_port = port; + sslen = sizeof(struct sockaddr_in6); + match_netid = "tcp6"; + match_netid_len = 4; + break; + + default: + dprintk("%s: unsupported address family: %u\n", + __func__, ss.ss_family); + goto out_free_buf; + } + + if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { + dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", + __func__, netid, match_netid); + goto out_free_buf; + } + + ds = nfs4_pnfs_ds_add((struct sockaddr *)&ss, sslen, gfp_flags); + dprintk("%s: Added DS %s\n", __func__, ds->ds_remotestr); +out_free_buf: kfree(buf); +out_free_netid: + kfree(netid); out_err: return ds; } @@ -591,13 +706,13 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) static void filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, - int err, u32 ds_addr) + int err, const char *ds_remotestr) { u32 *p = (u32 *)&dsaddr->id_node.deviceid; - printk(KERN_ERR "NFS: data server %x connection error %d." + printk(KERN_ERR "NFS: data server %s connection error %d." " Deviceid [%x%x%x%x] marked out of use.\n", - ds_addr, err, p[0], p[1], p[2], p[3]); + ds_remotestr, err, p[0], p[1], p[2], p[3]); spin_lock(&nfs4_ds_cache_lock); dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; @@ -628,7 +743,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) err = nfs4_ds_connect(s, ds); if (err) { filelayout_mark_devid_negative(dsaddr, err, - ntohl(ds->ds_ip_addr)); + ds->ds_remotestr); return NULL; } } -- cgit v1.2.3 From 14f9a6076f5388f3fd6341ad4b841337b28fc825 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 31 May 2011 18:48:57 -0400 Subject: NFS: Parse and store all multipath DS addresses This parses and stores all addresses associated with each data server, laying the groundwork for supporting multipath to data servers. - Skips over addresses that cannot be parsed (ie IPv6 addrs if v6 is not enabled). Only fails if none of the addresses are recognizable - Currently only uses the first address that parsed cleanly - Tested against pynfs server (modified to support multipath) Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.h | 12 +- fs/nfs/nfs4filelayoutdev.c | 363 +++++++++++++++++++++++++++++---------------- 2 files changed, 243 insertions(+), 132 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 6c6a817710e5..68cce730b800 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -47,11 +47,17 @@ enum stripetype4 { }; /* Individual ip address */ +struct nfs4_pnfs_ds_addr { + struct sockaddr_storage da_addr; + size_t da_addrlen; + struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *da_remotestr; /* human readable addr+port */ +}; + struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ - struct sockaddr_storage ds_addr; - size_t ds_addrlen; - char *ds_remotestr; /* human readable addr+port */ + char *ds_remotestr; /* comma sep list of addrs */ + struct list_head ds_addrs; struct nfs_client *ds_clp; atomic_t ds_count; }; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 1a8308b90108..610808a96f25 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -65,52 +65,103 @@ print_ds(struct nfs4_pnfs_ds *ds) ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } -/* nfs4_ds_cache_lock is held */ -static struct nfs4_pnfs_ds * -_data_server_lookup_locked(struct sockaddr *addr, size_t addrlen) +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) { - struct nfs4_pnfs_ds *ds; struct sockaddr_in *a, *b; struct sockaddr_in6 *a6, *b6; - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { - if (addr->sa_family != ds->ds_addr.ss_family) - continue; - - switch (addr->sa_family) { - case AF_INET: - a = (struct sockaddr_in *)addr; - b = (struct sockaddr_in *)&ds->ds_addr; - - if (a->sin_addr.s_addr == b->sin_addr.s_addr && - a->sin_port == b->sin_port) - return ds; - break; - - case AF_INET6: - a6 = (struct sockaddr_in6 *)addr; - b6 = (struct sockaddr_in6 *)&ds->ds_addr; - - /* LINKLOCAL addresses must have matching scope_id */ - if (ipv6_addr_scope(&a6->sin6_addr) == - IPV6_ADDR_SCOPE_LINKLOCAL && - a6->sin6_scope_id != b6->sin6_scope_id) - continue; - - if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && - a6->sin6_port == b6->sin6_port) - return ds; - break; - - default: - dprintk("%s: unhandled address family: %u\n", - __func__, addr->sa_family); - return NULL; + if (addr1->sa_family != addr2->sa_family) + return false; + + switch (addr1->sa_family) { + case AF_INET: + a = (struct sockaddr_in *)addr1; + b = (struct sockaddr_in *)addr2; + + if (a->sin_addr.s_addr == b->sin_addr.s_addr && + a->sin_port == b->sin_port) + return true; + break; + + case AF_INET6: + a6 = (struct sockaddr_in6 *)addr1; + b6 = (struct sockaddr_in6 *)addr2; + + /* LINKLOCAL addresses must have matching scope_id */ + if (ipv6_addr_scope(&a6->sin6_addr) == + IPV6_ADDR_SCOPE_LINKLOCAL && + a6->sin6_scope_id != b6->sin6_scope_id) + return false; + + if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && + a6->sin6_port == b6->sin6_port) + return true; + break; + + default: + dprintk("%s: unhandled address family: %u\n", + __func__, addr1->sa_family); + return false; + } + + return false; +} + +/* + * Lookup DS by addresses. The first matching address returns true. + * nfs4_ds_cache_lock is held + */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(struct list_head *dsaddrs) +{ + struct nfs4_pnfs_ds *ds; + struct nfs4_pnfs_ds_addr *da1, *da2; + + list_for_each_entry(da1, dsaddrs, da_node) { + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + list_for_each_entry(da2, &ds->ds_addrs, da_node) { + if (same_sockaddr( + (struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) + return ds; + } } } return NULL; } +/* + * Compare two lists of addresses. + */ +static bool +_data_server_match_all_addrs_locked(struct list_head *dsaddrs1, + struct list_head *dsaddrs2) +{ + struct nfs4_pnfs_ds_addr *da1, *da2; + size_t count1 = 0, + count2 = 0; + + list_for_each_entry(da1, dsaddrs1, da_node) + count1++; + + list_for_each_entry(da2, dsaddrs2, da_node) { + bool found = false; + count2++; + list_for_each_entry(da1, dsaddrs1, da_node) { + if (same_sockaddr((struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) { + found = true; + break; + } + } + if (!found) + return false; + } + + return (count1 == count2); +} + /* * Create an rpc connection to the nfs4_pnfs_ds data server * Currently only support IPv4 @@ -119,14 +170,21 @@ static int nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) { struct nfs_client *clp; + struct nfs4_pnfs_ds_addr *da; int status = 0; - dprintk("--> %s addr %s au_flavor %d\n", __func__, ds->ds_remotestr, + dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); + BUG_ON(list_empty(&ds->ds_addrs)); + + da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); + dprintk("%s: using the first address for DS %s: %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + clp = nfs4_set_ds_client(mds_srv->nfs_client, - (struct sockaddr *)&ds->ds_addr, - ds->ds_addrlen, IPPROTO_TCP); + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP); if (IS_ERR(clp)) { status = PTR_ERR(clp); goto out; @@ -169,12 +227,24 @@ out_put: static void destroy_ds(struct nfs4_pnfs_ds *ds) { + struct nfs4_pnfs_ds_addr *da; + dprintk("--> %s\n", __func__); ifdebug(FACILITY) print_ds(ds); if (ds->ds_clp) nfs_put_client(ds->ds_clp); + + while (!list_empty(&ds->ds_addrs)) { + da = list_first_entry(&ds->ds_addrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + kfree(ds->ds_remotestr); kfree(ds); } @@ -207,67 +277,73 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) * complicated setup around many dprinks. */ static char * -nfs4_pnfs_remotestr(struct sockaddr *ds_addr, gfp_t gfp_flags) +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) { - char buf[INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN]; + struct nfs4_pnfs_ds_addr *da; char *remotestr; - char *startsep = ""; - char *endsep = ""; size_t len; - uint16_t port; + char *p; - switch (ds_addr->sa_family) { - case AF_INET: - port = ((struct sockaddr_in *)ds_addr)->sin_port; - break; - case AF_INET6: - startsep = "["; - endsep = "]"; - port = ((struct sockaddr_in6 *)ds_addr)->sin6_port; - break; - default: - dprintk("%s: Unknown address family %u\n", - __func__, ds_addr->sa_family); - return NULL; + len = 3; /* '{', '}' and eol */ + list_for_each_entry(da, dsaddrs, da_node) { + len += strlen(da->da_remotestr) + 1; /* string plus comma */ } - if (!rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf))) { - dprintk("%s: error printing addr\n", __func__); + remotestr = kzalloc(len, gfp_flags); + if (!remotestr) return NULL; - } - len = strlen(buf) + strlen(startsep) + strlen(endsep) + 1 + 5 + 1; - remotestr = kzalloc(len, gfp_flags); + p = remotestr; + *(p++) = '{'; + len--; + list_for_each_entry(da, dsaddrs, da_node) { + size_t ll = strlen(da->da_remotestr); - if (unlikely(!remotestr)) { - dprintk("%s: couldn't alloc remotestr\n", __func__); - return NULL; - } + if (ll > len) + goto out_err; - snprintf(remotestr, len, "%s%s%s:%u", - startsep, buf, endsep, ntohs(port)); + memcpy(p, da->da_remotestr, ll); + p += ll; + len -= ll; + if (len < 1) + goto out_err; + (*p++) = ','; + len--; + } + if (len < 2) + goto out_err; + *(p++) = '}'; + *p = '\0'; return remotestr; +out_err: + kfree(remotestr); + return NULL; } static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct sockaddr *addr, size_t addrlen, gfp_t gfp_flags) +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) { struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; - ds = kzalloc(sizeof(*tmp_ds), gfp_flags); + if (list_empty(dsaddrs)) { + dprintk("%s: no addresses defined\n", __func__); + goto out; + } + + ds = kzalloc(sizeof(*ds), gfp_flags); if (!ds) goto out; /* this is only used for debugging, so it's ok if its NULL */ - remotestr = nfs4_pnfs_remotestr(addr, gfp_flags); + remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(addr, addrlen); + tmp_ds = _data_server_lookup_locked(dsaddrs); if (tmp_ds == NULL) { - memcpy(&ds->ds_addr, addr, addrlen); - ds->ds_addrlen = addrlen; + INIT_LIST_HEAD(&ds->ds_addrs); + list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; atomic_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); @@ -276,6 +352,11 @@ nfs4_pnfs_ds_add(struct sockaddr *addr, size_t addrlen, gfp_t gfp_flags) dprintk("%s add new data server %s\n", __func__, ds->ds_remotestr); } else { + if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs, + dsaddrs)) { + dprintk("%s: multipath address mismatch: %s != %s", + __func__, tmp_ds->ds_remotestr, remotestr); + } kfree(remotestr); kfree(ds); atomic_inc(&tmp_ds->ds_count); @@ -292,19 +373,20 @@ out: /* * Currently only supports ipv4, ipv6 and one multi-path address. */ -static struct nfs4_pnfs_ds * -decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) +static struct nfs4_pnfs_ds_addr * +decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) { - struct nfs4_pnfs_ds *ds = NULL; + struct nfs4_pnfs_ds_addr *da = NULL; char *buf, *portstr; - struct sockaddr_storage ss; - size_t sslen; u32 port; int nlen, rlen; int tmp[2]; __be32 *p; char *netid, *match_netid; - size_t match_netid_len; + size_t len, match_netid_len; + char *startsep = ""; + char *endsep = ""; + /* r_netid */ p = xdr_inline_decode(streamp, 4); @@ -365,50 +447,74 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla } *portstr = '\0'; - if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&ss, sizeof(ss))) { - dprintk("%s: Error parsing address %s\n", __func__, buf); + da = kzalloc(sizeof(*da), gfp_flags); + if (unlikely(!da)) goto out_free_buf; + + INIT_LIST_HEAD(&da->da_node); + + if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr, + sizeof(da->da_addr))) { + dprintk("%s: error parsing address %s\n", __func__, buf); + goto out_free_da; } portstr++; sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); port = htons((tmp[0] << 8) | (tmp[1])); - switch (ss.ss_family) { + switch (da->da_addr.ss_family) { case AF_INET: - ((struct sockaddr_in *)&ss)->sin_port = port; - sslen = sizeof(struct sockaddr_in); + ((struct sockaddr_in *)&da->da_addr)->sin_port = port; + da->da_addrlen = sizeof(struct sockaddr_in); match_netid = "tcp"; match_netid_len = 3; break; case AF_INET6: - ((struct sockaddr_in6 *)&ss)->sin6_port = port; - sslen = sizeof(struct sockaddr_in6); + ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; + da->da_addrlen = sizeof(struct sockaddr_in6); match_netid = "tcp6"; match_netid_len = 4; + startsep = "["; + endsep = "]"; break; default: dprintk("%s: unsupported address family: %u\n", - __func__, ss.ss_family); - goto out_free_buf; + __func__, da->da_addr.ss_family); + goto out_free_da; } if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", __func__, netid, match_netid); - goto out_free_buf; + goto out_free_da; } - ds = nfs4_pnfs_ds_add((struct sockaddr *)&ss, sslen, gfp_flags); - dprintk("%s: Added DS %s\n", __func__, ds->ds_remotestr); + /* save human readable address */ + len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; + da->da_remotestr = kzalloc(len, gfp_flags); + + /* NULL is ok, only used for dprintk */ + if (da->da_remotestr) + snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, + buf, endsep, ntohs(port)); + + dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); + kfree(buf); + kfree(netid); + return da; + +out_free_da: + kfree(da); out_free_buf: + dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); kfree(buf); out_free_netid: kfree(netid); out_err: - return ds; + return NULL; } /* Decode opaque device data and return the result */ @@ -425,6 +531,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) struct xdr_stream stream; struct xdr_buf buf; struct page *scratch; + struct list_head dsaddrs; + struct nfs4_pnfs_ds_addr *da; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -501,6 +609,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) NFS_SERVER(ino)->nfs_client, &pdev->dev_id); + INIT_LIST_HEAD(&dsaddrs); + for (i = 0; i < dsaddr->ds_num; i++) { int j; u32 mp_count; @@ -510,48 +620,43 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) goto out_err_free_deviceid; mp_count = be32_to_cpup(p); /* multipath count */ - if (mp_count > 1) { - printk(KERN_WARNING - "%s: Multipath count %d not supported, " - "skipping all greater than 1\n", __func__, - mp_count); - } for (j = 0; j < mp_count; j++) { - if (j == 0) { - dsaddr->ds_list[i] = decode_and_add_ds(&stream, - ino, gfp_flags); - if (dsaddr->ds_list[i] == NULL) - goto out_err_free_deviceid; - } else { - u32 len; - /* skip extra multipath */ - - /* read len, skip */ - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err_free_deviceid; - len = be32_to_cpup(p); - - p = xdr_inline_decode(&stream, len); - if (unlikely(!p)) - goto out_err_free_deviceid; - - /* read len, skip */ - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err_free_deviceid; - len = be32_to_cpup(p); - - p = xdr_inline_decode(&stream, len); - if (unlikely(!p)) - goto out_err_free_deviceid; - } + da = decode_ds_addr(&stream, gfp_flags); + if (da) + list_add_tail(&da->da_node, &dsaddrs); + } + if (list_empty(&dsaddrs)) { + dprintk("%s: no suitable DS addresses found\n", + __func__); + goto out_err_free_deviceid; + } + + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + if (!dsaddr->ds_list[i]) + goto out_err_drain_dsaddrs; + + /* If DS was already in cache, free ds addrs */ + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); } } __free_page(scratch); return dsaddr; +out_err_drain_dsaddrs: + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } out_err_free_deviceid: nfs4_fl_free_deviceid(dsaddr); /* stripe_indicies was part of dsaddr */ -- cgit v1.2.3 From 7e574f0d3911c5cc60d4d2b57fee975c462d6cd0 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 31 May 2011 18:48:58 -0400 Subject: NFS: pnfs: loop over multipath addrs on connect Don't just use the first addr in the multipath list - instead, loop over addresses when calling nfs4_set_ds_client() (which calls connect) until it is successful. Although this is not real multipath support, it's a quick fix to handle when an MDS sends a list of addresses for a DS and some of the addr families are unsupported or misconfigured (like no routable ipv6 addr assigned). This will attempt all paths to the DS before giving up, instead of immediately falling back to the MDS. As before, an error encountered after a successful connect() will cause all i/o to fall back to the MDS. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayoutdev.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 610808a96f25..61c173b0aab3 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -169,7 +169,7 @@ _data_server_match_all_addrs_locked(struct list_head *dsaddrs1, static int nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) { - struct nfs_client *clp; + struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; int status = 0; @@ -178,13 +178,17 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) BUG_ON(list_empty(&ds->ds_addrs)); - da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); - dprintk("%s: using the first address for DS %s: %s\n", - __func__, ds->ds_remotestr, da->da_remotestr); + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv->nfs_client, + clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP); + if (!IS_ERR(clp)) + break; + } + if (IS_ERR(clp)) { status = PTR_ERR(clp); goto out; -- cgit v1.2.3 From 78fe0f41d9937ee62817912ac8d627e06243c269 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 31 May 2011 19:05:47 -0400 Subject: NFS: use scope from exchange_id to skip reclaim can be skipped if the "eir_server_scope" from the exchange_id proc differs from previous calls. Also, in the future server_scope will be useful for determining whether client trunking is available Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 1 + fs/nfs/nfs4_fs.h | 3 +++ fs/nfs/nfs4proc.c | 32 ++++++++++++++++++++++++++++++++ fs/nfs/nfs4state.c | 9 ++++++++- fs/nfs/nfs4xdr.c | 8 +++++++- include/linux/nfs_fs_sb.h | 3 +++ include/linux/nfs_xdr.h | 1 + 7 files changed, 55 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b3dc2b88b65b..006f8ff0a3c0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -293,6 +293,7 @@ static void nfs_free_client(struct nfs_client *clp) nfs4_deviceid_purge_client(clp); kfree(clp->cl_hostname); + kfree(clp->server_scope); kfree(clp); dprintk("<-- nfs_free_client()\n"); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c4a69833dd0d..b47f0d4710fa 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -48,6 +48,7 @@ enum nfs4_client_state { NFS4CLNT_SESSION_RESET, NFS4CLNT_RECALL_SLOT, NFS4CLNT_LEASE_CONFIRM, + NFS4CLNT_SERVER_SCOPE_MISMATCH, }; enum nfs4_session_state { @@ -349,6 +350,8 @@ extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); +extern void nfs41_handle_server_scope(struct nfs_client *, + struct server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5879b23e0c99..5f4912f72282 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4781,6 +4781,16 @@ out_inval: return -NFS4ERR_INVAL; } +static bool +nfs41_same_server_scope(struct server_scope *a, struct server_scope *b) +{ + if (a->server_scope_sz == b->server_scope_sz && + memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) + return true; + + return false; +} + /* * nfs4_proc_exchange_id() * @@ -4823,9 +4833,31 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) init_utsname()->domainname, clp->cl_rpcclient->cl_auth->au_flavor); + res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); + if (unlikely(!res.server_scope)) + return -ENOMEM; + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); + + if (!status) { + if (clp->server_scope && + !nfs41_same_server_scope(clp->server_scope, + res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->server_scope); + clp->server_scope = NULL; + } + + if (!clp->server_scope) + clp->server_scope = res.server_scope; + else + kfree(res.server_scope); + } + dprintk("<-- %s status= %d\n", __func__, status); return status; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e97dd219f84f..5d744a52b4e1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1643,7 +1643,14 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); - set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + + if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, + &clp->cl_state)) + nfs4_state_start_reclaim_nograce(clp); + else + set_bit(NFS4CLNT_RECLAIM_REBOOT, + &clp->cl_state); + pnfs_destroy_all_layouts(clp); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e6e8f3b9a1de..1555c74dd336 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4977,11 +4977,17 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (unlikely(status)) return status; - /* Throw away server_scope */ + /* Save server_scope */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + + memcpy(res->server_scope->server_scope, dummy_str, dummy); + res->server_scope->server_scope_sz = dummy; + /* Throw away Implementation id array */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 87694ca86914..f23b18831559 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -16,6 +16,7 @@ struct nfs4_sequence_args; struct nfs4_sequence_res; struct nfs_server; struct nfs4_minor_version_ops; +struct server_scope; /* * The nfs_client identifies our client state to the server. @@ -83,6 +84,8 @@ struct nfs_client { #ifdef CONFIG_NFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif + + struct server_scope *server_scope; /* from exchange_id */ }; /* diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 00848d86ffb2..c4d98df884f3 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1060,6 +1060,7 @@ struct server_scope { struct nfs41_exchange_id_res { struct nfs_client *client; u32 flags; + struct server_scope *server_scope; }; struct nfs41_create_session_args { -- cgit v1.2.3 From 35dbbc99e93e57680837c17f96efe370f0535064 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Wed, 1 Jun 2011 16:32:21 -0400 Subject: NFS: fix comment We support IPv4 and IPv6 now. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayoutdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 61c173b0aab3..ed388aae9689 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -164,7 +164,7 @@ _data_server_match_all_addrs_locked(struct list_head *dsaddrs1, /* * Create an rpc connection to the nfs4_pnfs_ds data server - * Currently only support IPv4 + * Currently only supports IPv4 and IPv6 addresses */ static int nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) -- cgit v1.2.3 From 6382a44138e7aa40bf52170e7afc014443a24806 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Wed, 1 Jun 2011 16:44:44 -0400 Subject: NFS: move pnfs layouts to nfs_server structure Layouts should be tracked per nfs_server (aka superblock) instead of per struct nfs_client, which may have multiple FSIDs associated with it. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 57 ++++++++++++++++++++++++++++++----------------- fs/nfs/client.c | 4 +--- fs/nfs/pnfs.c | 13 ++++++++--- include/linux/nfs_fs_sb.h | 2 +- 4 files changed, 48 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index d4d1954e9bb9..74780f9f852c 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -111,6 +111,7 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; struct inode *ino; bool found = false; @@ -118,21 +119,28 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); spin_lock(&clp->cl_lock); - list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { - if (nfs_compare_fh(&args->cbl_fh, - &NFS_I(lo->plh_inode)->fh)) - continue; - ino = igrab(lo->plh_inode); - if (!ino) - continue; - found = true; - /* Without this, layout can be freed as soon - * as we release cl_lock. - */ - get_layout_hdr(lo); - break; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (nfs_compare_fh(&args->cbl_fh, + &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + continue; + found = true; + /* Without this, layout can be freed as soon + * as we release cl_lock. + */ + get_layout_hdr(lo); + break; + } + if (found) + break; } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); + if (!found) return NFS4ERR_NOMATCHING_LAYOUT; @@ -154,6 +162,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, static u32 initiate_bulk_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; struct inode *ino; u32 rv = NFS4ERR_NOMATCHING_LAYOUT; @@ -167,18 +176,24 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, }; spin_lock(&clp->cl_lock); - list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { if ((args->cbl_recall_type == RETURN_FSID) && - memcmp(&NFS_SERVER(lo->plh_inode)->fsid, - &args->cbl_fsid, sizeof(struct nfs_fsid))) - continue; - if (!igrab(lo->plh_inode)) + memcmp(&server->fsid, &args->cbl_fsid, + sizeof(struct nfs_fsid))) continue; - get_layout_hdr(lo); - BUG_ON(!list_empty(&lo->plh_bulk_recall)); - list_add(&lo->plh_bulk_recall, &recall_list); + + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!igrab(lo->plh_inode)) + continue; + get_layout_hdr(lo); + BUG_ON(!list_empty(&lo->plh_bulk_recall)); + list_add(&lo->plh_bulk_recall, &recall_list); + } } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); + list_for_each_entry_safe(lo, tmp, &recall_list, plh_bulk_recall) { ino = lo->plh_inode; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 006f8ff0a3c0..5452ada59461 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -188,9 +188,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; -#if defined(CONFIG_NFS_V4_1) - INIT_LIST_HEAD(&clp->cl_layouts); -#endif nfs_fscache_get_client_cookie(clp); return clp; @@ -1063,6 +1060,7 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); + INIT_LIST_HEAD(&server->layouts); atomic_set(&server->active, 0); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 29c0ca7fc347..ff8200772377 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -448,11 +448,17 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) void pnfs_destroy_all_layouts(struct nfs_client *clp) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); spin_lock(&clp->cl_lock); - list_splice_init(&clp->cl_layouts, &tmp_list); + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!list_empty(&server->layouts)) + list_splice_init(&server->layouts, &tmp_list); + } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); while (!list_empty(&tmp_list)) { @@ -920,7 +926,8 @@ pnfs_update_layout(struct inode *ino, }; unsigned pg_offset; struct nfs_inode *nfsi = NFS_I(ino); - struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; bool first = false; @@ -964,7 +971,7 @@ pnfs_update_layout(struct inode *ino, */ spin_lock(&clp->cl_lock); BUG_ON(!list_empty(&lo->plh_layouts)); - list_add_tail(&lo->plh_layouts, &clp->cl_layouts); + list_add_tail(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f23b18831559..4faeac8f448a 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -78,7 +78,6 @@ struct nfs_client { /* The flags used for obtaining the clientid during EXCHANGE_ID */ u32 cl_exchange_flags; struct nfs4_session *cl_session; /* sharred session */ - struct list_head cl_layouts; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_FSCACHE @@ -152,6 +151,7 @@ struct nfs_server { struct rb_root openowner_id; struct rb_root lockowner_id; #endif + struct list_head layouts; struct list_head delegations; void (*destroy)(struct nfs_server *); -- cgit v1.2.3 From fca78d6d2c77f87d7dbee89bbe4836a44da881e2 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 2 Jun 2011 14:59:07 -0400 Subject: NFS: Add SECINFO_NO_NAME procedure If the client is using NFS v4.1, then we can use SECINFO_NO_NAME to find the secflavor for the initial mount. If the server doesn't support SECINFO_NO_NAME then I fall back on the "guess and check" method used for v4.0 mounts. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 3 ++ fs/nfs/namespace.c | 2 +- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4proc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/nfs/nfs4xdr.c | 68 +++++++++++++++++++++++++++++++++++++++ include/linux/nfs4.h | 1 + include/linux/nfs_xdr.h | 8 +++++ 7 files changed, 166 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2a55347a2daa..fc017eadfe08 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -277,6 +277,9 @@ extern void nfs_sb_deactive(struct super_block *sb); extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen); extern struct vfsmount *nfs_d_automount(struct path *path); +#ifdef CONFIG_NFS_V4 +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); +#endif /* getroot.c */ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 1f063bacd285..8102391bb374 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -119,7 +119,7 @@ Elong: } #ifdef CONFIG_NFS_V4 -static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { struct gss_api_mech *mech; struct xdr_netobj oid; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index b47f0d4710fa..c30aed2c70f4 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -67,6 +67,8 @@ struct nfs4_minor_version_ops { int cache_reply); int (*validate_stateid)(struct nfs_delegation *, const nfs4_stateid *); + int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5f4912f72282..892bff53f61d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2251,13 +2251,14 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + int minor_version = server->nfs_client->cl_minorversion; int status = nfs4_lookup_root(server, fhandle, info); if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) /* * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM * by nfs4_map_errors() as this function exits. */ - status = nfs4_find_root_sec(server, fhandle, info); + status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) @@ -5935,6 +5936,85 @@ out: rpc_put_task(task); return status; } + +static int +_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs41_secinfo_no_name_args args = { + .style = SECINFO_STYLE_CURRENT_FH, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int +nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + case -NFS4ERR_NOTSUPP: + break; + default: + err = nfs4_handle_exception(server, err, &exception); + } + } while (exception.retry); + return err; +} + +static int +nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int err; + struct page *page; + rpc_authflavor_t flavor; + struct nfs4_secinfo_flavors *flavors; + + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + flavors = page_address(page); + err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + + /* + * Fall back on "guess and check" method if + * the server doesn't support SECINFO_NO_NAME + */ + if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { + err = nfs4_find_root_sec(server, fhandle, info); + goto out_freepage; + } + if (err) + goto out_freepage; + + flavor = nfs_find_best_sec(flavors); + if (err == 0) + err = nfs4_lookup_root_sec(server, fhandle, info, flavor); + +out_freepage: + put_page(page); + if (err == -EACCES) + return -EPERM; +out: + return err; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5996,6 +6076,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .call_sync = _nfs4_call_sync, .validate_stateid = nfs4_validate_delegation_stateid, + .find_root_sec = nfs4_find_root_sec, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, @@ -6006,6 +6087,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .call_sync = _nfs4_call_sync_session, .validate_stateid = nfs41_validate_delegation_stateid, + .find_root_sec = nfs41_find_root_sec, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1555c74dd336..c8c069a6319b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -343,6 +343,8 @@ static int nfs4_stat_to_errno(int); 1 /* FIXME: opaque lrf_body always empty at the moment */) #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 1 + decode_stateid_maxsz) +#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) +#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -772,6 +774,14 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutreturn_maxsz) +#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz +\ + encode_secinfo_no_name_maxsz) +#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_secinfo_no_name_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1938,6 +1948,20 @@ encode_layoutreturn(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_layoutreturn_maxsz; } + +static int +encode_secinfo_no_name(struct xdr_stream *xdr, + const struct nfs41_secinfo_no_name_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_SECINFO_NO_NAME); + *p++ = cpu_to_be32(args->style); + hdr->nops++; + hdr->replen += decode_secinfo_no_name_maxsz; + return 0; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2790,6 +2814,25 @@ static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, encode_layoutreturn(xdr, args, &hdr); encode_nops(&hdr); } + +/* + * Encode SECINFO_NO_NAME request + */ +static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_secinfo_no_name_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_secinfo_no_name(xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -6467,6 +6510,30 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode SECINFO_NO_NAME response + */ +static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6669,6 +6736,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(LAYOUTGET, enc_layoutget, dec_layoutget), PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 504b289ba680..7cc7d5745be5 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -563,6 +563,7 @@ enum { NFSPROC4_CLNT_GETDEVICEINFO, NFSPROC4_CLNT_LAYOUTCOMMIT, NFSPROC4_CLNT_LAYOUTRETURN, + NFSPROC4_CLNT_SECINFO_NO_NAME, }; /* nfs41 types */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c4d98df884f3..7159e1da63b6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1084,6 +1084,14 @@ struct nfs41_reclaim_complete_args { struct nfs41_reclaim_complete_res { struct nfs4_sequence_res seq_res; }; + +#define SECINFO_STYLE_CURRENT_FH 0 +#define SECINFO_STYLE_PARENT 1 +struct nfs41_secinfo_no_name_args { + int style; + struct nfs4_sequence_args seq_args; +}; + #endif /* CONFIG_NFS_V4_1 */ struct nfs_page; -- cgit v1.2.3 From 7d9747947ae66d8f6a9a9a023a3a5e28df6a536e Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 2 Jun 2011 14:59:08 -0400 Subject: NFS: Added TEST_STATEID call This patch adds in the xdr for doing a TEST_STATEID call with a single stateid. RFC 5661 allows multiple stateids to be tested in a single call, but only testing one keeps things simpler for now. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 28 +++++++++++++++ fs/nfs/nfs4xdr.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfs4.h | 1 + include/linux/nfs_xdr.h | 10 ++++++ 4 files changed, 130 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 892bff53f61d..5612ba997db7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6015,6 +6015,34 @@ out_freepage: out: return err; } +static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + int status; + struct nfs41_test_stateid_args args = { + .stateid = &state->stateid, + }; + struct nfs41_test_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + }; + args.seq_args.sa_session = res.seq_res.sr_session = NULL; + status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + return status; +} + +static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs41_test_stateid(server, state), + &exception); + } while (exception.retry); + return err; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c8c069a6319b..7d914d9e7584 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -345,6 +345,9 @@ static int nfs4_stat_to_errno(int); 1 + decode_stateid_maxsz) #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) #define decode_secinfo_no_name_maxsz decode_secinfo_maxsz +#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -782,6 +785,12 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putrootfh_maxsz + \ decode_secinfo_no_name_maxsz) +#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_test_stateid_maxsz) +#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_test_stateid_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1962,6 +1971,20 @@ encode_secinfo_no_name(struct xdr_stream *xdr, hdr->replen += decode_secinfo_no_name_maxsz; return 0; } + +static void encode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_TEST_STATEID); + *p++ = cpu_to_be32(1); + xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_test_stateid_maxsz; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2833,6 +2856,23 @@ static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, encode_nops(&hdr); return 0; } + +/* + * Encode TEST_STATEID request + */ +static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_test_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -5371,6 +5411,35 @@ out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } + +static int decode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + __be32 *p; + int status; + int num_res; + + status = decode_op_hdr(xdr, OP_TEST_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num_res = be32_to_cpup(p++); + if (num_res != 1) + goto out; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + return res->status; +out_overflow: + print_overflow_msg(__func__, xdr); +out: + return -EIO; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -6534,6 +6603,27 @@ static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode TEST_STATEID response + */ +static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_test_stateid(xdr, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6737,6 +6827,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 7cc7d5745be5..efe07f7a8765 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -564,6 +564,7 @@ enum { NFSPROC4_CLNT_LAYOUTCOMMIT, NFSPROC4_CLNT_LAYOUTRETURN, NFSPROC4_CLNT_SECINFO_NO_NAME, + NFSPROC4_CLNT_TEST_STATEID, }; /* nfs41 types */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7159e1da63b6..e696c1fb139a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1092,6 +1092,16 @@ struct nfs41_secinfo_no_name_args { struct nfs4_sequence_args seq_args; }; +struct nfs41_test_stateid_args { + nfs4_stateid *stateid; + struct nfs4_sequence_args seq_args; +}; + +struct nfs41_test_stateid_res { + unsigned int status; + struct nfs4_sequence_res seq_res; +}; + #endif /* CONFIG_NFS_V4_1 */ struct nfs_page; -- cgit v1.2.3 From 9aeda35fd643eba683fdb8dba8907fa796a85dda Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 2 Jun 2011 14:59:09 -0400 Subject: NFS: added FREE_STATEID call FREE_STATEID is used to tell the server that we want to free a stateid that no longer has any locks associated with it. This allows the client to reclaim locks without encountering edge conditions documented in section 8.4.3 of RFC 5661. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 30 +++++++++++++++++++ fs/nfs/nfs4xdr.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfs4.h | 1 + include/linux/nfs_xdr.h | 10 +++++++ 4 files changed, 121 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5612ba997db7..197ad0a737f6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6043,6 +6043,36 @@ static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *stat } while (exception.retry); return err; } + +static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + int status; + struct nfs41_free_stateid_args args = { + .stateid = &state->stateid, + }; + struct nfs41_free_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + args.seq_args.sa_session = res.seq_res.sr_session = NULL; + status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + return status; +} + +static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_free_stateid(server, state), + &exception); + } while (exception.retry); + return err; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7d914d9e7584..c191a9baa422 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -348,6 +348,9 @@ static int nfs4_stat_to_errno(int); #define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \ XDR_QUADLEN(NFS4_STATEID_SIZE)) #define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) +#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_free_stateid_maxsz (op_decode_hdr_maxsz + 1) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -791,6 +794,12 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_test_stateid_maxsz) +#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_free_stateid_maxsz) +#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_free_stateid_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1985,6 +1994,18 @@ static void encode_test_stateid(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_test_stateid_maxsz; } + +static void encode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_FREE_STATEID); + xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_free_stateid_maxsz; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2873,6 +2894,23 @@ static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, encode_test_stateid(xdr, args, &hdr); encode_nops(&hdr); } + +/* + * Encode FREE_STATEID request + */ +static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_free_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -5440,6 +5478,26 @@ out_overflow: out: return -EIO; } + +static int decode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_FREE_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + return res->status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -6624,6 +6682,27 @@ static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode FREE_STATEID response + */ +static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_free_stateid(xdr, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6828,6 +6907,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index efe07f7a8765..a3c4bc800dce 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -565,6 +565,7 @@ enum { NFSPROC4_CLNT_LAYOUTRETURN, NFSPROC4_CLNT_SECINFO_NO_NAME, NFSPROC4_CLNT_TEST_STATEID, + NFSPROC4_CLNT_FREE_STATEID, }; /* nfs41 types */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index e696c1fb139a..209455561749 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1102,6 +1102,16 @@ struct nfs41_test_stateid_res { struct nfs4_sequence_res seq_res; }; +struct nfs41_free_stateid_args { + nfs4_stateid *stateid; + struct nfs4_sequence_args seq_args; +}; + +struct nfs41_free_stateid_res { + unsigned int status; + struct nfs4_sequence_res seq_res; +}; + #endif /* CONFIG_NFS_V4_1 */ struct nfs_page; -- cgit v1.2.3 From f062eb6ced3b297277b94b4da3113b1d3782e539 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 2 Jun 2011 14:59:10 -0400 Subject: NFS: test and free stateids during recovery When recovering open files and locks, the stateid should be tested against the server and freed if it is invalid. This patch adds new recovery functions for NFS v4.1. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 197ad0a737f6..325706f73eea 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -80,7 +80,10 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, struct nfs4_state *state); - +#ifdef CONFIG_NFS_V4_1 +static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *); +static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *); +#endif /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { @@ -1687,6 +1690,20 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + int status; + struct nfs_server *server = NFS_SERVER(state->inode); + + status = nfs41_test_stateid(server, state); + if (status == NFS_OK) + return 0; + nfs41_free_stateid(server, state); + return nfs4_open_expired(sp, state); +} +#endif + /* * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* * fields corresponding to attributes that were used to store the verifier. @@ -4444,6 +4461,20 @@ out: return err; } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + int status; + struct nfs_server *server = NFS_SERVER(state->inode); + + status = nfs41_test_stateid(server, state); + if (status == NFS_OK) + return 0; + nfs41_free_stateid(server, state); + return nfs4_lock_expired(state, request); +} +#endif + static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs_inode *nfsi = NFS_I(state->inode); @@ -6109,8 +6140,8 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, - .recover_open = nfs4_open_expired, - .recover_lock = nfs4_lock_expired, + .recover_open = nfs41_open_expired, + .recover_lock = nfs41_lock_expired, .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, }; -- cgit v1.2.3 From 1751c3638f2a07a8c66a803a31791bab9bd3fced Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 10 Jun 2011 13:30:23 -0400 Subject: NFS: Cleanup of the nfs_pageio code in preparation for a pnfs bugfix We need to ensure that the layouts are set up before we can decide to coalesce requests. To do so, we want to further split up the struct nfs_pageio_descriptor operations into an initialisation callback, a coalescing test callback, and a 'do i/o' callback. This patch cleans up the existing callback methods before adding the 'initialisation' callback. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 15 +++++++++++++-- fs/nfs/objlayout/objio_osd.c | 13 ++++++++++++- fs/nfs/pagelist.c | 10 ++++------ fs/nfs/pnfs.c | 24 ++++++++++++++++++++++++ fs/nfs/pnfs.h | 25 +++++++++++++------------ fs/nfs/read.c | 43 ++++++++++++++++++++++++++++++++----------- fs/nfs/write.c | 28 ++++++++++++++++++++++------ include/linux/nfs_page.h | 14 +++++++++++--- 8 files changed, 131 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index b690bb5ee58d..46bb31b79683 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -658,7 +658,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, * return true : coalesce page * return false : don't coalesce page */ -bool +static bool filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { @@ -681,6 +681,16 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } +static const struct nfs_pageio_ops filelayout_pg_read_ops = { + .pg_test = filelayout_pg_test, + .pg_doio = nfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops filelayout_pg_write_ops = { + .pg_test = filelayout_pg_test, + .pg_doio = nfs_generic_pg_writepages, +}; + static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) { return !FILELAYOUT_LSEG(lseg)->commit_through_mds; @@ -878,7 +888,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .owner = THIS_MODULE, .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, - .pg_test = filelayout_pg_test, + .pg_read_ops = &filelayout_pg_read_ops, + .pg_write_ops = &filelayout_pg_write_ops, .mark_pnfs_commit = filelayout_mark_pnfs_commit, .choose_commit_list = filelayout_choose_commit_list, .commit_pagelist = filelayout_commit_pagelist, diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8ff2ea3f10ef..c203fab694af 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1007,6 +1007,16 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, OBJIO_LSEG(pgio->pg_lseg)->max_io_size; } +static const struct nfs_pageio_ops objio_pg_read_ops = { + .pg_test = objio_pg_test, + .pg_doio = nfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops objio_pg_write_ops = { + .pg_test = objio_pg_test, + .pg_doio = nfs_generic_pg_writepages, +}; + static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", @@ -1020,7 +1030,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .read_pagelist = objlayout_read_pagelist, .write_pagelist = objlayout_write_pagelist, - .pg_test = objio_pg_test, + .pg_read_ops = &objio_pg_read_ops, + .pg_write_ops = &objio_pg_write_ops, .free_deviceid_node = objio_free_deviceid_node, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 009855716286..9b8a4730f0bd 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test); */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct nfs_pageio_descriptor *), + const struct nfs_pageio_ops *pg_ops, size_t bsize, int io_flags) { @@ -241,12 +241,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_base = 0; desc->pg_moreio = 0; desc->pg_inode = inode; - desc->pg_doio = doio; + desc->pg_ops = pg_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; - desc->pg_test = nfs_generic_pg_test; - pnfs_pageio_init(desc, inode); } /** @@ -276,7 +274,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, return false; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return false; - return pgio->pg_test(pgio, prev, req); + return pgio->pg_ops->pg_test(pgio, prev, req); } /** @@ -311,7 +309,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { - int error = desc->pg_doio(desc); + int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; else diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ff8200772377..7ec46d5f05ab 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1055,6 +1055,30 @@ out_forget_reply: goto out; } +bool +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (ld == NULL) + return false; + nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0); + return true; +} + +bool +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (ld == NULL) + return false; + nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags); + return true; +} + bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 96bf4e6f45be..137a2bd5c8c7 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -87,7 +87,8 @@ struct pnfs_layoutdriver_type { void (*free_lseg) (struct pnfs_layout_segment *lseg); /* test for nfs page cache coalescing */ - bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + const struct nfs_pageio_ops *pg_read_ops; + const struct nfs_pageio_ops *pg_write_ops; /* Returns true if layoutdriver wants to divert this request to * driver's commit routine. @@ -152,6 +153,10 @@ struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, u64 count, enum pnfs_iomode access_type, gfp_t gfp_flags); + +bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); +bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); + void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, @@ -293,15 +298,6 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, - struct inode *inode) -{ - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - - if (ld) - pgio->pg_test = ld->pg_test; -} - #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -385,9 +381,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, - struct inode *inode) +static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { + return false; +} + +static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +{ + return false; } static inline void diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 20a7f952e244..b6d9ec9a208b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -32,6 +32,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); +static const struct nfs_pageio_ops nfs_pageio_read_ops; static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; @@ -113,6 +114,20 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } +static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, + NFS_SERVER(inode)->rsize, 0); +} + +static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + if (!pnfs_pageio_init_read(pgio, inode)) + nfs_pageio_init_read_mds(pgio, inode); +} + int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -131,14 +146,11 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_pageio_init(&pgio, inode, NULL, 0, 0); + nfs_pageio_init_read(&pgio, inode); nfs_list_add_request(new, &pgio.pg_list); pgio.pg_count = len; - if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(&pgio); - else - nfs_pagein_one(&pgio); + nfs_pageio_complete(&pgio); return 0; } @@ -365,6 +377,20 @@ out: return ret; } +int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_pagein_multi(desc); + return nfs_pagein_one(desc); +} +EXPORT_SYMBOL_GPL(nfs_generic_pg_readpages); + + +static const struct nfs_pageio_ops nfs_pageio_read_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_readpages, +}; + /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). @@ -635,8 +661,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, .pgio = &pgio, }; struct inode *inode = mapping->host; - struct nfs_server *server = NFS_SERVER(inode); - size_t rsize = server->rsize; unsigned long npages; int ret = -ESTALE; @@ -664,10 +688,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - if (rsize < PAGE_CACHE_SIZE) - nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); - else - nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0); + nfs_pageio_init_read(&pgio, inode); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 727168059684..b144b5474546 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1033,15 +1033,31 @@ out: return ret; } -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, +int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_flush_multi(desc); + return nfs_flush_one(desc); +} +EXPORT_SYMBOL_GPL(nfs_generic_pg_writepages); + +static const struct nfs_pageio_ops nfs_pageio_write_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_writepages, +}; + +static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { - size_t wsize = NFS_SERVER(inode)->wsize; + nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, + NFS_SERVER(inode)->wsize, ioflags); +} - if (wsize < PAGE_CACHE_SIZE) - nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); - else - nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags) +{ + if (!pnfs_pageio_init_write(pgio, inode, ioflags)) + nfs_pageio_init_write_mds(pgio, inode, ioflags); } /* diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 25311b3bedf8..d378f08b905e 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -55,6 +55,12 @@ struct nfs_page { struct nfs_writeverf wb_verf; /* Commit cookie */ }; +struct nfs_pageio_descriptor; +struct nfs_pageio_ops { + bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + int (*pg_doio)(struct nfs_pageio_descriptor *); +}; + struct nfs_pageio_descriptor { struct list_head pg_list; unsigned long pg_bytes_written; @@ -64,11 +70,10 @@ struct nfs_pageio_descriptor { char pg_moreio; struct inode *pg_inode; - int (*pg_doio)(struct nfs_pageio_descriptor *); + const struct nfs_pageio_ops *pg_ops; int pg_ioflags; int pg_error; struct pnfs_layout_segment *pg_lseg; - bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); }; #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) @@ -85,7 +90,7 @@ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int tag); extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct nfs_pageio_descriptor *desc), + const struct nfs_pageio_ops *pg_ops, size_t bsize, int how); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, @@ -100,6 +105,9 @@ extern void nfs_unlock_request(struct nfs_page *req); extern int nfs_set_page_tag_locked(struct nfs_page *req); extern void nfs_clear_page_tag_locked(struct nfs_page *req); +extern int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); +extern int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); + /* * Lock the page of an asynchronous request without getting a new reference -- cgit v1.2.3 From d8007d4dd6ff8749cc8a4063c3ec87442db76d82 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 10 Jun 2011 13:30:23 -0400 Subject: NFSv4.1: Add an initialisation callback for pNFS Ensure that we always get a layout before setting up the i/o request. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 2 ++ fs/nfs/objlayout/objio_osd.c | 2 ++ fs/nfs/pagelist.c | 2 ++ fs/nfs/pnfs.c | 57 ++++++++++++++++++++++++-------------------- fs/nfs/pnfs.h | 14 ++--------- fs/nfs/read.c | 16 ++++--------- fs/nfs/write.c | 12 +++------- include/linux/nfs_page.h | 1 + 8 files changed, 47 insertions(+), 59 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 46bb31b79683..faac87f24797 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -682,11 +682,13 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, } static const struct nfs_pageio_ops filelayout_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, .pg_test = filelayout_pg_test, .pg_doio = nfs_generic_pg_readpages, }; static const struct nfs_pageio_ops filelayout_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, .pg_test = filelayout_pg_test, .pg_doio = nfs_generic_pg_writepages, }; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c203fab694af..b759aeba57b9 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1008,11 +1008,13 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, } static const struct nfs_pageio_ops objio_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, .pg_test = objio_pg_test, .pg_doio = nfs_generic_pg_readpages, }; static const struct nfs_pageio_ops objio_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, .pg_test = objio_pg_test, .pg_doio = nfs_generic_pg_writepages, }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9b8a4730f0bd..d421e19557a5 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -295,6 +295,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else { + if (desc->pg_ops->pg_init) + desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; } nfs_list_remove_request(req); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7ec46d5f05ab..69a0c3f1e462 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -911,7 +911,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, * Layout segment is retreived from the server if not cached. * The appropriate layout segment is referenced and returned to the caller. */ -struct pnfs_layout_segment * +static struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, @@ -1055,6 +1055,34 @@ out_forget_reply: goto out; } +void +pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + req->wb_bytes, + IOMODE_READ, + GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); + +void +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + req->wb_bytes, + IOMODE_RW, + GFP_NOFS); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); + bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { @@ -1083,31 +1111,8 @@ bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - enum pnfs_iomode access_type; - gfp_t gfp_flags; - - /* We assume that pg_ioflags == 0 iff we're reading a page */ - if (pgio->pg_ioflags == 0) { - access_type = IOMODE_READ; - gfp_flags = GFP_KERNEL; - } else { - access_type = IOMODE_RW; - gfp_flags = GFP_NOFS; - } - - if (pgio->pg_lseg == NULL) { - if (pgio->pg_count != prev->wb_bytes) - return true; - /* This is first coelesce call for a series of nfs_pages */ - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - prev->wb_context, - req_offset(prev), - pgio->pg_count, - access_type, - gfp_flags); - if (pgio->pg_lseg == NULL) - return true; - } + if (pgio->pg_lseg == NULL) + return nfs_generic_pg_test(pgio, prev, req); /* * Test if a nfs_page is fully contained in the pnfs_layout_range. diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 137a2bd5c8c7..dd54351ae6ac 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -149,10 +149,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ void get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); -struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - loff_t pos, u64 count, enum pnfs_iomode access_type, - gfp_t gfp_flags); bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); @@ -163,6 +159,8 @@ enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, const struct rpc_call_ops *, int); enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, const struct rpc_call_ops *); +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -318,14 +316,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) { } -static inline struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - loff_t pos, u64 count, enum pnfs_iomode access_type, - gfp_t gfp_flags) -{ - return NULL; -} - static inline enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *data, const struct rpc_call_ops *call_ops) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index b6d9ec9a208b..2cf728b832cd 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -147,9 +147,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, zero_user_segment(page, len, PAGE_CACHE_SIZE); nfs_pageio_init_read(&pgio, inode); - nfs_list_add_request(new, &pgio.pg_list); - pgio.pg_count = len; - + nfs_pageio_add_request(&pgio, new); nfs_pageio_complete(&pgio); return 0; } @@ -281,7 +279,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg; + struct pnfs_layout_segment *lseg = desc->pg_lseg; LIST_HEAD(list); nfs_list_remove_request(req); @@ -299,10 +297,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - BUG_ON(desc->pg_lseg != NULL); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_READ, GFP_KERNEL); ClearPageError(page); offset = 0; nbytes = desc->pg_count; @@ -336,6 +330,8 @@ out_bad: } SetPageError(page); nfs_readpage_release(req); + put_lseg(lseg); + desc->pg_lseg = NULL; return -ENOMEM; } @@ -364,10 +360,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); - if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_READ, GFP_KERNEL); ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 0, lseg); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b144b5474546..d1ae7e45886f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -916,7 +916,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg; + struct pnfs_layout_segment *lseg = desc->pg_lseg; LIST_HEAD(list); nfs_list_remove_request(req); @@ -940,10 +940,6 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) } while (nbytes != 0); atomic_set(&req->wb_complete, requests); - BUG_ON(desc->pg_lseg); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_RW, GFP_NOFS); ClearPageError(page); offset = 0; nbytes = desc->pg_count; @@ -976,6 +972,8 @@ out_bad: nfs_writedata_free(data); } nfs_redirty_request(req); + put_lseg(lseg); + desc->pg_lseg = NULL; return -ENOMEM; } @@ -1016,10 +1014,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); - if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_RW, GFP_NOFS); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index d378f08b905e..9ac2dd158d0b 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -57,6 +57,7 @@ struct nfs_page { struct nfs_pageio_descriptor; struct nfs_pageio_ops { + void (*pg_init)(struct nfs_pageio_descriptor *, struct nfs_page *); bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); int (*pg_doio)(struct nfs_pageio_descriptor *); }; -- cgit v1.2.3 From e885de1a5bc9f46ef8f934c5a7602c89d2d51e8d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 10 Jun 2011 13:30:23 -0400 Subject: NFSv4.1: Fall back to ordinary i/o through the mds if we have no layout segment Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 6 ++++++ fs/nfs/nfs4filelayout.c | 2 -- fs/nfs/objlayout/objio_osd.c | 3 --- fs/nfs/pnfs.c | 7 +++++++ fs/nfs/read.c | 2 +- fs/nfs/write.c | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fc017eadfe08..31e8b50011af 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -296,7 +296,13 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +struct nfs_pageio_descriptor; +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode); + /* write.c */ +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags); extern void nfs_commit_free(struct nfs_write_data *p); extern int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index faac87f24797..d1ae5bfa6680 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -669,8 +669,6 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, !nfs_generic_pg_test(pgio, prev, req)) return false; - if (!pgio->pg_lseg) - return 1; p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index b759aeba57b9..70272d5355b2 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1000,9 +1000,6 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, if (!pnfs_generic_pg_test(pgio, prev, req)) return false; - if (pgio->pg_lseg == NULL) - return true; - return pgio->pg_count + req->wb_bytes <= OBJIO_LSEG(pgio->pg_lseg)->max_io_size; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 69a0c3f1e462..cc807fe46953 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1066,6 +1066,10 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r req->wb_bytes, IOMODE_READ, GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_init_read_mds(pgio, pgio->pg_inode); + } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); @@ -1080,6 +1084,9 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page * req->wb_bytes, IOMODE_RW, GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_init_write_mds(pgio, pgio->pg_inode, pgio->pg_ioflags); } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 2cf728b832cd..9b2a1d7fb760 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -114,7 +114,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode) { nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d1ae7e45886f..00940dc9ab04 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1040,7 +1040,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = { .pg_doio = nfs_generic_pg_writepages, }; -static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, -- cgit v1.2.3 From 47cb498e9316314e7e681f417135589195ad78a7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Jun 2011 12:18:11 -0400 Subject: NFSv4.1: Clean ups for the device id cache The fact that the global device id cache holds a reference to the nfs4_deviceid_node until it is invisible to rcu lookups implies that we can always assume that the reference count is non-zero in _find_get_deviceid. Also clean up nfs4_put_deviceid_node and the removal of the device id from the cache. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.h | 1 - fs/nfs/pnfs_dev.c | 44 +++++++++++--------------------------------- 2 files changed, 11 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index dd54351ae6ac..1cfc96f8c45b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -198,7 +198,6 @@ struct nfs4_deviceid_node { void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); -struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, const struct pnfs_layoutdriver_type *, diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index f0f8e1e22f6c..fb9498d91f6a 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -100,8 +100,8 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld, rcu_read_lock(); d = _lookup_deviceid(ld, clp, id, hash); - if (d && !atomic_inc_not_zero(&d->ref)) - d = NULL; + if (d != NULL) + atomic_inc(&d->ref); rcu_read_unlock(); return d; } @@ -115,15 +115,15 @@ nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); /* - * Unhash and put deviceid + * Remove a deviceid from cache * * @clp nfs_client associated with deviceid * @id the deviceid to unhash * * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. */ -struct nfs4_deviceid_node * -nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, const struct nfs_client *clp, const struct nfs4_deviceid *id) { struct nfs4_deviceid_node *d; @@ -134,7 +134,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, rcu_read_unlock(); if (!d) { spin_unlock(&nfs4_deviceid_lock); - return NULL; + return; } hlist_del_init_rcu(&d->node); spin_unlock(&nfs4_deviceid_lock); @@ -142,28 +142,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, /* balance the initial ref set in pnfs_insert_deviceid */ if (atomic_dec_and_test(&d->ref)) - return d; - - return NULL; -} -EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid); - -/* - * Delete a deviceid from cache - * - * @clp struct nfs_client qualifying the deviceid - * @id deviceid to delete - */ -void -nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id) -{ - struct nfs4_deviceid_node *d; - - d = nfs4_unhash_put_deviceid(ld, clp, id); - if (!d) - return; - d->ld->free_deviceid_node(d); + d->ld->free_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -221,16 +200,15 @@ EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); * * @d deviceid node to put * - * @ret true iff the node was deleted + * return true iff the node was deleted + * Note that since the test for d->ref == 0 is sufficient to establish + * that the node is no longer hashed in the global device id cache. */ bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { - if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) + if (!atomic_dec_and_test(&d->ref)) return false; - hlist_del_init_rcu(&d->node); - spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); d->ld->free_deviceid_node(d); return true; } -- cgit v1.2.3 From 7c24d9489fe57d67cb56c6bdad58d89806e7fd97 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 13 Jun 2011 18:22:38 -0400 Subject: NFSv4.1: File layout only supports whole file layouts Ask for whole file layouts. Until support for layout segments is fully supported in the file layout code, discard non-whole file layouts. Signed-off-by: Andy Adamson Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- fs/nfs/pnfs.c | 6 ++++-- fs/nfs/pnfs.h | 6 ++++++ fs/nfs/read.c | 1 + fs/nfs/write.c | 1 + 5 files changed, 57 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index d1ae5bfa6680..51c19093022e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -427,6 +427,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + goto out; + } + if (fl->pattern_offset > lgr->range.offset) { dprintk("%s pattern_offset %lld too large\n", __func__, fl->pattern_offset); @@ -679,14 +687,49 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } +void +filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_init_read_mds(pgio, pgio->pg_inode); +} + +void +filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_init_write_mds(pgio, pgio->pg_inode, + pgio->pg_ioflags); +} + static const struct nfs_pageio_ops filelayout_pg_read_ops = { - .pg_init = pnfs_generic_pg_init_read, + .pg_init = filelayout_pg_init_read, .pg_test = filelayout_pg_test, .pg_doio = nfs_generic_pg_readpages, }; static const struct nfs_pageio_ops filelayout_pg_write_ops = { - .pg_init = pnfs_generic_pg_init_write, + .pg_init = filelayout_pg_init_write, .pg_test = filelayout_pg_test, .pg_doio = nfs_generic_pg_writepages, }; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index cc807fe46953..a7dc3367a857 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -911,7 +911,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, * Layout segment is retreived from the server if not cached. * The appropriate layout segment is referenced and returned to the caller. */ -static struct pnfs_layout_segment * +struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, @@ -980,7 +980,8 @@ pnfs_update_layout(struct inode *ino, arg.offset -= pg_offset; arg.length += pg_offset; } - arg.length = PAGE_CACHE_ALIGN(arg.length); + if (arg.length != NFS4_MAX_UINT64) + arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); if (!lseg && first) { @@ -998,6 +999,7 @@ out_unlock: spin_unlock(&ino->i_lock); goto out; } +EXPORT_SYMBOL_GPL(pnfs_update_layout); int pnfs_layout_process(struct nfs4_layoutget *lgp) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1cfc96f8c45b..678c4c7b14d9 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -185,6 +185,12 @@ int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); int pnfs_ld_write_done(struct nfs_write_data *); int pnfs_ld_read_done(struct nfs_read_data *); +struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags); /* pnfs_dev.c */ struct nfs4_deviceid_node { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 9b2a1d7fb760..c394662c790e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -120,6 +120,7 @@ void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, NFS_SERVER(inode)->rsize, 0); } +EXPORT_SYMBOL_GPL(nfs_pageio_init_read_mds); static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 00940dc9ab04..7f8732e31982 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1046,6 +1046,7 @@ void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, NFS_SERVER(inode)->wsize, ioflags); } +EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds); static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) -- cgit v1.2.3 From a56aaa02b1f723e28b41d339ddff02e958d32d43 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 15 Jun 2011 11:59:10 -0400 Subject: NFSv4.1: Clean up layoutreturn Since we take a reference to it, we really ought to pass the a pointer to the layout header in the arguments instead of assuming that NFS_I(inode)->layout will forever point to the correct object. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ++-- fs/nfs/pnfs.c | 1 + include/linux/nfs_xdr.h | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 325706f73eea..93ef77666efc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5770,7 +5770,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; - struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lrp->args.layout; dprintk("--> %s\n", __func__); @@ -5799,7 +5799,7 @@ static void nfs4_layoutreturn_release(void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - put_layout_hdr(NFS_I(lrp->args.inode)->layout); + put_layout_hdr(lrp->args.layout); kfree(calldata); dprintk("<-- %s\n", __func__); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a7dc3367a857..5fc2e5d755a5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -667,6 +667,7 @@ _pnfs_return_layout(struct inode *ino) lrp->args.stateid = stateid; lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; + lrp->args.layout = lo; lrp->clp = NFS_SERVER(ino)->nfs_client; status = nfs4_proc_layoutreturn(lrp); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 209455561749..956d3576df7f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -269,9 +269,10 @@ struct nfs4_layoutcommit_data { }; struct nfs4_layoutreturn_args { - __u32 layout_type; + struct pnfs_layout_hdr *layout; struct inode *inode; nfs4_stateid stateid; + __u32 layout_type; struct nfs4_sequence_args seq_args; }; -- cgit v1.2.3 From c47abcf8ff4d0c56d20ce541e80d3e1c975f54b5 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 15 Jun 2011 17:52:40 -0400 Subject: NFSv4.1: do not use deviceids after MDS clientid invalidation Mark all deviceids established under an expired MDS clientid as invalid. Stop all new i/o through DS and send through the MDS. Don't use any new LAYOUTGETs that use the invalid deviceid. Purge all layouts established under the expired MDS clientid. Remove the MDS clientid deviceid and data servers reference Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 10 ++++++++++ fs/nfs/nfs4filelayout.h | 6 ++++++ fs/nfs/pnfs.c | 3 +++ fs/nfs/pnfs.h | 8 ++++++++ fs/nfs/pnfs_dev.c | 20 ++++++++++++++++++++ 5 files changed, 47 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 51c19093022e..af9bf9eed4ca 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -334,6 +334,9 @@ filelayout_read_pagelist(struct nfs_read_data *data) __func__, data->inode->i_ino, data->args.pgbase, (size_t)data->args.count, offset); + if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) + return PNFS_NOT_ATTEMPTED; + /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); @@ -373,6 +376,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) struct nfs_fh *fh; int status; + if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) + return PNFS_NOT_ATTEMPTED; + /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); @@ -456,6 +462,10 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } else dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + /* Found deviceid is being reaped */ + if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) + goto out_put; + fl->dsaddr = dsaddr; if (fl->first_stripe_index < 0 || diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 68cce730b800..2e42284253fa 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -96,6 +96,12 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } +static inline struct nfs4_deviceid_node * +FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) +{ + return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; +} + extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5fc2e5d755a5..5b3cc3f4bb39 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -452,6 +452,9 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + spin_lock(&clp->cl_lock); rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 678c4c7b14d9..a59736eae6ec 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -192,12 +192,20 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, enum pnfs_iomode iomode, gfp_t gfp_flags); +void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); + +/* nfs4_deviceid_flags */ +enum { + NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ +}; + /* pnfs_dev.c */ struct nfs4_deviceid_node { struct hlist_node node; struct hlist_node tmpnode; const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; + unsigned long flags; struct nfs4_deviceid deviceid; atomic_t ref; }; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index fb9498d91f6a..6fda5228ef56 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -156,6 +156,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, INIT_HLIST_NODE(&d->tmpnode); d->ld = ld; d->nfs_client = nfs_client; + d->flags = 0; d->deviceid = *id; atomic_set(&d->ref, 1); } @@ -253,3 +254,22 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp) for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) _deviceid_purge_client(clp, h); } + +/* + * Stop use of all deviceids associated with an nfs_client + */ +void +nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) +{ + struct nfs4_deviceid_node *d; + struct hlist_node *n; + int i; + + rcu_read_lock(); + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ + hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node) + if (d->nfs_client == clp) + set_bit(NFS_DEVICEID_INVALID, &d->flags); + } + rcu_read_unlock(); +} -- cgit v1.2.3 From 87ed5eb44ad9338b1617a0e78dea81d681325298 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2011 13:42:01 -0400 Subject: NFS: Don't use DATA_SYNC writes If we're writing back data, and the FLUSH_STABLE flag is set, then we always want to use NFS_FILE_SYNC, since we're always in a situation where we're doing page reclaim, and so we want to free up the page as quickly as possible. If we're in the FLUSH_COND_STABLE case, then we either want to use another unstable write (if we have to do a commit anyway) or again, we want to use NFS_FILE_SYNC because we know that we have no more pages to write out. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7f8732e31982..1af4d82c4959 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -872,10 +872,14 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; - if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { - data->args.stable = NFS_DATA_SYNC; - if (!nfs_need_commit(NFS_I(inode))) - data->args.stable = NFS_FILE_SYNC; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_need_commit(NFS_I(inode))) + break; + default: + data->args.stable = NFS_FILE_SYNC; } data->res.fattr = &data->fattr; -- cgit v1.2.3 From 6e4efd568574221840ee8dd86f176dc977c1330c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2011 13:42:02 -0400 Subject: NFS: Clean up nfs_read_rpcsetup and nfs_write_rpcsetup Split them up into two parts: one which sets up the struct nfs_read/write_data, the other which sets up the actual RPC call or pNFS call. Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 42 +++++++++++++++++++++++++----------------- fs/nfs/write.c | 41 ++++++++++++++++++++++++++--------------- include/linux/nfs_xdr.h | 2 ++ 3 files changed, 53 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index c394662c790e..248a55425853 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -213,17 +213,14 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read); /* * Set up the NFS read request struct */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - struct pnfs_layout_segment *lseg) +static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + unsigned int count, unsigned int offset) { struct inode *inode = req->wb_context->path.dentry->d_inode; data->req = req; data->inode = inode; data->cred = req->wb_context->cred; - data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -237,10 +234,21 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->res.count = count; data->res.eof = 0; nfs_fattr_init(&data->fattr); +} - if (data->lseg && - (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) - return 0; +static int nfs_do_read(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = data->args.context->path.dentry->d_inode; + + if (lseg) { + data->lseg = get_lseg(lseg); + if (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED) + return 0; + put_lseg(data->lseg); + data->lseg = NULL; + } return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } @@ -292,7 +300,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) data = nfs_readdata_alloc(1); if (!data) goto out_bad; - list_add(&data->pages, &list); + list_add(&data->list, &list); requests++; nbytes -= len; } while(nbytes != 0); @@ -304,15 +312,15 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) do { int ret2; - data = list_entry(list.next, struct nfs_read_data, pages); - list_del_init(&data->pages); + data = list_entry(list.next, struct nfs_read_data, list); + list_del_init(&data->list); data->pagevec[0] = page; if (nbytes < rsize) rsize = nbytes; - ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, - rsize, offset, lseg); + nfs_read_rpcsetup(req, data, rsize, offset); + ret2 = nfs_do_read(data, &nfs_read_partial_ops, lseg); if (ret == 0) ret = ret2; offset += rsize; @@ -325,8 +333,8 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) out_bad: while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_read_data, pages); - list_del(&data->pages); + data = list_entry(list.next, struct nfs_read_data, list); + list_del(&data->list); nfs_readdata_free(data); } SetPageError(page); @@ -362,8 +370,8 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) } req = nfs_list_entry(data->pages.next); - ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, - 0, lseg); + nfs_read_rpcsetup(req, data, desc->pg_count, 0); + ret = nfs_do_read(data, &nfs_read_full_ops, lseg); out: put_lseg(lseg); desc->pg_lseg = NULL; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1af4d82c4959..0aeb09b38e4b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -845,11 +845,9 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write); /* * Set up the argument/result storage required for the RPC call. */ -static int nfs_write_rpcsetup(struct nfs_page *req, +static void nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, - struct pnfs_layout_segment *lseg, int how) { struct inode *inode = req->wb_context->path.dentry->d_inode; @@ -860,7 +858,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; data->cred = req->wb_context->cred; - data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -886,10 +883,22 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->res.count = count; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); +} - if (data->lseg && - (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) - return 0; +static int nfs_do_write(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg, + int how) +{ + struct inode *inode = data->args.context->path.dentry->d_inode; + + if (lseg != NULL) { + data->lseg = get_lseg(lseg); + if (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED) + return 0; + put_lseg(data->lseg); + data->lseg = NULL; + } return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } @@ -938,7 +947,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) data = nfs_writedata_alloc(1); if (!data) goto out_bad; - list_add(&data->pages, &list); + list_add(&data->list, &list); requests++; nbytes -= len; } while (nbytes != 0); @@ -950,15 +959,16 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) do { int ret2; - data = list_entry(list.next, struct nfs_write_data, pages); - list_del_init(&data->pages); + data = list_entry(list.next, struct nfs_write_data, list); + list_del_init(&data->list); data->pagevec[0] = page; if (nbytes < wsize) wsize = nbytes; - ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, lseg, desc->pg_ioflags); + nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags); + ret2 = nfs_do_write(data, &nfs_write_partial_ops, lseg, + desc->pg_ioflags); if (ret == 0) ret = ret2; offset += wsize; @@ -971,8 +981,8 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) out_bad: while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_write_data, pages); - list_del(&data->pages); + data = list_entry(list.next, struct nfs_write_data, list); + list_del(&data->list); nfs_writedata_free(data); } nfs_redirty_request(req); @@ -1024,7 +1034,8 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); + nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags); + ret = nfs_do_write(data, &nfs_write_full_ops, lseg, desc->pg_ioflags); out: put_lseg(lseg); /* Cleans any gotten in ->pg_test */ desc->pg_lseg = NULL; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 956d3576df7f..5b115956abac 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1126,6 +1126,7 @@ struct nfs_read_data { struct rpc_cred *cred; struct nfs_fattr fattr; /* fattr storage */ struct list_head pages; /* Coalesced read requests */ + struct list_head list; /* lists of struct nfs_read_data */ struct nfs_page *req; /* multi ops per nfs_page */ struct page **pagevec; unsigned int npages; /* Max length of pagevec */ @@ -1149,6 +1150,7 @@ struct nfs_write_data { struct nfs_fattr fattr; struct nfs_writeverf verf; struct list_head pages; /* Coalesced requests we wish to flush */ + struct list_head list; /* lists of struct nfs_write_data */ struct nfs_page *req; /* multi ops per nfs_page */ struct page **pagevec; unsigned int npages; /* Max length of pagevec */ -- cgit v1.2.3 From d8926bb3badd36670fecf2de4a062c78bc37430b Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 13 Jul 2011 10:38:47 -0700 Subject: btrfs: don't BUG_ON btrfs_alloc_path() errors This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation failure. All the sites that are fixed in this patch were checked by me to be fairly trivial to fix because of at least one of two criteria: - Callers of the function catch errors from it already so bubbling the error up will be handled. - Callers of the function might BUG_ON any nonzero return code in which case there is no behavior changed (but we still got to remove a BUG_ON) The following functions were updated: btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group, btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written, btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink, insert_reserved_file_extent, and run_delalloc_nocow Signed-off-by: Mark Fasheh --- fs/btrfs/extent-tree.c | 12 +++++++++--- fs/btrfs/file-item.c | 7 +++++-- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 18 +++++++++++++----- 4 files changed, 29 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456fdb60..aa91773fe31b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -667,7 +667,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) struct btrfs_path *path; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + key.objectid = start; key.offset = len; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); @@ -5494,7 +5496,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, @@ -7162,7 +7165,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } inode = lookup_free_space_inode(root, block_group, path); if (!IS_ERR(inode)) { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90d4ee52cd45..f92ff0ed6e03 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -282,7 +282,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; if (search_commit) { path->skip_locking = 1; @@ -672,7 +673,9 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, btrfs_super_csum_size(&root->fs_info->super_copy); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + sector_sum = sums->sums; again: next_offset = (u64)-1; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa4ef18b66b1..23d1d811e2b2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -855,7 +855,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; again: recow = 0; split = start; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3601f0aebddf..8be7d7a759ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1070,7 +1070,8 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; nolock = is_free_space_inode(root, inode); @@ -1644,7 +1645,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, int ret; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; @@ -3713,7 +3715,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, int ret = 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, namelen, 0); @@ -4438,7 +4441,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, int owner; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return ERR_PTR(-ENOMEM); inode = new_inode(root->fs_info->sb); if (!inode) { @@ -7194,7 +7198,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + drop_inode = 1; + goto out_unlock; + } key.objectid = btrfs_ino(inode); key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); -- cgit v1.2.3 From 1e5063d093b5f024ae35bf835ca07463de2c1a1f Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 12 Jul 2011 10:46:06 -0700 Subject: btrfs: Don't BUG_ON alloc_path errors in replay_one_buffer() The two ->process_func call sites in tree-log.c which were ignoring a return code have also been updated to gracefully exit as well. Signed-off-by: Mark Fasheh --- fs/btrfs/tree-log.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f41d1e..f3cacc079102 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1617,7 +1617,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { @@ -1723,7 +1724,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return -ENOMEM; if (*level == 1) { - wc->process_func(root, next, wc, ptr_gen); + ret = wc->process_func(root, next, wc, ptr_gen); + if (ret) + return ret; path->slots[*level]++; if (wc->free) { @@ -1788,8 +1791,11 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level + 1]; root_owner = btrfs_header_owner(parent); - wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level])); + if (ret) + return ret; + if (wc->free) { struct extent_buffer *next; -- cgit v1.2.3 From 0eb0e19cde6f01397ef8c0e094e44beb75c62a1e Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 12 Jul 2011 16:44:10 -0700 Subject: btrfs: Don't BUG_ON alloc_path errors in btrfs_truncate_inode_items I moved the path allocation up a few lines to the top of the function so that we couldn't get into the state where we've dropped delayed items and the extent cache but fail due to -ENOMEM. Signed-off-by: Mark Fasheh --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8be7d7a759ce..a0faf7d7f0e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3172,6 +3172,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = -1; + if (root->ref_cows || root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); @@ -3184,10 +3189,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (min_type == 0 && root == BTRFS_I(inode)->root) btrfs_kill_delayed_inode_items(inode); - path = btrfs_alloc_path(); - BUG_ON(!path); - path->reada = -1; - key.objectid = ino; key.offset = (u64)-1; key.type = (u8)-1; -- cgit v1.2.3 From 1748f843a0190ef4332d03a64263f383af72682b Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 12 Jul 2011 11:25:31 -0700 Subject: btrfs: Don't BUG_ON alloc_path errors in btrfs_read_locked_inode btrfs_iget() also needed an update so that errors from btrfs_locked_inode() are caught and bubbled back up. Signed-off-by: Mark Fasheh --- fs/btrfs/inode.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0faf7d7f0e0..88829993db6c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2518,7 +2518,9 @@ static void btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + goto make_bad; + path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); @@ -3973,6 +3975,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *new) { struct inode *inode; + int bad_inode = 0; inode = btrfs_iget_locked(s, location->objectid, root); if (!inode) @@ -3982,10 +3985,19 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - inode_tree_add(inode); - unlock_new_inode(inode); - if (new) - *new = 1; + if (!is_bad_inode(inode)) { + inode_tree_add(inode); + unlock_new_inode(inode); + if (new) + *new = 1; + } else { + bad_inode = 1; + } + } + + if (bad_inode) { + iput(inode); + inode = ERR_PTR(-ESTALE); } return inode; -- cgit v1.2.3 From 17e9f796bd92cddec17d781c459376f82340fa44 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 12 Jul 2011 11:10:23 -0700 Subject: btrfs: Don't BUG_ON alloc_path errors in btrfs_balance() Dealing with this seems trivial - the only caller of btrfs_balance() is btrfs_ioctl() which passes the error code directly back to userspace. There also isn't much state to unwind (if I'm wrong about this point, we can always safely move the allocation to the top of btrfs_balance() anyway). Signed-off-by: Mark Fasheh --- fs/btrfs/volumes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19450bc53632..530a2fcea1ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2061,8 +2061,10 @@ int btrfs_balance(struct btrfs_root *dev_root) /* step two, relocate all the chunks */ path = btrfs_alloc_path(); - BUG_ON(!path); - + if (!path) { + ret = -ENOMEM; + goto error; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; -- cgit v1.2.3 From 3b6091846d5b6113d695c79caec7cc96b62d469b Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 15 Jul 2011 03:33:42 -0400 Subject: NFS: fix return value of nfs_pagein_one/nfs_flush_one Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 3 ++- fs/nfs/write.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 248a55425853..581534a4aed7 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -351,12 +351,13 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) struct nfs_read_data *data; struct list_head *head = &desc->pg_list; struct pnfs_layout_segment *lseg = desc->pg_lseg; - int ret = -ENOMEM; + int ret = 0; data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, desc->pg_count)); if (!data) { nfs_async_read_error(head); + ret = -ENOMEM; goto out; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0aeb09b38e4b..d9dd744588d4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1006,7 +1006,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) struct nfs_write_data *data; struct list_head *head = &desc->pg_list; struct pnfs_layout_segment *lseg = desc->pg_lseg; - int ret; + int ret = 0; data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, desc->pg_count)); -- cgit v1.2.3 From 275acaafd45fbc8ecc3beabd6367e60b3049606a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2011 13:42:02 -0400 Subject: NFS: Clean up: split out the RPC transmission from nfs_pagein_multi/one ...and do the same for nfs_flush_multi/one. Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 92 +++++++++++++++++++++++++++++++-------------------------- fs/nfs/write.c | 93 ++++++++++++++++++++++++++++++++-------------------------- 2 files changed, 102 insertions(+), 83 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 581534a4aed7..0215bac35fc6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -30,8 +30,6 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); static const struct nfs_pageio_ops nfs_pageio_read_ops; static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; @@ -253,6 +251,27 @@ static int nfs_do_read(struct nfs_read_data *data, return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } +static int +nfs_do_multiple_reads(struct list_head *head, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg) +{ + struct nfs_read_data *data; + int ret = 0; + + while (!list_empty(head)) { + int ret2; + + data = list_entry(head->next, struct nfs_read_data, list); + list_del_init(&data->list); + + ret2 = nfs_do_read(data, call_ops, lseg); + if (ret == 0) + ret = ret2; + } + return ret; +} + static void nfs_async_read_error(struct list_head *head) { @@ -279,7 +298,7 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; @@ -288,11 +307,10 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg = desc->pg_lseg; - LIST_HEAD(list); nfs_list_remove_request(req); + offset = 0; nbytes = desc->pg_count; do { size_t len = min(nbytes,rsize); @@ -300,57 +318,33 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) data = nfs_readdata_alloc(1); if (!data) goto out_bad; - list_add(&data->list, &list); + data->pagevec[0] = page; + nfs_read_rpcsetup(req, data, len, offset); + list_add(&data->list, res); requests++; nbytes -= len; + offset += len; } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - ClearPageError(page); - offset = 0; - nbytes = desc->pg_count; - do { - int ret2; - - data = list_entry(list.next, struct nfs_read_data, list); - list_del_init(&data->list); - - data->pagevec[0] = page; - - if (nbytes < rsize) - rsize = nbytes; - nfs_read_rpcsetup(req, data, rsize, offset); - ret2 = nfs_do_read(data, &nfs_read_partial_ops, lseg); - if (ret == 0) - ret = ret2; - offset += rsize; - nbytes -= rsize; - } while (nbytes != 0); - put_lseg(lseg); - desc->pg_lseg = NULL; - return ret; - out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_read_data, list); + while (!list_empty(res)) { + data = list_entry(res->next, struct nfs_read_data, list); list_del(&data->list); nfs_readdata_free(data); } SetPageError(page); nfs_readpage_release(req); - put_lseg(lseg); - desc->pg_lseg = NULL; return -ENOMEM; } -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req; struct page **pages; struct nfs_read_data *data; struct list_head *head = &desc->pg_list; - struct pnfs_layout_segment *lseg = desc->pg_lseg; int ret = 0; data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, @@ -372,18 +366,32 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) req = nfs_list_entry(data->pages.next); nfs_read_rpcsetup(req, data, desc->pg_count, 0); - ret = nfs_do_read(data, &nfs_read_full_ops, lseg); + list_add(&data->list, res); out: - put_lseg(lseg); - desc->pg_lseg = NULL; return ret; } int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_pagein_multi(desc); - return nfs_pagein_one(desc); + LIST_HEAD(head); + int ret; + + if (desc->pg_bsize < PAGE_CACHE_SIZE) { + ret = nfs_pagein_multi(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_reads(&head, + &nfs_read_partial_ops, + desc->pg_lseg); + } else { + ret = nfs_pagein_one(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_reads(&head, + &nfs_read_full_ops, + desc->pg_lseg); + } + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; } EXPORT_SYMBOL_GPL(nfs_generic_pg_readpages); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d9dd744588d4..bd4b34e5870b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -903,6 +903,27 @@ static int nfs_do_write(struct nfs_write_data *data, return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } +static int nfs_do_multiple_writes(struct list_head *head, + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg, + int how) +{ + struct nfs_write_data *data; + int ret = 0; + + while (!list_empty(head)) { + int ret2; + + data = list_entry(head->next, struct nfs_write_data, list); + list_del_init(&data->list); + + ret2 = nfs_do_write(data, call_ops, lseg, how); + if (ret == 0) + ret = ret2; + } + return ret; +} + /* If a nfs_flush_* function fails, it should remove reqs from @head and * call this on each, which will prepare them to be retried on next * writeback using standard nfs. @@ -920,7 +941,7 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; @@ -929,8 +950,6 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg = desc->pg_lseg; - LIST_HEAD(list); nfs_list_remove_request(req); @@ -940,6 +959,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) desc->pg_ioflags &= ~FLUSH_COND_STABLE; + offset = 0; nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -947,47 +967,23 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) data = nfs_writedata_alloc(1); if (!data) goto out_bad; - list_add(&data->list, &list); + data->pagevec[0] = page; + nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags); + list_add(&data->list, res); requests++; nbytes -= len; + offset += len; } while (nbytes != 0); atomic_set(&req->wb_complete, requests); - - ClearPageError(page); - offset = 0; - nbytes = desc->pg_count; - do { - int ret2; - - data = list_entry(list.next, struct nfs_write_data, list); - list_del_init(&data->list); - - data->pagevec[0] = page; - - if (nbytes < wsize) - wsize = nbytes; - nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags); - ret2 = nfs_do_write(data, &nfs_write_partial_ops, lseg, - desc->pg_ioflags); - if (ret == 0) - ret = ret2; - offset += wsize; - nbytes -= wsize; - } while (nbytes != 0); - - put_lseg(lseg); - desc->pg_lseg = NULL; return ret; out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_write_data, list); + while (!list_empty(res)) { + data = list_entry(res->next, struct nfs_write_data, list); list_del(&data->list); nfs_writedata_free(data); } nfs_redirty_request(req); - put_lseg(lseg); - desc->pg_lseg = NULL; return -ENOMEM; } @@ -999,13 +995,12 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; struct list_head *head = &desc->pg_list; - struct pnfs_layout_segment *lseg = desc->pg_lseg; int ret = 0; data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, @@ -1035,18 +1030,34 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) /* Set up the argument struct */ nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags); - ret = nfs_do_write(data, &nfs_write_full_ops, lseg, desc->pg_ioflags); + list_add(&data->list, res); out: - put_lseg(lseg); /* Cleans any gotten in ->pg_test */ - desc->pg_lseg = NULL; return ret; } int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_flush_multi(desc); - return nfs_flush_one(desc); + LIST_HEAD(head); + int ret; + + if (desc->pg_bsize < PAGE_CACHE_SIZE) { + ret = nfs_flush_multi(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_writes(&head, + &nfs_write_partial_ops, + desc->pg_lseg, + desc->pg_ioflags); + } else { + ret = nfs_flush_one(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_writes(&head, + &nfs_write_full_ops, + desc->pg_lseg, + desc->pg_ioflags); + } + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; } EXPORT_SYMBOL_GPL(nfs_generic_pg_writepages); -- cgit v1.2.3 From 50828d7e6767a92726708bc0666e2b8b84575808 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2011 13:42:02 -0400 Subject: NFS: Cache rpc_ops in struct nfs_pageio_descriptor Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 19 ++++++++----------- fs/nfs/write.c | 20 +++++++------------- include/linux/nfs_page.h | 1 + 3 files changed, 16 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0215bac35fc6..c14362fbe65f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -327,6 +327,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head } while(nbytes != 0); atomic_set(&req->wb_complete, requests); ClearPageError(page); + desc->pg_rpc_callops = &nfs_read_partial_ops; return ret; out_bad: while (!list_empty(res)) { @@ -367,6 +368,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head * nfs_read_rpcsetup(req, data, desc->pg_count, 0); list_add(&data->list, res); + desc->pg_rpc_callops = &nfs_read_full_ops; out: return ret; } @@ -376,19 +378,14 @@ int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) LIST_HEAD(head); int ret; - if (desc->pg_bsize < PAGE_CACHE_SIZE) { + if (desc->pg_bsize < PAGE_CACHE_SIZE) ret = nfs_pagein_multi(desc, &head); - if (ret == 0) - ret = nfs_do_multiple_reads(&head, - &nfs_read_partial_ops, - desc->pg_lseg); - } else { + else ret = nfs_pagein_one(desc, &head); - if (ret == 0) - ret = nfs_do_multiple_reads(&head, - &nfs_read_full_ops, - desc->pg_lseg); - } + + if (ret == 0) + ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops, + desc->pg_lseg); put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return ret; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bd4b34e5870b..71fbba72ace3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -975,6 +975,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head offset += len; } while (nbytes != 0); atomic_set(&req->wb_complete, requests); + desc->pg_rpc_callops = &nfs_write_partial_ops; return ret; out_bad: @@ -1031,6 +1032,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r /* Set up the argument struct */ nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags); list_add(&data->list, res); + desc->pg_rpc_callops = &nfs_write_full_ops; out: return ret; } @@ -1040,21 +1042,13 @@ int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) LIST_HEAD(head); int ret; - if (desc->pg_bsize < PAGE_CACHE_SIZE) { + if (desc->pg_bsize < PAGE_CACHE_SIZE) ret = nfs_flush_multi(desc, &head); - if (ret == 0) - ret = nfs_do_multiple_writes(&head, - &nfs_write_partial_ops, - desc->pg_lseg, - desc->pg_ioflags); - } else { + else ret = nfs_flush_one(desc, &head); - if (ret == 0) - ret = nfs_do_multiple_writes(&head, - &nfs_write_full_ops, - desc->pg_lseg, - desc->pg_ioflags); - } + if (ret == 0) + ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops, + desc->pg_lseg, desc->pg_ioflags); put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return ret; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 9ac2dd158d0b..db3194f63479 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -74,6 +74,7 @@ struct nfs_pageio_descriptor { const struct nfs_pageio_ops *pg_ops; int pg_ioflags; int pg_error; + const struct rpc_call_ops *pg_rpc_callops; struct pnfs_layout_segment *pg_lseg; }; -- cgit v1.2.3 From d097971d8ab4042eaa4bff98698ae9cc55942327 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2011 13:42:02 -0400 Subject: NFS: Use the nfs_pageio_descriptor->pg_bsize in the read/write request Instead of looking up the rsize and wsize, the routines that generate the RPC requests should really be using the pg_bsize, since that is what we use when deciding whether or not to coalesce write requests... Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 2 +- fs/nfs/write.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index c14362fbe65f..75088f58183b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -303,7 +303,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_read_data *data; - size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; + size_t rsize = desc->pg_bsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 71fbba72ace3..c88a1ab42c39 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -946,7 +946,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_write_data *data; - size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; + size_t wsize = desc->pg_bsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; -- cgit v1.2.3 From d9156f9f364897e93bdd98b4ad22138de18f7c24 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2011 13:42:02 -0400 Subject: NFS: Allow the nfs_pageio_descriptor to signal that a re-coalesce is needed If an attempt to do pNFS fails, and we have to fall back to writing through the MDS, then we may want to re-coalesce the requests that we already have since the block size for the MDS read/writes may be different to that of the DS read/writes. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 57 +++++++++++++++++++++++++++++++++++++++++++++--- include/linux/nfs_page.h | 3 ++- 2 files changed, 56 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d421e19557a5..7139dbf8784e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -240,6 +240,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_bsize = bsize; desc->pg_base = 0; desc->pg_moreio = 0; + desc->pg_recoalesce = 0; desc->pg_inode = inode; desc->pg_ops = pg_ops; desc->pg_ioflags = io_flags; @@ -331,7 +332,7 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ -int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { while (!nfs_pageio_do_add_request(desc, req)) { @@ -340,17 +341,67 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_error < 0) return 0; desc->pg_moreio = 0; + if (desc->pg_recoalesce) + return 0; } return 1; } +static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + + do { + list_splice_init(&desc->pg_list, &head); + desc->pg_bytes_written -= desc->pg_count; + desc->pg_count = 0; + desc->pg_base = 0; + desc->pg_recoalesce = 0; + + while (!list_empty(&head)) { + struct nfs_page *req; + + req = list_first_entry(&head, struct nfs_page, wb_list); + nfs_list_remove_request(req); + if (__nfs_pageio_add_request(desc, req)) + continue; + if (desc->pg_error < 0) + return 0; + break; + } + } while (desc->pg_recoalesce); + return 1; +} + +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + int ret; + + do { + ret = __nfs_pageio_add_request(desc, req); + if (ret) + break; + if (desc->pg_error < 0) + break; + ret = nfs_do_recoalesce(desc); + } while (ret); + return ret; +} + /** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor * @desc: pointer to io descriptor */ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) { - nfs_pageio_doio(desc); + for (;;) { + nfs_pageio_doio(desc); + if (!desc->pg_recoalesce) + break; + if (!nfs_do_recoalesce(desc)) + break; + } } /** @@ -369,7 +420,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) if (!list_empty(&desc->pg_list)) { struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); if (index != prev->wb_index + 1) - nfs_pageio_doio(desc); + nfs_pageio_complete(desc); } } diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index db3194f63479..7241b2a2a4d6 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -68,7 +68,8 @@ struct nfs_pageio_descriptor { size_t pg_count; size_t pg_bsize; unsigned int pg_base; - char pg_moreio; + unsigned char pg_moreio : 1, + pg_recoalesce : 1; struct inode *pg_inode; const struct nfs_pageio_ops *pg_ops; -- cgit v1.2.3 From 493292ddc78d18ee2ad2d5c24c2b7dd6a24641d2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Jul 2011 15:58:28 -0400 Subject: NFS: Move the pnfs read code into pnfs.c ...and ensure that we recoalese to take into account differences in block sizes when falling back to read through the MDS. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 6 ++++- fs/nfs/nfs4filelayout.c | 2 +- fs/nfs/objlayout/objio_osd.c | 2 +- fs/nfs/pnfs.c | 57 ++++++++++++++++++++++++++++++++++++++++++-- fs/nfs/pnfs.h | 10 +------- fs/nfs/read.c | 46 ++++++++++++++++------------------- include/linux/nfs_page.h | 1 - 7 files changed, 83 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 31e8b50011af..d74d7dea9173 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -291,14 +291,18 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif +struct nfs_pageio_descriptor; /* read.c */ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, + struct list_head *head); -struct nfs_pageio_descriptor; extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode); +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index af9bf9eed4ca..fc556d6b90f1 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -735,7 +735,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, static const struct nfs_pageio_ops filelayout_pg_read_ops = { .pg_init = filelayout_pg_init_read, .pg_test = filelayout_pg_test, - .pg_doio = nfs_generic_pg_readpages, + .pg_doio = pnfs_generic_pg_readpages, }; static const struct nfs_pageio_ops filelayout_pg_write_ops = { diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 70272d5355b2..add62894f364 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1007,7 +1007,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, static const struct nfs_pageio_ops objio_pg_read_ops = { .pg_init = pnfs_generic_pg_init_read, .pg_test = objio_pg_test, - .pg_doio = nfs_generic_pg_readpages, + .pg_doio = pnfs_generic_pg_readpages, }; static const struct nfs_pageio_ops objio_pg_write_ops = { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5b3cc3f4bb39..9eca5a8cdbdd 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -28,6 +28,7 @@ */ #include +#include #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -1216,18 +1217,32 @@ pnfs_ld_read_done(struct nfs_read_data *data) } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); +static void +pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_read_data *data) +{ + list_splice_tail_init(&data->pages, &desc->pg_list); + if (data->req && list_empty(&data->req->wb_list)) + nfs_list_add_request(data->req, &desc->pg_list); + nfs_pageio_reset_read_mds(desc); + desc->pg_recoalesce = 1; + nfs_readdata_release(data); +} + /* * Call the appropriate parallel I/O subsystem read function. */ -enum pnfs_try_status +static enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *rdata, - const struct rpc_call_ops *call_ops) + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg) { struct inode *inode = rdata->inode; struct nfs_server *nfss = NFS_SERVER(inode); enum pnfs_try_status trypnfs; rdata->mds_ops = call_ops; + rdata->lseg = get_lseg(lseg); dprintk("%s: Reading ino:%lu %u@%llu\n", __func__, inode->i_ino, rdata->args.count, rdata->args.offset); @@ -1243,6 +1258,44 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, return trypnfs; } +static void +pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + struct nfs_read_data *data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + + desc->pg_lseg = NULL; + while (!list_empty(head)) { + enum pnfs_try_status trypnfs; + + data = list_entry(head->next, struct nfs_read_data, list); + list_del_init(&data->list); + + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_read_through_mds(desc, data); + } + put_lseg(lseg); +} + +int +pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_pagein(desc, &head); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; + } + pnfs_do_multiple_reads(desc, &head); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); + /* * Currently there is only one (whole file) write lseg. */ diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a59736eae6ec..c40ffa52c1ab 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -157,9 +157,8 @@ void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, const struct rpc_call_ops *, int); -enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, - const struct rpc_call_ops *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); int pnfs_layout_process(struct nfs4_layoutget *lgp); @@ -329,13 +328,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) { } -static inline enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *data, - const struct rpc_call_ops *call_ops) -{ - return PNFS_NOT_ATTEMPTED; -} - static inline enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *data, const struct rpc_call_ops *call_ops, int how) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 75088f58183b..d2f53ddd8260 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -67,7 +67,7 @@ void nfs_readdata_free(struct nfs_read_data *p) mempool_free(p, nfs_rdata_mempool); } -static void nfs_readdata_release(struct nfs_read_data *rdata) +void nfs_readdata_release(struct nfs_read_data *rdata) { put_lseg(rdata->lseg); put_nfs_open_context(rdata->args.context); @@ -120,6 +120,12 @@ void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read_mds); +void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_ops = &nfs_pageio_read_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; +} + static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { @@ -235,26 +241,16 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, } static int nfs_do_read(struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - struct pnfs_layout_segment *lseg) + const struct rpc_call_ops *call_ops) { struct inode *inode = data->args.context->path.dentry->d_inode; - if (lseg) { - data->lseg = get_lseg(lseg); - if (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED) - return 0; - put_lseg(data->lseg); - data->lseg = NULL; - } - return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } static int nfs_do_multiple_reads(struct list_head *head, - const struct rpc_call_ops *call_ops, - struct pnfs_layout_segment *lseg) + const struct rpc_call_ops *call_ops) { struct nfs_read_data *data; int ret = 0; @@ -265,7 +261,7 @@ nfs_do_multiple_reads(struct list_head *head, data = list_entry(head->next, struct nfs_read_data, list); list_del_init(&data->list); - ret2 = nfs_do_read(data, call_ops, lseg); + ret2 = nfs_do_read(data, call_ops); if (ret == 0) ret = ret2; } @@ -373,25 +369,23 @@ out: return ret; } -int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_pagein_multi(desc, head); + return nfs_pagein_one(desc, head); +} + +static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { LIST_HEAD(head); int ret; - if (desc->pg_bsize < PAGE_CACHE_SIZE) - ret = nfs_pagein_multi(desc, &head); - else - ret = nfs_pagein_one(desc, &head); - + ret = nfs_generic_pagein(desc, &head); if (ret == 0) - ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops, - desc->pg_lseg); - put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; + ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops); return ret; } -EXPORT_SYMBOL_GPL(nfs_generic_pg_readpages); - static const struct nfs_pageio_ops nfs_pageio_read_ops = { .pg_test = nfs_generic_pg_test, diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 7241b2a2a4d6..0a48f842f83c 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -108,7 +108,6 @@ extern void nfs_unlock_request(struct nfs_page *req); extern int nfs_set_page_tag_locked(struct nfs_page *req); extern void nfs_clear_page_tag_locked(struct nfs_page *req); -extern int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); extern int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); -- cgit v1.2.3 From dce81290eed64d24493989bb7a08f9e20495e184 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Jul 2011 15:59:19 -0400 Subject: NFS: Move the pnfs write code into pnfs.c ...and ensure that we recoalese to take into account differences in differences in block sizes when falling back to write through the MDS. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 4 ++++ fs/nfs/nfs4filelayout.c | 2 +- fs/nfs/objlayout/objio_osd.c | 2 +- fs/nfs/pnfs.c | 57 ++++++++++++++++++++++++++++++++++++++++++-- fs/nfs/pnfs.h | 10 +------- fs/nfs/write.c | 39 ++++++++++++++---------------- include/linux/nfs_page.h | 3 --- 7 files changed, 80 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d74d7dea9173..09e20de31901 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -305,8 +305,12 @@ extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ +extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, + struct list_head *head); extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_write_data *p); extern int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index fc556d6b90f1..fbc5b42d1970 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -741,7 +741,7 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = { static const struct nfs_pageio_ops filelayout_pg_write_ops = { .pg_init = filelayout_pg_init_write, .pg_test = filelayout_pg_test, - .pg_doio = nfs_generic_pg_writepages, + .pg_doio = pnfs_generic_pg_writepages, }; static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index add62894f364..7d49bb160b46 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1013,7 +1013,7 @@ static const struct nfs_pageio_ops objio_pg_read_ops = { static const struct nfs_pageio_ops objio_pg_write_ops = { .pg_init = pnfs_generic_pg_init_write, .pg_test = objio_pg_test, - .pg_doio = nfs_generic_pg_writepages, + .pg_doio = pnfs_generic_pg_writepages, }; static struct pnfs_layoutdriver_type objlayout_type = { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 9eca5a8cdbdd..93c73299588d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1170,15 +1170,30 @@ pnfs_ld_write_done(struct nfs_write_data *data) } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); -enum pnfs_try_status +static void +pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_write_data *data) +{ + list_splice_tail_init(&data->pages, &desc->pg_list); + if (data->req && list_empty(&data->req->wb_list)) + nfs_list_add_request(data->req, &desc->pg_list); + nfs_pageio_reset_write_mds(desc); + desc->pg_recoalesce = 1; + nfs_writedata_release(data); +} + +static enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *wdata, - const struct rpc_call_ops *call_ops, int how) + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg, + int how) { struct inode *inode = wdata->inode; enum pnfs_try_status trypnfs; struct nfs_server *nfss = NFS_SERVER(inode); wdata->mds_ops = call_ops; + wdata->lseg = get_lseg(lseg); dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, inode->i_ino, wdata->args.count, wdata->args.offset, how); @@ -1194,6 +1209,44 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, return trypnfs; } +static void +pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +{ + struct nfs_write_data *data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + + desc->pg_lseg = NULL; + while (!list_empty(head)) { + enum pnfs_try_status trypnfs; + + data = list_entry(head->next, struct nfs_write_data, list); + list_del_init(&data->list); + + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, data); + } + put_lseg(lseg); +} + +int +pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_flush(desc, &head); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; + } + pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); + /* * Called by non rpc-based layout drivers */ diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index c40ffa52c1ab..078670dfbe04 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -155,11 +155,10 @@ bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int) void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); -enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, - const struct rpc_call_ops *, int); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -328,13 +327,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) { } -static inline enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, int how) -{ - return PNFS_NOT_ATTEMPTED; -} - static inline int pnfs_return_layout(struct inode *ino) { return 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c88a1ab42c39..cabe5f6611b9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -97,7 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p) mempool_free(p, nfs_wdata_mempool); } -static void nfs_writedata_release(struct nfs_write_data *wdata) +void nfs_writedata_release(struct nfs_write_data *wdata) { put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); @@ -887,25 +887,15 @@ static void nfs_write_rpcsetup(struct nfs_page *req, static int nfs_do_write(struct nfs_write_data *data, const struct rpc_call_ops *call_ops, - struct pnfs_layout_segment *lseg, int how) { struct inode *inode = data->args.context->path.dentry->d_inode; - if (lseg != NULL) { - data->lseg = get_lseg(lseg); - if (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED) - return 0; - put_lseg(data->lseg); - data->lseg = NULL; - } - return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } static int nfs_do_multiple_writes(struct list_head *head, const struct rpc_call_ops *call_ops, - struct pnfs_layout_segment *lseg, int how) { struct nfs_write_data *data; @@ -917,7 +907,7 @@ static int nfs_do_multiple_writes(struct list_head *head, data = list_entry(head->next, struct nfs_write_data, list); list_del_init(&data->list); - ret2 = nfs_do_write(data, call_ops, lseg, how); + ret2 = nfs_do_write(data, call_ops, how); if (ret == 0) ret = ret2; } @@ -1037,23 +1027,24 @@ out: return ret; } -int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_flush_multi(desc, head); + return nfs_flush_one(desc, head); +} + +static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { LIST_HEAD(head); int ret; - if (desc->pg_bsize < PAGE_CACHE_SIZE) - ret = nfs_flush_multi(desc, &head); - else - ret = nfs_flush_one(desc, &head); + ret = nfs_generic_flush(desc, &head); if (ret == 0) ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops, - desc->pg_lseg, desc->pg_ioflags); - put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; + desc->pg_ioflags); return ret; } -EXPORT_SYMBOL_GPL(nfs_generic_pg_writepages); static const struct nfs_pageio_ops nfs_pageio_write_ops = { .pg_test = nfs_generic_pg_test, @@ -1068,6 +1059,12 @@ void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds); +void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_ops = &nfs_pageio_write_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; +} + static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 0a48f842f83c..e2791a27a901 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -108,9 +108,6 @@ extern void nfs_unlock_request(struct nfs_page *req); extern int nfs_set_page_tag_locked(struct nfs_page *req); extern void nfs_clear_page_tag_locked(struct nfs_page *req); -extern int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); - - /* * Lock the page of an asynchronous request without getting a new reference */ -- cgit v1.2.3 From 1f9453578f059d2651aa6c6b16756627fc9f2a74 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Jul 2011 15:59:57 -0400 Subject: NFS: Clean up - simplify the switch to read/write-through-MDS Use nfs_pageio_reset_read_mds and nfs_pageio_reset_write_mds instead of completely reinitialising the struct nfs_pageio_descriptor. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 4 ---- fs/nfs/nfs4filelayout.c | 5 ++--- fs/nfs/pnfs.c | 4 ++-- fs/nfs/read.c | 4 ++-- fs/nfs/write.c | 4 ++-- 5 files changed, 8 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 09e20de31901..ab12913dd473 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -299,16 +299,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head); -extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, - struct inode *inode); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head); -extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_write_data *p); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index fbc5b42d1970..f0b37e155ce3 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -711,7 +711,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, GFP_KERNEL); /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) - nfs_pageio_init_read_mds(pgio, pgio->pg_inode); + nfs_pageio_reset_read_mds(pgio); } void @@ -728,8 +728,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, GFP_NOFS); /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - nfs_pageio_init_write_mds(pgio, pgio->pg_inode, - pgio->pg_ioflags); + nfs_pageio_reset_write_mds(pgio); } static const struct nfs_pageio_ops filelayout_pg_read_ops = { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 93c73299588d..38e5508555c6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1075,7 +1075,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r GFP_KERNEL); /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) - nfs_pageio_init_read_mds(pgio, pgio->pg_inode); + nfs_pageio_reset_read_mds(pgio); } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); @@ -1093,7 +1093,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page * GFP_NOFS); /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - nfs_pageio_init_write_mds(pgio, pgio->pg_inode, pgio->pg_ioflags); + nfs_pageio_reset_write_mds(pgio); } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index d2f53ddd8260..7cba2280e2b8 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -112,19 +112,19 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode) { nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, NFS_SERVER(inode)->rsize, 0); } -EXPORT_SYMBOL_GPL(nfs_pageio_init_read_mds); void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { pgio->pg_ops = &nfs_pageio_read_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; } +EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cabe5f6611b9..9fba5270b1a8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1051,19 +1051,19 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = { .pg_doio = nfs_generic_pg_writepages, }; -void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, +static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, NFS_SERVER(inode)->wsize, ioflags); } -EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds); void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { pgio->pg_ops = &nfs_pageio_write_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; } +EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) -- cgit v1.2.3 From 9e00abc3c20904fd6a5d888bb7023925799ec8a5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Jul 2011 19:20:49 -0400 Subject: SUNRPC: sunrpc should not explicitly depend on NFS config options Change explicit references to CONFIG_NFS_V4_1 to implicit ones Get rid of the unnecessary defines in backchannel_rqst.c and bc_svc.c: the Makefile takes care of those dependency. Signed-off-by: Trond Myklebust --- fs/nfs/Kconfig | 1 + include/linux/sunrpc/bc_xprt.h | 6 +++--- include/linux/sunrpc/svc.h | 4 ++-- include/linux/sunrpc/xprt.h | 16 ++++++++-------- net/sunrpc/Kconfig | 4 ++++ net/sunrpc/Makefile | 2 +- net/sunrpc/backchannel_rqst.c | 3 --- net/sunrpc/bc_svc.c | 3 --- net/sunrpc/clnt.c | 12 ++++++------ net/sunrpc/svc.c | 4 ++-- net/sunrpc/svcsock.c | 14 +++++++------- net/sunrpc/xprt.c | 4 ++-- net/sunrpc/xprtsock.c | 6 +++--- 13 files changed, 39 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 81515545ba75..2cde5d954750 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -77,6 +77,7 @@ config NFS_V4 config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select SUNRPC_BACKCHANNEL select PNFS_FILE_LAYOUT help This option enables support for minor version 1 of the NFSv4 protocol diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index 082884295f80..f7f3ce340c08 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#ifdef CONFIG_NFS_V4_1 +#ifdef CONFIG_SUNRPC_BACKCHANNEL struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt); void xprt_free_bc_request(struct rpc_rqst *req); int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); @@ -47,7 +47,7 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp) return 1; return 0; } -#else /* CONFIG_NFS_V4_1 */ +#else /* CONFIG_SUNRPC_BACKCHANNEL */ static inline int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) { @@ -62,6 +62,6 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp) static inline void xprt_free_bc_request(struct rpc_rqst *req) { } -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ #endif /* _LINUX_SUNRPC_BC_XPRT_H */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ea29330b78bd..9a3fa7baf4b8 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -92,7 +92,7 @@ struct svc_serv { struct module * sv_module; /* optional module to count when * adding threads */ svc_thread_fn sv_function; /* main function for threads */ -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) struct list_head sv_cb_list; /* queue for callback requests * that arrive over the same * connection */ @@ -100,7 +100,7 @@ struct svc_serv { wait_queue_head_t sv_cb_waitq; /* sleep here if there are no * entries in the svc_cb_list */ struct svc_xprt *sv_bc_xprt; /* callback on fore channel */ -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ }; /* diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 81cce3b3ee66..217b0206581b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -100,11 +100,11 @@ struct rpc_rqst { ktime_t rq_xtime; /* transmit time stamp */ int rq_ntrans; -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) struct list_head rq_bc_list; /* Callback service list */ unsigned long rq_bc_pa_state; /* Backchannel prealloc state */ struct list_head rq_bc_pa_list; /* Backchannel prealloc list */ -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANEL */ }; #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len @@ -200,7 +200,7 @@ struct rpc_xprt { u32 xid; /* Next XID value to use */ struct rpc_task * snd_task; /* Task blocked in send */ struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) struct svc_serv *bc_serv; /* The RPC service which will */ /* process the callback */ unsigned int bc_alloc_count; /* Total number of preallocs */ @@ -208,7 +208,7 @@ struct rpc_xprt { * items */ struct list_head bc_pa_list; /* List of preallocated * backchannel rpc_rqst's */ -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ struct list_head recv; struct { @@ -228,15 +228,15 @@ struct rpc_xprt { const char *address_strings[RPC_DISPLAY_MAX]; }; -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* * Backchannel flags */ #define RPC_BC_PA_IN_USE 0x0001 /* Preallocated backchannel */ /* buffer in use */ -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) static inline int bc_prealloc(struct rpc_rqst *req) { return test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); @@ -246,7 +246,7 @@ static inline int bc_prealloc(struct rpc_rqst *req) { return 0; } -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ struct xprt_create { int ident; /* XPRT_TRANSPORT identifier */ diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index b2198e65d8bb..ffd243d09188 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -4,6 +4,10 @@ config SUNRPC config SUNRPC_GSS tristate +config SUNRPC_BACKCHANNEL + bool + depends on SUNRPC + config SUNRPC_XPRT_RDMA tristate depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 9d2fca5ad14a..8209a0411bca 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -13,6 +13,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ addr.o rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o \ svc_xprt.o -sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o +sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index cf06af3b63c6..bb5593f467cb 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -29,8 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RPCDBG_FACILITY RPCDBG_TRANS #endif -#if defined(CONFIG_NFS_V4_1) - /* * Helper routines that track the number of preallocation elements * on the transport. @@ -279,4 +277,3 @@ void xprt_free_bc_request(struct rpc_rqst *req) spin_unlock_bh(&xprt->bc_pa_lock); } -#endif /* CONFIG_NFS_V4_1 */ diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c index 1dd1a6890007..0b2eb388cbda 100644 --- a/net/sunrpc/bc_svc.c +++ b/net/sunrpc/bc_svc.c @@ -27,8 +27,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * reply over an existing open connection previously established by the client. */ -#if defined(CONFIG_NFS_V4_1) - #include #include @@ -63,4 +61,3 @@ int bc_send(struct rpc_rqst *req) return ret; } -#endif /* CONFIG_NFS_V4_1 */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 64c3fe11aff6..d3fe866f57ac 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -64,9 +64,9 @@ static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) static void call_bc_transmit(struct rpc_task *task); -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ static void call_status(struct rpc_task *task); static void call_transmit_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); @@ -716,7 +716,7 @@ rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, } EXPORT_SYMBOL_GPL(rpc_call_async); -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) /** * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run * rpc_execute against it @@ -759,7 +759,7 @@ out: dprintk("RPC: rpc_run_bc_task: task= %p\n", task); return task; } -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ void rpc_call_start(struct rpc_task *task) @@ -1362,7 +1362,7 @@ call_transmit_status(struct rpc_task *task) } } -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* * 5b. Send the backchannel RPC reply. On error, drop the reply. In * addition, disconnect on connectivity errors. @@ -1426,7 +1426,7 @@ call_bc_transmit(struct rpc_task *task) } rpc_wake_up_queued_task(&req->rq_xprt->pending, task); } -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ /* * 6. Sort out the RPC call status diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2b90292e9505..3c6fe222dec6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1252,7 +1252,7 @@ svc_process(struct svc_rqst *rqstp) } } -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* * Process a backchannel RPC request that arrived over an existing * outbound connection @@ -1301,7 +1301,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, } } EXPORT_SYMBOL(bc_svc_process); -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ /* * Return (transport-specific) limit on the rpc payload. diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index af04f779ce9f..a1812a295388 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -66,12 +66,12 @@ static void svc_sock_free(struct svc_xprt *); static struct svc_xprt *svc_create_socket(struct svc_serv *, int, struct net *, struct sockaddr *, int, int); -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, struct net *, struct sockaddr *, int, int); static void svc_bc_sock_free(struct svc_xprt *xprt); -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key svc_key[2]; @@ -1241,7 +1241,7 @@ static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); } -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, struct net *, struct sockaddr *, int, int); @@ -1282,7 +1282,7 @@ static void svc_cleanup_bc_xprt_sock(void) { svc_unreg_xprt_class(&svc_tcp_bc_class); } -#else /* CONFIG_NFS_V4_1 */ +#else /* CONFIG_SUNRPC_BACKCHANNEL */ static void svc_init_bc_xprt_sock(void) { } @@ -1290,7 +1290,7 @@ static void svc_init_bc_xprt_sock(void) static void svc_cleanup_bc_xprt_sock(void) { } -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ static struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, @@ -1621,7 +1621,7 @@ static void svc_sock_free(struct svc_xprt *xprt) kfree(svsk); } -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* * Create a back channel svc_xprt which shares the fore channel socket. */ @@ -1660,4 +1660,4 @@ static void svc_bc_sock_free(struct svc_xprt *xprt) if (xprt) kfree(container_of(xprt, struct svc_sock, sk_xprt)); } -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ce5eb68a9664..fbdbaf2cd58d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1111,10 +1111,10 @@ found: INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 72abb7358933..cd6c410fa8fa 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -37,7 +37,7 @@ #include #include #include -#ifdef CONFIG_NFS_V4_1 +#ifdef CONFIG_SUNRPC_BACKCHANNEL #include #endif @@ -1236,7 +1236,7 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, return 0; } -#if defined(CONFIG_NFS_V4_1) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* * Obtains an rpc_rqst previously allocated and invokes the common * tcp read code to read the data. The result is placed in the callback @@ -1299,7 +1299,7 @@ static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, { return xs_tcp_read_reply(xprt, desc); } -#endif /* CONFIG_NFS_V4_1 */ +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ /* * Read data off the transport. This can be either an RPC_CALL or an -- cgit v1.2.3 From 94b134ac8e9965309e70684b504c53bca36338b4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Jul 2011 19:26:49 -0400 Subject: NFS: Convert nfs4_set_ds_client to EXPORT_SYMBOL_GPL This is not part of an external ABI... Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5452ada59461..19ea7d9c75e6 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1463,7 +1463,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, dprintk("<-- %s %p\n", __func__, clp); return clp; } -EXPORT_SYMBOL(nfs4_set_ds_client); +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. -- cgit v1.2.3 From 674e405b8b3310702fd43d314f5f432ec2cb9980 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 15 Jul 2011 19:09:08 -0400 Subject: nfs: document nfsv4 sillyrename issues Somebody working on this code asked what the deal was with NFSv4, since this comment notes that it's v2/v3's statelessness that requires sillyrename. Shouldn't hurt to document the answer. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/unlink.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 8d6864c2a5fa..981298ce5124 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -501,6 +501,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, * and only performs the unlink once the last reference to it is put. * * The final cleanup is done during dentry_iput. + * + * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server + * could take responsibility for keeping open files referenced. The server + * would also need to ensure that opened-but-deleted files were kept over + * reboots. However, we may not assume a server does so. (RFC 5661 + * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can + * use to advertise that it does this; some day we may take advantage of + * it.)) */ int nfs_sillyrename(struct inode *dir, struct dentry *dentry) -- cgit v1.2.3 From f85ef69ce08bc2209858135328335f668ba35bdb Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 15 Jul 2011 19:18:42 -0400 Subject: pnfs: simplify pnfs files module autoloading Embed the necessary alias into the module rather than waiting for someone to add it to /etc/modprobe.conf Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 2 ++ fs/nfs/objlayout/objio_osd.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index f0b37e155ce3..be93a622872c 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -964,5 +964,7 @@ static void __exit nfs4filelayout_exit(void) pnfs_unregister_layoutdriver(&filelayout_type); } +MODULE_ALIAS("nfs-layouttype4-1"); + module_init(nfs4filelayout_init); module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 7d49bb160b46..9383ca7245bc 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -1065,5 +1065,7 @@ objlayout_exit(void) __func__); } +MODULE_ALIAS("nfs-layouttype4-2"); + module_init(objlayout_init); module_exit(objlayout_exit); -- cgit v1.2.3 From 265c6a0f9290c8f470b839257dc6af3c46b24da1 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Sat, 16 Jul 2011 19:41:23 -0400 Subject: ext4: fix compilation with -DDX_DEBUG Compilation of ext4/namei.c brought up an error and warning messages when compiled with -DDX_DEBUG Signed-off-by: Bernd Schubert Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8dde5ab239cc..aaf313107c6c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q *err = -ENOENT; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); dx_release (frames); return NULL; } -- cgit v1.2.3 From d7a1fee135771e6e5185642bdc17df19bbdbcc48 Mon Sep 17 00:00:00 2001 From: Dan Ehrenberg Date: Sun, 17 Jul 2011 21:11:30 -0400 Subject: ext4: make the preallocation size be a multiple of stripe size Previously, if a stripe width was provided, then it would be used as the preallocation granularity, with no santiy checking and no way to override this. Now, mb_prealloc_size defaults to the smallest multiple of stripe size that is greater than or equal to the old default mb_prealloc_size, and this can be overridden with the sysfs interface. Signed-off-by: Dan Ehrenberg Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b97a2d2f0fdf..037f680b76f9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -128,12 +128,13 @@ * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is * 512 blocks. This can be tuned via - * /sys/fs/ext4//mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe= option the group prealloc request is normalized to the - * stripe value (sbi->s_stripe) + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. * - * The regular allocator(using the buddy cache) supports few tunables. + * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4//mb_min_to_scan * /sys/fs/ext4//mb_max_to_scan @@ -2474,6 +2475,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { @@ -2841,8 +2854,9 @@ out_err: /* * here we normalize request for locality group - * Group request are normalized to s_strip size if we set the same via mount - * option. If not we set it to s_mb_group_prealloc which can be configured via + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via * /sys/fs/ext4//mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? @@ -2853,10 +2867,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); - if (EXT4_SB(sb)->s_stripe) - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; - else - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } -- cgit v1.2.3 From 3eb08658431abd65c0fe6855d1860859c2d416f7 Mon Sep 17 00:00:00 2001 From: Dan Ehrenberg Date: Sun, 17 Jul 2011 21:18:51 -0400 Subject: ext4: ignore a stripe width of 1 If the stripe width was set to 1, then this patch will ignore that stripe width and ext4 will act as if the stripe width were 0 with respect to optimizing allocations. Signed-off-by: Dan Ehrenberg Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7910e61809e7..143d763729b4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2384,17 +2384,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) - return sbi->s_stripe; - - if (stripe_width <= sbi->s_blocks_per_group) - return stripe_width; + ret = sbi->s_stripe; + else if (stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; - if (stride <= sbi->s_blocks_per_group) - return stride; + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; - return 0; + return ret; } /* sysfs supprt */ -- cgit v1.2.3 From f7d0d3797fac6cad24ad9f86dd9baf65c586b434 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sun, 17 Jul 2011 23:17:02 -0400 Subject: ext4: punch hole optimizations: skip un-needed extent lookup This patch optimizes the punch hole operation by skipping the tree walking code that is used by truncate. Since punch hole is done through map blocks, the path to the extent is already known in this function, so we do not need to look it up again. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1c538e5055c..06b30b61205f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3461,8 +3461,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_mark_uninitialized(ex); - err = ext4_ext_remove_space(inode, map->m_lblk, - map->m_lblk + punched_out); + ext4_ext_invalidate_cache(inode); + + err = ext4_ext_rm_leaf(handle, inode, path, + map->m_lblk, map->m_lblk + punched_out); + + if (!err && path->p_hdr->eh_entries == 0) { + /* + * Punch hole freed all of this sub tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root( + inode, 0)); + + err = ext4_ext_dirty( + handle, inode, path); + } + } goto out2; } -- cgit v1.2.3 From c6a0371cbefade85376bbc326d18451860632dce Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Sun, 17 Jul 2011 23:21:03 -0400 Subject: ext4: remove unneeded parameter to ext4_ext_remove_space() This patch removes the extra parameter in ext4_ext_remove_space() which is no longer needed. Signed-off-by: Allison Henderson Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 06b30b61205f..3d8c5f50ba33 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2500,8 +2500,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2541,7 +2540,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - start, end); + start, EXT_MAX_BLOCKS - 1); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3683,7 +3682,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + err = ext4_ext_remove_space(inode, last_block); /* In a multi-transaction truncate, we only make the final * transaction synchronous. -- cgit v1.2.3 From 015861badd0db43d025bbb538f8fc62dfaf3f18d Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sun, 17 Jul 2011 23:27:43 -0400 Subject: ext4: avoid wasted extent cache lookup if !PUNCH_OUT_EXT This patch avoids an extraneous lookup of the extent cache in ext4_ext_map_blocks() when the flag EXT4_GET_BLOCKS_PUNCH_OUT_EXT is absent. The existing logic was performing the lookup but not making use of the result. The patch simply reverses the order of evaluation in the condition. Since ext4_ext_in_cache() does not initialize newex on misses, bypassing its invocation does not introduce any new issue in this regard. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner Reviewed-by: Eric Gouriou --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3d8c5f50ba33..b8acfab00224 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3320,8 +3320,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && - ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { + if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && + ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* -- cgit v1.2.3 From d46203159ed376fdbe2b05aa57e58207bf27a8f9 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sun, 17 Jul 2011 23:43:42 -0400 Subject: ext4: avoid eh_entries overflow before insert extent_idx If eh_entries is equal to (or greater than) eh_max, the operation of inserting new extent_idx will make number of entries overflow. So check eh_entries before inserting the new extent_idx. Although there is no bug case according the code (function ext4_ext_insert_index is called by ext4_ext_split and ext4_ext_split is called only if the index block has free space), the right logic should be "lookup the capacity before insertion". Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b8acfab00224..9bec432e2d26 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -741,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, logical, le32_to_cpu(curp->p_idx->ei_block)); return -EIO; } + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EIO; + } + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ @@ -770,14 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max))) { - EXT4_ERROR_INODE(inode, - "eh_entries %d > eh_max %d!", - le16_to_cpu(curp->p_hdr->eh_entries), - le16_to_cpu(curp->p_hdr->eh_max)); - return -EIO; - } if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EIO; -- cgit v1.2.3 From c878c73f8dda48f64bcec24f78e80e154cbc5cf8 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Wed, 20 Jul 2011 23:16:33 +0200 Subject: ext3: Fix compilation with -DDX_DEBUG Compilation of ext3/namei.c brought up an error and warning messages when compiled with -DDX_DEBUG. Signed-off-by: Bernd Schubert Signed-off-by: Jan Kara --- fs/ext3/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 51736a4ff0cd..0465368598bd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -288,7 +288,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent while (len--) printk("%c", *name++); ext3fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); names++; @@ -1014,7 +1014,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk("%s not found\n", entry->name)); dx_release (frames); return NULL; } -- cgit v1.2.3 From 03b5bb342978f99f75fb36d69cd29bab32109fd4 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Fri, 22 Jul 2011 08:50:13 -0500 Subject: ext2: check xattr name_len before acquiring xattr_sem in ext2_xattr_get In ext2_xattr_get(), the code will acquire xattr_sem first, later checks the length of xattr name_len > 255. It's unnecessarily time consuming and also ext2_xattr_set() checks the length before other checks. So move the check before acquiring xattr_sem to make these two functions consistent. Signed-off-by: Wang Sheng-Hui Signed-off-by: Jan Kara --- fs/ext2/xattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 529970617a21..d27b71f1d183 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -161,6 +161,10 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; + name_len = strlen(name); + if (name_len > 255) + return -ERANGE; + down_read(&EXT2_I(inode)->xattr_sem); error = -ENODATA; if (!EXT2_I(inode)->i_file_acl) @@ -181,12 +185,8 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", error = -EIO; goto cleanup; } - /* find named attribute */ - name_len = strlen(name); - error = -ERANGE; - if (name_len > 255) - goto cleanup; + /* find named attribute */ entry = FIRST_ENTRY(bh); while (!IS_LAST_ENTRY(entry)) { struct ext2_xattr_entry *next = -- cgit v1.2.3 From 638c1fd3033c76778e6d9975ad8a4a9cdd5b96d9 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 21 Jul 2011 16:57:52 -0400 Subject: pstore: Extend API for more flexibility in new backends Some pstore implementations may not have a static context, so extend the API to pass the pstore_info struct to all calls and allow for a context pointer. Signed-off-by: Matthew Garrett Signed-off-by: Tony Luck --- drivers/acpi/apei/erst.c | 18 +++++++++++++----- fs/pstore/inode.c | 10 +++++----- fs/pstore/internal.h | 2 +- fs/pstore/platform.c | 13 +++++++------ include/linux/pstore.h | 8 +++++--- 5 files changed, 31 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index e6cef8e1b534..de3ae92adaa5 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -932,8 +932,10 @@ static int erst_check_table(struct acpi_table_erst *erst_tab) static int erst_open_pstore(struct pstore_info *psi); static int erst_close_pstore(struct pstore_info *psi); static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, - struct timespec *time); -static u64 erst_writer(enum pstore_type_id type, size_t size); + struct timespec *time, struct pstore_info *psi); +static u64 erst_writer(enum pstore_type_id type, size_t size, + struct pstore_info *psi); +static int erst_clearer(u64 id, struct pstore_info *psi); static struct pstore_info erst_info = { .owner = THIS_MODULE, @@ -942,7 +944,7 @@ static struct pstore_info erst_info = { .close = erst_close_pstore, .read = erst_reader, .write = erst_writer, - .erase = erst_clear + .erase = erst_clearer }; #define CPER_CREATOR_PSTORE \ @@ -983,7 +985,7 @@ static int erst_close_pstore(struct pstore_info *psi) } static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, - struct timespec *time) + struct timespec *time, struct pstore_info *psi) { int rc; ssize_t len = 0; @@ -1037,7 +1039,8 @@ out: return (rc < 0) ? rc : (len - sizeof(*rcd)); } -static u64 erst_writer(enum pstore_type_id type, size_t size) +static u64 erst_writer(enum pstore_type_id type, size_t size, + struct pstore_info *psi) { struct cper_pstore_record *rcd = (struct cper_pstore_record *) (erst_info.buf - sizeof(*rcd)); @@ -1080,6 +1083,11 @@ static u64 erst_writer(enum pstore_type_id type, size_t size) return rcd->hdr.record_id; } +static int erst_clearer(u64 id, struct pstore_info *psi) +{ + return erst_clear(id); +} + static int __init erst_init(void) { int rc = 0; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 977ed2723845..b19884a1ba77 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -40,7 +40,7 @@ struct pstore_private { u64 id; - int (*erase)(u64); + struct pstore_info *psi; ssize_t size; char data[]; }; @@ -73,7 +73,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; - p->erase(p->id); + p->psi->erase(p->id, p->psi); return simple_unlink(dir, dentry); } @@ -175,8 +175,8 @@ int pstore_is_mounted(void) * Set the mtime & ctime to the date that this record was originally stored. */ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, - char *data, size_t size, - struct timespec time, int (*erase)(u64)) + char *data, size_t size, struct timespec time, + struct pstore_info *psi) { struct dentry *root = pstore_sb->s_root; struct dentry *dentry; @@ -193,7 +193,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, if (!private) goto fail_alloc; private->id = id; - private->erase = erase; + private->psi = psi; switch (type) { case PSTORE_TYPE_DMESG: diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 8c9f23eb1645..611c1b3c46fa 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -2,5 +2,5 @@ extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(void); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, - struct timespec time, int (*erase)(u64)); + struct timespec time, struct pstore_info *psi); extern int pstore_is_mounted(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f2c3ff20ea68..221c04e5e333 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -94,11 +94,12 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); + id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy, + psinfo); if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, hsize + l1_cpy + l2_cpy, - CURRENT_TIME, psinfo->erase); + CURRENT_TIME, psinfo); l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; @@ -166,9 +167,9 @@ void pstore_get_records(void) if (rc) goto out; - while ((size = psi->read(&id, &type, &time)) > 0) { + while ((size = psi->read(&id, &type, &time, psi)) > 0) { if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, - time, psi->erase)) + time, psi)) failed++; } psi->close(psi); @@ -196,10 +197,10 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) mutex_lock(&psinfo->buf_mutex); memcpy(psinfo->buf, buf, size); - id = psinfo->write(type, size); + id = psinfo->write(type, size, psinfo); if (pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, - size, CURRENT_TIME, psinfo->erase); + size, CURRENT_TIME, psinfo); mutex_unlock(&psinfo->buf_mutex); return 0; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 2455ef2683f0..b2f1d97f6909 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -38,9 +38,11 @@ struct pstore_info { int (*open)(struct pstore_info *psi); int (*close)(struct pstore_info *psi); ssize_t (*read)(u64 *id, enum pstore_type_id *type, - struct timespec *time); - u64 (*write)(enum pstore_type_id type, size_t size); - int (*erase)(u64 id); + struct timespec *time, struct pstore_info *psi); + u64 (*write)(enum pstore_type_id type, size_t size, + struct pstore_info *psi); + int (*erase)(u64 id, struct pstore_info *psi); + void *data; }; #ifdef CONFIG_PSTORE -- cgit v1.2.3 From 56280682ceeef74b692b3e21d1872049eea7c887 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 21 Jul 2011 16:57:53 -0400 Subject: pstore: Add extra context for writes and erases EFI only provides small amounts of individual storage, and conventionally puts metadata in the storage variable name. Rather than add a metadata header to the (already limited) variable storage, it's easier for us to modify pstore to pass all the information we need to construct a unique variable name to the appropriate functions. Signed-off-by: Matthew Garrett Signed-off-by: Tony Luck --- drivers/acpi/apei/erst.c | 10 ++++++---- fs/pstore/inode.c | 6 ++++-- fs/pstore/platform.c | 9 +++++---- include/linux/pstore.h | 5 +++-- 4 files changed, 18 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index de3ae92adaa5..d842ac4f8cfe 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -933,9 +933,10 @@ static int erst_open_pstore(struct pstore_info *psi); static int erst_close_pstore(struct pstore_info *psi); static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, struct timespec *time, struct pstore_info *psi); -static u64 erst_writer(enum pstore_type_id type, size_t size, +static u64 erst_writer(enum pstore_type_id type, int part, size_t size, struct pstore_info *psi); -static int erst_clearer(u64 id, struct pstore_info *psi); +static int erst_clearer(enum pstore_type_id type, u64 id, + struct pstore_info *psi); static struct pstore_info erst_info = { .owner = THIS_MODULE, @@ -1039,7 +1040,7 @@ out: return (rc < 0) ? rc : (len - sizeof(*rcd)); } -static u64 erst_writer(enum pstore_type_id type, size_t size, +static u64 erst_writer(enum pstore_type_id type, int part, size_t size, struct pstore_info *psi) { struct cper_pstore_record *rcd = (struct cper_pstore_record *) @@ -1083,7 +1084,8 @@ static u64 erst_writer(enum pstore_type_id type, size_t size, return rcd->hdr.record_id; } -static int erst_clearer(u64 id, struct pstore_info *psi) +static int erst_clearer(enum pstore_type_id type, u64 id, + struct pstore_info *psi) { return erst_clear(id); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index b19884a1ba77..893b961dcfd8 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -39,8 +39,9 @@ #define PSTORE_NAMELEN 64 struct pstore_private { - u64 id; struct pstore_info *psi; + enum pstore_type_id type; + u64 id; ssize_t size; char data[]; }; @@ -73,7 +74,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; - p->psi->erase(p->id, p->psi); + p->psi->erase(p->type, p->id, p->psi); return simple_unlink(dir, dentry); } @@ -192,6 +193,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, private = kmalloc(sizeof *private + size, GFP_KERNEL); if (!private) goto fail_alloc; + private->type = type; private->id = id; private->psi = psi; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 221c04e5e333..163bb40511e7 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -78,7 +78,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, oopscount++; while (total < kmsg_bytes) { dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); + hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); size = psinfo->bufsize - hsize; dst += hsize; @@ -94,8 +94,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy, - psinfo); + id = psinfo->write(PSTORE_TYPE_DMESG, part, + hsize + l1_cpy + l2_cpy, psinfo); if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, hsize + l1_cpy + l2_cpy, @@ -103,6 +103,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; + part++; } mutex_unlock(&psinfo->buf_mutex); } @@ -197,7 +198,7 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) mutex_lock(&psinfo->buf_mutex); memcpy(psinfo->buf, buf, size); - id = psinfo->write(type, size, psinfo); + id = psinfo->write(type, 0, size, psinfo); if (pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, size, CURRENT_TIME, psinfo); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index b2f1d97f6909..12be8f193d09 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,9 +39,10 @@ struct pstore_info { int (*close)(struct pstore_info *psi); ssize_t (*read)(u64 *id, enum pstore_type_id *type, struct timespec *time, struct pstore_info *psi); - u64 (*write)(enum pstore_type_id type, size_t size, + u64 (*write)(enum pstore_type_id type, int part, + size_t size, struct pstore_info *psi); + int (*erase)(enum pstore_type_id type, u64 id, struct pstore_info *psi); - int (*erase)(u64 id, struct pstore_info *psi); void *data; }; -- cgit v1.2.3 From b94fdd077eef5e6cab56836bf62695b497946716 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 21 Jul 2011 16:57:54 -0400 Subject: pstore: Make "part" unsigned We'll never have a negative part, so just make this an unsigned int. Signed-off-by: Matthew Garrett Signed-off-by: Tony Luck --- drivers/acpi/apei/erst.c | 8 ++++---- fs/pstore/platform.c | 3 ++- include/linux/pstore.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index d842ac4f8cfe..6053f4780df9 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -933,8 +933,8 @@ static int erst_open_pstore(struct pstore_info *psi); static int erst_close_pstore(struct pstore_info *psi); static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, struct timespec *time, struct pstore_info *psi); -static u64 erst_writer(enum pstore_type_id type, int part, size_t size, - struct pstore_info *psi); +static u64 erst_writer(enum pstore_type_id type, unsigned int part, + size_t size, struct pstore_info *psi); static int erst_clearer(enum pstore_type_id type, u64 id, struct pstore_info *psi); @@ -1040,8 +1040,8 @@ out: return (rc < 0) ? rc : (len - sizeof(*rcd)); } -static u64 erst_writer(enum pstore_type_id type, int part, size_t size, - struct pstore_info *psi) +static u64 erst_writer(enum pstore_type_id type, unsigned int part, + size_t size, struct pstore_info *psi) { struct cper_pstore_record *rcd = (struct cper_pstore_record *) (erst_info.buf - sizeof(*rcd)); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 163bb40511e7..49ff1de2178a 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -67,7 +67,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long size, total = 0; char *dst, *why; u64 id; - int hsize, part = 1; + int hsize; + unsigned int part = 1; if (reason < ARRAY_SIZE(reason_str)) why = reason_str[reason]; diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 12be8f193d09..cc03bbf5c4b8 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -39,7 +39,7 @@ struct pstore_info { int (*close)(struct pstore_info *psi); ssize_t (*read)(u64 *id, enum pstore_type_id *type, struct timespec *time, struct pstore_info *psi); - u64 (*write)(enum pstore_type_id type, int part, + u64 (*write)(enum pstore_type_id type, unsigned int part, size_t size, struct pstore_info *psi); int (*erase)(enum pstore_type_id type, u64 id, struct pstore_info *psi); -- cgit v1.2.3 From dee28e72b619b48ec80a9e5509db458dbe66f71f Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 21 Jul 2011 16:57:55 -0400 Subject: pstore: Allow the user to explicitly choose a backend pstore only allows one backend to be registered at present, but the system may provide several. Add a parameter to allow the user to choose which backend will be used rather than just relying on load order. Signed-off-by: Matthew Garrett Signed-off-by: Tony Luck --- Documentation/ABI/testing/pstore | 6 ++++++ Documentation/kernel-parameters.txt | 2 ++ fs/pstore/platform.c | 11 +++++++++++ 3 files changed, 19 insertions(+) (limited to 'fs') diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore index ddf451ee2a08..ff1df4e3b059 100644 --- a/Documentation/ABI/testing/pstore +++ b/Documentation/ABI/testing/pstore @@ -39,3 +39,9 @@ Description: Generic interface to platform dependent persistent storage. multiple) files based on the record size of the underlying persistent storage until at least this amount is reached. Default is 10 Kbytes. + + Pstore only supports one backend at a time. If multiple + backends are available, the preferred backend may be + set by passing the pstore.backend= argument to the kernel at + boot time. + diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index aa47be71df4c..8789d0c9291a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2151,6 +2151,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [HW,MOUSE] Controls Logitech smartscroll autorepeat. 0 = disabled, 1 = enabled (default). + pstore.backend= Specify the name of the pstore backend to use + pt. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 49ff1de2178a..c5300ec31696 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -37,6 +37,8 @@ static DEFINE_SPINLOCK(pstore_lock); static struct pstore_info *psinfo; +static char *backend; + /* How much of the console log to snapshot */ static unsigned long kmsg_bytes = 10240; @@ -131,6 +133,12 @@ int pstore_register(struct pstore_info *psi) spin_unlock(&pstore_lock); return -EBUSY; } + + if (backend && strcmp(backend, psi->name)) { + spin_unlock(&pstore_lock); + return -EINVAL; + } + psinfo = psi; spin_unlock(&pstore_lock); @@ -208,3 +216,6 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) return 0; } EXPORT_SYMBOL_GPL(pstore_write); + +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "Pstore backend to use"); -- cgit v1.2.3 From b22570d9abb3d844e65c15c8bc0d57a78129e3b4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 23 Jul 2011 01:21:38 +0200 Subject: ext3: Fix data corruption in inodes with journalled data When journalling data for an inode (either because it is a symlink or because the filesystem is mounted in data=journal mode), ext3_evict_inode() can discard unwritten data by calling truncate_inode_pages(). This is because we don't mark the buffer / page dirty when journalling data but only add the buffer to the running transaction and thus mm does not know there are still unwritten data. Fix the problem by carefully tracking transaction containing inode's data, committing this transaction, and writing uncheckpointed buffers when inode should be reaped. Signed-off-by: Jan Kara --- fs/ext3/inode.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d2e4547c7806..f57c87b0cb83 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -197,6 +197,7 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode) */ void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_block_alloc_info *rsv; handle_t *handle; int want_delete = 0; @@ -207,11 +208,36 @@ void ext3_evict_inode (struct inode *inode) want_delete = 1; } + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); ext3_discard_reservation(inode); - rsv = EXT3_I(inode)->i_block_alloc_info; - EXT3_I(inode)->i_block_alloc_info = NULL; + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); @@ -241,7 +267,7 @@ void ext3_evict_inode (struct inode *inode) * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -1411,6 +1437,7 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; @@ -1440,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file, if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_set_inode_state(inode, EXT3_STATE_JDATA); - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1739,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page, if (ret == 0) ret = err; ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); } else { /* -- cgit v1.2.3 From 529da704ad1ead755d9e40721f29446cb278e099 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 23 Jul 2011 16:07:26 -0400 Subject: ext4: remove unnecessary ext4_get_group_info in ext4_mb_load_buddy ext4_mb_load_buddy() calls ext4_get_group_info() for setting both "grp" and "e4b->bd_info", but it could do "e4b->bd_info = grp". Reported-by: Andreas Dilger Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 037f680b76f9..447c0f3384ab 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1126,7 +1126,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; -- cgit v1.2.3 From ced156e464e49b6b4153ede9aaa04d9a4ad24e0c Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 23 Jul 2011 16:18:05 -0400 Subject: ext4: don't increment s_mb_buddies_generated in ext4_mb_release In ext4_mb_release, we use s_mb_buddies_generated++. Although the output is OK, but I don't think we need this extra ++. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 447c0f3384ab..e16583032b6b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2581,7 +2581,7 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_mb_lost_chunks)); printk(KERN_INFO "EXT4-fs: mballoc: %lu generated and it took %Lu\n", - sbi->s_mb_buddies_generated++, + sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); printk(KERN_INFO "EXT4-fs: mballoc: %u preallocated, %u discarded\n", -- cgit v1.2.3 From 6a0fe49308dcac10ab510b3c45e00eb8d5ef440e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 23 Jul 2011 16:18:55 -0400 Subject: ext4: remove ac_repeats from ext4_allocation_context ac_repeats isn't referenced in the mballoc code. So remove it. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7bfebd1..9d4a636b546c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,7 +187,6 @@ struct ext4_allocation_context { __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; - __u8 ac_repeats; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ -- cgit v1.2.3 From 5718789da5b94bd4148cb7ea0f457089c26bc1c3 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 23 Jul 2011 21:49:07 -0400 Subject: ext4: remove unused argument in ext4_ext_next_leaf_block The argument "inode" in function ext4_ext_next_allocated_block looks useless, so clean it. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9bec432e2d26..33bbe8467bd6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1414,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ -static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, - struct ext4_ext_path *path) +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; @@ -1734,7 +1733,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, fex = EXT_LAST_EXTENT(eh); next = EXT_MAX_BLOCKS; if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) - next = ext4_ext_next_leaf_block(inode, path); + next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); -- cgit v1.2.3 From 0737964bc98202776f4d10bc8e108f45b3115037 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 23 Jul 2011 21:51:07 -0400 Subject: ext4: correct the debug message in ext4_ext_insert_extent The debug message in ext4_ext_insert_extent before moving extent is incorrect (the "from xx to xx"). Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 33bbe8467bd6..a637d837a895 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1806,7 +1806,7 @@ has_space: ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); + nearex, len, nearex, nearex + 1); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; } -- cgit v1.2.3 From b7ca1e8ec53259359db5313f923a0a20fa04bdb6 Mon Sep 17 00:00:00 2001 From: Robin Dong Date: Sat, 23 Jul 2011 21:53:25 -0400 Subject: ext4: correct comment for ext4_ext_check_cache The comment for ext4_ext_check_cache has a litte mistake. Signed-off-by: Robin Dong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a637d837a895..4d73e11ae883 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2019,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_in_cache() + * ext4_ext_check_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given * cache extent pointer. If the cached extent is a hole, -- cgit v1.2.3 From fcc5c22218a18509a7412bf074fc9a7a5d874a8a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 11 Jul 2011 23:08:50 -0700 Subject: writeback: don't busy retry writeback on new/freeing inodes Fix a system hang bug introduced by commit b7a2441f9966 ("writeback: remove writeback_control.more_io") and e8dfc3058 ("writeback: elevate queue_io() into wb_writeback()") easily reproducible with high memory pressure and lots of file creation/deletions, for example, a kernel build in limited memory. It hangs when some inode is in the I_NEW, I_FREEING or I_WILL_FREE state, the flusher will get stuck busy retrying that inode, never releasing wb->list_lock. The lock in turn blocks all kinds of other tasks when they are trying to grab it. As put by Jan, it's a safe change regarding data integrity. I_FREEING or I_WILL_FREE inodes are written back by iput_final() and it is reclaim code that is responsible for eventually removing them. So writeback code can safely ignore them. I_NEW inodes should move out of this state when they are fully set up and in the writeback round following that, we will consider them for writeback. So the change makes sense. CC: Jan Kara Reported-by: Hugh Dickins Tested-by: Hugh Dickins Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 50445cf0b83a..6d49439ca31d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -593,7 +593,7 @@ static long writeback_sb_inodes(struct super_block *sb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode, wb); + redirty_tail(inode, wb); continue; } __iget(inode); -- cgit v1.2.3 From 3c2c22628599006047781946b317a16d9ff3883d Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 20 Jun 2011 13:00:27 -0500 Subject: jfs: clean up some compiler warnings jfs has a few variables being set but never used. Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dmap.c | 5 +---- fs/jfs/jfs_txnmgr.c | 6 ++---- fs/jfs/namei.c | 3 +-- 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 4496872cf4e7..9cbd11a3f804 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -3161,7 +3161,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, { int rc; int dbitno, word, rembits, nb, nwords, wbitno, agno; - s8 oldroot, *leaf; + s8 oldroot; struct dmaptree *tp = (struct dmaptree *) & dp->tree; /* save the current value of the root (i.e. maximum free string) @@ -3169,9 +3169,6 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, */ oldroot = tp->stree[ROOT]; - /* pick up a pointer to the leaves of the dmap tree */ - leaf = tp->stree + LEAFIND; - /* determine the bit number and word within the dmap of the * starting block. */ diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f6cc0c09ec63..af9606057dde 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1143,7 +1143,6 @@ int txCommit(tid_t tid, /* transaction identifier */ struct jfs_log *log; struct tblock *tblk; struct lrd *lrd; - int lsn; struct inode *ip; struct jfs_inode_info *jfs_ip; int k, n; @@ -1310,7 +1309,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; - lsn = lmLog(log, tblk, lrd, NULL); + lmLog(log, tblk, lrd, NULL); lmGroupCommit(log, tblk); @@ -2935,7 +2934,6 @@ int jfs_sync(void *arg) { struct inode *ip; struct jfs_inode_info *jfs_ip; - int rc; tid_t tid; do { @@ -2961,7 +2959,7 @@ int jfs_sync(void *arg) */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE); - rc = txCommit(tid, 1, &ip, 0); + txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 03787ef6a118..45e5c6f65b2c 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -893,7 +893,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unchar *i_fastsymlink; s64 xlen = 0; int bmask = 0, xsize; - s64 extent = 0, xaddr; + s64 xaddr; struct metapage *mp; struct super_block *sb; struct tblock *tblk; @@ -993,7 +993,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, txAbort(tid, 0); goto out3; } - extent = xaddr; ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ -- cgit v1.2.3 From 73ca1001ed6881b476e8252adcd0eede1ea368ea Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 18 Jul 2011 11:26:30 -0400 Subject: nfs: don't use d_move in nfs_async_rename_done If the task that initiated the sillyrename ends up being killed by a fatal signal, then it will eventually return back to userspace and end up releasing the i_mutex. d_move however needs to be done while holding the i_mutex. Instead of using d_move here, just unhash the old and new dentries to prevent them from being found by lookups. With this change though, the dentries are now incorrect post-rename and do not reflect the actual name of the file on the server. I'm proceeding under the assumption that since they are unhashed that this isn't really a problem. In order for the sillydelete to still work though, the dname must be copied earlier when setting up the sillydelete info, and the name must be recopied if the sillydelete info has to be moved to a new dentry. Reported-by: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/unlink.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 981298ce5124..b2fbbde58e44 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -147,7 +147,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n alias = d_lookup(parent, &data->args.name); if (alias != NULL) { - int ret = 0; + int ret; void *devname_garbage = NULL; /* @@ -155,14 +155,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * the sillyrename information to the aliased dentry. */ nfs_free_dname(data); + ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (alias->d_inode != NULL && + if (ret == 0 && alias->d_inode != NULL && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; alias->d_flags |= DCACHE_NFSFS_RENAMED; ret = 1; - } + } else + ret = 0; spin_unlock(&alias->d_lock); nfs_dec_sillycount(dir); dput(alias); @@ -171,8 +173,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * point dentry is definitely not a root, so we won't need * that anymore. */ - if (devname_garbage) - kfree(devname_garbage); + kfree(devname_garbage); return ret; } data->dir = igrab(dir); @@ -204,8 +205,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) if (parent == NULL) goto out_free; dir = parent->d_inode; - if (nfs_copy_dname(dentry, data) != 0) - goto out_dput; /* Non-exclusive lock protects against concurrent lookup() calls */ spin_lock(&dir->i_lock); if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -366,6 +365,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct nfs_renamedata *data = calldata; struct inode *old_dir = data->old_dir; struct inode *new_dir = data->new_dir; + struct dentry *old_dentry = data->old_dentry; + struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); @@ -373,12 +374,12 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) } if (task->tk_status != 0) { - nfs_cancel_async_unlink(data->old_dentry); + nfs_cancel_async_unlink(old_dentry); return; } - nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); - d_move(data->old_dentry, data->new_dentry); + d_drop(old_dentry); + d_drop(new_dentry); } /** @@ -568,6 +569,14 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (error) goto out_dput; + /* populate unlinkdata with the right dname */ + error = nfs_copy_dname(sdentry, + (struct nfs_unlinkdata *)dentry->d_fsdata); + if (error) { + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + /* run the rename task, undo unlink if it fails */ task = nfs_async_rename(dir, dir, dentry, sdentry); if (IS_ERR(task)) { -- cgit v1.2.3 From ed1e6211a0a134ff23592c6f057af982ad5dab52 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Jul 2011 15:37:29 -0400 Subject: NFSv4: Don't use the delegation->inode in nfs_mark_return_delegation() nfs_mark_return_delegation() is usually called without any locking, and so it is not safe to dereference delegation->inode. Since the inode is only used to discover the nfs_client anyway, it makes more sense to have the callers pass a valid pointer to the nfs_server as a parameter. Reported-by: Ian Kent Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index dd25c2aec375..321a66bc3846 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -398,12 +398,11 @@ int nfs_inode_return_delegation(struct inode *inode) return err; } -static void nfs_mark_return_delegation(struct nfs_delegation *delegation) +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client; - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } /** @@ -441,7 +440,7 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server, if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; if (delegation->type & flags) - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); } } @@ -508,7 +507,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); } } @@ -539,7 +538,8 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; rcu_read_lock(); @@ -549,7 +549,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, rcu_read_unlock(); return -ENOENT; } - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); nfs_delegation_run_state_manager(clp); -- cgit v1.2.3 From ed43233be910bbc8b9da3d61aa1b931843d1b44e Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:39:39 +0000 Subject: xfs: Remove the macro XFS_BUF_BFLAGS Remove the definition of the macro XFS_BUF_BFLAGS and its usage. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- fs/xfs/linux-2.6/xfs_buf.h | 1 - fs/xfs/xfs_trans_buf.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index b2b411985591..969fd151024c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1094,7 +1094,7 @@ STATIC int xfs_bioerror_relse( struct xfs_buf *bp) { - int64_t fl = XFS_BUF_BFLAGS(bp); + int64_t fl = bp->b_flags; /* * No need to wait until the buffer is unpinned. * We aren't flushing it. diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 6a83b46b4bcf..6b6c25ff4ead 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -232,7 +232,6 @@ extern void xfs_buf_terminate(void); ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) -#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 15584fc3ed7d..1bc04d4e7eb1 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -430,7 +430,7 @@ shutdown_abort: if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); #endif - ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != + ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) != (XBF_STALE|XBF_DELWRI)); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); -- cgit v1.2.3 From 5a52c2a581cddcb676a54a95d99cd39f5577c33b Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:39:51 +0000 Subject: xfs: Remove the macro XFS_BUF_ERROR and family Remove the definitions and usage of the macros XFS_BUF_ERROR, XFS_BUF_GETERROR and XFS_BUF_ISERROR. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 6 +++--- fs/xfs/linux-2.6/xfs_buf.h | 4 ---- fs/xfs/quota/xfs_dquot.c | 2 +- fs/xfs/xfs_alloc.c | 7 +++---- fs/xfs/xfs_attr.c | 3 +-- fs/xfs/xfs_btree.c | 17 ++++++----------- fs/xfs/xfs_buf_item.c | 4 ++-- fs/xfs/xfs_da_btree.c | 2 +- fs/xfs/xfs_ialloc.c | 5 ++--- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 4 ++-- fs/xfs/xfs_log_recover.c | 12 +++++------- fs/xfs/xfs_rtalloc.c | 2 +- fs/xfs/xfs_rw.c | 6 +++--- fs/xfs/xfs_trans_buf.c | 15 +++++++-------- fs/xfs/xfs_vnodeops.c | 4 ++-- 16 files changed, 40 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 969fd151024c..704418a04991 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -596,7 +596,7 @@ _xfs_buf_read( bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); status = xfs_buf_iorequest(bp); - if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) + if (status || bp->b_error || (flags & XBF_ASYNC)) return status; return xfs_buf_iowait(bp); } @@ -1069,7 +1069,7 @@ xfs_bioerror( /* * No need to wait until the buffer is unpinned, we aren't flushing it. */ - XFS_BUF_ERROR(bp, EIO); + xfs_buf_ioerror(bp, EIO); /* * We're calling xfs_buf_ioend, so delete XBF_DONE flag. @@ -1115,7 +1115,7 @@ xfs_bioerror_relse( * There's no reason to mark error for * ASYNC buffers. */ - XFS_BUF_ERROR(bp, EIO); + xfs_buf_ioerror(bp, EIO); XFS_BUF_FINISH_IOWAIT(bp); } else { xfs_buf_relse(bp); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 6b6c25ff4ead..08a15c25484b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -250,10 +250,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) -#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) -#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp) -#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0) - #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 837f31158d43..784019d4a576 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -403,7 +403,7 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp || (error = XFS_BUF_GETERROR(bp))) + if (!bp || (error = xfs_buf_geterror(bp))) goto error1; /* * Make a chunk of dquots out of this buffer and log diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 1e00b3ef6274..bdd9cb54d63b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -451,8 +451,7 @@ xfs_alloc_read_agfl( XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) return error; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); *bpp = bp; return 0; @@ -2116,7 +2115,7 @@ xfs_read_agf( if (!*bpp) return 0; - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); /* @@ -2168,7 +2167,7 @@ xfs_alloc_read_agf( return error; if (!*bpp) return 0; - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); pag = xfs_perag_get(mp, agno); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index cbae424fe1ba..160bcdc34a6e 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2121,8 +2121,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, XBF_LOCK | XBF_DONT_BLOCK); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index cabf4b5604aa..2b9fd385e27d 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -275,8 +275,7 @@ xfs_btree_dup_cursor( return error; } new->bc_bufs[i] = bp; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); } else new->bc_bufs[i] = NULL; } @@ -467,8 +466,7 @@ xfs_btree_get_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); return bp; } @@ -491,8 +489,7 @@ xfs_btree_get_bufs( ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); return bp; } @@ -632,7 +629,7 @@ xfs_btree_read_bufl( mp->m_bsize, lock, &bp))) { return error; } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); if (bp) XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); *bpp = bp; @@ -973,8 +970,7 @@ xfs_btree_get_buf_block( *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags); - ASSERT(*bpp); - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); *block = XFS_BUF_TO_BLOCK(*bpp); return 0; @@ -1006,8 +1002,7 @@ xfs_btree_read_buf_block( if (error) return error; - ASSERT(*bpp != NULL); - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 88492916c3dc..38417ab46fde 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -960,7 +960,7 @@ xfs_buf_iodone_callbacks( static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!XFS_BUF_GETERROR(bp))) + if (likely(!xfs_buf_geterror(bp))) goto do_callbacks; /* @@ -991,7 +991,7 @@ xfs_buf_iodone_callbacks( * around. */ if (XFS_BUF_ISASYNC(bp)) { - XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { XFS_BUF_DELAYWRITE(bp); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 2925726529f8..5d9290db3d6c 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2040,7 +2040,7 @@ xfs_da_do_buf( case 0: bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, mappedbno, nmapped, 0); - error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); + error = bp ? bp->b_error : XFS_ERROR(EIO); break; case 1: case 2: diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index dd5628bd8d0b..9f24ec28283b 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -202,8 +202,7 @@ xfs_ialloc_inode_init( fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, XBF_LOCK); - ASSERT(fbuf); - ASSERT(!XFS_BUF_GETERROR(fbuf)); + ASSERT(!xfs_buf_geterror(fbuf)); /* * Initialize all inodes in this buffer and then log them. @@ -1486,7 +1485,7 @@ xfs_read_agi( if (error) return error; - ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); agi = XFS_BUF_TO_AGI(*bpp); /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cc21ddf9f7e..bdb47b22c6bc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2473,7 +2473,7 @@ cluster_corrupt_out: if (bp->b_iodone) { XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); - XFS_BUF_ERROR(bp,EIO); + xfs_buf_ioerror(bp, EIO); xfs_buf_ioend(bp, 0); } else { XFS_BUF_STALE(bp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 06ff8437ed8e..d1595e7ffd0c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -878,7 +878,7 @@ xlog_iodone(xfs_buf_t *bp) /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, + if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); XFS_BUF_STALE(bp); @@ -1248,7 +1248,7 @@ xlog_bdstrat( struct xlog_in_core *iclog = bp->b_fspriv; if (iclog->ic_state & XLOG_STATE_IOERROR) { - XFS_BUF_ERROR(bp, EIO); + xfs_buf_ioerror(bp, EIO); XFS_BUF_STALE(bp); xfs_buf_ioend(bp, 0); /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8fe4206de057..a8e0827dbe74 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -360,7 +360,7 @@ STATIC void xlog_recover_iodone( struct xfs_buf *bp) { - if (XFS_BUF_GETERROR(bp)) { + if (bp->b_error) { /* * We're not going to bother about retrying * this during recovery. One strike! @@ -2135,15 +2135,14 @@ xlog_recover_buffer_pass2( bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags); - if (XFS_BUF_ISERROR(bp)) { + error = xfs_buf_geterror(bp); + if (error) { xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, bp, buf_f->blf_blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); return error; } - error = 0; if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (buf_f->blf_flags & @@ -2227,14 +2226,13 @@ xlog_recover_inode_pass2( bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, XBF_LOCK); - if (XFS_BUF_ISERROR(bp)) { + error = xfs_buf_geterror(bp); + if (error) { xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, bp, in_f->ilf_blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); goto error; } - error = 0; ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8f76fdff4f46..cb8132cb37ea 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -883,7 +883,7 @@ xfs_rtbuf_get( if (error) { return error; } - ASSERT(bp && !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); *bpp = bp; return 0; } diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index d6d6fdfe9422..d1f76f8a4c7e 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -106,7 +106,7 @@ xfs_ioerror_alert( " (\"%s\") error %d buf count %zd", XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), (__uint64_t)blkno, func, - XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); + bp->b_error, XFS_BUF_COUNT(bp)); } /* @@ -137,8 +137,8 @@ xfs_read_buf( bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return XFS_ERROR(EIO); - error = XFS_BUF_GETERROR(bp); - if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { + error = bp->b_error; + if (!error && !XFS_FORCED_SHUTDOWN(mp)) { *bpp = bp; } else { *bpp = NULL; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 1bc04d4e7eb1..f9f1bf6ab4b8 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -194,7 +194,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return NULL; } - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_fspriv); @@ -293,10 +293,10 @@ xfs_trans_read_buf( return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); - if (XFS_BUF_GETERROR(bp) != 0) { + if (bp->b_error) { + error = bp->b_error; xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); return error; } @@ -330,7 +330,7 @@ xfs_trans_read_buf( ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_fspriv != NULL); - ASSERT((XFS_BUF_ISERROR(bp)) == 0); + ASSERT(!bp->b_error); if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); @@ -386,10 +386,9 @@ xfs_trans_read_buf( return (flags & XBF_TRYLOCK) ? 0 : XFS_ERROR(ENOMEM); } - if (XFS_BUF_GETERROR(bp) != 0) { - XFS_BUF_SUPER_STALE(bp); - error = XFS_BUF_GETERROR(bp); - + if (bp->b_error) { + error = bp->b_error; + XFS_BUF_SUPER_STALE(bp); xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); if (tp->t_flags & XFS_TRANS_DIRTY) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 88d121486c52..3baebe280f10 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -83,7 +83,7 @@ xfs_readlink_bmap( bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); - error = XFS_BUF_GETERROR(bp); + error = xfs_buf_geterror(bp); if (error) { xfs_ioerror_alert("xfs_readlink", ip->i_mount, bp, XFS_BUF_ADDR(bp)); @@ -1648,7 +1648,7 @@ xfs_symlink( byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); - ASSERT(bp && !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); if (pathlen < byte_cnt) { byte_cnt = pathlen; } -- cgit v1.2.3 From b75e40a4193ca027af7327ef30c31d45aa0a0e40 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:39:57 +0000 Subject: xfs: Remove macro XFS_BUF_BUSY and family Remove the definitions and uses of the macros XFS_BUF_BUSY, XFS_BUF_UNBUSY, and XFS_BUF_ISBUSY. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 1 - fs/xfs/linux-2.6/xfs_buf.h | 4 ---- fs/xfs/quota/xfs_dquot.c | 4 ---- fs/xfs/xfs_buf_item.c | 2 -- fs/xfs/xfs_log.c | 4 ---- fs/xfs/xfs_log_recover.c | 2 -- fs/xfs/xfs_trans_buf.c | 9 --------- 7 files changed, 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 704418a04991..ae2c2e719682 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -679,7 +679,6 @@ xfs_buf_read_uncached( /* set up the buffer for a read IO */ XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 08a15c25484b..05e744f9fe71 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -254,10 +254,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) -#define XFS_BUF_BUSY(bp) do { } while (0) -#define XFS_BUF_UNBUSY(bp) do { } while (0) -#define XFS_BUF_ISBUSY(bp) (1) - #define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) #define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) #define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 784019d4a576..0e12861bd552 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -318,7 +318,6 @@ xfs_qm_init_dquot_blk( int curid, i; ASSERT(tp); - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); @@ -534,7 +533,6 @@ xfs_qm_dqtobp( return XFS_ERROR(error); } - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); /* @@ -553,7 +551,6 @@ xfs_qm_dqtobp( xfs_trans_brelse(tp, bp); return XFS_ERROR(EIO); } - XFS_BUF_BUSY(bp); /* We dirtied this */ } *O_bpp = bp; @@ -622,7 +619,6 @@ xfs_qm_dqread( * this particular dquot was repaired. We still aren't afraid to * brelse it because we have the changes incore. */ - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 38417ab46fde..9e9b4a70360a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -371,7 +371,6 @@ xfs_buf_item_pin( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - ASSERT(XFS_BUF_ISBUSY(bip->bli_buf)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); @@ -895,7 +894,6 @@ xfs_buf_attach_iodone( { xfs_log_item_t *head_lip; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d1595e7ffd0c..64682b6f9dcb 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1051,7 +1051,6 @@ xlog_alloc_log(xfs_mount_t *mp, if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); log->l_xbuf = bp; @@ -1108,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t *mp, iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1355,7 +1353,6 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_COUNT(bp, count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_SYNCIO; @@ -1402,7 +1399,6 @@ xlog_sync(xlog_t *log, (__psint_t)count), split); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a8e0827dbe74..4c8a892c0951 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -178,7 +178,6 @@ xlog_bread_noalign( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); @@ -266,7 +265,6 @@ xlog_bwrite( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_HOLD(bp); xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index f9f1bf6ab4b8..7dd62e26ce92 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -80,7 +80,6 @@ _xfs_trans_bjoin( { struct xfs_buf_log_item *bip; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == NULL); /* @@ -580,7 +579,6 @@ xfs_trans_bhold(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -601,7 +599,6 @@ xfs_trans_bhold_release(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -630,7 +627,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); @@ -701,7 +697,6 @@ xfs_trans_binval( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -773,7 +768,6 @@ xfs_trans_inode_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -797,7 +791,6 @@ xfs_trans_stale_inode_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -822,7 +815,6 @@ xfs_trans_inode_alloc_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -850,7 +842,6 @@ xfs_trans_dquot_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || -- cgit v1.2.3 From 72790aa1192f46dedfc827c170365fd554981d15 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:04 +0000 Subject: xfs: Remove macro XFS_BUF_HOLD Remove the definition and usage of the macro XFS_BUF_HOLD Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.h | 1 - fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_mount.c | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 05e744f9fe71..6691a0236708 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -258,7 +258,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) #define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) -#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) #define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) #define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) #define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 9e9b4a70360a..a6dd497510a3 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -484,7 +484,7 @@ xfs_buf_item_trylock( return XFS_ITEM_LOCKED; /* take a reference to the buffer. */ - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_trylock(bip); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4c8a892c0951..536eb0dd94d6 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -265,7 +265,7 @@ xlog_bwrite( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7f25245da289..b00c808d7d3e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1938,7 +1938,7 @@ xfs_getsb( xfs_buf_lock(bp); } - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); ASSERT(XFS_BUF_ISDONE(bp)); return bp; } -- cgit v1.2.3 From 0095a21eb6ae8ac9f9860aa26029fe6ebbd3beeb Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:09 +0000 Subject: xfs: Remove macro XFS_BUF_SET_START Remove the definition and usage of the macro XFS_BUF_SET_START. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.h | 2 -- fs/xfs/xfs_buf_item.c | 1 - 2 files changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 6691a0236708..4e8a6ca7f77d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -266,8 +266,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_SET_START(bp) do { } while (0) - #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) #define XFS_BUF_ADDR(bp) ((bp)->b_bn) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a6dd497510a3..bd4c62b055be 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -994,7 +994,6 @@ xfs_buf_iodone_callbacks( if (!XFS_BUF_ISSTALE(bp)) { XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); } ASSERT(bp->b_iodone != NULL); trace_xfs_buf_item_iodone_async(bp, _RET_IP_); -- cgit v1.2.3 From 6292604447ade7d150f5eba3b1518e1a224fda15 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:15 +0000 Subject: xfs: Remove the macro XFS_BUF_PTR Remove the definition and usages of the macro XFS_BUF_PTR. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- fs/xfs/linux-2.6/xfs_buf.h | 1 - fs/xfs/quota/xfs_dquot.c | 6 +++--- fs/xfs/quota/xfs_qm.c | 2 +- fs/xfs/xfs_ag.h | 6 +++--- fs/xfs/xfs_bmap.c | 3 +-- fs/xfs/xfs_btree.h | 2 +- fs/xfs/xfs_buf_item.c | 6 +++--- fs/xfs/xfs_da_btree.c | 10 +++++----- fs/xfs/xfs_dinode.h | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 10 +++++----- fs/xfs/xfs_rtalloc.c | 30 +++++++++++++++--------------- fs/xfs/xfs_rtalloc.h | 2 +- fs/xfs/xfs_sb.h | 2 +- fs/xfs/xfs_vnodeops.c | 6 +++--- 16 files changed, 45 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ae2c2e719682..6a42f71d08aa 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1320,7 +1320,7 @@ xfs_buf_offset( struct page *page; if (bp->b_flags & XBF_MAPPED) - return XFS_BUF_PTR(bp) + offset; + return bp->b_addr + offset; offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 4e8a6ca7f77d..f0aa94703cf5 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -266,7 +266,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 0e12861bd552..2e0629265f67 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -320,7 +320,7 @@ xfs_qm_init_dquot_blk( ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); - d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); + d = bp->b_addr; /* * ID of the first dquot in the block - id's are zero based. @@ -538,7 +538,7 @@ xfs_qm_dqtobp( /* * calculate the location of the dquot inside the buffer. */ - ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + ddq = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot... @@ -1200,7 +1200,7 @@ xfs_qm_dqflush( /* * Calculate the location of the dquot inside the buffer. */ - ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + ddqp = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot.. diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 46e54ad9a2dc..9a0aa76facdf 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1240,7 +1240,7 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); + ddq = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { /* * Do a sanity check, and if needed, repair the dqblk. Don't diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 6530769a999b..4805f009f923 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -103,7 +103,7 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); @@ -156,7 +156,7 @@ typedef struct xfs_agi { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); @@ -168,7 +168,7 @@ extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) #define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) typedef struct xfs_agfl { __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c51a3f903633..25cb2b2c427a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3384,8 +3384,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, - ifp->if_bytes); + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 8d05a6a46ce3..5b240de104c0 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -262,7 +262,7 @@ typedef struct xfs_btree_cur /* * Convert from buffer to btree block header. */ -#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) /* diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index bd4c62b055be..a16c24c3a3dd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -124,9 +124,9 @@ xfs_buf_item_log_check( bp = bip->bli_buf; ASSERT(XFS_BUF_COUNT(bp) > 0); - ASSERT(XFS_BUF_PTR(bp) != NULL); + ASSERT(bp->b_addr != NULL); orig = bip->bli_orig; - buffer = XFS_BUF_PTR(bp); + buffer = bp->b_addr; for (x = 0; x < XFS_BUF_COUNT(bp); x++) { if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { xfs_emerg(bp->b_mount, @@ -725,7 +725,7 @@ xfs_buf_item_init( * to have logged. */ bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); + memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); #endif diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 5d9290db3d6c..d56ccb709655 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2258,7 +2258,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) dabuf->nbuf = 1; bp = bps[0]; dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); - dabuf->data = XFS_BUF_PTR(bp); + dabuf->data = bp->b_addr; dabuf->bps[0] = bp; } else { dabuf->nbuf = nbuf; @@ -2269,7 +2269,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { bp = bps[i]; - memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), + memcpy((char *)dabuf->data + off, bp->b_addr, XFS_BUF_COUNT(bp)); } } @@ -2292,8 +2292,8 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf) for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { bp = dabuf->bps[i]; - memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, - XFS_BUF_COUNT(bp)); + memcpy(bp->b_addr, dabuf->data + off, + XFS_BUF_COUNT(bp)); } } } @@ -2330,7 +2330,7 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); if (dabuf->nbuf == 1) { - ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); + ASSERT(dabuf->data == dabuf->bps[0]->b_addr); xfs_trans_log_buf(tp, dabuf->bps[0], first, last); return; } diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index dffba9ba0db6..a3721633abc8 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -148,7 +148,7 @@ typedef enum xfs_dinode_fmt { be32_to_cpu((dip)->di_nextents) : \ be16_to_cpu((dip)->di_anextents)) -#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr)) /* * For block and character special files the 32bit dev_t is stored at the diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 64682b6f9dcb..3f1fa7bb9e3d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1403,7 +1403,7 @@ xlog_sync(xlog_t *log, bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = XFS_BUF_PTR(bp); + dptr = bp->b_addr; /* * Bump the cycle numbers at the start of each block * since this part of the buffer is at the start of diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 536eb0dd94d6..1ac295da9f17 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_align( xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); - return XFS_BUF_PTR(bp) + BBTOB(offset); + return bp->b_addr + BBTOB(offset); } @@ -219,7 +219,7 @@ xlog_bread_offset( xfs_buf_t *bp, xfs_caddr_t offset) { - xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); + xfs_caddr_t orig_offset = bp->b_addr; int orig_len = bp->b_buffer_length; int error, error2; @@ -1260,7 +1260,7 @@ xlog_write_log_records( */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); + offset = bp->b_addr + BBTOB(ealign - start_block); error = xlog_bread_offset(log, ealign, sectbb, bp, offset); if (error) @@ -3433,7 +3433,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = XFS_BUF_PTR(hbp); + offset = hbp->b_addr; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -3493,7 +3493,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = XFS_BUF_PTR(dbp); + offset = dbp->b_addr; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index cb8132cb37ea..35561a511b57 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -168,7 +168,7 @@ error_cancel: xfs_trans_cancel(tp, cancelflags); goto error; } - memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* * Commit the transaction. @@ -943,7 +943,7 @@ xfs_rtcheck_range( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Compute the starting word's address, and starting bit. */ @@ -994,7 +994,7 @@ xfs_rtcheck_range( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1040,7 +1040,7 @@ xfs_rtcheck_range( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1158,7 +1158,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Get the first word's index & point to it. */ @@ -1210,7 +1210,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; word = XFS_BLOCKWMASK(mp); b = &bufp[word]; } else { @@ -1256,7 +1256,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; word = XFS_BLOCKWMASK(mp); b = &bufp[word]; } else { @@ -1333,7 +1333,7 @@ xfs_rtfind_forw( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Get the first word's index & point to it. */ @@ -1384,7 +1384,7 @@ xfs_rtfind_forw( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1429,7 +1429,7 @@ xfs_rtfind_forw( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1649,7 +1649,7 @@ xfs_rtmodify_range( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Compute the starting word's address, and starting bit. */ @@ -1694,7 +1694,7 @@ xfs_rtmodify_range( if (error) { return error; } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + first = b = bufp = bp->b_addr; word = 0; } else { /* @@ -1734,7 +1734,7 @@ xfs_rtmodify_range( if (error) { return error; } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + first = b = bufp = bp->b_addr; word = 0; } else { /* @@ -1832,8 +1832,8 @@ xfs_rtmodify_summary( */ sp = XFS_SUMPTR(mp, bp, so); *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), - (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), + (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); return 0; } diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 09e1f4f35e97..f7f3a359c1c5 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -47,7 +47,7 @@ struct xfs_trans; #define XFS_SUMOFFSTOBLOCK(mp,s) \ (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) #define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ + ((xfs_suminfo_t *)((bp)->b_addr + \ (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) #define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 1eb2ba586814..cb6ae715814a 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -509,7 +509,7 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 3baebe280f10..3ee5f8a2858b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -94,7 +94,7 @@ xfs_readlink_bmap( byte_cnt = pathlen; pathlen -= byte_cnt; - memcpy(link, XFS_BUF_PTR(bp), byte_cnt); + memcpy(link, bp->b_addr, byte_cnt); xfs_buf_relse(bp); } @@ -1654,7 +1654,7 @@ xfs_symlink( } pathlen -= byte_cnt; - memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); + memcpy(bp->b_addr, cur_chunk, byte_cnt); cur_chunk += byte_cnt; xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); @@ -1999,7 +1999,7 @@ xfs_zero_remaining_bytes( mp, bp, XFS_BUF_ADDR(bp)); break; } - memset(XFS_BUF_PTR(bp) + + memset(bp->b_addr + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 0, lastoffset - offset + 1); XFS_BUF_UNDONE(bp); -- cgit v1.2.3 From 02fe03d909f3a5876d7b4775fdbc83c07c7c3842 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:22 +0000 Subject: xfs: Remove the macro XFS_BUF_SET_PTR Remove the definition and usages of the macro XFS_BUF_SET_PTR. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.h | 1 - fs/xfs/xfs_log.c | 4 ++-- fs/xfs/xfs_log_recover.c | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index f0aa94703cf5..c5601e1e0f46 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -266,7 +266,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) #define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3f1fa7bb9e3d..3a8d4f66d702 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1395,8 +1395,8 @@ xlog_sync(xlog_t *log, if (split) { bp = iclog->ic_log->l_xbuf; XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ - XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ - (__psint_t)count), split); + xfs_buf_associate_memory(bp, + (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); XFS_BUF_ASYNC(bp); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1ac295da9f17..aaf61d5eefb9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -223,14 +223,14 @@ xlog_bread_offset( int orig_len = bp->b_buffer_length; int error, error2; - error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); if (error) return error; error = xlog_bread_noalign(log, blk_no, nbblks, bp); /* must reset buffer pointer even on error */ - error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); if (error) return error; return error2; -- cgit v1.2.3 From 811e64c7169bb59229971c4aa3b1ed5093f44c84 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:27 +0000 Subject: Replace the macro XFS_BUF_ISPINNED with helper xfs_buf_ispinned Replace the macro XFS_BUF_ISPINNED with an inline helper function xfs_buf_ispinned, and change all its usages. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- fs/xfs/linux-2.6/xfs_buf.h | 5 ++++- fs/xfs/linux-2.6/xfs_sync.c | 2 +- fs/xfs/quota/xfs_dquot.c | 4 ++-- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_inode.c | 2 +- 6 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 6a42f71d08aa..5e929f0e57f7 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1677,7 +1677,7 @@ xfs_buf_delwri_split( list_for_each_entry_safe(bp, n, dwq, b_list) { ASSERT(bp->b_flags & XBF_DELWRI); - if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) { + if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { if (!force && time_before(jiffies, bp->b_queuetime + age)) { xfs_buf_unlock(bp); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index c5601e1e0f46..f4e3de65ebe7 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -285,7 +285,10 @@ xfs_buf_set_ref( #define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) -#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) +static inline int xfs_buf_ispinned(struct xfs_buf *bp) +{ + return atomic_read(&bp->b_pin_count); +} #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 5cc158e52d4c..a8500e92ae72 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -330,7 +330,7 @@ xfs_sync_fsdata( * between there and here. */ bp = xfs_getsb(mp, 0); - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); return xfs_bwrite(mp, bp); diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 2e0629265f67..db62959bed13 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1236,7 +1236,7 @@ xfs_qm_dqflush( * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)) { + if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } @@ -1443,7 +1443,7 @@ xfs_qm_dqflock_pushbuf_wait( goto out_lock; if (XFS_BUF_ISDELAYWRITE(bp)) { - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); xfs_buf_delwri_promote(bp); wake_up_process(bp->b_target->bt_task); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a16c24c3a3dd..a3d2bbca26c7 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -478,7 +478,7 @@ xfs_buf_item_trylock( struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bdb47b22c6bc..76ee2c5371ca 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2585,7 +2585,7 @@ xfs_iflush( * If the buffer is pinned then push on the log now so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); /* -- cgit v1.2.3 From e38c9b87e5b428b3e2a2e48ab0ee2b6cdc8e6208 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:33 +0000 Subject: xfs: Remove the macro XFS_BUF_SET_TARGET Remove the macro XFS_BUF_SET_TARGET. hch: As all the buffer allocator already set ->b_target it should be safe to simply remove these calls. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.h | 1 - fs/xfs/xfs_log_recover.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index f4e3de65ebe7..6a38b2d21ea0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -292,7 +292,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp) #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); -#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) #define XFS_BUF_TARGET(bp) ((bp)->b_target) #define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index aaf61d5eefb9..93786e518d87 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -179,7 +179,6 @@ xlog_bread_noalign( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); @@ -268,7 +267,6 @@ xlog_bwrite( xfs_buf_hold(bp); xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); if ((error = xfs_bwrite(log->l_mp, bp))) xfs_ioerror_alert("xlog_bwrite", log->l_mp, -- cgit v1.2.3 From 49074c069cd3f0f683325d0c7f8b2765dbe2e294 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:40 +0000 Subject: xfs: Remove the macro XFS_BUF_TARGET Remove the definition and usages of the macro XFS_BUF_TARGET Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.h | 1 - fs/xfs/xfs_buf_item.c | 6 +++--- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_rw.c | 2 +- fs/xfs/xfs_trans_buf.c | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 6a38b2d21ea0..4b304990a799 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -292,7 +292,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp) #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); -#define XFS_BUF_TARGET(bp) ((bp)->b_target) #define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) static inline void xfs_buf_relse(xfs_buf_t *bp) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a3d2bbca26c7..5c2b5549e14e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -971,14 +971,14 @@ xfs_buf_iodone_callbacks( goto do_callbacks; } - if (XFS_BUF_TARGET(bp) != lasttarg || + if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; xfs_alert(mp, "Device %s: metadata write error block 0x%llx", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + XFS_BUFTARG_NAME(bp->b_target), (__uint64_t)XFS_BUF_ADDR(bp)); } - lasttarg = XFS_BUF_TARGET(bp); + lasttarg = bp->b_target; /* * If the write was asynchronous then no one will be looking for the diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b00c808d7d3e..49ecc170f749 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1615,7 +1615,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNDELAYWRITE(sbp); XFS_BUF_WRITE(sbp); XFS_BUF_UNASYNC(sbp); - ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); + ASSERT(sbp->b_target == mp->m_ddev_targp); xfsbdstrat(mp, sbp); error = xfs_buf_iowait(sbp); if (error) diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index d1f76f8a4c7e..7382bfe40fbd 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -104,7 +104,7 @@ xfs_ioerror_alert( xfs_alert(mp, "I/O error occurred: meta-data dev %s block 0x%llx" " (\"%s\") error %d buf count %zd", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + XFS_BUFTARG_NAME(bp->b_target), (__uint64_t)blkno, func, bp->b_error, XFS_BUF_COUNT(bp)); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 7dd62e26ce92..137e2b9e2948 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -54,7 +54,7 @@ xfs_trans_buf_item_match( list_for_each_entry(lidp, &tp->t_items, lid_trans) { blip = (struct xfs_buf_log_item *)lidp->lid_item; if (blip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_TARGET(blip->bli_buf) == target && + blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == blkno && XFS_BUF_COUNT(blip->bli_buf) == len) return blip->bli_buf; -- cgit v1.2.3 From c35a549c8b9e85bdff7e531a410d10e36b4b4f32 Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Fri, 22 Jul 2011 23:40:46 +0000 Subject: xfs: Remove the macro XFS_BUFTARG_NAME Remove the definition and usages of the macro XFS_BUFTARG_NAME. Signed-off-by: Chandra Seetharaman Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- fs/xfs/linux-2.6/xfs_buf.h | 11 +++++++---- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_rw.c | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 5e929f0e57f7..6bddce40de72 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1480,7 +1480,7 @@ xfs_setsize_buftarg_flags( if (set_blocksize(btp->bt_bdev, sectorsize)) { xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %s\n", - sectorsize, XFS_BUFTARG_NAME(btp)); + sectorsize, xfs_buf_target_name(btp)); return EINVAL; } diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 4b304990a799..620972b8094d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -228,8 +228,13 @@ extern void xfs_buf_delwri_promote(xfs_buf_t *); extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -#define xfs_buf_target_name(target) \ - ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) +static inline const char * +xfs_buf_target_name(struct xfs_buftarg *target) +{ + static char __b[BDEVNAME_SIZE]; + + return bdevname(target->bt_bdev, __b); +} #define XFS_BUF_ZEROFLAGS(bp) \ @@ -292,8 +297,6 @@ static inline int xfs_buf_ispinned(struct xfs_buf *bp) #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); -#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) - static inline void xfs_buf_relse(xfs_buf_t *bp) { xfs_buf_unlock(bp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 5c2b5549e14e..0402173e79e9 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -975,7 +975,7 @@ xfs_buf_iodone_callbacks( time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; xfs_alert(mp, "Device %s: metadata write error block 0x%llx", - XFS_BUFTARG_NAME(bp->b_target), + xfs_buf_target_name(bp->b_target), (__uint64_t)XFS_BUF_ADDR(bp)); } lasttarg = bp->b_target; diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 7382bfe40fbd..c96a8a05ac03 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -104,7 +104,7 @@ xfs_ioerror_alert( xfs_alert(mp, "I/O error occurred: meta-data dev %s block 0x%llx" " (\"%s\") error %d buf count %zd", - XFS_BUFTARG_NAME(bp->b_target), + xfs_buf_target_name(bp->b_target), (__uint64_t)blkno, func, bp->b_error, XFS_BUF_COUNT(bp)); } -- cgit v1.2.3 From 9feed6f8fbab477b6339efb4f3119a3c22dc187e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 16 Jul 2011 15:23:49 -0400 Subject: cifs: cleanup cifs_filldir Use sensible variable names and formatting and remove some superflous checks on entry. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/readdir.c | 80 +++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6751e745bbc6..04c9b9fbebb4 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -681,57 +681,49 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, return rc; } -static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, - void *direntry, char *scratch_buf, unsigned int max_len) +static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, + void *dirent, char *scratch_buf, unsigned int max_len) { - int rc = 0; - struct qstr qstring; - struct cifsFileInfo *pCifsF; - u64 inum; - ino_t ino; - struct super_block *sb; - struct cifs_sb_info *cifs_sb; - struct dentry *tmp_dentry; + struct cifsFileInfo *file_info = file->private_data; + struct super_block *sb = file->f_path.dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_fattr fattr; + struct dentry *dentry; + struct qstr name; + int rc = 0; + u64 inum; + ino_t ino; - /* get filename and len into qstring */ - /* get dentry */ - /* decide whether to create and populate ionde */ - if ((direntry == NULL) || (file == NULL)) - return -EINVAL; - - pCifsF = file->private_data; - - if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL)) - return -ENOENT; - - rc = cifs_entry_is_dot(pfindEntry, pCifsF); /* skip . and .. since we added them first */ + rc = cifs_entry_is_dot(find_entry, file_info); if (rc != 0) return 0; - sb = file->f_path.dentry->d_sb; - cifs_sb = CIFS_SB(sb); - - qstring.name = scratch_buf; - rc = cifs_get_name_from_search_buf(&qstring, pfindEntry, - pCifsF->srch_inf.info_level, - pCifsF->srch_inf.unicode, cifs_sb, - max_len, &inum /* returned */); - + name.name = scratch_buf; + rc = cifs_get_name_from_search_buf(&name, find_entry, + file_info->srch_inf.info_level, + file_info->srch_inf.unicode, + cifs_sb, max_len, &inum); if (rc) return rc; - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) + switch (file_info->srch_inf.info_level) { + case SMB_FIND_FILE_UNIX: cifs_unix_basic_to_fattr(&fattr, - &((FILE_UNIX_INFO *) pfindEntry)->basic, - cifs_sb); - else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) - cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *) - pfindEntry, cifs_sb); - else - cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) - pfindEntry, cifs_sb); + &((FILE_UNIX_INFO *)find_entry)->basic, + cifs_sb); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_std_info_to_fattr(&fattr, + (FIND_FILE_STANDARD_INFO *)find_entry, + cifs_sb); + break; + default: + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)find_entry, + cifs_sb); + break; + } if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { fattr.cf_uniqueid = inum; @@ -750,12 +742,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); + dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr); - rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, - ino, fattr.cf_dtype); + rc = filldir(dirent, name.name, name.len, file->f_pos, ino, + fattr.cf_dtype); - dput(tmp_dentry); + dput(dentry); return rc; } -- cgit v1.2.3 From 92b8e897f6e7ba4aa10037ebd8186f85d39330d0 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 12 Jul 2011 10:57:59 -0700 Subject: btrfs: Don't BUG_ON alloc_path errors in find_next_chunk I also removed the BUG_ON from error return of find_next_chunk in init_first_rw_device(). It turns out that the only caller of init_first_rw_device() also BUGS on any nonzero return so no actual behavior change has occurred here. do_chunk_alloc() also needed an update since it calls btrfs_alloc_chunk() which can now return -ENOMEM. Instead of setting space_info->full on any error from btrfs_alloc_chunk() I catch and return every error value _except_ -ENOSPC. Thanks goes to Tsutomu Itoh for pointing that issue out. Signed-off-by: Mark Fasheh --- fs/btrfs/extent-tree.c | 4 ++++ fs/btrfs/volumes.c | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aa91773fe31b..f6af4236e59b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3277,6 +3277,9 @@ again: } ret = btrfs_alloc_chunk(trans, extent_root, flags); + if (ret < 0 && ret != -ENOSPC) + goto out; + spin_lock(&space_info->lock); if (ret) space_info->full = 1; @@ -3286,6 +3289,7 @@ again: space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); +out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 530a2fcea1ef..90d956c17d92 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1037,7 +1037,8 @@ static noinline int find_next_chunk(struct btrfs_root *root, struct btrfs_key found_key; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; key.objectid = objectid; key.offset = (u64)-1; @@ -2663,7 +2664,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = find_next_chunk(fs_info->chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - BUG_ON(ret); + if (ret) + return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | (fs_info->metadata_alloc_profile & -- cgit v1.2.3 From 38a1a919535742af677303271eb4ff731547b706 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 13 Jul 2011 10:59:59 -0700 Subject: btrfs: don't BUG_ON allocation errors in btrfs_drop_snapshot In addition to properly handling allocation failure from btrfs_alloc_path, I also fixed up the kzalloc error handling code immediately below it. Signed-off-by: Mark Fasheh --- fs/btrfs/extent-tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f6af4236e59b..6bce721e7bbc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6272,10 +6272,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - BUG_ON(!wc); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; + } trans = btrfs_start_transaction(tree_root, 0); BUG_ON(IS_ERR(trans)); -- cgit v1.2.3 From cda0ec6a86f18127d490048a46de954c03886d5e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 16 Jul 2011 15:24:05 -0400 Subject: cifs: introduce cifs_dirent Introduce a generic directory entry structure, and factor the parsing of the various on the wire structures that can represent one into a common helper. Switch cifs_entry_is_dot over to use it as a start. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/readdir.c | 194 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 125 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 04c9b9fbebb4..67fc1199699f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -4,6 +4,7 @@ * Directory search handling * * Copyright (C) International Business Machines Corp., 2004, 2008 + * Copyright (C) Red Hat, Inc., 2011 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -290,10 +291,10 @@ error_exit: } /* return length of unicode string in bytes */ -static int cifs_unicode_bytelen(char *str) +static int cifs_unicode_bytelen(const char *str) { int len; - __le16 *ustr = (__le16 *)str; + const __le16 *ustr = (const __le16 *)str; for (len = 0; len <= PATH_MAX; len++) { if (ustr[len] == 0) @@ -334,78 +335,128 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) } +struct cifs_dirent { + const char *name; + size_t namelen; + u32 resume_key; + u64 ino; +}; + +static void cifs_fill_dirent_unix(struct cifs_dirent *de, + const FILE_UNIX_INFO *info, bool is_unicode) +{ + de->name = &info->FileName[0]; + if (is_unicode) + de->namelen = cifs_unicode_bytelen(de->name); + else + de->namelen = strnlen(de->name, PATH_MAX); + de->resume_key = info->ResumeKey; + de->ino = le64_to_cpu(info->basic.UniqueId); +} + +static void cifs_fill_dirent_dir(struct cifs_dirent *de, + const FILE_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_full(struct cifs_dirent *de, + const FILE_FULL_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_search(struct cifs_dirent *de, + const SEARCH_ID_FULL_DIR_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; + de->ino = le64_to_cpu(info->UniqueId); +} + +static void cifs_fill_dirent_both(struct cifs_dirent *de, + const FILE_BOTH_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_std(struct cifs_dirent *de, + const FIND_FILE_STANDARD_INFO *info) +{ + de->name = &info->FileName[0]; + /* one byte length, no endianess conversion */ + de->namelen = info->FileNameLength; + de->resume_key = info->ResumeKey; +} + +static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, + u16 level, bool is_unicode) +{ + memset(de, 0, sizeof(*de)); + + switch (level) { + case SMB_FIND_FILE_UNIX: + cifs_fill_dirent_unix(de, info, is_unicode); + break; + case SMB_FIND_FILE_DIRECTORY_INFO: + cifs_fill_dirent_dir(de, info); + break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + cifs_fill_dirent_full(de, info); + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fill_dirent_search(de, info); + break; + case SMB_FIND_FILE_BOTH_DIRECTORY_INFO: + cifs_fill_dirent_both(de, info); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_fill_dirent_std(de, info); + break; + default: + cFYI(1, "Unknown findfirst level %d", level); + return -EINVAL; + } + + return 0; +} + #define UNICODE_DOT cpu_to_le16(0x2e) /* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */ -static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) +static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) { int rc = 0; - char *filename = NULL; - int len = 0; - if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - filename = &pFindData->FileName[0]; - if (cfile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, 5); - } - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", - cfile->srch_inf.info_level); - } + if (!de->name) + return 0; - if (filename) { - if (cfile->srch_inf.unicode) { - __le16 *ufilename = (__le16 *)filename; - if (len == 2) { - /* check for . */ - if (ufilename[0] == UNICODE_DOT) - rc = 1; - } else if (len == 4) { - /* check for .. */ - if ((ufilename[0] == UNICODE_DOT) - && (ufilename[1] == UNICODE_DOT)) - rc = 2; - } - } else /* ASCII */ { - if (len == 1) { - if (filename[0] == '.') - rc = 1; - } else if (len == 2) { - if ((filename[0] == '.') && (filename[1] == '.')) - rc = 2; - } + if (is_unicode) { + __le16 *ufilename = (__le16 *)de->name; + if (de->namelen == 2) { + /* check for . */ + if (ufilename[0] == UNICODE_DOT) + rc = 1; + } else if (de->namelen == 4) { + /* check for .. */ + if (ufilename[0] == UNICODE_DOT && + ufilename[1] == UNICODE_DOT) + rc = 2; + } + } else /* ASCII */ { + if (de->namelen == 1) { + if (de->name[0] == '.') + rc = 1; + } else if (de->namelen == 2) { + if (de->name[0] == '.' && de->name[1] == '.') + rc = 2; } } @@ -687,6 +738,7 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, struct cifsFileInfo *file_info = file->private_data; struct super_block *sb = file->f_path.dentry->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_dirent de = { NULL, }; struct cifs_fattr fattr; struct dentry *dentry; struct qstr name; @@ -694,9 +746,13 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, u64 inum; ino_t ino; + rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (rc) + return rc; + /* skip . and .. since we added them first */ - rc = cifs_entry_is_dot(find_entry, file_info); - if (rc != 0) + if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode)) return 0; name.name = scratch_buf; -- cgit v1.2.3 From f16d59b417d781eb7fe63a561272429b5e098c3a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 16 Jul 2011 15:24:22 -0400 Subject: cifs: use cifs_dirent to replace cifs_get_name_from_search_buf This allows us to parse the on the wire structures only once in cifs_filldir. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/readdir.c | 109 ++++++++++++------------------------------------------ 1 file changed, 23 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 67fc1199699f..47bb1b8cd305 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -656,82 +656,6 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, return rc; } -/* inode num, inode type and filename returned */ -static int cifs_get_name_from_search_buf(struct qstr *pqst, - char *current_entry, __u16 level, unsigned int unicode, - struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum) -{ - int rc = 0; - unsigned int len = 0; - char *filename; - struct nls_table *nlt = cifs_sb->local_nls; - - *pinum = 0; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - - filename = &pFindData->FileName[0]; - if (unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } - - *pinum = le64_to_cpu(pFindData->basic.UniqueId); - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - *pinum = le64_to_cpu(pFindData->UniqueId); - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; - } - - if (len > max_len) { - cERROR(1, "bad search response length %d past smb end", len); - return -EINVAL; - } - - if (unicode) { - pqst->len = cifs_from_ucs2((char *) pqst->name, - (__le16 *) filename, - UNICODE_NAME_MAX, - min(len, max_len), nlt, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - pqst->len -= nls_nullsize(nlt); - } else { - pqst->name = filename; - pqst->len = len; - } - return rc; -} - static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, void *dirent, char *scratch_buf, unsigned int max_len) { @@ -743,7 +667,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, struct dentry *dentry; struct qstr name; int rc = 0; - u64 inum; ino_t ino; rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, @@ -751,17 +674,31 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, if (rc) return rc; + if (de.namelen > max_len) { + cERROR(1, "bad search response length %zd past smb end", + de.namelen); + return -EINVAL; + } + /* skip . and .. since we added them first */ if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode)) return 0; - name.name = scratch_buf; - rc = cifs_get_name_from_search_buf(&name, find_entry, - file_info->srch_inf.info_level, - file_info->srch_inf.unicode, - cifs_sb, max_len, &inum); - if (rc) - return rc; + if (file_info->srch_inf.unicode) { + struct nls_table *nlt = cifs_sb->local_nls; + + name.name = scratch_buf; + name.len = + cifs_from_ucs2((char *)name.name, (__le16 *)de.name, + UNICODE_NAME_MAX, + min(de.namelen, (size_t)max_len), nlt, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + name.len -= nls_nullsize(nlt); + } else { + name.name = de.name; + name.len = de.namelen; + } switch (file_info->srch_inf.info_level) { case SMB_FIND_FILE_UNIX: @@ -781,8 +718,8 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, break; } - if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { - fattr.cf_uniqueid = inum; + if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + fattr.cf_uniqueid = de.ino; } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); -- cgit v1.2.3 From eaf35b1ea8c12edc5ba8299a8ecfe1efab85101b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 16 Jul 2011 15:24:37 -0400 Subject: cifs: use cifs_dirent in cifs_save_resume_key Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 +- fs/cifs/readdir.c | 66 ++++++++---------------------------------------------- 2 files changed, 10 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6255fa812c7a..1fcf4e5b3112 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -501,7 +501,7 @@ struct cifs_search_info { char *ntwrk_buf_start; char *srch_entries_start; char *last_entry; - char *presume_name; + const char *presume_name; unsigned int resume_name_len; bool endOfSearch:1; bool emptyDir:1; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 47bb1b8cd305..36885a368fa8 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -478,66 +478,18 @@ static int is_dir_changed(struct file *file) } static int cifs_save_resume_key(const char *current_entry, - struct cifsFileInfo *cifsFile) + struct cifsFileInfo *file_info) { - int rc = 0; - unsigned int len = 0; - __u16 level; - char *filename; - - if ((cifsFile == NULL) || (current_entry == NULL)) - return -EINVAL; - - level = cifsFile->srch_inf.info_level; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + struct cifs_dirent de; + int rc; - filename = &pFindData->FileName[0]; - if (cifsFile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; + rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (!rc) { + file_info->srch_inf.presume_name = de.name; + file_info->srch_inf.resume_name_len = de.namelen; + file_info->srch_inf.resume_key = de.resume_key; } - cifsFile->srch_inf.resume_name_len = len; - cifsFile->srch_inf.presume_name = filename; return rc; } -- cgit v1.2.3 From e010a5ef95b8b6a12b74b548578f7dcf93564347 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 25 Jul 2011 22:04:32 +0000 Subject: [CIFS] Redundant null check after dereference Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/dir.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index fa8c21d913bc..8e9d37d44e9d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -57,11 +57,6 @@ build_path_from_dentry(struct dentry *direntry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); unsigned seq; - if (direntry == NULL) - return NULL; /* not much we can do if dentry is freed and - we need to reopen the file after it was closed implicitly - when the server crashed */ - dirsep = CIFS_DIR_SEP(cifs_sb); if (tcon->Flags & SMB_SHARE_IS_IN_DFS) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); -- cgit v1.2.3 From f5bc1e755d23d022bf948904386337fc3e5e29a8 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 25 Jul 2011 17:59:10 +0400 Subject: CIFS: Fix oops while mounting with prefixpath commit fec11dd9a0109fe52fd631e5c510778d6cbff6cc caused a regression when we have already mounted //server/share/a and want to mount //server/share/a/b. The problem is that lookup_one_len calls __lookup_hash with nd pointer as NULL. Then __lookup_hash calls do_revalidate in the case when dentry exists and we end up with NULL pointer deference in cifs_d_revalidate: if (nd->flags & LOOKUP_RCU) return -ECHILD; Fix this by checking nd for NULL. Signed-off-by: Pavel Shilovsky Reviewed-by: Shirish Pargaonkar CC: Stable Signed-off-by: Steve French --- fs/cifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 8e9d37d44e9d..c1bd0306d6c6 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -636,7 +636,7 @@ lookup_out: static int cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && (nd->flags & LOOKUP_RCU)) return -ECHILD; if (direntry->d_inode) { -- cgit v1.2.3 From 14cae3243b555afda69a57778069ddca65532c06 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Mon, 20 Jun 2011 16:14:03 -0500 Subject: cifs: Cleanup: check return codes of crypto api calls Check return codes of crypto api calls and either log an error or log an error and return from the calling function with error. Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 110 ++++++++++++++++++++++++++++++++++++++++---------- fs/cifs/link.c | 8 +++- fs/cifs/smbencrypt.c | 8 +++- 3 files changed, 103 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5a0ee7f2af06..259991bd2112 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -52,19 +52,29 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not init md5\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); + if (rc) { + cERROR(1, "%s: Could not update with payload\n", __func__); + return rc; + } rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); - return 0; + return rc; } /* must be called with server->srv_mutex held */ @@ -112,12 +122,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not init md5\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) @@ -131,14 +145,24 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base + 4, iov[i].iov_len - 4); - } else + } else { + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cERROR(1, "%s: Could not update with payload\n", + __func__); + return rc; + } } rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -463,8 +487,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); - crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NT Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -478,13 +506,18 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (user == NULL) { cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); rc = -ENOMEM; - goto calc_exit_2; + return rc; } len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); UniStrupr(user); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)user, 2 * len); + kfree(user); + if (rc) { + cERROR(1, "%s: Could not update with user\n", __func__); + return rc; + } /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -494,13 +527,19 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (domain == NULL) { cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)domain, 2 * len); kfree(domain); + if (rc) { + cERROR(1, "%s: Could not update with domain\n", + __func__); + return rc; + } } else if (ses->serverName) { len = strlen(ses->serverName); @@ -508,21 +547,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (server == NULL) { cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)server, 2 * len); kfree(server); + if (rc) { + cERROR(1, "%s: Could not update with server\n", + __func__); + return rc; + } } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ntlmv2_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); -calc_exit_1: - kfree(user); -calc_exit_2: return rc; } @@ -537,8 +581,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return -1; } - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -552,11 +600,17 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ses->auth_key.response + offset, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + offset, ses->auth_key.len - offset); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -626,8 +680,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -635,12 +693,18 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto setup_ntlmv2_rsp_ret; } - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); setup_ntlmv2_rsp_ret: kfree(tiblob); @@ -668,8 +732,12 @@ calc_seckey(struct cifs_ses *ses) desc.tfm = tfm_arc4; - crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); + if (rc) { + cERROR(1, "%s: Could not set response as a key", __func__); + return rc; + } sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); @@ -688,7 +756,7 @@ calc_seckey(struct cifs_ses *ses) crypto_free_blkcipher(tfm_arc4); - return 0; + return rc; } void diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 556b1a0b54de..db3f18cdf024 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -74,8 +74,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) cERROR(1, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update iwth link_str\n", __func__); + goto symlink_hash_err; + } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: crypto_free_shash(md5); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 1c5b770c3141..42b9fff48751 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -157,8 +157,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) cERROR(1, "%s: Could not init md4 shash\n", __func__); goto mdfour_err; } - crypto_shash_update(&sdescmd4->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update with link_str\n", __func__); + goto mdfour_err; + } rc = crypto_shash_final(&sdescmd4->shash, md4_hash); + if (rc) + cERROR(1, "%s: Could not genereate md4 hash\n", __func__); mdfour_err: crypto_free_shash(md4); -- cgit v1.2.3 From 19237039919088781b4191a00bdc1284d8fea1dd Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 26 Jul 2011 09:15:45 +0100 Subject: GFS2: Fix mount hang caused by certain access pattern to sysfs files Depending upon the order of userspace/kernel during the mount process, this can result in a hang without the _all version of the completion. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_fstype.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 516516e0c2a2..3bc073a4cf82 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1018,13 +1018,13 @@ hostdata_error: fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, fsname); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return ret; } -- cgit v1.2.3 From 2d859db3e4a82a365572592d57624a5f996ed0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Jul 2011 09:07:11 -0400 Subject: ext4: fix data corruption in inodes with journalled data When journalling data for an inode (either because it is a symlink or because the filesystem is mounted in data=journal mode), ext4_evict_inode() can discard unwritten data by calling truncate_inode_pages(). This is because we don't mark the buffer / page dirty when journalling data but only add the buffer to the running transaction and thus mm does not know there are still unwritten data. Fix the problem by carefully tracking transaction containing inode's data, committing this transaction, and writing uncheckpointed buffers when inode should be reaped. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index de50b16a8f67..43e4abd67be7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -121,6 +121,33 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the + * journal. So although mm thinks everything is clean and + * ready for reaping the inode might still have some pages to + * write in the running transaction or waiting to be + * checkpointed. Thus calling jbd2_journal_invalidatepage() + * (via truncate_inode_pages()) to discard these buffers can + * cause data loss. Also even if we did not discard these + * buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to + * read them before the transaction is checkpointed. So be + * careful and force everything to disk here... We use + * ei->i_datasync_tid to store the newest transaction + * containing inode's data. + * + * Note that directories do not have this problem because they + * don't use page cache. + */ + if (ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; + + jbd2_log_start_commit(journal, commit_tid); + jbd2_log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); goto no_delete; } @@ -970,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file, if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1678,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page, write_end_fn); if (ret == 0) ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; err = ext4_journal_stop(handle); if (!ret) ret = err; -- cgit v1.2.3 From bacb2d816c77edefd464d6bcc04c07f92109bd7d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 26 Jul 2011 17:25:20 +0300 Subject: fs: add missing unlock in default_llseek() A recent change in linux-next, 982d816581 "fs: add SEEK_HOLE and SEEK_DATA flags" added some direct returns on error, but it should have been a goto out. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/read_write.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 5907b49e4d7e..179f1c33ea57 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -166,8 +166,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * long as offset isn't at the end of the file then the * offset is data. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } break; case SEEK_HOLE: /* @@ -175,8 +177,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * as long as offset isn't i_size or larger, return * i_size. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } offset = inode->i_size; break; } -- cgit v1.2.3 From a209dfc7b0d94bd6fa94553c097836a2e6d0f0ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Jul 2011 11:36:34 +0200 Subject: vfs: dont chain pipe/anon/socket on superblock s_inodes list Workloads using pipes and sockets hit inode_sb_list_lock contention. superblock s_inodes list is needed for quota, dirty, pagecache and fsnotify management. pipe/anon/socket fs are clearly not candidates for these. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/anon_inodes.c | 2 +- fs/inode.c | 39 ++++++++++++++++++++++++++++++--------- fs/pipe.c | 2 +- include/linux/fs.h | 3 ++- net/socket.c | 2 +- 5 files changed, 35 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 4d433d34736f..f11e43ed907d 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); */ static struct inode *anon_inode_mkinode(void) { - struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); if (!inode) return ERR_PTR(-ENOMEM); diff --git a/fs/inode.c b/fs/inode.c index 96c77b81167c..319b93b55570 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -362,9 +362,11 @@ EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - spin_lock(&inode_sb_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + if (!list_empty(&inode->i_sb_list)) { + spin_lock(&inode_sb_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); + } } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -796,6 +798,29 @@ unsigned int get_next_ino(void) } EXPORT_SYMBOL(get_next_ino); +/** + * new_inode_pseudo - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *new_inode_pseudo(struct super_block *sb) +{ + struct inode *inode = alloc_inode(sb); + + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + INIT_LIST_HEAD(&inode->i_sb_list); + } + return inode; +} + /** * new_inode - obtain an inode * @sb: superblock @@ -814,13 +839,9 @@ struct inode *new_inode(struct super_block *sb) spin_lock_prefetch(&inode_sb_list_lock); - inode = alloc_inode(sb); - if (inode) { - spin_lock(&inode->i_lock); - inode->i_state = 0; - spin_unlock(&inode->i_lock); + inode = new_inode_pseudo(sb); + if (inode) inode_sb_list_add(inode); - } return inode; } EXPORT_SYMBOL(new_inode); diff --git a/fs/pipe.c b/fs/pipe.c index 1b7f9af67ccf..0e0be1dc0f8e 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -948,7 +948,7 @@ static const struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { - struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) diff --git a/include/linux/fs.h b/include/linux/fs.h index a6658043258a..cc363fa7bb82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2310,7 +2310,8 @@ extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void end_writeback(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode(struct super_block *); +extern struct inode *new_inode_pseudo(struct super_block *sb); +extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); diff --git a/net/socket.c b/net/socket.c index 02dc82db3d23..26ed35c7751e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -467,7 +467,7 @@ static struct socket *sock_alloc(void) struct inode *inode; struct socket *sock; - inode = new_inode(sock_mnt->mnt_sb); + inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; -- cgit v1.2.3 From 24a01d4ee4ee9196c1acc58c64dc216ba2578f1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 12:57:42 -0400 Subject: v9fs_iop_get_acl: get rid of unused variable Signed-off-by: Al Viro --- fs/9p/acl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 814be079c185..e9cb57f07546 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -98,7 +98,6 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) { - struct posix_acl *acl; struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); -- cgit v1.2.3 From 61effb519cbf0b2973c65070a890fabfcbf84756 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 17:11:33 -0400 Subject: jffs2: S_ISLNK(mode & S_IFMT) is pointless it's S_ISLNK(mode), TYVM... Signed-off-by: Al Viro --- fs/jffs2/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index eeead33d8ef0..b81b35ddf4e4 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -80,7 +80,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) { jffs2_free_raw_inode(ri); - if (S_ISLNK(inode->i_mode & S_IFMT)) + if (S_ISLNK(inode->i_mode)) kfree(mdata); return ret; } -- cgit v1.2.3 From 569254b0cc4e125ffde48780b215ecaf5f72bbf4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 17:08:40 -0400 Subject: btrfs: S_ISREG(mode) is not mode & S_IFREG... Signed-off-by: Al Viro --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e91b097e7252..caa26ab5ed68 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4467,7 +4467,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); - if (mode & S_IFDIR) + if (S_ISDIR(mode)) owner = 0; else owner = 1; @@ -4512,7 +4512,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); - if ((mode & S_IFREG)) { + if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW) || -- cgit v1.2.3 From 41c96486f238e6a545f52215f95fe69748abf603 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 02:34:33 -0400 Subject: omfs: fix (mode & S_IFDIR) abuse granted, on a filesystem that has only regular files and directories it happens to work, but really should be S_ISDIR(mode)... Signed-off-by: Al Viro --- fs/omfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 3b8d3979e03b..98e544274390 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -93,7 +93,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb) memset(bh->b_data, 0, sizeof(struct omfs_inode)); - if (inode->i_mode & S_IFDIR) { + if (S_ISDIR(inode->i_mode)) { memset(&bh->b_data[OMFS_DIR_START], 0xff, sbi->s_sys_blocksize - OMFS_DIR_START); } else -- cgit v1.2.3 From c46c887744b330795eba55fdb96343c36d481765 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 26 Jul 2011 13:33:16 -0400 Subject: vfs: document locking requirements for d_move, __d_move and d_materialise_unique Adding a comment to d_materialise_unique per Al's request... d_move and __d_move have some pretty substantial locking requirements, but they are not clearly documented. Add some comments spelling them out. Also, document the requirement for the i_mutex of the parent in d_materialise_unique. Cc: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/dcache.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index be18598c7fd7..b05aac3a8cfc 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2138,8 +2138,9 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller hold - * rename_lock. + * dcache entries should not be moved in this way. Caller must hold + * rename_lock, the i_mutex of the source and target directories, + * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry * dentry, struct dentry * target) { @@ -2202,7 +2203,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target) * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. + * dcache entries should not be moved in this way. See the locking + * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { @@ -2320,7 +2322,8 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) * @inode: inode to bind to the dentry, to which aliases may be attached * * Introduces an dentry into the tree, substituting an extant disconnected - * root directory alias in its place if there is one + * root directory alias in its place if there is one. Caller must hold the + * i_mutex of the parent directory. */ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) { -- cgit v1.2.3 From 252c6728de604d6a897d85e212996811d5c8c46c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:25:27 -0700 Subject: ceph: add flags field to file_info Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/super.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f5cabefa98dc..8febe6fce2b1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -544,7 +544,8 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, * we keep buffered readdir results attached to file->private_data */ struct ceph_file_info { - int fmode; /* initialized on open */ + short fmode; /* initialized on open */ + short flags; /* CEPH_F_* */ /* readdir: position within the dir */ u32 frag; -- cgit v1.2.3 From 4918b6d140c4822201ebbe2f070875332aff337b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:26:07 -0700 Subject: ceph: add F_SYNC file flag to force sync (non-O_DIRECT) io This allows us to force IO through the sync path which you normally only get when multiple clients are reading/writing to the same file or by mounting with -o sync. Among other things, this lets test programs verify correctness with a single mount. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 6 ++++-- fs/ceph/ioctl.c | 11 +++++++++++ fs/ceph/ioctl.h | 1 + fs/ceph/super.h | 2 ++ 4 files changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4698a5c553dc..44e4fe9fba02 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -643,7 +643,8 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); else @@ -720,7 +721,8 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index ef0b5f48e13a..a757a5680578 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -231,6 +231,14 @@ static long ceph_ioctl_lazyio(struct file *file) return 0; } +static long ceph_ioctl_syncio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + + fi->flags |= CEPH_F_SYNC; + return 0; +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -249,6 +257,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_LAZYIO: return ceph_ioctl_lazyio(file); + + case CEPH_IOC_SYNCIO: + return ceph_ioctl_syncio(file); } return -ENOTTY; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 52e8fd74d450..0c5167e43180 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -40,5 +40,6 @@ struct ceph_ioctl_dataloc { struct ceph_ioctl_dataloc) #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) #endif diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8febe6fce2b1..cdb17d36755c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -543,6 +543,8 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, /* * we keep buffered readdir results attached to file->private_data */ +#define CEPH_F_SYNC 1 + struct ceph_file_info { short fmode; /* initialized on open */ short flags; /* CEPH_F_* */ -- cgit v1.2.3 From 9cfa1098dcfb34f71c5f3b7bcdbbb435a0cecab2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:26:18 -0700 Subject: ceph: use flag bit for at_end readdir flag This saves us a word of memory per file. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 10 +++++----- fs/ceph/super.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ef8f08c343e8..53b441fe78f1 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -133,7 +133,7 @@ more: d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); @@ -234,7 +234,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); - if (fi->at_end) + if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ @@ -403,7 +403,7 @@ more: dout("readdir next frag is %x\n", frag); goto more; } - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries @@ -435,7 +435,7 @@ static void reset_readdir(struct ceph_file_info *fi) dput(fi->dentry); fi->dentry = NULL; } - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) @@ -458,7 +458,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } retval = offset; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cdb17d36755c..a8a273320241 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -544,6 +544,7 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, * we keep buffered readdir results attached to file->private_data */ #define CEPH_F_SYNC 1 +#define CEPH_F_ATEND 2 struct ceph_file_info { short fmode; /* initialized on open */ @@ -552,7 +553,6 @@ struct ceph_file_info { /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; - int at_end; /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ -- cgit v1.2.3 From af0ed569d7019f1b49e9e51e77b47092e656b00e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:26:31 -0700 Subject: ceph: fix snap writeback when racing with writes There are two problems that come up when we try to queue a capsnap while a write is in progress: - The FILE_WR cap is held, but not yet dirty, so we may queue a capsnap with dirty == 0. That will crash later in __ceph_flush_snaps(). Or on the FILE_WR cap if a write is in progress. - We may not have i_head_snapc set, which causes problems pretty quickly. Look to the snaprealm in this case. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/snap.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 54b14de2e729..ac030c9d9959 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -449,6 +449,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + + /* + * If there is a write in progress, treat that as a dirty Fw, + * even though it hasn't completed yet; by the time we finish + * up this capsnap it will be. + */ + if (used & CEPH_CAP_FILE_WR) + dirty |= CEPH_CAP_FILE_WR; + if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -456,14 +465,22 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || + } else if (ci->i_wrbuffer_ref_head || (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { struct ceph_snap_context *snapc = ci->i_head_snapc; - dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, - capsnap, snapc); + /* + * if we are a sync write, we may need to go to the snaprealm + * to get the current snapc. + */ + if (!snapc) + snapc = ci->i_snap_realm->cached_context; + + dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", + inode, capsnap, snapc, ceph_cap_string(dirty)); ihold(inode); + BUG_ON(dirty == 0); atomic_set(&capsnap->nref, 1); capsnap->ci = ci; -- cgit v1.2.3 From e77dc3e9c061e50c67e2e1a604d0a370f40db298 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:26:41 -0700 Subject: ceph: only queue capsnap if caps are dirty We used to go into this branch if i_wrbuffer_ref_head was non-zero. This was an ancient check from before we were careful about dealing with all kinds of caps (and not just dirty pages). It is cleaner to only queue a capsnap if there is an actual dirty cap. If we are racing with... something...we will end up here with ci->i_wrbuffer_refs but no dirty caps. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/snap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index ac030c9d9959..e26437191333 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -465,9 +465,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || - (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { + } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; /* @@ -480,7 +479,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", inode, capsnap, snapc, ceph_cap_string(dirty)); ihold(inode); - BUG_ON(dirty == 0); atomic_set(&capsnap->nref, 1); capsnap->ci = ci; -- cgit v1.2.3 From 8f04d42276048b3baff5a5d8fa769f433c62b63e Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Tue, 26 Jul 2011 11:26:54 -0700 Subject: ceph: report f_bfree based on kb_avail rather than diffing. Reviewed-by: Yehuda Sadeh Signed-off-by: Greg Farnum --- fs/ceph/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f2f77fd3c14c..d4757c98ec14 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -73,8 +73,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> - (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_files = le64_to_cpu(st.num_objects); -- cgit v1.2.3 From d8de9ab63a57326d21154c13c365f949f53ce8e1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:34 -0700 Subject: ceph: avoid carrying Fw cap during write into page cache The generic_file_aio_write call may block on balance_dirty_pages while we flush data to the OSDs. If we hold a reference to the FILE_WR cap during that interval revocation by the MDS (e.g., to do a stat(2)) may be very slow. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 44e4fe9fba02..6c90cf090601 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -713,7 +713,7 @@ retry_snap: want = CEPH_CAP_FILE_BUFFER; ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) - goto out; + goto out_put; dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, @@ -726,8 +726,18 @@ retry_snap: ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + /* + * buffered write; drop Fw early to avoid slow + * revocation if we get stuck on balance_dirty_pages + */ + int dirty; + + spin_lock(&inode->i_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + ceph_put_cap_refs(ci, got); + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { @@ -735,7 +745,12 @@ retry_snap: if (err < 0) ret = err; } + + if (dirty) + __mark_inode_dirty(inode, dirty); + goto out; } + if (ret >= 0) { int dirty; spin_lock(&inode->i_lock); @@ -745,12 +760,13 @@ retry_snap: __mark_inode_dirty(inode, dirty); } -out: +out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); +out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); -- cgit v1.2.3 From acda76578813ef893004ecad0e5ad2bb6039e5f7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:48 -0700 Subject: ceph: fix bad parent_inode calc in ceph_lookup_open We were always getting NULL here because the intent file f_dentry is always NULL at this point, which means we were always passing NULL to ceph_mdsc_do_request. In reality, this was fine, since this isn't currently ever a write operation that needs to get strung on the dir's unsafe list. Use the dir explicitly, and only pass it if this open has side-effects that a dir fsync should flush. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c90cf090601..9b667e9abf4c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -223,7 +223,6 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct file *file = nd->intent.open.file; - struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); struct ceph_mds_request *req; int err; int flags = nd->intent.open.flags - 1; /* silly vfs! */ @@ -242,7 +241,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); dentry = ceph_finish_lookup(req, dentry, err); if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); -- cgit v1.2.3 From 9bae113a085b790de384bf86f09e15b42a65a985 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:59 -0700 Subject: ceph: only link open operations to directory unsafe list if O_CREAT|O_TRUNC We only need to put these on the directory unsafe list if they have side effects that fsync(2) should flush out. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9b667e9abf4c..e0115eb4e9ba 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -194,7 +194,8 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? + parent_inode : NULL, req); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); -- cgit v1.2.3 From 468640e32c7f6bfdaaa011095cc388786755d159 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:28:11 -0700 Subject: ceph: fix ceph_lookup_open intent usage We weren't properly calling lookup_instantiate_filp when setting up the lookup intent, which could lead to file leakage on errors. So: - use separate helper for the hidden snapdir translation, immediately following the mds request - use ceph_finish_lookup for the final dentry/return value dance in the exit path - lookup_instantiate_filp on success Reported-by: Al Viro Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 31 ++++++++++++++++++++----------- fs/ceph/file.c | 23 +++++++++++++++-------- fs/ceph/super.h | 2 ++ 3 files changed, 37 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 53b441fe78f1..f39a409db0ea 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -482,18 +482,10 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) } /* - * Process result of a lookup/open request. - * - * Mainly, make sure we return the final req->r_dentry (if it already - * existed) in place of the original VFS-provided dentry when they - * differ. - * - * Gracefully handle the case where the MDS replies with -ENOENT and - * no trace (which it may do, at its discretion, e.g., if it doesn't - * care to issue a lease on the negative dentry). + * Handle lookups for the hidden .snap directory. */ -struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, - struct dentry *dentry, int err) +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; @@ -510,7 +502,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, d_add(dentry, inode); err = 0; } + return err; +} +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ if (err == -ENOENT) { /* no trace? */ err = 0; @@ -605,6 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e0115eb4e9ba..f34d47d66e7c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -223,8 +223,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file = nd->intent.open.file; + struct file *file; struct ceph_mds_request *req; + struct dentry *ret; int err; int flags = nd->intent.open.flags - 1; /* silly vfs! */ @@ -245,15 +246,21 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - dentry = ceph_finish_lookup(req, dentry, err); - if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out; + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (!err) - err = ceph_init_file(req->r_dentry->d_inode, file, - req->r_fmode); + if (err) + goto out; + file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); + if (IS_ERR(file)) + err = PTR_ERR(file); +out: + ret = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", dentry); - return dentry; + dout("ceph_lookup_open result=%p\n", ret); + return ret; } int ceph_release(struct inode *inode, struct file *file) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a8a273320241..c24891a5bec2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -791,6 +791,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); -- cgit v1.2.3 From 2f90b852e3ae73889d7f6de6ecf429b9b6a6b103 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:28:25 -0700 Subject: ceph: ignore lease mask The lease mask is no longer used (and it changed a while back). Instead, use a non-zero duration to indicate that there is a lease being issued. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/inode.c | 10 ++++------ fs/ceph/mds_client.c | 18 +++++++----------- fs/ceph/mds_client.h | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d8858e96ab18..f51e873e966f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -805,14 +805,14 @@ static void update_dentry_lease(struct dentry *dentry, return; spin_lock(&dentry->d_lock); - dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", - dentry, le16_to_cpu(lease->mask), duration, ttl); + dout("update_dentry_lease %p duration %lu ms ttl %lu\n", + dentry, duration, ttl); /* make lease_rdcache_gen match directory */ dir = dentry->d_parent->d_inode; di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; - if (lease->mask == 0) + if (duration == 0) goto out_unlock; if (di->lease_gen == session->s_cap_gen && @@ -1022,9 +1022,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* do we have a dn lease? */ have_lease = have_dir_cap || - (le16_to_cpu(rinfo->dlease->mask) & - CEPH_LOCK_DN); - + le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) dout("fill_trace no dentry lease or dir cap\n"); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0c1d91756528..3b1e743b8c8d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2714,7 +2714,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_lease *h = msg->front.iov_base; u32 seq; struct ceph_vino vino; - int mask; struct qstr dname; int release = 0; @@ -2725,7 +2724,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, goto bad; vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - mask = le16_to_cpu(h->mask); seq = le32_to_cpu(h->seq); dname.name = (void *)h + sizeof(*h) + sizeof(u32); dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); @@ -2737,8 +2735,8 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", - ceph_lease_op_name(h->action), mask, vino.ino, inode, + dout("handle_lease %s, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); @@ -2828,7 +2826,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, return; lease = msg->front.iov_base; lease->action = action; - lease->mask = cpu_to_le16(1); lease->ino = cpu_to_le64(ceph_vino(inode).ino); lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); @@ -2850,7 +2847,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, * Pass @inode always, @dentry is optional. */ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry, int mask) + struct dentry *dentry) { struct ceph_dentry_info *di; struct ceph_mds_session *session; @@ -2858,7 +2855,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, BUG_ON(inode == NULL); BUG_ON(dentry == NULL); - BUG_ON(mask == 0); /* is dentry lease valid? */ spin_lock(&dentry->d_lock); @@ -2868,8 +2864,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, di->lease_gen != di->lease_session->s_cap_gen || !time_before(jiffies, dentry->d_time)) { dout("lease_release inode %p dentry %p -- " - "no lease on %d\n", - inode, dentry, mask); + "no lease\n", + inode, dentry); spin_unlock(&dentry->d_lock); return; } @@ -2880,8 +2876,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, __ceph_mdsc_drop_dentry_lease(dentry); spin_unlock(&dentry->d_lock); - dout("lease_release inode %p dentry %p mask %d to mds%d\n", - inode, dentry, mask, session->s_mds); + dout("lease_release inode %p dentry %p to mds%d\n", + inode, dentry, session->s_mds); ceph_mdsc_lease_send_msg(session, inode, dentry, CEPH_MDS_LEASE_RELEASE, seq); ceph_put_mds_session(session); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7d8a0d662d56..8a40f0636ba9 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -333,7 +333,7 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dn, int mask); + struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); -- cgit v1.2.3 From e9852227431a0ed6ceda064f33e4218757acab6c Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 22 Jul 2011 11:12:28 -0700 Subject: ceph: set up readahead size when rsize is not passed This should improve the default read performance, as without it readahead is practically disabled. Signed-off-by: Yehuda Sadeh --- fs/ceph/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d4757c98ec14..d47c5ec7fb1f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -779,6 +779,10 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; + else + fsc->backing_dev_info.ra_pages = + default_backing_dev_info.ra_pages; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", atomic_long_inc_return(&bdi_seq)); if (!err) -- cgit v1.2.3 From dfabbed6fdd509dc2beb89c954bc36014a1bc7cb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:30:02 -0700 Subject: ceph: set dir complete frag after adding capability Curretly ceph_add_cap clears the complete bit if we are newly issued the FILE_SHARED cap, which is normally the case for a newly issue cap on a new directory. That means we clear the just-set bit. Move the check that sets the flag to after the cap is added/updated. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/inode.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f51e873e966f..2717dc4e443c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -560,7 +560,8 @@ static int fill_inode(struct inode *inode, struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int i; - int issued, implemented; + int issued = 0, implemented; + int updating_inode = 0; struct timespec mtime, atime, ctime; u32 nsplits; struct ceph_buffer *xattr_blob = NULL; @@ -599,7 +600,8 @@ static int fill_inode(struct inode *inode, if (le64_to_cpu(info->version) > 0 && (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; - + + updating_inode = 1; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); @@ -707,17 +709,6 @@ static int fill_inode(struct inode *inode, ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ceph_decode_timespec(&ci->i_rctime, &info->rctime); - - /* set dir completion flag? */ - if (ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - dout(" marking %p complete (empty)\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ - ci->i_max_offset = 2; - } break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", @@ -774,6 +765,19 @@ no_change: __ceph_get_fmode(ci, cap_fmode); } + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + updating_inode && /* didn't jump to no_change */ + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + dout(" marking %p complete (empty)\n", inode); + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ci->i_max_offset = 2; + } + /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); -- cgit v1.2.3 From 48d0cbd1242aac969560ef8b90f26ee3b09a6a5c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:30:15 -0700 Subject: ceph: handle racing calls to ceph_init_dentry The ->lookup() and prepopulate_readdir() callers are working with unhashed dentries, so we don't have to worry. The export.c callers, though, need to initialize something they got back from d_obtain_alias() and are potentially racing with other callers. Make sure we don't return unless the dentry is properly initialized (by us or someone else). Reported-by: Al Viro Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f39a409db0ea..883c9546111d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ - ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) - d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) - d_set_d_op(dentry, &ceph_snapdir_dentry_ops); - else - d_set_d_op(dentry, &ceph_snap_dentry_ops); - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -58,10 +50,21 @@ int ceph_init_dentry(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; } + + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + d_set_d_op(dentry, &ceph_dentry_ops); + else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); + else + d_set_d_op(dentry, &ceph_snap_dentry_ops); + di->dentry = dentry; di->lease_session = NULL; - dentry->d_fsdata = di; dentry->d_time = jiffies; + /* avoid reordering d_fsdata setup so that the check above is safe */ + smp_mb(); + dentry->d_fsdata = di; ceph_dentry_lru_add(dentry); out_unlock: spin_unlock(&dentry->d_lock); -- cgit v1.2.3 From 5f21c96dd5c615341963036ae8f5e4f5227a818d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:30:29 -0700 Subject: ceph: protect access to d_parent d_parent is protected by d_lock: use it when looking up a dentry's parent directory inode. Also take a reference and drop it in the caller to avoid a use-after-free. Reported-by: Al Viro Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 15 +++++++++++++++ fs/ceph/file.c | 8 +++++--- fs/ceph/inode.c | 4 +++- fs/ceph/ioctl.c | 4 +++- fs/ceph/super.h | 9 +-------- fs/ceph/xattr.c | 8 ++++++-- 6 files changed, 33 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 883c9546111d..ed296ec121d1 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -71,6 +71,21 @@ out_unlock: return 0; } +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ + struct inode *inode = NULL; + + if (!dentry) + return NULL; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent) { + inode = dentry->d_parent->d_inode; + ihold(inode); + } + spin_unlock(&dentry->d_lock); + return inode; +} /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f34d47d66e7c..45fbd69daabe 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -194,8 +194,10 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? - parent_inode : NULL, req); + if (flags & (O_CREAT|O_TRUNC)) + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2717dc4e443c..a7db56f1523b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1562,7 +1562,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1745,7 +1745,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index a757a5680578..3b256b50f7d8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -38,7 +38,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; @@ -87,7 +87,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32(l.preferred_osd); + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c24891a5bec2..c1eb9a014b74 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -801,6 +801,7 @@ extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern unsigned ceph_dentry_hash(struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, @@ -823,14 +824,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, int p_locks, int f_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); -static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) -{ - if (dentry && dentry->d_parent) - return dentry->d_parent->d_inode; - - return NULL; -} - /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f42d730f1b66..96c6739a0280 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } -- cgit v1.2.3 From bf1c6aca96c9d2f117dc7e590c2bc2304e7febe1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:30:43 -0700 Subject: ceph: protect d_parent access in ceph_d_revalidate Protect d_parent with d_lock. Carry a reference. Simplify the flow so that there is a single exit point and cleanup. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ed296ec121d1..31d27f8f8261 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1024,36 +1024,38 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) */ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { + int valid = 0; struct inode *dir; if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; - dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode, ceph_dentry(dentry)->offset); + dir = ceph_get_dentry_parent_inode(dentry); + /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - goto out_touch; + valid = 1; + } else if (dentry->d_inode && + ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + valid = 1; + } else if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) { + valid = 1; } - if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) - goto out_touch; - if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) - goto out_touch; - - dout("d_revalidate %p invalid\n", dentry); - d_drop(dentry); - return 0; -out_touch: - ceph_dentry_lru_touch(dentry); - return 1; + dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + if (valid) + ceph_dentry_lru_touch(dentry); + else + d_drop(dentry); + iput(dir); + return valid; } /* -- cgit v1.2.3 From e5f86dc377e7ff2b4195831153a85a3e76fefff2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:30:55 -0700 Subject: ceph: avoid d_parent in ceph_dentry_hash; fix ceph_encode_fh() hashing bug Have caller pass in a safely-obtained reference to the parent directory for calculating a dentry's hash valud. While we're here, simpify the flow through ceph_encode_fh() so that there is a single exit point and cleanup. Also fix a bug with the dentry hash calculation: calculate the hash for the dentry we were given, not its parent. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 3 +-- fs/ceph/export.c | 24 +++++++++++++++--------- fs/ceph/mds_client.c | 2 +- fs/ceph/super.h | 2 +- 4 files changed, 18 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 31d27f8f8261..33a19df72288 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1244,9 +1244,8 @@ void ceph_dentry_lru_del(struct dentry *dn) * Return name hash for a given dentry. This is dependent on * the parent directory's hash function. */ -unsigned ceph_dentry_hash(struct dentry *dn) +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { - struct inode *dir = dn->d_parent->d_inode; struct ceph_inode_info *dci = ceph_inode(dir); switch (dci->i_dir_layout.dl_dir_hash) { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f67b687550de..9fbcdecaaccd 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -46,7 +46,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; - struct dentry *parent = dentry->d_parent; + struct dentry *parent; struct inode *inode = dentry->d_inode; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; @@ -55,26 +55,33 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + spin_lock(&dentry->d_lock); + parent = dget(dentry->d_parent); + spin_unlock(&dentry->d_lock); + if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent); + cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, + dentry); *max_len = connected_handle_length; type = 2; } else if (*max_len >= handle_length) { if (connectable) { *max_len = connected_handle_length; - return 255; + type = 255; + } else { + dout("encode_fh %p\n", dentry); + fh->ino = ceph_ino(dentry->d_inode); + *max_len = handle_length; + type = 1; } - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(dentry->d_inode); - *max_len = handle_length; - type = 1; } else { *max_len = handle_length; - return 255; + type = 255; } + dput(parent); return type; } @@ -123,7 +130,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return dentry; } err = ceph_init_dentry(dentry); - if (err < 0) { iput(inode); return ERR_PTR(err); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3b1e743b8c8d..8a09cd5a659e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -670,7 +670,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else { /* dir + name */ inode = dir; - hash = ceph_dentry_hash(req->r_dentry); + hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; } } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c1eb9a014b74..35dc9656e499 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -800,7 +800,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); -extern unsigned ceph_dentry_hash(struct dentry *dn); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); /* -- cgit v1.2.3 From 4f1772645296a230e04f5c53e79cfb6f841ce634 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:31:08 -0700 Subject: ceph: document locking for ceph_set_dentry_offset Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a7db56f1523b..5f0013aafa4b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -843,11 +843,13 @@ out_unlock: /* * Set dentry's directory position based on the current dir's max, and * order it in d_subdirs, so that dcache_readdir behaves. + * + * Always called under directory's i_mutex. */ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; + struct inode *inode = dir->d_inode; struct ceph_dentry_info *di; BUG_ON(!inode); -- cgit v1.2.3 From 41b02e1f9bb87b07d792b64aaeb7af3d00d69cd2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:31:14 -0700 Subject: ceph: explicitly reference rename old_dentry parent dir in request We carry a pin on the parent directory for the rename source and dest dentries. For the source it's r_locked_dir; we need to explicitly reference the old_dentry parent as well, since the dentry's d_parent may change between when the request was created and pinned and when it is freed. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 2 +- fs/ceph/dir.c | 2 ++ fs/ceph/mds_client.c | 23 +++++++++++++---------- fs/ceph/mds_client.h | 1 + 4 files changed, 17 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 0dba6915712b..fb962efdacee 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -102,7 +102,7 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry->d_parent->d_inode), + ceph_ino(req->r_old_dentry_dir), req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 33a19df72288..7263f825d426 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -811,6 +811,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -909,6 +910,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8a09cd5a659e..66a8939cc518 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -483,22 +483,26 @@ void ceph_mdsc_release_request(struct kref *kref) destroy_reply_info(&req->r_reply_info); } if (req->r_inode) { - ceph_put_cap_refs(ceph_inode(req->r_inode), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_target_inode) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) { - ceph_put_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + /* + * track (and drop pins for) r_old_dentry_dir + * separately, since r_old_dentry's d_parent may have + * changed between the dir mutex being dropped and + * this request being freed. + */ + ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); dput(req->r_old_dentry); + iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); @@ -1931,9 +1935,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_old_dentry) - ceph_get_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); /* issue */ mutex_lock(&mdsc->mutex); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 8a40f0636ba9..4bb239921dbd 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -171,6 +171,7 @@ struct ceph_mds_request { struct inode *r_inode; /* arg1 */ struct dentry *r_dentry; /* arg1 */ struct dentry *r_old_dentry; /* arg2: rename from or link from */ + struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; -- cgit v1.2.3 From d79698da32b317e96216236f265a9b72b78ae568 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:31:26 -0700 Subject: ceph: document unlocked d_parent accesses For the most part we don't care about racing with rename when directing MDS requests; either the old or new parent is fine. Document that, and do some minor cleanup. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 2 +- fs/ceph/mds_client.c | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 7263f825d426..852ff8600ac9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -506,7 +506,7 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; + struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 66a8939cc518..fee028b5332e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -621,6 +621,12 @@ static void __unregister_request(struct ceph_mds_client *mdsc, */ struct dentry *get_nonsnap_parent(struct dentry *dentry) { + /* + * we don't need to worry about protecting the d_parent access + * here because we never renaming inside the snapped namespace + * except to resplice to another snapdir, and either the old or new + * result is a valid result. + */ while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) dentry = dentry->d_parent; return dentry; @@ -656,7 +662,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - struct inode *dir = req->r_dentry->d_parent->d_inode; + /* ignore race with rename; old or new d_parent is okay */ + struct dentry *parent = req->r_dentry->d_parent; + struct inode *dir = parent->d_inode; if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ @@ -664,8 +672,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = - get_nonsnap_parent(req->r_dentry->d_parent); + struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); } else if (req->r_dentry->d_inode) { -- cgit v1.2.3 From abbede1b3a680e108d61aaa415ce5153296e775d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 02:31:30 -0400 Subject: xfs: get rid of open-coded S_ISREG(), etc. Signed-off-by: Al Viro --- fs/xfs/linux-2.6/xfs_ioctl.c | 4 ++-- fs/xfs/xfs_bmap.c | 7 +++---- fs/xfs/xfs_dir2.c | 16 ++++++++-------- fs/xfs/xfs_filestream.c | 2 +- fs/xfs/xfs_inode.c | 16 ++++++++-------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_log_recover.c | 4 ++-- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_rename.c | 4 ++-- fs/xfs/xfs_vnodeops.c | 10 +++++----- 10 files changed, 33 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index acca2c5ca3fa..14fc00e32f6b 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -850,14 +850,14 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & XFS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (xflags & XFS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; if (xflags & XFS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; if (xflags & XFS_XFLAG_EXTSIZE) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c51a3f903633..ab3e5c6c4642 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -414,7 +414,7 @@ xfs_bmap_add_attrfork_local( if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; @@ -3344,8 +3344,7 @@ xfs_bmap_local_to_extents( * We don't want to deal with the case of keeping inode data inline yet. * So sending the data fork of a regular inode is invalid. */ - ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && - whichfork == XFS_DATA_FORK)); + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); flags = 0; @@ -4052,7 +4051,7 @@ xfs_bmap_one_block( #ifndef DEBUG if (whichfork == XFS_DATA_FORK) { - return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? + return S_ISREG(ip->i_d.di_mode) ? (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); } diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 4580ce00aeb4..a2e27010c7fb 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -121,7 +121,7 @@ xfs_dir_isempty( { xfs_dir2_sf_hdr_t *sfp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if (dp->i_d.di_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) @@ -179,7 +179,7 @@ xfs_dir_init( memset((char *)&args, 0, sizeof(args)); args.dp = dp; args.trans = tp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) return error; return xfs_dir2_sf_create(&args, pdp->i_ino); @@ -202,7 +202,7 @@ xfs_dir_createname( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; XFS_STATS_INC(xs_dir_create); @@ -278,7 +278,7 @@ xfs_dir_lookup( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -333,7 +333,7 @@ xfs_dir_removename( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -382,7 +382,7 @@ xfs_readdir( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) @@ -414,7 +414,7 @@ xfs_dir_replace( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; @@ -464,7 +464,7 @@ xfs_dir_canenter( if (resblks) return 0; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9124425b7f2f..87fceceee2db 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -344,7 +344,7 @@ _xfs_filestream_update_ag( * Either ip is a regular file and pip is a directory, or ip is a * directory and pip is NULL. */ - ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && + ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && (pip->i_d.di_mode & S_IFDIR)) || ((ip->i_d.di_mode & S_IFDIR) && !pip))); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cc21ddf9f7e..2fcca4b03ed3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -368,7 +368,7 @@ xfs_iformat( /* * no local regular files yet */ - if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { xfs_warn(ip->i_mount, "corrupt inode %Lu (local format for regular file).", (unsigned long long) ip->i_ino); @@ -1040,7 +1040,7 @@ xfs_ialloc( if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; - if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { ip->i_d.di_mode |= S_ISGID; } } @@ -1097,14 +1097,14 @@ xfs_ialloc( if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; - if ((mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } - } else if ((mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { @@ -1188,7 +1188,7 @@ xfs_isize_check( int nimaps; xfs_bmbt_irec_t imaps[2]; - if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) + if (!S_ISREG(ip->i_d.di_mode)) return; if (XFS_IS_REALTIME_INODE(ip)) @@ -1828,7 +1828,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); + (!S_ISREG(ip->i_d.di_mode))); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -2671,7 +2671,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip, ip->i_d.di_magic); goto corrupt_out; } - if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (S_ISREG(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), @@ -2681,7 +2681,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip); goto corrupt_out; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + } else if (S_ISDIR(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a97644ab945a..2380a4bcbece 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -263,7 +263,7 @@ typedef struct xfs_inode { struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ +#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ (ip)->i_size : (ip)->i_d.di_size; /* Convert from vfs inode to xfs inode */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8fe4206de057..052a2c0ec5fb 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2283,7 +2283,7 @@ xlog_recover_inode_pass2( /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", @@ -2296,7 +2296,7 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7f25245da289..092e16ae4d9d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1331,7 +1331,7 @@ xfs_mountfs( ASSERT(rip != NULL); - if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { + if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 77a59891734e..df78c297d1a1 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -116,7 +116,7 @@ xfs_rename( trace_xfs_rename(src_dp, target_dp, src_name, target_name); new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); + src_is_directory = S_ISDIR(src_ip->i_d.di_mode); if (src_is_directory) { /* @@ -226,7 +226,7 @@ xfs_rename( * target and source are directories and that target can be * destroyed, or that neither is a directory. */ - if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(target_ip->i_d.di_mode)) { /* * Make sure target dir is empty. */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 88d121486c52..9322e13f0c63 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -121,7 +121,7 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); + ASSERT(S_ISLNK(ip->i_d.di_mode)); ASSERT(ip->i_d.di_size <= MAXPATHLEN); pathlen = ip->i_d.di_size; @@ -529,7 +529,7 @@ xfs_release( if (ip->i_d.di_nlink == 0) return 0; - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && @@ -610,7 +610,7 @@ xfs_inactive( truncate = ((ip->i_d.di_nlink == 0) && ((ip->i_d.di_size != 0) || (ip->i_size != 0) || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && - ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); + S_ISREG(ip->i_d.di_mode)); mp = ip->i_mount; @@ -621,7 +621,7 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS) && @@ -669,7 +669,7 @@ xfs_inactive( xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { + } else if (S_ISLNK(ip->i_d.di_mode)) { /* * If we get an error while cleaning up a -- cgit v1.2.3 From 03209378b4b25775bf5e6a86e86f074a1057a439 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Jul 2011 20:54:24 -0400 Subject: xfs: fix misspelled S_IS...() mode_t is not a bitmap... Signed-off-by: Al Viro --- fs/xfs/linux-2.6/xfs_ioctl.c | 2 +- fs/xfs/xfs_filestream.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 14fc00e32f6b..f7ce7debe14c 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -265,7 +265,7 @@ xfs_open_by_handle( return PTR_ERR(filp); } - if (inode->i_mode & S_IFREG) { + if (S_ISREG(inode->i_mode)) { filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 87fceceee2db..3ff3d9e23ded 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -345,8 +345,8 @@ _xfs_filestream_update_ag( * directory and pip is NULL. */ ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && - (pip->i_d.di_mode & S_IFDIR)) || - ((ip->i_d.di_mode & S_IFDIR) && !pip))); + S_ISDIR(pip->i_d.di_mode)) || + (S_ISDIR(ip->i_d.di_mode) && !pip))); mp = ip->i_mount; cache = mp->m_filestream; @@ -537,7 +537,7 @@ xfs_filestream_lookup_ag( xfs_agnumber_t ag; int ref; - if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { + if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { ASSERT(0); return NULLAGNUMBER; } @@ -579,9 +579,9 @@ xfs_filestream_associate( xfs_agnumber_t ag, rotorstep, startag; int err = 0; - ASSERT(pip->i_d.di_mode & S_IFDIR); - ASSERT(ip->i_d.di_mode & S_IFREG); - if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) + ASSERT(S_ISDIR(pip->i_d.di_mode)); + ASSERT(S_ISREG(ip->i_d.di_mode)); + if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) return -EINVAL; mp = pip->i_mount; -- cgit v1.2.3 From e57712ebebbb9db7d8dcef216437b3171ddcf115 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 04:15:54 -0400 Subject: merge fchmod() and fchmodat() guts, kill ancient broken kludge The kludge in question is undocumented and doesn't work for 32bit binaries on amd64, sparc64 and s390. Passing (mode_t)-1 as mode had (since 0.99.14v and contrary to behaviour of any other Unix, prescriptions of POSIX, SuS and our own manpages) was kinda-sorta no-op. Note that any software relying on that (and looking for examples shows none) would be visibly broken on sparc64, where practically all userland is built 32bit. No such complaints noticed... Signed-off-by: Al Viro --- fs/open.c | 78 +++++++++++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 739b751aa73e..f71192109457 100644 --- a/fs/open.c +++ b/fs/open.c @@ -446,74 +446,52 @@ out: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +static int chmod_common(struct path *path, umode_t mode) { - struct inode * inode; - struct dentry * dentry; - struct file * file; - int err = -EBADF; + struct inode *inode = path->dentry->d_inode; struct iattr newattrs; + int error; - file = fget(fd); - if (!file) - goto out; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - - audit_inode(NULL, dentry); - - err = mnt_want_write_file(file); - if (err) - goto out_putf; + error = mnt_want_write(path->mnt); + if (error) + return error; mutex_lock(&inode->i_mutex); - err = security_path_chmod(dentry, file->f_vfsmnt, mode); - if (err) + error = security_path_chmod(path->dentry, path->mnt, mode); + if (error) goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); + error = notify_change(path->dentry, &newattrs); out_unlock: mutex_unlock(&inode->i_mutex); - mnt_drop_write(file->f_path.mnt); -out_putf: - fput(file); -out: + mnt_drop_write(path->mnt); + return error; +} + +SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +{ + struct file * file; + int err = -EBADF; + + file = fget(fd); + if (file) { + audit_inode(NULL, file->f_path.dentry); + err = chmod_common(&file->f_path, mode); + fput(file); + } return err; } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) { struct path path; - struct inode *inode; int error; - struct iattr newattrs; error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); - if (error) - goto out; - inode = path.dentry->d_inode; - - error = mnt_want_write(path.mnt); - if (error) - goto dput_and_out; - mutex_lock(&inode->i_mutex); - error = security_path_chmod(path.dentry, path.mnt, mode); - if (error) - goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path.dentry, &newattrs); -out_unlock: - mutex_unlock(&inode->i_mutex); - mnt_drop_write(path.mnt); -dput_and_out: - path_put(&path); -out: + if (!error) { + error = chmod_common(&path, mode); + path_put(&path); + } return error; } -- cgit v1.2.3 From 3141c8b165644774eb0e83d8330fbe47e45b37bf Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 26 Jul 2011 16:08:32 -0700 Subject: coredump: use task comm instead of (unknown) If we don't know the file corresponding to the binary (i.e. exe_file is unknown), use "task->comm (path unknown)" instead of simple "(unknown)" as suggested by ak. The fallback is the same as %e except it will append "(path unknown)". Signed-off-by: Jiri Slaby Cc: Alan Cox Cc: Al Viro Cc: Andi Kleen Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 842d5700c155..a682624de572 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1657,7 +1657,7 @@ static int cn_print_exe_file(struct core_name *cn) exe_file = get_mm_exe_file(current->mm); if (!exe_file) - return cn_printf(cn, "(unknown)"); + return cn_printf(cn, "%s (path unknown)", current->comm); pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { -- cgit v1.2.3 From 2c563731fee0f625924f72e854957bc77601e8b3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 26 Jul 2011 16:08:33 -0700 Subject: coredump: escape / in hostname and comm Change every occurence of / in comm and hostname to !. If the process changes its name to contain /, the core is not dumped (if the directory tree doesn't exist like that). The same with hostname being something like myhost/3. Fix this behaviour by using the escape loop used in %E. (We extract it to a separate function.) Now both with comm == myprocess/1 and hostname == myhost/1, the core is dumped like (kernel.core_pattern='core.%p.%e.%h): core.2349.myprocess!1.myhost!1 Signed-off-by: Jiri Slaby Cc: Alan Cox Cc: Al Viro Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a682624de572..27d487f913d3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1649,15 +1649,26 @@ expand_fail: return ret; } +static void cn_escape(char *str) +{ + for (; *str; str++) + if (*str == '/') + *str = '!'; +} + static int cn_print_exe_file(struct core_name *cn) { struct file *exe_file; - char *pathbuf, *path, *p; + char *pathbuf, *path; int ret; exe_file = get_mm_exe_file(current->mm); - if (!exe_file) - return cn_printf(cn, "%s (path unknown)", current->comm); + if (!exe_file) { + char *commstart = cn->corename + cn->used; + ret = cn_printf(cn, "%s (path unknown)", current->comm); + cn_escape(commstart); + return ret; + } pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { @@ -1671,9 +1682,7 @@ static int cn_print_exe_file(struct core_name *cn) goto free_buf; } - for (p = path; *p; p++) - if (*p == '/') - *p = '!'; + cn_escape(path); ret = cn_printf(cn, "%s", path); @@ -1745,16 +1754,22 @@ static int format_corename(struct core_name *cn, long signr) break; } /* hostname */ - case 'h': + case 'h': { + char *namestart = cn->corename + cn->used; down_read(&uts_sem); err = cn_printf(cn, "%s", utsname()->nodename); up_read(&uts_sem); + cn_escape(namestart); break; + } /* executable */ - case 'e': + case 'e': { + char *commstart = cn->corename + cn->used; err = cn_printf(cn, "%s", current->comm); + cn_escape(commstart); break; + } case 'E': err = cn_print_exe_file(cn); break; -- cgit v1.2.3 From 99b64567486716d18b2156cad188d86478816e4f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 26 Jul 2011 16:08:34 -0700 Subject: do_coredump: fix the "ispipe" error check do_coredump() assumes that if format_corename() fails it should return -ENOMEM. This is not true, for example cn_print_exe_file() can propagate the error from d_path. Even if it was true, this is too fragile. Change the code to check "ispipe < 0". Signed-off-by: Oleg Nesterov Signed-off-by: Jiri Slaby Reviewed-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 27d487f913d3..f8fad7fc0e5f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2133,16 +2133,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(&cn, signr); - if (ispipe == -ENOMEM) { - printk(KERN_WARNING "format_corename failed\n"); - printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; - } - if (ispipe) { int dump_count; char **helper_argv; + if (ispipe < 0) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } + if (cprm.limit == 1) { /* * Normally core limits are irrelevant to pipes, since -- cgit v1.2.3 From d2857e79a2ba7c155eaa1a7d3581c8d26b31e54e Mon Sep 17 00:00:00 2001 From: Daisuke Ogino Date: Tue, 26 Jul 2011 16:08:37 -0700 Subject: procfs: return ENOENT on opening a being-removed proc entry Change the return value to ENOENT. This return value is then returned when opening the proc entry that have been removed. For example, open("/proc/bus/pci/XX/YY") when the corresponding device is being hot-removed. Signed-off-by: Daisuke Ogino Cc: Jesse Barnes Acked-by: Alexey Dobriyan Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 74b48cfa1bb2..7ed72d6c1c6f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -319,7 +319,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); kfree(pdeo); - return -EINVAL; + return -ENOENT; } pde->pde_users++; open = pde->proc_fops->open; -- cgit v1.2.3 From 293eb1e7772b25a93647c798c7b89bf26c2da2e0 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 26 Jul 2011 16:08:38 -0700 Subject: proc: fix a race in do_io_accounting() If an inode's mode permits opening /proc/PID/io and the resulting file descriptor is kept across execve() of a setuid or similar binary, the ptrace_may_access() check tries to prevent using this fd against the task with escalated privileges. Unfortunately, there is a race in the check against execve(). If execve() is processed after the ptrace check, but before the actual io information gathering, io statistics will be gathered from the privileged process. At least in theory this might lead to gathering sensible information (like ssh/ftp password length) that wouldn't be available otherwise. Holding task->signal->cred_guard_mutex while gathering the io information should protect against the race. The order of locking is similar to the one inside of ptrace_attach(): first goes cred_guard_mutex, then lock_task_sighand(). Signed-off-by: Vasiliy Kulikov Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index c9e3f650f23c..08e3eccf9a12 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2706,9 +2706,16 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; + int result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2719,7 +2726,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) unlock_task_sighand(task, &flags); } - return sprintf(buffer, + result = sprintf(buffer, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@ -2734,6 +2741,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) -- cgit v1.2.3 From aacb3d17a73f6447c04e4d769391238dcf85568d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 26 Jul 2011 16:08:40 -0700 Subject: fs/exec.c: use BUILD_BUG_ON for VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP Commit a8bef8ff6ea1 ("mm: migration: avoid race between shift_arg_pages() and rmap_walk() during migration by not migrating temporary stacks") introduced a BUG_ON() to ensure that VM_STACK_FLAGS and VM_STACK_INCOMPLETE_SETUP do not overlap. The check is a compile time one, so BUILD_BUG_ON is more appropriate. Signed-off-by: Michal Hocko Cc: Mel Gorman Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index f8fad7fc0e5f..01829a1cb766 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -277,7 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; -- cgit v1.2.3 From 912193521b719fbfc2f16776febf5232fe8ba261 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 26 Jul 2011 16:08:41 -0700 Subject: exec: do not call request_module() twice from search_binary_handler() Currently, search_binary_handler() tries to load binary loader module using request_module() if a loader for the requested program is not yet loaded. But second attempt of request_module() does not affect the result of search_binary_handler(). If request_module() triggered recursion, calling request_module() twice causes 2 to the power of MAX_KMOD_CONCURRENT (= 50) repetitions. It is not an infinite loop but is sufficient for users to consider as a hang up. Therefore, this patch changes not to call request_module() twice, making 1 to the power of MAX_KMOD_CONCURRENT repetitions in case of recursion. Signed-off-by: Tetsuo Handa Reported-by: Richard Weinberger Tested-by: Richard Weinberger Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 01829a1cb766..e6770a526f34 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1440,6 +1440,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) printable(bprm->buf[2]) && printable(bprm->buf[3])) break; /* -ENOEXEC */ + if (try) + break; /* -ENOEXEC */ request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); #endif } -- cgit v1.2.3 From b4edf8bd06916645b57df23a720b17cae4051c43 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 26 Jul 2011 16:08:42 -0700 Subject: exec: do not retry load_binary method if CONFIG_MODULES=n If CONFIG_MODULES=n, it makes no sense to retry the list of binary formats handler because the list will not be modified by request_module(). Signed-off-by: Tetsuo Handa Cc: Richard Weinberger Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index e6770a526f34..0e8e59939d09 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1430,9 +1430,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } } read_unlock(&binfmt_lock); +#ifdef CONFIG_MODULES if (retval != -ENOEXEC || bprm->mm == NULL) { break; -#ifdef CONFIG_MODULES } else { #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) if (printable(bprm->buf[0]) && @@ -1443,8 +1443,10 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) if (try) break; /* -ENOEXEC */ request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); -#endif } +#else + break; +#endif } return retval; } -- cgit v1.2.3 From 32e107f71e4a993ac438f0049aa4019457911ffb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 26 Jul 2011 16:08:43 -0700 Subject: fs/exec.c:acct_arg_size(): ptl is no longer needed for add_mm_counter() acct_arg_size() takes ->page_table_lock around add_mm_counter() if !SPLIT_RSS_COUNTING. This is not needed after commit 172703b08cd0 ("mm: delete non-atomic mm counter implementation"). Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Cc: Dave Hansen Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 0e8e59939d09..da80612a35f4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -181,14 +181,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) return; bprm->vma_pages = pages; - -#ifdef SPLIT_RSS_COUNTING - add_mm_counter(mm, MM_ANONPAGES, diff); -#else - spin_lock(&mm->page_table_lock); add_mm_counter(mm, MM_ANONPAGES, diff); - spin_unlock(&mm->page_table_lock); -#endif } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, -- cgit v1.2.3 From 60063497a95e716c9a689af3be2687d261f115b4 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Tue, 26 Jul 2011 16:09:06 -0700 Subject: atomic: use This allows us to move duplicated code in (atomic_inc_not_zero() for now) to Signed-off-by: Arun Sharma Reviewed-by: Eric Dumazet Cc: Ingo Molnar Cc: David Miller Cc: Eric Dumazet Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/atomic.h | 1 - arch/alpha/include/asm/local.h | 2 +- arch/alpha/kernel/perf_event.c | 2 +- arch/alpha/kernel/smp.c | 2 +- arch/alpha/lib/dec_and_lock.c | 2 +- arch/arm/include/asm/atomic.h | 1 - arch/arm/kernel/smp.c | 2 +- arch/arm/kernel/traps.c | 2 +- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-bcmring/dma.c | 2 +- arch/arm/mach-cns3xxx/include/mach/pm.h | 2 +- arch/arm/mach-cns3xxx/pm.c | 2 +- arch/arm/mach-omap1/pm.c | 2 +- arch/arm/mach-s3c2440/clock.c | 2 +- arch/arm/mach-s3c2440/s3c2442.c | 2 +- arch/arm/mach-s3c2440/s3c244x-clock.c | 2 +- arch/avr32/include/asm/atomic.h | 1 - arch/blackfin/include/asm/atomic.h | 1 - arch/blackfin/include/asm/dma.h | 2 +- arch/blackfin/include/asm/ipipe.h | 2 +- arch/blackfin/include/asm/spinlock.h | 2 +- arch/blackfin/kernel/ftrace.c | 2 +- arch/blackfin/kernel/ipipe.c | 2 +- arch/blackfin/kernel/nmi.c | 2 +- arch/blackfin/mach-common/smp.c | 2 +- arch/cris/arch-v32/drivers/cryptocop.c | 2 +- arch/cris/arch-v32/kernel/smp.c | 2 +- arch/cris/include/asm/atomic.h | 1 - arch/cris/include/asm/bitops.h | 2 +- arch/cris/kernel/process.c | 2 +- arch/frv/include/asm/atomic.h | 1 - arch/frv/include/asm/hardirq.h | 2 +- arch/frv/kernel/irq.c | 2 +- arch/h8300/include/asm/atomic.h | 1 - arch/ia64/include/asm/atomic.h | 1 - arch/ia64/include/asm/processor.h | 2 +- arch/ia64/include/asm/spinlock.h | 2 +- arch/ia64/kernel/smp.c | 2 +- arch/ia64/kernel/smpboot.c | 2 +- arch/ia64/kernel/uncached.c | 2 +- arch/m32r/include/asm/atomic.h | 1 - arch/m32r/include/asm/mmu_context.h | 2 +- arch/m32r/include/asm/spinlock.h | 2 +- arch/m32r/kernel/smp.c | 2 +- arch/m32r/kernel/traps.c | 2 +- arch/m68k/include/asm/atomic.h | 1 - arch/microblaze/include/asm/mmu_context_mm.h | 2 +- arch/microblaze/include/asm/prom.h | 2 +- arch/mips/include/asm/atomic.h | 1 - arch/mips/include/asm/hw_irq.h | 2 +- arch/mips/include/asm/local.h | 2 +- arch/mips/include/asm/smp.h | 2 +- arch/mips/kernel/irq.c | 2 +- arch/mips/kernel/mips-mt.c | 2 +- arch/mips/kernel/rtlx.c | 2 +- arch/mips/kernel/smp-cmp.c | 2 +- arch/mips/kernel/smp-mt.c | 2 +- arch/mips/kernel/smp.c | 2 +- arch/mips/kernel/smtc-proc.c | 2 +- arch/mips/kernel/smtc.c | 2 +- arch/mips/kernel/sync-r4k.c | 2 +- arch/mips/kernel/vpe.c | 2 +- arch/mips/mipssim/sim_smtc.c | 2 +- arch/mips/sgi-ip27/ip27-nmi.c | 2 +- arch/mn10300/include/asm/atomic.h | 1 - arch/mn10300/include/asm/mmu_context.h | 2 +- arch/mn10300/include/asm/spinlock.h | 2 +- arch/mn10300/include/asm/system.h | 2 +- arch/mn10300/kernel/mn10300-watchdog.c | 2 +- arch/mn10300/kernel/traps.c | 2 +- arch/mn10300/mm/misalignment.c | 2 +- arch/mn10300/proc-mn2ws0050/proc-init.c | 2 +- arch/parisc/include/asm/atomic.h | 1 - arch/parisc/include/asm/bitops.h | 2 +- arch/parisc/include/asm/mmu_context.h | 2 +- arch/parisc/kernel/parisc_ksyms.c | 2 +- arch/parisc/kernel/smp.c | 2 +- arch/parisc/kernel/traps.c | 2 +- arch/parisc/lib/bitops.c | 2 +- arch/powerpc/include/asm/atomic.h | 1 - arch/powerpc/include/asm/emulated_ops.h | 2 +- arch/powerpc/include/asm/irq.h | 2 +- arch/powerpc/include/asm/local.h | 2 +- arch/powerpc/include/asm/prom.h | 2 +- arch/powerpc/kernel/of_platform.c | 2 +- arch/powerpc/kernel/ppc_ksyms.c | 2 +- arch/powerpc/kernel/rtas.c | 2 +- arch/powerpc/kernel/rtasd.c | 2 +- arch/powerpc/kernel/smp-tbsync.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/platforms/83xx/km83xx.c | 2 +- arch/powerpc/platforms/83xx/mpc832x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_itx.c | 2 +- arch/powerpc/platforms/83xx/mpc834x_mds.c | 2 +- arch/powerpc/platforms/83xx/mpc836x_mds.c | 2 +- arch/powerpc/platforms/83xx/sbc834x.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_cds.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_mds.c | 2 +- arch/powerpc/platforms/85xx/sbc8548.c | 2 +- arch/powerpc/platforms/cell/cpufreq_spudemand.c | 2 +- arch/powerpc/platforms/cell/smp.c | 2 +- arch/powerpc/platforms/cell/spufs/context.c | 2 +- arch/powerpc/platforms/chrp/smp.c | 2 +- arch/powerpc/platforms/iseries/smp.c | 2 +- arch/powerpc/platforms/powermac/backlight.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 2 +- arch/powerpc/platforms/pseries/eeh.c | 2 +- arch/powerpc/platforms/pseries/eeh_cache.c | 2 +- arch/powerpc/platforms/pseries/smp.c | 2 +- arch/powerpc/sysdev/fsl_soc.c | 2 +- arch/powerpc/sysdev/tsi108_dev.c | 2 +- arch/s390/include/asm/atomic.h | 1 - arch/s390/kernel/dis.c | 2 +- arch/s390/kernel/traps.c | 2 +- arch/sh/include/asm/atomic.h | 1 - arch/sh/include/asm/hw_irq.h | 2 +- arch/sh/include/asm/smp.h | 2 +- arch/sh/kernel/idle.c | 2 +- arch/sh/kernel/smp.c | 2 +- arch/sh/kernel/traps_64.c | 2 +- arch/sh/kernel/unwinder.c | 2 +- arch/sparc/include/asm/atomic_32.h | 1 - arch/sparc/include/asm/atomic_64.h | 1 - arch/sparc/include/asm/prom.h | 2 +- arch/sparc/include/asm/smp_32.h | 2 +- arch/sparc/include/asm/smp_64.h | 2 +- arch/sparc/kernel/irq_64.c | 2 +- arch/sparc/kernel/leon_smp.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/sparc/kernel/smp_32.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/lib/atomic32.c | 2 +- arch/tile/include/asm/atomic.h | 9 --------- arch/tile/include/asm/atomic_32.h | 4 ++-- arch/tile/include/asm/atomic_64.h | 2 +- arch/tile/include/asm/bitops_32.h | 2 +- arch/tile/include/asm/bitops_64.h | 2 +- arch/tile/include/asm/spinlock_32.h | 2 +- arch/tile/kernel/intvec_32.S | 2 +- arch/tile/lib/atomic_32.c | 2 +- arch/tile/lib/atomic_asm_32.S | 2 +- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/atomic.h | 1 - arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/local.h | 2 +- arch/x86/include/asm/mce.h | 2 +- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/prom.h | 2 +- arch/x86/include/asm/spinlock.h | 2 +- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/i8259.c | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/traps.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/timer.c | 2 +- arch/x86/lib/atomic64_32.c | 2 +- arch/x86/mm/mmio-mod.c | 2 +- arch/xtensa/include/asm/atomic.h | 1 - arch/xtensa/kernel/process.c | 2 +- crypto/af_alg.c | 2 +- crypto/proc.c | 2 +- crypto/rng.c | 2 +- drivers/atm/ambassador.c | 2 +- drivers/atm/atmtcp.c | 2 +- drivers/atm/eni.c | 2 +- drivers/atm/eni.h | 2 +- drivers/atm/firestream.c | 2 +- drivers/atm/fore200e.c | 2 +- drivers/atm/horizon.c | 2 +- drivers/atm/idt77252.c | 2 +- drivers/atm/iphase.c | 2 +- drivers/atm/nicstar.c | 2 +- drivers/atm/suni.c | 2 +- drivers/atm/uPD98402.c | 2 +- drivers/atm/zatm.c | 2 +- drivers/base/memory.c | 2 +- drivers/base/power/sysfs.c | 2 +- drivers/block/cciss_scsi.c | 2 +- drivers/char/ipmi/ipmi_watchdog.c | 2 +- drivers/char/mspec.c | 2 +- drivers/connector/cn_proc.c | 3 ++- drivers/edac/edac_stub.c | 2 +- drivers/firewire/core-card.c | 2 +- drivers/firewire/core-device.c | 2 +- drivers/firewire/core-topology.c | 2 +- drivers/firewire/core.h | 2 +- drivers/firewire/nosy.c | 2 +- drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_fence.c | 2 +- drivers/gpu/drm/ttm/ttm_bo.c | 2 +- drivers/gpu/drm/ttm/ttm_lock.c | 2 +- drivers/gpu/drm/ttm/ttm_object.c | 2 +- drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 +- drivers/hwmon/sht15.c | 2 +- drivers/infiniband/hw/cxgb4/mem.c | 2 +- drivers/infiniband/hw/ehca/ehca_tools.h | 2 +- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- drivers/infiniband/ulp/srp/ib_srp.c | 2 +- drivers/isdn/gigaset/gigaset.h | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-kcopyd.c | 2 +- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-queue-length.c | 2 +- drivers/md/dm-table.c | 2 +- drivers/media/video/hdpvr/hdpvr-core.c | 2 +- drivers/media/video/tlg2300/pd-dvb.c | 2 +- drivers/media/video/uvc/uvc_ctrl.c | 2 +- drivers/media/video/uvc/uvc_queue.c | 2 +- drivers/media/video/uvc/uvc_v4l2.c | 2 +- drivers/media/video/uvc/uvc_video.c | 2 +- drivers/message/i2o/i2o_scsi.c | 2 +- drivers/misc/phantom.c | 2 +- drivers/net/atlx/atl1.c | 2 +- drivers/net/atlx/atl2.c | 2 +- drivers/net/atlx/atl2.h | 2 +- drivers/net/cassini.c | 2 +- drivers/net/cpmac.c | 2 +- drivers/net/cxgb3/cxgb3_offload.c | 2 +- drivers/net/cxgb3/l2t.h | 2 +- drivers/net/cxgb3/t3cdev.h | 2 +- drivers/net/cxgb4/cxgb4_uld.h | 2 +- drivers/net/cxgb4/l2t.h | 2 +- drivers/net/hamradio/6pack.c | 2 +- drivers/net/hamradio/dmascc.c | 2 +- drivers/net/ibmveth.c | 2 +- drivers/net/phy/phy.c | 2 +- drivers/net/ppp_generic.c | 2 +- drivers/net/wimax/i2400m/i2400m.h | 2 +- drivers/net/wireless/b43legacy/b43legacy.h | 2 +- drivers/net/wireless/b43legacy/dma.h | 2 +- drivers/oprofile/oprofile_stats.h | 2 +- drivers/pci/hotplug/cpci_hotplug_core.c | 2 +- drivers/pci/xen-pcifront.c | 2 +- drivers/s390/block/dasd_eer.c | 2 +- drivers/s390/char/sclp_quiesce.c | 2 +- drivers/s390/char/vmlogrdr.c | 2 +- drivers/s390/cio/device.h | 2 +- drivers/s390/cio/qdio_main.c | 2 +- drivers/s390/cio/qdio_thinint.c | 2 +- drivers/s390/crypto/ap_bus.c | 2 +- drivers/s390/crypto/zcrypt_api.c | 2 +- drivers/s390/crypto/zcrypt_cex2a.c | 2 +- drivers/s390/crypto/zcrypt_mono.c | 2 +- drivers/s390/crypto/zcrypt_pcica.c | 2 +- drivers/s390/crypto/zcrypt_pcicc.c | 2 +- drivers/s390/crypto/zcrypt_pcixcc.c | 2 +- drivers/s390/net/fsm.h | 2 +- drivers/s390/scsi/zfcp_scsi.c | 2 +- drivers/sbus/char/display7seg.c | 2 +- drivers/scsi/dpt/dpti_i2o.h | 2 +- drivers/scsi/hpsa.c | 2 +- drivers/scsi/pm8001/pm8001_sas.h | 2 +- drivers/staging/octeon/ethernet-rx.c | 2 +- drivers/staging/octeon/ethernet-tx.c | 2 +- drivers/staging/solo6x10/solo6x10.h | 2 +- drivers/staging/tidspbridge/include/dspbridge/host_os.h | 2 +- drivers/staging/winbond/mds_s.h | 2 +- drivers/staging/winbond/wb35reg_s.h | 2 +- drivers/tty/bfin_jtag_comm.c | 2 +- drivers/tty/rocket.c | 2 +- drivers/tty/serial/dz.c | 2 +- drivers/tty/serial/sb1250-duart.c | 2 +- drivers/tty/serial/zs.c | 2 +- drivers/usb/gadget/f_audio.c | 2 +- drivers/usb/gadget/f_rndis.c | 2 +- drivers/usb/gadget/uvc_queue.c | 2 +- drivers/usb/image/microtek.c | 2 +- drivers/usb/misc/appledisplay.c | 2 +- drivers/usb/serial/garmin_gps.c | 2 +- drivers/usb/wusbcore/wa-rpipe.c | 2 +- drivers/vhost/vhost.h | 2 +- drivers/video/sh_mobile_lcdcfb.c | 2 +- drivers/video/vermilion/vermilion.h | 2 +- drivers/w1/masters/matrox_w1.c | 2 +- drivers/w1/w1.c | 2 +- drivers/w1/w1_family.h | 2 +- drivers/watchdog/intel_scu_watchdog.c | 2 +- drivers/watchdog/sbc7240_wdt.c | 2 +- fs/btrfs/delayed-inode.h | 2 +- fs/direct-io.c | 2 +- fs/eventpoll.c | 2 +- fs/file_table.c | 2 +- fs/gfs2/main.c | 2 +- fs/nfs/cache_lib.h | 2 +- fs/nfs/direct.c | 2 +- fs/notify/group.c | 2 +- fs/notify/inode_mark.c | 2 +- fs/notify/mark.c | 2 +- fs/notify/notification.c | 2 +- fs/notify/vfsmount_mark.c | 2 +- fs/ntfs/inode.h | 2 +- fs/posix_acl.c | 2 +- fs/proc/meminfo.c | 2 +- include/acpi/platform/aclinux.h | 2 +- include/asm-generic/atomic.h | 2 -- include/asm-generic/local.h | 2 +- include/asm-generic/local64.h | 2 +- include/drm/ttm/ttm_lock.h | 2 +- include/linux/aio.h | 2 +- include/linux/atmdev.h | 2 +- include/linux/atomic.h | 9 +++++++++ include/linux/backing-dev.h | 2 +- include/linux/bit_spinlock.h | 2 +- include/linux/buffer_head.h | 2 +- include/linux/configfs.h | 2 +- include/linux/connector.h | 2 +- include/linux/cred.h | 2 +- include/linux/crypto.h | 2 +- include/linux/dcache.h | 2 +- include/linux/debug_locks.h | 2 +- include/linux/device.h | 2 +- include/linux/edac.h | 2 +- include/linux/fault-inject.h | 2 +- include/linux/fdtable.h | 2 +- include/linux/filter.h | 2 +- include/linux/firewire.h | 2 +- include/linux/fsnotify_backend.h | 2 +- include/linux/interrupt.h | 2 +- include/linux/jump_label.h | 2 +- include/linux/kdb.h | 2 +- include/linux/key.h | 2 +- include/linux/kgdb.h | 2 +- include/linux/kobject.h | 2 +- include/linux/mlx4/device.h | 2 +- include/linux/mman.h | 2 +- include/linux/mmzone.h | 2 +- include/linux/mount.h | 2 +- include/linux/mutex.h | 2 +- include/linux/netdevice.h | 2 +- include/linux/nfs_fs_sb.h | 2 +- include/linux/oprofile.h | 2 +- include/linux/pci.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/phy.h | 2 +- include/linux/proc_fs.h | 2 +- include/linux/quota.h | 2 +- include/linux/rwsem.h | 2 +- include/linux/sem.h | 2 +- include/linux/skbuff.h | 2 +- include/linux/sonet.h | 2 +- include/linux/spinlock.h | 2 +- include/linux/sunrpc/auth.h | 2 +- include/linux/sunrpc/cache.h | 2 +- include/linux/sunrpc/timer.h | 2 +- include/linux/swap.h | 2 +- include/linux/sysfs.h | 2 +- include/linux/vmstat.h | 2 +- include/linux/workqueue.h | 2 +- include/net/ax25.h | 2 +- include/net/cipso_ipv4.h | 2 +- include/net/flow.h | 2 +- include/net/inet_hashtables.h | 2 +- include/net/inet_timewait_sock.h | 2 +- include/net/inetpeer.h | 2 +- include/net/ip_vs.h | 2 +- include/net/lib80211.h | 2 +- include/net/llc.h | 2 +- include/net/neighbour.h | 2 +- include/net/net_namespace.h | 2 +- include/net/netfilter/nf_conntrack.h | 2 +- include/net/netlabel.h | 2 +- include/net/netns/conntrack.h | 2 +- include/net/sctp/structs.h | 2 +- include/pcmcia/ds.h | 2 +- include/rdma/ib_sa.h | 2 +- include/rdma/ib_verbs.h | 2 +- include/rxrpc/types.h | 2 +- include/scsi/scsi_device.h | 2 +- kernel/audit.c | 2 +- kernel/auditsc.c | 2 +- kernel/cgroup.c | 2 +- kernel/cpuset.c | 2 +- kernel/debug/debug_core.c | 2 +- kernel/rcupdate.c | 2 +- kernel/rcutorture.c | 2 +- kernel/rcutree_trace.c | 2 +- kernel/rwsem.c | 2 +- kernel/stop_machine.c | 2 +- kernel/taskstats.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_mmiotrace.c | 2 +- lib/atomic64.c | 2 +- lib/atomic64_test.c | 2 +- lib/crc32.c | 2 +- lib/dec_and_lock.c | 2 +- mm/init-mm.c | 2 +- mm/kmemleak.c | 2 +- mm/slob.c | 2 +- mm/vmalloc.c | 2 +- net/atm/atm_misc.c | 2 +- net/atm/clip.c | 2 +- net/atm/common.c | 2 +- net/atm/lec.c | 2 +- net/atm/proc.c | 2 +- net/bridge/br_fdb.c | 2 +- net/core/flow.c | 2 +- net/decnet/dn_fib.c | 2 +- net/decnet/dn_neigh.c | 2 +- net/decnet/dn_table.c | 2 +- net/decnet/dn_timer.c | 2 +- net/ipv4/cipso_ipv4.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/iucv/iucv.c | 2 +- net/l2tp/l2tp_core.c | 2 +- net/l2tp/l2tp_ppp.c | 2 +- net/netfilter/nfnetlink_log.c | 2 +- net/netfilter/nfnetlink_queue.c | 2 +- net/netlabel/netlabel_cipso_v4.c | 2 +- net/netlabel/netlabel_kapi.c | 2 +- net/netlabel/netlabel_mgmt.c | 2 +- net/netlabel/netlabel_mgmt.h | 2 +- net/netlabel/netlabel_unlabeled.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- net/tipc/core.h | 2 +- security/selinux/hooks.c | 2 +- security/selinux/xfrm.c | 2 +- sound/pci/echoaudio/darla20.c | 2 +- sound/pci/echoaudio/darla24.c | 2 +- sound/pci/echoaudio/echo3g.c | 2 +- sound/pci/echoaudio/gina20.c | 2 +- sound/pci/echoaudio/gina24.c | 2 +- sound/pci/echoaudio/indigo.c | 2 +- sound/pci/echoaudio/indigodj.c | 2 +- sound/pci/echoaudio/indigodjx.c | 2 +- sound/pci/echoaudio/indigoio.c | 2 +- sound/pci/echoaudio/indigoiox.c | 2 +- sound/pci/echoaudio/layla20.c | 2 +- sound/pci/echoaudio/layla24.c | 2 +- sound/pci/echoaudio/mia.c | 2 +- sound/pci/echoaudio/mona.c | 2 +- sound/pci/lx6464es/lx6464es.h | 2 +- sound/sparc/dbri.c | 2 +- 439 files changed, 427 insertions(+), 448 deletions(-) (limited to 'fs') diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index e756d04b6cd5..88b7491490bc 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -199,7 +199,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /** * atomic64_add_unless - add unless the number is a given value diff --git a/arch/alpha/include/asm/local.h b/arch/alpha/include/asm/local.h index b9e3e3318371..9c94b8456043 100644 --- a/arch/alpha/include/asm/local.h +++ b/arch/alpha/include/asm/local.h @@ -2,7 +2,7 @@ #define _ALPHA_LOCAL_H #include -#include +#include typedef struct { diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 8e47709160f8..8143cd7cdbfb 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index d739703608fc..4087a569b43b 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c index 0f5520d2f45f..f9f5fe830e9f 100644 --- a/arch/alpha/lib/dec_and_lock.c +++ b/arch/alpha/lib/dec_and_lock.c @@ -6,7 +6,7 @@ */ #include -#include +#include asm (".text \n\ .global _atomic_dec_and_lock \n\ diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 7e79503ab89b..4d501f1bdc9d 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -217,7 +217,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) c = old; return c != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_inc(v) atomic_add(1, v) #define atomic_dec(v) atomic_sub(1, v) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 167e3cbe1f2f..d88ff0230e82 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2d3436e9f71f..bc9f9da782cb 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index ea53f4d9b283..4159eca78945 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c index 9f2a948e0e72..0ca00050666a 100644 --- a/arch/arm/mach-bcmring/dma.c +++ b/arch/arm/mach-bcmring/dma.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include /* I don't quite understand why dc4 fails when this is set to 1 and DMA is enabled */ diff --git a/arch/arm/mach-cns3xxx/include/mach/pm.h b/arch/arm/mach-cns3xxx/include/mach/pm.h index 6eae7f764d1d..c2588cc991d1 100644 --- a/arch/arm/mach-cns3xxx/include/mach/pm.h +++ b/arch/arm/mach-cns3xxx/include/mach/pm.h @@ -11,7 +11,7 @@ #ifndef __CNS3XXX_PM_H #define __CNS3XXX_PM_H -#include +#include void cns3xxx_pwr_clk_en(unsigned int block); void cns3xxx_pwr_clk_dis(unsigned int block); diff --git a/arch/arm/mach-cns3xxx/pm.c b/arch/arm/mach-cns3xxx/pm.c index 5e579552aa54..0c04678615ce 100644 --- a/arch/arm/mach-cns3xxx/pm.c +++ b/arch/arm/mach-cns3xxx/pm.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c index 98ba9784aa15..495b3987d461 100644 --- a/arch/arm/mach-omap1/pm.c +++ b/arch/arm/mach-omap1/pm.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/clock.c b/arch/arm/mach-s3c2440/clock.c index 554e0d3ec70b..f9e6bdaf41d2 100644 --- a/arch/arm/mach-s3c2440/clock.c +++ b/arch/arm/mach-s3c2440/clock.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/s3c2442.c b/arch/arm/mach-s3c2440/s3c2442.c index 6224bad4d604..9ad99f8016a1 100644 --- a/arch/arm/mach-s3c2440/s3c2442.c +++ b/arch/arm/mach-s3c2440/s3c2442.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/arch/arm/mach-s3c2440/s3c244x-clock.c b/arch/arm/mach-s3c2440/s3c244x-clock.c index f8d96130d1d1..7f5ea0a169a5 100644 --- a/arch/arm/mach-s3c2440/s3c244x-clock.c +++ b/arch/arm/mach-s3c2440/s3c244x-clock.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h index bbce6a1c6bb6..f229c3849f03 100644 --- a/arch/avr32/include/asm/atomic.h +++ b/arch/avr32/include/asm/atomic.h @@ -188,7 +188,6 @@ static inline int atomic_sub_if_positive(int i, atomic_t *v) #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0) -#define atomic_inc_not_zero(v) atomic_add_unless(v, 1, 0) #define atomic_dec_if_positive(v) atomic_sub_if_positive(1, v) #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h index 4c707dbe1ff9..f2cf5b714ea4 100644 --- a/arch/blackfin/include/asm/atomic.h +++ b/arch/blackfin/include/asm/atomic.h @@ -97,7 +97,6 @@ static inline void atomic_set_mask(int mask, atomic_t *v) c = old; \ c != (u); \ }) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* * atomic_inc_and_test - increment and test diff --git a/arch/blackfin/include/asm/dma.h b/arch/blackfin/include/asm/dma.h index d9dbc1a53534..dac0c97242bb 100644 --- a/arch/blackfin/include/asm/dma.h +++ b/arch/blackfin/include/asm/dma.h @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/blackfin/include/asm/ipipe.h b/arch/blackfin/include/asm/ipipe.h index 9e0cc0e2534f..17b5e92e3bc6 100644 --- a/arch/blackfin/include/asm/ipipe.h +++ b/arch/blackfin/include/asm/ipipe.h @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index 2336093fca23..490c7caa02d9 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -11,7 +11,7 @@ # include #else -#include +#include asmlinkage int __raw_spin_is_locked_asm(volatile int *ptr); asmlinkage void __raw_spin_lock_asm(volatile int *ptr); diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c index 48808a12b427..9277905b82cf 100644 --- a/arch/blackfin/kernel/ftrace.c +++ b/arch/blackfin/kernel/ftrace.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/blackfin/kernel/ipipe.c b/arch/blackfin/kernel/ipipe.c index 486426f8a0d7..dbe11220cc53 100644 --- a/arch/blackfin/kernel/ipipe.c +++ b/arch/blackfin/kernel/ipipe.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include DEFINE_PER_CPU(struct pt_regs, __ipipe_tick_regs); diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 679d0db35256..9919d29287dc 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 1c143a4de5f5..107622aacf6b 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index c03bc3bc30c2..642c6fed43d7 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index a0843a71aaee..0b99df72d2a4 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h index 88dc9b9c4ba0..ce9f67e4d977 100644 --- a/arch/cris/include/asm/atomic.h +++ b/arch/cris/include/asm/atomic.h @@ -150,7 +150,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) cris_atomic_restore(v, flags); return ret != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* Atomic operations are already serializing */ #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/cris/include/asm/bitops.h b/arch/cris/include/asm/bitops.h index c0092fc7d846..a78a2d70cd8b 100644 --- a/arch/cris/include/asm/bitops.h +++ b/arch/cris/include/asm/bitops.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include /* diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index c99aeab7cef7..aa585e4e979e 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -12,7 +12,7 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include +#include #include #include #include diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h index fae32c7fdcb6..b07b75f411f2 100644 --- a/arch/frv/include/asm/atomic.h +++ b/arch/frv/include/asm/atomic.h @@ -256,7 +256,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #include #endif /* _ASM_ATOMIC_H */ diff --git a/arch/frv/include/asm/hardirq.h b/arch/frv/include/asm/hardirq.h index 5fc8b6f5bc55..c62833d6ebbb 100644 --- a/arch/frv/include/asm/hardirq.h +++ b/arch/frv/include/asm/hardirq.h @@ -12,7 +12,7 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include +#include extern atomic_t irq_err_count; static inline void ack_bad_irq(int irq) diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c index a5f624a9f559..3facbc28cbbc 100644 --- a/arch/frv/kernel/irq.c +++ b/arch/frv/kernel/irq.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index 984221abb66d..b641714774ea 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -116,7 +116,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) local_irq_restore(flags); return ret != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ void atomic_clear_mask(unsigned long mask, unsigned long *v) { diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 446881439675..fdb887005dff 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -105,7 +105,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u) { diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 03afe7970748..d9f397fae03e 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -75,7 +75,7 @@ #include #include #include -#include +#include #ifdef CONFIG_NUMA #include #endif diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index 1a91c9121d17..b77768d35f93 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index be450a3e9871..0bd537b4ea6b 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 14ec641003da..559097986672 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index c4696d217ce0..6a867dc45c05 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h index d44a51e5271b..d64d894dc549 100644 --- a/arch/m32r/include/asm/atomic.h +++ b/arch/m32r/include/asm/atomic.h @@ -262,7 +262,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static __inline__ void atomic_clear_mask(unsigned long mask, atomic_t *addr) { diff --git a/arch/m32r/include/asm/mmu_context.h b/arch/m32r/include/asm/mmu_context.h index a70a3df33635..a979a4198168 100644 --- a/arch/m32r/include/asm/mmu_context.h +++ b/arch/m32r/include/asm/mmu_context.h @@ -11,7 +11,7 @@ #ifndef __ASSEMBLY__ -#include +#include #include #include #include diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 179a06489b10..b0ea2f26da3b 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -10,7 +10,7 @@ */ #include -#include +#include #include /* diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 092d40a6708e..ce7aea34fdf4 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index fbd109031df3..ee6a9199561c 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h index 307a573881ad..e844a2d2ba23 100644 --- a/arch/m68k/include/asm/atomic.h +++ b/arch/m68k/include/asm/atomic.h @@ -198,7 +198,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* Atomic operations are already serializing */ #define smp_mb__before_atomic_dec() barrier() diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h index 3e5c254e8d1c..d68647746448 100644 --- a/arch/microblaze/include/asm/mmu_context_mm.h +++ b/arch/microblaze/include/asm/mmu_context_mm.h @@ -11,7 +11,7 @@ #ifndef _ASM_MICROBLAZE_MMU_CONTEXT_H #define _ASM_MICROBLAZE_MMU_CONTEXT_H -#include +#include #include #include #include diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h index 9bd01ecb00d6..9ad567e2d425 100644 --- a/arch/microblaze/include/asm/prom.h +++ b/arch/microblaze/include/asm/prom.h @@ -21,7 +21,7 @@ #include #include -#include +#include #define HAVE_ARCH_DEVTREE_FIXUPS diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 4a02fe891ab6..833a4023648a 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -325,7 +325,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) } return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_dec_return(v) atomic_sub_return(1, (v)) #define atomic_inc_return(v) atomic_add_return(1, (v)) diff --git a/arch/mips/include/asm/hw_irq.h b/arch/mips/include/asm/hw_irq.h index 77adda297ad9..9e8ef5994c9c 100644 --- a/arch/mips/include/asm/hw_irq.h +++ b/arch/mips/include/asm/hw_irq.h @@ -8,7 +8,7 @@ #ifndef __ASM_HW_IRQ_H #define __ASM_HW_IRQ_H -#include +#include extern atomic_t irq_err_count; diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h index fffc8307a80a..94fde8d0fac1 100644 --- a/arch/mips/include/asm/local.h +++ b/arch/mips/include/asm/local.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index af42385245d5..d4fb4d852a6d 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include extern int smp_num_siblings; diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c index 9b734d74ae8e..b53970d80991 100644 --- a/arch/mips/kernel/irq.c +++ b/arch/mips/kernel/irq.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index b2259e7cd829..594ca69cb867 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c index 557ef72472e0..7a80b7cda7cc 100644 --- a/arch/mips/kernel/rtlx.c +++ b/arch/mips/kernel/rtlx.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c index cc81771b882c..fe3095160655 100644 --- a/arch/mips/kernel/smp-cmp.c +++ b/arch/mips/kernel/smp-cmp.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c index 1ec56e635d04..ce9e286f0a74 100644 --- a/arch/mips/kernel/smp-mt.c +++ b/arch/mips/kernel/smp-mt.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 32a256101082..32c1e954cd37 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smtc-proc.c b/arch/mips/kernel/smtc-proc.c index fe256559c997..928a5a61e1a6 100644 --- a/arch/mips/kernel/smtc-proc.c +++ b/arch/mips/kernel/smtc-proc.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index cedac4633741..f0895e70e283 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/sync-r4k.c b/arch/mips/kernel/sync-r4k.c index 05dd170a83f7..99f913c8d7a6 100644 --- a/arch/mips/kernel/sync-r4k.c +++ b/arch/mips/kernel/sync-r4k.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index dbb6b408f001..2cd50ad0d5c6 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/mipssim/sim_smtc.c b/arch/mips/mipssim/sim_smtc.c index 30df47258c2c..915063991f6e 100644 --- a/arch/mips/mipssim/sim_smtc.c +++ b/arch/mips/mipssim/sim_smtc.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mips/sgi-ip27/ip27-nmi.c b/arch/mips/sgi-ip27/ip27-nmi.c index bc4fa8dd67f3..005c29ed419a 100644 --- a/arch/mips/sgi-ip27/ip27-nmi.c +++ b/arch/mips/sgi-ip27/ip27-nmi.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h index 9d773a639513..041b9d69d86c 100644 --- a/arch/mn10300/include/asm/atomic.h +++ b/arch/mn10300/include/asm/atomic.h @@ -269,7 +269,6 @@ static inline void atomic_dec(atomic_t *v) c != (u); \ }) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /** * atomic_clear_mask - Atomically clear bits in memory diff --git a/arch/mn10300/include/asm/mmu_context.h b/arch/mn10300/include/asm/mmu_context.h index c8f6c82672ad..c67c2b5365a6 100644 --- a/arch/mn10300/include/asm/mmu_context.h +++ b/arch/mn10300/include/asm/mmu_context.h @@ -22,7 +22,7 @@ #ifndef _ASM_MMU_CONTEXT_H #define _ASM_MMU_CONTEXT_H -#include +#include #include #include #include diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 93429154e898..1ae580f38933 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -11,7 +11,7 @@ #ifndef _ASM_SPINLOCK_H #define _ASM_SPINLOCK_H -#include +#include #include #include diff --git a/arch/mn10300/include/asm/system.h b/arch/mn10300/include/asm/system.h index 8ff3e5aaca41..94b4c5e1491b 100644 --- a/arch/mn10300/include/asm/system.h +++ b/arch/mn10300/include/asm/system.h @@ -19,7 +19,7 @@ #include #include -#include +#include #if !defined(CONFIG_LAZY_SAVE_FPU) struct fpu_state_struct; diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c index c5e12bfd9fcd..a45f0c7549a6 100644 --- a/arch/mn10300/kernel/mn10300-watchdog.c +++ b/arch/mn10300/kernel/mn10300-watchdog.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index bd3e5e73826e..9220a75a7b43 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c index eef989c1d0c1..f9bb8cb1c14a 100644 --- a/arch/mn10300/mm/misalignment.c +++ b/arch/mn10300/mm/misalignment.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/proc-mn2ws0050/proc-init.c b/arch/mn10300/proc-mn2ws0050/proc-init.c index c58249b9525a..fe6e24906ffc 100644 --- a/arch/mn10300/proc-mn2ws0050/proc-init.c +++ b/arch/mn10300/proc-mn2ws0050/proc-init.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index f81955934aeb..192488999b63 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -220,7 +220,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_add(i,v) ((void)(__atomic_add_return( (i),(v)))) #define atomic_sub(i,v) ((void)(__atomic_add_return(-(i),(v)))) diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 4e833aa05a44..8c9b631d2a78 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -8,7 +8,7 @@ #include #include /* for BITS_PER_LONG/SHIFT_PER_LONG */ #include -#include +#include /* * HP-PARISC specific bit operations diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index 354b2aca990e..59be25764433 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c index df653663d3db..a7bb757a5497 100644 --- a/arch/parisc/kernel/parisc_ksyms.c +++ b/arch/parisc/kernel/parisc_ksyms.c @@ -31,7 +31,7 @@ #include EXPORT_SYMBOL(memset); -#include +#include EXPORT_SYMBOL(__xchg8); EXPORT_SYMBOL(__xchg32); EXPORT_SYMBOL(__cmpxchg_u32); diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 828305f19cff..32d588488f04 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 8b58bf0b7d5a..f19e6604026a 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c index 353963d42059..a8bffd8af77d 100644 --- a/arch/parisc/lib/bitops.c +++ b/arch/parisc/lib/bitops.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #ifdef CONFIG_SMP arch_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = { diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index b8f152ece025..b2bcbee622ea 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -212,7 +212,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return t != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_sub_and_test(a, v) (atomic_sub_return((a), (v)) == 0) #define atomic_dec_and_test(v) (atomic_dec_return((v)) == 0) diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index 2cc41c715d2b..63f2a22e9954 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -18,7 +18,7 @@ #ifndef _ASM_POWERPC_EMULATED_OPS_H #define _ASM_POWERPC_EMULATED_OPS_H -#include +#include #include diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index c57a28e52b64..c0e1bc319e35 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -14,7 +14,7 @@ #include #include -#include +#include /* Define a way to iterate across irqs. */ diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h index c2410af6bfd9..b8da91363864 100644 --- a/arch/powerpc/include/asm/local.h +++ b/arch/powerpc/include/asm/local.h @@ -2,7 +2,7 @@ #define _ARCH_POWERPC_LOCAL_H #include -#include +#include typedef struct { diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index b823536375dc..b5c91901e384 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -18,7 +18,7 @@ */ #include #include -#include +#include #define HAVE_ARCH_DEVTREE_FIXUPS diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 24582181b6ec..59dbf6abaaf3 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #ifdef CONFIG_PPC_OF_PLATFORM_PCI diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 7d28f540200c..f5ae872a2ef0 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 0e0ea941156f..d5ca8236315c 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 67f6c3b51357..481ef064c8f1 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c index 03e45c4a9ef1..640de836e466 100644 --- a/arch/powerpc/kernel/smp-tbsync.c +++ b/arch/powerpc/kernel/smp-tbsync.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f932f8a0cf0c..7bf2187dfd99 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c index f8fa2fc3129f..c55129f5760a 100644 --- a/arch/powerpc/platforms/83xx/km83xx.c +++ b/arch/powerpc/platforms/83xx/km83xx.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc832x_mds.c b/arch/powerpc/platforms/83xx/mpc832x_mds.c index 93e60f1f21a9..32a52896822f 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc832x_mds.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc834x_itx.c b/arch/powerpc/platforms/83xx/mpc834x_itx.c index 81e44fa1c644..6b45969567d4 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_itx.c +++ b/arch/powerpc/platforms/83xx/mpc834x_itx.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc834x_mds.c b/arch/powerpc/platforms/83xx/mpc834x_mds.c index c1b1dc50b32a..041c5177e737 100644 --- a/arch/powerpc/platforms/83xx/mpc834x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc834x_mds.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/mpc836x_mds.c b/arch/powerpc/platforms/83xx/mpc836x_mds.c index 81c052b1353e..934cc8c46bbc 100644 --- a/arch/powerpc/platforms/83xx/mpc836x_mds.c +++ b/arch/powerpc/platforms/83xx/mpc836x_mds.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/83xx/sbc834x.c b/arch/powerpc/platforms/83xx/sbc834x.c index 49023dbe1576..af41d8c810a8 100644 --- a/arch/powerpc/platforms/83xx/sbc834x.c +++ b/arch/powerpc/platforms/83xx/sbc834x.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c index 6299a2a51ae8..2bf99786d249 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 747d1ee661fd..973b3f4a4b49 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/85xx/sbc8548.c b/arch/powerpc/platforms/85xx/sbc8548.c index ecdd8c09e4ed..d07dcb7f4ee9 100644 --- a/arch/powerpc/platforms/85xx/sbc8548.c +++ b/arch/powerpc/platforms/85xx/sbc8548.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/cpufreq_spudemand.c b/arch/powerpc/platforms/cell/cpufreq_spudemand.c index d809836bcf5f..7f92096fe968 100644 --- a/arch/powerpc/platforms/cell/cpufreq_spudemand.c +++ b/arch/powerpc/platforms/cell/cpufreq_spudemand.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index dbb641ea90dd..f2e1dfe4bf31 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c index 0c87bcd2452a..bf4d41d8fa14 100644 --- a/arch/powerpc/platforms/cell/spufs/context.c +++ b/arch/powerpc/platforms/cell/spufs/context.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include "spufs.h" diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c index a800122e4dda..feab30bbae23 100644 --- a/arch/powerpc/platforms/chrp/smp.c +++ b/arch/powerpc/platforms/chrp/smp.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/iseries/smp.c b/arch/powerpc/platforms/iseries/smp.c index 2df48c2287bd..8bda9be06fa0 100644 --- a/arch/powerpc/platforms/iseries/smp.c +++ b/arch/powerpc/platforms/iseries/smp.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powermac/backlight.c b/arch/powerpc/platforms/powermac/backlight.c index d679964ae2ab..c2f3e861f5ea 100644 --- a/arch/powerpc/platforms/powermac/backlight.c +++ b/arch/powerpc/platforms/powermac/backlight.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index d15fca322978..9a521dc8e485 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 46b55cf563e3..ada6e07532ec 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c index 8ed0d2d0e1b5..fc5ae767989e 100644 --- a/arch/powerpc/platforms/pseries/eeh_cache.c +++ b/arch/powerpc/platforms/pseries/eeh_cache.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 1672db2d1b0e..4e44c4dcd11c 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index 265313e8396b..2d66275e489f 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index ee056807b52c..9f51f97abb5d 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index d9db13810d15..29d756329228 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -108,7 +108,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != u; } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #undef __CS_LOOP diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 1ca3d1d6a86c..45df6d456aa1 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index e9372c77cced..ffabcd9d3363 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index c7983124d99d..8ddb2635cf92 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -30,7 +30,6 @@ #define atomic_inc_and_test(v) (atomic_inc_return(v) == 0) #define atomic_sub_and_test(i,v) (atomic_sub_return((i), (v)) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic_inc(v) atomic_add(1, (v)) #define atomic_dec(v) atomic_sub(1, (v)) diff --git a/arch/sh/include/asm/hw_irq.h b/arch/sh/include/asm/hw_irq.h index 603cdde813d1..693d44184058 100644 --- a/arch/sh/include/asm/hw_irq.h +++ b/arch/sh/include/asm/hw_irq.h @@ -3,7 +3,7 @@ #include #include -#include +#include extern atomic_t irq_err_count; diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h index 9070d943ddde..78b0d0f4b24b 100644 --- a/arch/sh/include/asm/smp.h +++ b/arch/sh/include/asm/smp.h @@ -8,7 +8,7 @@ #ifdef CONFIG_SMP #include -#include +#include #include #include diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 425d604e3a28..84db0d6ccd0d 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include void (*pm_idle)(void) = NULL; diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 6207561ea34a..3147a9a6fb8b 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index 67110be83fd7..cd3a40483299 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/unwinder.c b/arch/sh/kernel/unwinder.c index 468889d958f4..521b5432471f 100644 --- a/arch/sh/kernel/unwinder.c +++ b/arch/sh/kernel/unwinder.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include /* * This is the most basic stack unwinder an architecture can diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 7ae128b19d3f..7646f2cef5d0 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -52,7 +52,6 @@ extern void atomic_set(atomic_t *, int); #define atomic_dec_and_test(v) (atomic_dec_return(v) == 0) #define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0) -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* This is the old 24-bit implementation. It's still used internally * by some sparc-specific code, notably the semaphore implementation. diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index bdb2ff880bdd..337139ef91be 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -85,7 +85,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) #define atomic64_cmpxchg(v, o, n) \ ((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n))) diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h index 56bbaadef646..edd3d3cde460 100644 --- a/arch/sparc/include/asm/prom.h +++ b/arch/sparc/include/asm/prom.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #define OF_ROOT_NODE_ADDR_CELLS_DEFAULT 2 #define OF_ROOT_NODE_SIZE_CELLS_DEFAULT 1 diff --git a/arch/sparc/include/asm/smp_32.h b/arch/sparc/include/asm/smp_32.h index 093f10843ff2..01c51c704341 100644 --- a/arch/sparc/include/asm/smp_32.h +++ b/arch/sparc/include/asm/smp_32.h @@ -22,7 +22,7 @@ #include #include -#include +#include /* * Private routines/data diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h index 20bca8950710..29862a9e9065 100644 --- a/arch/sparc/include/asm/smp_64.h +++ b/arch/sparc/include/asm/smp_64.h @@ -27,7 +27,7 @@ */ #include -#include +#include #include DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 4e78862d12fd..0dd8422a469c 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c index fe8fb44c609c..1210fde18740 100644 --- a/arch/sparc/kernel/leon_smp.c +++ b/arch/sparc/kernel/leon_smp.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 62a034318b18..171e8d84dc3f 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 21b125341bf7..f671e7fd6ddc 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 99cb17251bb5..4a442c32e117 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index d3c7a12ad879..1a371f8ae0b0 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -7,7 +7,7 @@ * Based on asm-parisc/atomic.h Copyright (C) 2000 Philipp Rumpf */ -#include +#include #include #include diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h index 739cfe0499d1..e3272715c3cb 100644 --- a/arch/tile/include/asm/atomic.h +++ b/arch/tile/include/asm/atomic.h @@ -121,15 +121,6 @@ static inline int atomic_read(const atomic_t *v) */ #define atomic_add_negative(i, v) (atomic_add_return((i), (v)) < 0) -/** - * atomic_inc_not_zero - increment unless the number is zero - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1, so long as @v is non-zero. - * Returns non-zero if @v was non-zero, and zero otherwise. - */ -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) - /* Nonexistent functions intended to cause link errors. */ extern unsigned long __xchg_called_with_bad_pointer(void); extern unsigned long __cmpxchg_called_with_bad_pointer(void); diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index 92a8bee32311..246feed4794d 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -11,7 +11,7 @@ * NON INFRINGEMENT. See the GNU General Public License for * more details. * - * Do not include directly; use . + * Do not include directly; use . */ #ifndef _ASM_TILE_ATOMIC_32_H @@ -21,7 +21,7 @@ #ifndef __ASSEMBLY__ -/* Tile-specific routines to support . */ +/* Tile-specific routines to support . */ int _atomic_xchg(atomic_t *v, int n); int _atomic_xchg_add(atomic_t *v, int i); int _atomic_xchg_add_unless(atomic_t *v, int a, int u); diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h index 1c1e60d8ccb6..a48dda30cbcc 100644 --- a/arch/tile/include/asm/atomic_64.h +++ b/arch/tile/include/asm/atomic_64.h @@ -11,7 +11,7 @@ * NON INFRINGEMENT. See the GNU General Public License for * more details. * - * Do not include directly; use . + * Do not include directly; use . */ #ifndef _ASM_TILE_ATOMIC_64_H diff --git a/arch/tile/include/asm/bitops_32.h b/arch/tile/include/asm/bitops_32.h index d31ab905cfa7..571b118bfd9b 100644 --- a/arch/tile/include/asm/bitops_32.h +++ b/arch/tile/include/asm/bitops_32.h @@ -16,7 +16,7 @@ #define _ASM_TILE_BITOPS_32_H #include -#include +#include #include /* Tile-specific routines to support . */ diff --git a/arch/tile/include/asm/bitops_64.h b/arch/tile/include/asm/bitops_64.h index 68f8c5bc0679..e9c8e381ee0e 100644 --- a/arch/tile/include/asm/bitops_64.h +++ b/arch/tile/include/asm/bitops_64.h @@ -16,7 +16,7 @@ #define _ASM_TILE_BITOPS_64_H #include -#include +#include #include /* See for API comments. */ diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index a8f2c6e31a87..a5e4208d34f9 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -17,7 +17,7 @@ #ifndef _ASM_TILE_SPINLOCK_32_H #define _ASM_TILE_SPINLOCK_32_H -#include +#include #include #include #include diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index 72ade79b621b..fc94607f0bd5 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 46570211df52..771b251b409d 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 24448734f6f1..1f75a2a56101 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S @@ -70,7 +70,7 @@ */ #include -#include +#include #include #include diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 5852519b2d0f..f6f5c53dc903 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4a0b7c7e2cce..7b3ca8324b69 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 952a826ac4e5..897969bdd4e6 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -244,7 +244,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) /* * atomic_dec_if_positive - decrement by 1 if old value positive diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 13f5504c76c0..09199052060f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 2e9972468a5d..9cdae5d47e8f 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include typedef struct { diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 716b48af7863..c9321f34e55b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -124,7 +124,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain; #include #include -#include +#include extern int mce_disabled; extern int mce_p5_enabled; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8b5393ec1080..69021528b43c 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -2,7 +2,7 @@ #define _ASM_X86_MMU_CONTEXT_H #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index df1287019e6d..644dd885f05a 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index e9e51f710e6c..ee67edf86fdd 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H -#include +#include #include #include #include diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1f2e61e28981..a1fe5c127b52 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -21,7 +21,7 @@ struct task_struct; struct exec_domain; #include #include -#include +#include struct thread_info { struct task_struct *task; /* main task structure */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b117efd24f71..8a439d364b94 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b24be38c8cf8..52fa56399a50 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 9536b3fe43f8..5d513bc47b6b 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d7fbff..62184390a601 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 65b8f5c2eebf..610485223bdb 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index f09d4bbe2d2d..b3300e6bacef 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fbc097a085ca..9682ec50180c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2b2255b1f04b..57dcbd4308fa 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index abd86e865be3..ae432ea1cd83 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "kvm_timer.h" static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c index 540179e8e9fa..042f6826bf57 100644 --- a/arch/x86/lib/atomic64_32.c +++ b/arch/x86/lib/atomic64_32.c @@ -4,7 +4,7 @@ #include #include -#include +#include long long atomic64_read_cx8(long long, const atomic64_t *v); EXPORT_SYMBOL(atomic64_read_cx8); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3adff7dcc148..67421f38a215 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -34,7 +34,7 @@ #include #include #include /* for ISA_START_ADDRESS */ -#include +#include #include #include diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h index a96a0619d0b7..7cca2fb18baf 100644 --- a/arch/xtensa/include/asm/atomic.h +++ b/arch/xtensa/include/asm/atomic.h @@ -248,7 +248,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) return c != (u); } -#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) static inline void atomic_clear_mask(unsigned int mask, atomic_t *v) { diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index e3558b9a58ba..47041e7c088c 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 940d70cb5c25..ac33d5f30778 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -12,7 +12,7 @@ * */ -#include +#include #include #include #include diff --git a/crypto/proc.c b/crypto/proc.c index 58fef67d4f4d..3808697814d7 100644 --- a/crypto/proc.c +++ b/crypto/proc.c @@ -13,7 +13,7 @@ * */ -#include +#include #include #include #include diff --git a/crypto/rng.c b/crypto/rng.c index f93cb5311182..45229ae782be 100644 --- a/crypto/rng.c +++ b/crypto/rng.c @@ -12,7 +12,7 @@ * */ -#include +#include #include #include #include diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c index bb3b016b6ce8..f8f41e0e8a8c 100644 --- a/drivers/atm/ambassador.c +++ b/drivers/atm/ambassador.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index 0b0625054a87..b22d71cac54c 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include extern int atm_init_aal5(struct atm_vcc *vcc); /* "raw" AAL5 transport */ diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 3230ea0df83c..93071417315f 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/atm/eni.h b/drivers/atm/eni.h index 493a6932507e..dc9a62cc2605 100644 --- a/drivers/atm/eni.h +++ b/drivers/atm/eni.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "midway.h" diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 7c7b571647f9..5072f8ac16fd 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index bc9e702186dd..361f5aee3be1 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #ifdef CONFIG_SBUS #include diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c index 287506183893..b81210330aca 100644 --- a/drivers/atm/horizon.c +++ b/drivers/atm/horizon.c @@ -45,7 +45,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index be0dbfeb541c..db06f34419cf 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include #ifdef CONFIG_ATM_IDT77252_USE_SUNI diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index 957106f636ea..cb90f7a3e074 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index 6b313ee9231b..1c70c45fa044 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include "nicstar.h" #ifdef CONFIG_ATM_NICSTAR_USE_SUNI #include "suni.h" diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c index 41c56eae4c81..90f1ccca9e52 100644 --- a/drivers/atm/suni.c +++ b/drivers/atm/suni.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include "suni.h" diff --git a/drivers/atm/uPD98402.c b/drivers/atm/uPD98402.c index c45ae0573bbd..5120a96b3a89 100644 --- a/drivers/atm/uPD98402.c +++ b/drivers/atm/uPD98402.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include "uPD98402.h" diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index 7f8c5132ff32..d889f56e8d8c 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include "uPD98401.h" diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 45d7c8fc73bd..2840ed4668c1 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include static DEFINE_MUTEX(mem_sysfs_mutex); diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index 942d6a7c9ae1..17b7934f31cb 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include "power.h" diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 696100241a6f..951a4e33b92b 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 320668f4c3aa..3302586655c4 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #ifdef CONFIG_X86 /* diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 25d139c9dbed..5c0d96a820fa 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 0debc17c8e28..3ee1fdb31ea7 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -29,7 +29,8 @@ #include #include #include -#include +#include + #include #include diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c index aab970760b75..86ad2eee1201 100644 --- a/drivers/edac/edac_stub.c +++ b/drivers/edac/edac_stub.c @@ -14,7 +14,7 @@ */ #include #include -#include +#include #include int edac_op_state = EDAC_OPSTATE_INVAL; diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 29d2423fae6d..85661b060ed7 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include "core.h" diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index 95a471401892..8ba7f7928f1f 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 193ed9233144..94d3b494ddfb 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index 0fe4e4e6eda7..b45be5767529 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -9,7 +9,7 @@ #include #include -#include +#include struct device; struct fw_card; diff --git a/drivers/firewire/nosy.c b/drivers/firewire/nosy.c index 0618145376ad..763626b739d1 100644 --- a/drivers/firewire/nosy.c +++ b/drivers/firewire/nosy.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include "nosy.h" diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index ef37a9b5a3cc..32807baf55e2 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -60,7 +60,7 @@ * are considered as fatal) */ -#include +#include #include #include #include diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 021d2b6b556f..7fd4e3e5ad5f 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -29,7 +29,7 @@ * Dave Airlie */ #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 2e618b5ac465..56619f64b6bf 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #define TTM_ASSERT_LOCKED(param) #define TTM_DEBUG(fmt, arg...) diff --git a/drivers/gpu/drm/ttm/ttm_lock.c b/drivers/gpu/drm/ttm/ttm_lock.c index de41e55a944a..075daf44bce4 100644 --- a/drivers/gpu/drm/ttm/ttm_lock.c +++ b/drivers/gpu/drm/ttm/ttm_lock.c @@ -30,7 +30,7 @@ #include "ttm/ttm_lock.h" #include "ttm/ttm_module.h" -#include +#include #include #include #include diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c index ebddd443d91a..93577f2e2954 100644 --- a/drivers/gpu/drm/ttm/ttm_object.c +++ b/drivers/gpu/drm/ttm/ttm_object.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include struct ttm_object_file { struct ttm_object_device *tdev; diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c index 170e751c283e..727e93daac3b 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include "ttm/ttm_bo_driver.h" #include "ttm/ttm_page_alloc.h" diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index 7d231cf5d2ce..fe4104c6b764 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include /* Commands */ #define SHT15_MEASURE_TEMP 0x03 diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 0347eed4a167..40c835309e49 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -31,7 +31,7 @@ */ #include -#include +#include #include "iw_cxgb4.h" diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h index f09914cccf53..54c0d23bad92 100644 --- a/drivers/infiniband/hw/ehca/ehca_tools.h +++ b/drivers/infiniband/hw/ehca/ehca_tools.h @@ -58,7 +58,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 73bc18465c9c..c118663e4437 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -34,7 +34,7 @@ #define TCPOPT_TIMESTAMP 8 -#include +#include #include #include #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 7b6985a2e652..b3cc1e062b17 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -45,7 +45,7 @@ #include -#include +#include #include #include diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 7d5109bbd1ad..0bfa545675b8 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/isdn/gigaset/gigaset.h b/drivers/isdn/gigaset/gigaset.h index 6dd360734cfd..212efaf9a4e4 100644 --- a/drivers/isdn/gigaset/gigaset.h +++ b/drivers/isdn/gigaset/gigaset.h @@ -34,7 +34,7 @@ #include #include #include -#include +#include #define GIG_VERSION {0, 5, 0, 0} #define GIG_COMPAT {0, 4, 0, 0} diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c8827ffd85bb..bae6c4e23d3f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 819e37eaaeba..320401dec104 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include #include #include diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aa4e570c2cb5..c3547016f0f1 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "multipath" #define MESG_STR(x) x, sizeof(x) diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index f92b6cea9d9c..03a837aa5ce6 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "multipath queue-length" #define QL_MIN_IO 128 diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 451c3bb176d2..bfe9c2333cea 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #define DM_MSG_PREFIX "table" diff --git a/drivers/media/video/hdpvr/hdpvr-core.c b/drivers/media/video/hdpvr/hdpvr-core.c index a27d93b503a5..5f1db46beb4e 100644 --- a/drivers/media/video/hdpvr/hdpvr-core.c +++ b/drivers/media/video/hdpvr/hdpvr-core.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/video/tlg2300/pd-dvb.c b/drivers/media/video/tlg2300/pd-dvb.c index edd78f8b1baa..d0da11ae19df 100644 --- a/drivers/media/video/tlg2300/pd-dvb.c +++ b/drivers/media/video/tlg2300/pd-dvb.c @@ -7,7 +7,7 @@ #include "vendorcmds.h" #include -#include +#include static void dvb_urb_cleanup(struct pd_dvb_adapter *pd_dvb); diff --git a/drivers/media/video/uvc/uvc_ctrl.c b/drivers/media/video/uvc/uvc_ctrl.c index a4db26fa2f53..2c8954ec6859 100644 --- a/drivers/media/video/uvc/uvc_ctrl.c +++ b/drivers/media/video/uvc/uvc_ctrl.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "uvcvideo.h" diff --git a/drivers/media/video/uvc/uvc_queue.c b/drivers/media/video/uvc/uvc_queue.c index f90ce9fce539..677691c44500 100644 --- a/drivers/media/video/uvc/uvc_queue.c +++ b/drivers/media/video/uvc/uvc_queue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include "uvcvideo.h" diff --git a/drivers/media/video/uvc/uvc_v4l2.c b/drivers/media/video/uvc/uvc_v4l2.c index 543a80395b7f..dde6533e8e6d 100644 --- a/drivers/media/video/uvc/uvc_v4l2.c +++ b/drivers/media/video/uvc/uvc_v4l2.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/media/video/uvc/uvc_video.c b/drivers/media/video/uvc/uvc_video.c index 49994793cc77..8244167c8915 100644 --- a/drivers/media/video/uvc/uvc_video.c +++ b/drivers/media/video/uvc/uvc_video.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c index 74fbe56321ff..c8ed7b63fdf5 100644 --- a/drivers/message/i2o/i2o_scsi.c +++ b/drivers/message/i2o/i2o_scsi.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index b05db55c8c8e..21b28fc6d912 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #define PHANTOM_VERSION "n0.9.8" diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c index 6f0e9403004b..97e6954304ea 100644 --- a/drivers/net/atlx/atl1.c +++ b/drivers/net/atlx/atl1.c @@ -44,7 +44,7 @@ * SMP torture testing */ -#include +#include #include #include diff --git a/drivers/net/atlx/atl2.c b/drivers/net/atlx/atl2.c index e0f87cf1e2ba..d4f7dda39721 100644 --- a/drivers/net/atlx/atl2.c +++ b/drivers/net/atlx/atl2.c @@ -20,7 +20,7 @@ * Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -#include +#include #include #include #include diff --git a/drivers/net/atlx/atl2.h b/drivers/net/atlx/atl2.h index 78344ddf4bf0..bf9016ebdd9b 100644 --- a/drivers/net/atlx/atl2.h +++ b/drivers/net/atlx/atl2.h @@ -25,7 +25,7 @@ #ifndef _ATL2_H_ #define _ATL2_H_ -#include +#include #include #ifndef _ATL2_HW_H_ diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c index b414f5ae0da5..646c86bcc545 100644 --- a/drivers/net/cassini.c +++ b/drivers/net/cassini.c @@ -98,7 +98,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c index 086ce0418b29..e0638cb4b07c 100644 --- a/drivers/net/cpmac.c +++ b/drivers/net/cpmac.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include MODULE_AUTHOR("Eugene Konev "); MODULE_DESCRIPTION("TI AR7 ethernet driver (CPMAC)"); diff --git a/drivers/net/cxgb3/cxgb3_offload.c b/drivers/net/cxgb3/cxgb3_offload.c index 32636a1d62a5..805076c54f1b 100644 --- a/drivers/net/cxgb3/cxgb3_offload.c +++ b/drivers/net/cxgb3/cxgb3_offload.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/cxgb3/l2t.h b/drivers/net/cxgb3/l2t.h index fd3eb07e3f40..7a12d52ed4fc 100644 --- a/drivers/net/cxgb3/l2t.h +++ b/drivers/net/cxgb3/l2t.h @@ -34,7 +34,7 @@ #include #include "t3cdev.h" -#include +#include enum { L2T_STATE_VALID, /* entry is up to date */ diff --git a/drivers/net/cxgb3/t3cdev.h b/drivers/net/cxgb3/t3cdev.h index be55e9ae74d1..705713b56636 100644 --- a/drivers/net/cxgb3/t3cdev.h +++ b/drivers/net/cxgb3/t3cdev.h @@ -33,7 +33,7 @@ #define _T3CDEV_H_ #include -#include +#include #include #include #include diff --git a/drivers/net/cxgb4/cxgb4_uld.h b/drivers/net/cxgb4/cxgb4_uld.h index 1b48c0170145..b1d39b8d141a 100644 --- a/drivers/net/cxgb4/cxgb4_uld.h +++ b/drivers/net/cxgb4/cxgb4_uld.h @@ -38,7 +38,7 @@ #include #include #include -#include +#include /* CPL message priority levels */ enum { diff --git a/drivers/net/cxgb4/l2t.h b/drivers/net/cxgb4/l2t.h index 7bd8f42378ff..02b31d0c6410 100644 --- a/drivers/net/cxgb4/l2t.h +++ b/drivers/net/cxgb4/l2t.h @@ -37,7 +37,7 @@ #include #include -#include +#include struct adapter; struct l2t_data; diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 0d283781bc5e..2a5a34d2d67b 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #define SIXPACK_VERSION "Revision: 0.3.0" diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c index 52b14256e2c0..ce555d9ac02c 100644 --- a/drivers/net/hamradio/dmascc.c +++ b/drivers/net/hamradio/dmascc.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index 838c5b673767..ba99af05bf62 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index a47595760751..3cbda0851f83 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 4609bc0e2f56..10e5d985afa3 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wimax/i2400m/i2400m.h b/drivers/net/wimax/i2400m/i2400m.h index 5eacc653a94d..c421a6141854 100644 --- a/drivers/net/wimax/i2400m/i2400m.h +++ b/drivers/net/wimax/i2400m/i2400m.h @@ -155,7 +155,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/wireless/b43legacy/b43legacy.h b/drivers/net/wireless/b43legacy/b43legacy.h index 17a130d18dc9..a610a352102a 100644 --- a/drivers/net/wireless/b43legacy/b43legacy.h +++ b/drivers/net/wireless/b43legacy/b43legacy.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wireless/b43legacy/dma.h b/drivers/net/wireless/b43legacy/dma.h index f89c34226288..686941c242fc 100644 --- a/drivers/net/wireless/b43legacy/dma.h +++ b/drivers/net/wireless/b43legacy/dma.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "b43legacy.h" diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h index 0b54e46c3c14..38b6fc028984 100644 --- a/drivers/oprofile/oprofile_stats.h +++ b/drivers/oprofile/oprofile_stats.h @@ -10,7 +10,7 @@ #ifndef OPROFILE_STATS_H #define OPROFILE_STATS_H -#include +#include struct oprofile_stat_struct { atomic_t sample_lost_no_mm; diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index d703e73fffa7..3fadf2f135e8 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include "cpci_hotplug.h" diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 492b7d807fe8..6fa215a38615 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 77f778b7b070..16c5208c3dc7 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include "dasd_int.h" diff --git a/drivers/s390/char/sclp_quiesce.c b/drivers/s390/char/sclp_quiesce.c index 05909a7df8b3..a90a02c28d6a 100644 --- a/drivers/s390/char/sclp_quiesce.c +++ b/drivers/s390/char/sclp_quiesce.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index c837d7419a6a..524d988d89dd 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h index 7e297c7bb5ff..0b7245c72d5e 100644 --- a/drivers/s390/cio/device.h +++ b/drivers/s390/cio/device.h @@ -2,7 +2,7 @@ #define S390_DEVICE_H #include -#include +#include #include #include #include "io_sch.h" diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 570d4da10696..e58169c32474 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 68be6e157126..2a1d4dfaf859 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index f8134a44cefa..b77ae519d79c 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 8e65447f76b7..88ad33ed5d38 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c index 2176d00b395e..da171b5f3996 100644 --- a/drivers/s390/crypto/zcrypt_cex2a.c +++ b/drivers/s390/crypto/zcrypt_cex2a.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_mono.c b/drivers/s390/crypto/zcrypt_mono.c index 44253fdd4136..eb313c3fb2d1 100644 --- a/drivers/s390/crypto/zcrypt_mono.c +++ b/drivers/s390/crypto/zcrypt_mono.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcica.c b/drivers/s390/crypto/zcrypt_pcica.c index 1afb69c75fea..d84816f144df 100644 --- a/drivers/s390/crypto/zcrypt_pcica.c +++ b/drivers/s390/crypto/zcrypt_pcica.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcicc.c b/drivers/s390/crypto/zcrypt_pcicc.c index aa4c050a5694..bdbdbe192993 100644 --- a/drivers/s390/crypto/zcrypt_pcicc.c +++ b/drivers/s390/crypto/zcrypt_pcicc.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcixcc.c b/drivers/s390/crypto/zcrypt_pcixcc.c index 4f85eb725f4f..dd4737808e06 100644 --- a/drivers/s390/crypto/zcrypt_pcixcc.c +++ b/drivers/s390/crypto/zcrypt_pcixcc.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/net/fsm.h b/drivers/s390/net/fsm.h index 1e8b235d95b5..a4510cf59034 100644 --- a/drivers/s390/net/fsm.h +++ b/drivers/s390/net/fsm.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include /** * Define this to get debugging messages. diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 2a4991d6d4d5..7cac873c7383 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include "zfcp_ext.h" #include "zfcp_dbf.h" #include "zfcp_fc.h" diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c index 740da4465447..965a1fccd66a 100644 --- a/drivers/sbus/char/display7seg.c +++ b/drivers/sbus/char/display7seg.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include /* put_/get_user */ #include diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h index 179ad77f6cc9..bd9e31e16249 100644 --- a/drivers/scsi/dpt/dpti_i2o.h +++ b/drivers/scsi/dpt/dpti_i2o.h @@ -22,7 +22,7 @@ #include #include -#include +#include /* diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 6bba23a26303..c6f99b1d2383 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include "hpsa_cmd.h" #include "hpsa.h" diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index aa05e661d113..b97c8ab0c20e 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -54,7 +54,7 @@ #include #include #include -#include +#include #include "pm8001_defs.h" #define DRV_NAME "pm8001" diff --git a/drivers/staging/octeon/ethernet-rx.c b/drivers/staging/octeon/ethernet-rx.c index 0f22f0f47446..1a7c19ae766f 100644 --- a/drivers/staging/octeon/ethernet-rx.c +++ b/drivers/staging/octeon/ethernet-rx.c @@ -42,7 +42,7 @@ #include #endif /* CONFIG_XFRM */ -#include +#include #include diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c index 6227571149f5..b445cd63f901 100644 --- a/drivers/staging/octeon/ethernet-tx.c +++ b/drivers/staging/octeon/ethernet-tx.c @@ -38,7 +38,7 @@ #include #endif /* CONFIG_XFRM */ -#include +#include #include diff --git a/drivers/staging/solo6x10/solo6x10.h b/drivers/staging/solo6x10/solo6x10.h index fd59b093dd4d..17c06bd6cc91 100644 --- a/drivers/staging/solo6x10/solo6x10.h +++ b/drivers/staging/solo6x10/solo6x10.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/staging/tidspbridge/include/dspbridge/host_os.h b/drivers/staging/tidspbridge/include/dspbridge/host_os.h index 1a38896f4331..a2f31c69d12e 100644 --- a/drivers/staging/tidspbridge/include/dspbridge/host_os.h +++ b/drivers/staging/tidspbridge/include/dspbridge/host_os.h @@ -18,7 +18,7 @@ #define _HOST_OS_H_ #include -#include +#include #include #include #include diff --git a/drivers/staging/winbond/mds_s.h b/drivers/staging/winbond/mds_s.h index eeedf0186365..07d835b3b706 100644 --- a/drivers/staging/winbond/mds_s.h +++ b/drivers/staging/winbond/mds_s.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include "localpara.h" #include "mac_structures.h" diff --git a/drivers/staging/winbond/wb35reg_s.h b/drivers/staging/winbond/wb35reg_s.h index eb274ffdd1ba..dc79faa4029f 100644 --- a/drivers/staging/winbond/wb35reg_s.h +++ b/drivers/staging/winbond/wb35reg_s.h @@ -3,7 +3,7 @@ #include #include -#include +#include struct hw_data; diff --git a/drivers/tty/bfin_jtag_comm.c b/drivers/tty/bfin_jtag_comm.c index 03c285bb2f18..3a997760ec32 100644 --- a/drivers/tty/bfin_jtag_comm.c +++ b/drivers/tty/bfin_jtag_comm.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #define pr_init(fmt, args...) ({ static const __initconst char __fmt[] = fmt; printk(__fmt, ## args); }) diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c index 13043e8d37fe..6a1241c7f841 100644 --- a/drivers/tty/rocket.c +++ b/drivers/tty/rocket.c @@ -83,7 +83,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c index 57421d776329..ddc487a2d42f 100644 --- a/drivers/tty/serial/dz.c +++ b/drivers/tty/serial/dz.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c index ea2340b814e9..6bc2e3f876f4 100644 --- a/drivers/tty/serial/sb1250-duart.c +++ b/drivers/tty/serial/sb1250-duart.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c index 1a7fd3e70315..0aebd7121b56 100644 --- a/drivers/tty/serial/zs.c +++ b/drivers/tty/serial/zs.c @@ -65,7 +65,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/usb/gadget/f_audio.c b/drivers/usb/gadget/f_audio.c index 02a02700b51d..a9a4eade7e80 100644 --- a/drivers/usb/gadget/f_audio.c +++ b/drivers/usb/gadget/f_audio.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "u_audio.h" diff --git a/drivers/usb/gadget/f_rndis.c b/drivers/usb/gadget/f_rndis.c index 8f3eae90919f..3ea4666be3d0 100644 --- a/drivers/usb/gadget/f_rndis.c +++ b/drivers/usb/gadget/f_rndis.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include "u_ether.h" #include "rndis.h" diff --git a/drivers/usb/gadget/uvc_queue.c b/drivers/usb/gadget/uvc_queue.c index f7395ac5dc17..aa0ad34e0f1f 100644 --- a/drivers/usb/gadget/uvc_queue.c +++ b/drivers/usb/gadget/uvc_queue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include "uvc.h" diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c index a0037961e5bd..27e209a7222f 100644 --- a/drivers/usb/image/microtek.c +++ b/drivers/usb/image/microtek.c @@ -131,7 +131,7 @@ #include #include -#include +#include #include #include "../../scsi/scsi.h" #include diff --git a/drivers/usb/misc/appledisplay.c b/drivers/usb/misc/appledisplay.c index 68ab460a735c..ac0d75a9005a 100644 --- a/drivers/usb/misc/appledisplay.c +++ b/drivers/usb/misc/appledisplay.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #define APPLE_VENDOR_ID 0x05AC diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index b0a7a9e909a4..1a49ca9c8ea5 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/usb/wusbcore/wa-rpipe.c b/drivers/usb/wusbcore/wa-rpipe.c index ca80171f42c6..2acc7f504c51 100644 --- a/drivers/usb/wusbcore/wa-rpipe.c +++ b/drivers/usb/wusbcore/wa-rpipe.c @@ -58,7 +58,7 @@ * destination address. */ #include -#include +#include #include #include diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 14c9abf0d800..a801e2821d03 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* This is for zerocopy, used buffer len is set to 1 when lower device DMA * done */ diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c index 019dbd3f12b2..b048417247e8 100644 --- a/drivers/video/sh_mobile_lcdcfb.c +++ b/drivers/video/sh_mobile_lcdcfb.c @@ -24,7 +24,7 @@ #include #include #include