From 91aa11fae1cf8c2fd67be0609692ea9741cdcc43 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Aug 2013 09:53:28 -0400 Subject: jbd2: Fix use after free after error in jbd2_journal_dirty_metadata() When jbd2_journal_dirty_metadata() returns error, __ext4_handle_dirty_metadata() stops the handle. However callers of this function do not count with that fact and still happily used now freed handle. This use after free can result in various issues but very likely we oops soon. The motivation of adding __ext4_journal_stop() into __ext4_handle_dirty_metadata() in commit 9ea7a0df seems to be only to improve error reporting. So replace __ext4_journal_stop() with ext4_journal_abort_handle() which was there before that commit and add WARN_ON_ONCE() to dump stack to provide useful information. Reported-by: Sage Weil Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org # 3.2+ --- fs/ext4/ext4_jbd2.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 72a3600aedbd..17ac112ab101 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -255,10 +255,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - if (err) { - /* Errors can only happen if there is a bug */ - handle->h_err = err; - __ext4_journal_stop(where, line, handle); + /* Errors can only happen if there is a bug */ + if (WARN_ON_ONCE(err)) { + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); } } else { if (inode) -- cgit v1.2.3 From 44512449c0ab368889dd13ae0031fba74ee7e1d2 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Thu, 15 Aug 2013 15:36:49 -0500 Subject: jfs: fix readdir cookie incompatibility with NFSv4 NFSv4 reserves readdir cookie values 0-2 for special entries (. and ..), but jfs allows a value of 2 for a non-special entry. This incompatibility can result in the nfs client reporting a readdir loop. This patch doesn't change the value stored internally, but adds one to the value exposed to the iterate method. Signed-off-by: Dave Kleikamp Tested-by: Christian Kujau --- fs/jfs/jfs_dtree.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 8743ba9c6742..984c2bbf4f61 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3047,6 +3047,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) dir_index = (u32) ctx->pos; + /* + * NFSv4 reserves cookies 1 and 2 for . and .. so the value + * we return to the vfs is one greater than the one we use + * internally. + */ + if (dir_index) + dir_index--; + if (dir_index > 1) { struct dir_table_slot dirtab_slot; @@ -3086,7 +3094,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); - ctx->pos = -1; + ctx->pos = DIREND; return 0; } } else { @@ -3094,14 +3102,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) /* * self "." */ - ctx->pos = 0; + ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ - ctx->pos = 1; + ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; @@ -3122,22 +3130,23 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * - * pn = index = 0: First entry "." - * pn = 0; index = 1: Second entry ".." + * pn = 0; index = 1: First entry "." + * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; - if (dtpos == 0) { + if (dtpos < 2) { /* build "." entry */ + ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; - dtoffset->index = 1; + dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { - if (dtoffset->index == 1) { + if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; @@ -3228,6 +3237,12 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) } jfs_dirent->position = unique_pos++; } + /* + * We add 1 to the index because we may + * use a value of 2 internally, and NFSv4 + * doesn't like that. + */ + jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); -- cgit v1.2.3 From 2b047252d087be7f2ba088b4933cd904f92e6fce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Aug 2013 11:42:25 -0700 Subject: Fix TLB gather virtual address range invalidation corner cases Ben Tebulin reported: "Since v3.7.2 on two independent machines a very specific Git repository fails in 9/10 cases on git-fsck due to an SHA1/memory failures. This only occurs on a very specific repository and can be reproduced stably on two independent laptops. Git mailing list ran out of ideas and for me this looks like some very exotic kernel issue" and bisected the failure to the backport of commit 53a59fc67f97 ("mm: limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT"). That commit itself is not actually buggy, but what it does is to make it much more likely to hit the partial TLB invalidation case, since it introduces a new case in tlb_next_batch() that previously only ever happened when running out of memory. The real bug is that the TLB gather virtual memory range setup is subtly buggered. It was introduced in commit 597e1c3580b7 ("mm/mmu_gather: enable tlb flush range in generic mmu_gather"), and the range handling was already fixed at least once in commit e6c495a96ce0 ("mm: fix the TLB range flushed when __tlb_remove_page() runs out of slots"), but that fix was not complete. The problem with the TLB gather virtual address range is that it isn't set up by the initial tlb_gather_mmu() initialization (which didn't get the TLB range information), but it is set up ad-hoc later by the functions that actually flush the TLB. And so any such case that forgot to update the TLB range entries would potentially miss TLB invalidates. Rather than try to figure out exactly which particular ad-hoc range setup was missing (I personally suspect it's the hugetlb case in zap_huge_pmd(), which didn't have the same logic as zap_pte_range() did), this patch just gets rid of the problem at the source: make the TLB range information available to tlb_gather_mmu(), and initialize it when initializing all the other tlb gather fields. This makes the patch larger, but conceptually much simpler. And the end result is much more understandable; even if you want to play games with partial ranges when invalidating the TLB contents in chunks, now the range information is always there, and anybody who doesn't want to bother with it won't introduce subtle bugs. Ben verified that this fixes his problem. Reported-bisected-and-tested-by: Ben Tebulin Build-testing-by: Stephen Rothwell Build-testing-by: Richard Weinberger Reviewed-by: Michal Hocko Acked-by: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 7 +++++-- arch/arm64/include/asm/tlb.h | 7 +++++-- arch/ia64/include/asm/tlb.h | 9 ++++++--- arch/s390/include/asm/tlb.h | 8 ++++++-- arch/sh/include/asm/tlb.h | 6 ++++-- arch/um/include/asm/tlb.h | 6 ++++-- fs/exec.c | 4 ++-- include/asm-generic/tlb.h | 2 +- mm/hugetlb.c | 2 +- mm/memory.c | 36 +++++++++++++++++++++--------------- mm/mmap.c | 4 ++-- 11 files changed, 57 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 46e7cfb3e721..0baf7f0d9394 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -43,6 +43,7 @@ struct mmu_gather { struct mm_struct *mm; unsigned int fullmm; struct vm_area_struct *vma; + unsigned long start, end; unsigned long range_start; unsigned long range_end; unsigned int nr; @@ -107,10 +108,12 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + tlb->fullmm = !(start | (end+1)); + tlb->start = start; + tlb->end = end; tlb->vma = NULL; tlb->max = ARRAY_SIZE(tlb->local); tlb->pages = tlb->local; diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 46b3beb4b773..717031a762c2 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -35,6 +35,7 @@ struct mmu_gather { struct mm_struct *mm; unsigned int fullmm; struct vm_area_struct *vma; + unsigned long start, end; unsigned long range_start; unsigned long range_end; unsigned int nr; @@ -97,10 +98,12 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + tlb->fullmm = !(start | (end+1)); + tlb->start = start; + tlb->end = end; tlb->vma = NULL; tlb->max = ARRAY_SIZE(tlb->local); tlb->pages = tlb->local; diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index ef3a9de01954..bc5efc7c3f3f 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -22,7 +22,7 @@ * unmapping a portion of the virtual address space, these hooks are called according to * the following template: * - * tlb <- tlb_gather_mmu(mm, full_mm_flush); // start unmap for address space MM + * tlb <- tlb_gather_mmu(mm, start, end); // start unmap for address space MM * { * for each vma that needs a shootdown do { * tlb_start_vma(tlb, vma); @@ -58,6 +58,7 @@ struct mmu_gather { unsigned int max; unsigned char fullmm; /* non-zero means full mm flush */ unsigned char need_flush; /* really unmapped some PTEs? */ + unsigned long start, end; unsigned long start_addr; unsigned long end_addr; struct page **pages; @@ -155,13 +156,15 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb) static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; tlb->max = ARRAY_SIZE(tlb->local); tlb->pages = tlb->local; tlb->nr = 0; - tlb->fullmm = full_mm_flush; + tlb->fullmm = !(start | (end+1)); + tlb->start = start; + tlb->end = end; tlb->start_addr = ~0UL; } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b75d7d686684..23a64d25f2b1 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -32,6 +32,7 @@ struct mmu_gather { struct mm_struct *mm; struct mmu_table_batch *batch; unsigned int fullmm; + unsigned long start, unsigned long end; }; struct mmu_table_batch { @@ -48,10 +49,13 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned int full_mm_flush) + unsigned long start, + unsigned long end) { tlb->mm = mm; - tlb->fullmm = full_mm_flush; + tlb->start = start; + tlb->end = end; + tlb->fullmm = !(start | (end+1)); tlb->batch = NULL; if (tlb->fullmm) __tlb_flush_mm(mm); diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index e61d43d9f689..362192ed12fe 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -36,10 +36,12 @@ static inline void init_tlb_gather(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = full_mm_flush; + tlb->start = start; + tlb->end = end; + tlb->fullmm = !(start | (end+1)); init_tlb_gather(tlb); } diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 4febacd1a8a1..29b0301c18aa 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -45,10 +45,12 @@ static inline void init_tlb_gather(struct mmu_gather *tlb) } static inline void -tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = full_mm_flush; + tlb->start = start; + tlb->end = end; + tlb->fullmm = !(start | (end+1)); init_tlb_gather(tlb); } diff --git a/fs/exec.c b/fs/exec.c index 9c73def87642..fd774c7cb483 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -608,7 +608,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, old_start, old_end); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. @@ -625,7 +625,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } - tlb_finish_mmu(&tlb, new_end, old_end); + tlb_finish_mmu(&tlb, old_start, old_end); /* * Shrink the vma to just the new range. Always succeeds. diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 13821c339a41..5672d7ea1fa0 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -112,7 +112,7 @@ struct mmu_gather { #define HAVE_GENERIC_MMU_GATHER -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm); +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end); void tlb_flush_mmu(struct mmu_gather *tlb); void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 83aff0a4d093..b60f33080a28 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2490,7 +2490,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); tlb_finish_mmu(&tlb, start, end); } diff --git a/mm/memory.c b/mm/memory.c index 40268410732a..af84bc0ec17c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -209,14 +209,15 @@ static int tlb_next_batch(struct mmu_gather *tlb) * tear-down from @mm. The @fullmm argument is used when @mm is without * users and we're going to destroy the full address space (exit/execve). */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; - tlb->start = -1UL; - tlb->end = 0; + tlb->start = start; + tlb->end = end; tlb->need_flush = 0; tlb->local.next = NULL; tlb->local.nr = 0; @@ -256,8 +257,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e { struct mmu_gather_batch *batch, *next; - tlb->start = start; - tlb->end = end; tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ @@ -1099,7 +1098,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - unsigned long range_start = addr; again: init_rss_vec(rss); @@ -1205,17 +1203,25 @@ again: * and page-free while holding it. */ if (force_flush) { + unsigned long old_end; + force_flush = 0; -#ifdef HAVE_GENERIC_MMU_GATHER - tlb->start = range_start; + /* + * Flush the TLB just for the previous segment, + * then update the range to be the remaining + * TLB range. + */ + old_end = tlb->end; tlb->end = addr; -#endif + tlb_flush_mmu(tlb); - if (addr != end) { - range_start = addr; + + tlb->start = addr; + tlb->end = old_end; + + if (addr != end) goto again; - } } return addr; @@ -1400,7 +1406,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) @@ -1426,7 +1432,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr unsigned long end = address + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, address, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); unmap_single_vma(&tlb, vma, address, end, details); diff --git a/mm/mmap.c b/mm/mmap.c index 1edbaa3136c3..f9c97d10b873 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2336,7 +2336,7 @@ static void unmap_region(struct mm_struct *mm, struct mmu_gather tlb; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, @@ -2709,7 +2709,7 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb_gather_mmu(&tlb, mm, 1); + tlb_gather_mmu(&tlb, mm, 0, -1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); -- cgit v1.2.3 From a361293f5fedea0016a10599f409631a15d47ee7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Aug 2013 21:19:41 -0400 Subject: jbd2: Fix oops in jbd2_journal_file_inode() Commit 0713ed0cde76438d05849f1537d3aab46e099475 added jbd2_journal_file_inode() call into ext4_block_zero_page_range(). However that function gets called from truncate path and thus inode needn't have jinode attached - that happens in ext4_file_open() but the file needn't be ever open since mount. Calling jbd2_journal_file_inode() without jinode attached results in the oops. We fix the problem by attaching jinode to inode also in ext4_truncate() and ext4_punch_hole() when we are going to zero out partial blocks. Reported-by: majianpeng Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/file.c | 21 ++++----------------- fs/ext4/inode.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b577e45425b0..0ab26fbf3380 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2086,6 +2086,7 @@ extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6f4cc567c382..319c9d26279a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -219,7 +219,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp) { struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; @@ -259,22 +258,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp) * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ - if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { - struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); - - spin_lock(&inode->i_lock); - if (!ei->jinode) { - if (!jinode) { - spin_unlock(&inode->i_lock); - return -ENOMEM; - } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); - jinode = NULL; - } - spin_unlock(&inode->i_lock); - if (unlikely(jinode != NULL)) - jbd2_free_inode(jinode); + if (filp->f_mode & FMODE_WRITE) { + int ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + return ret; } return dquot_file_open(inode, filp); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dd32a2eacd0d..c2ca04e67a4f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3533,6 +3533,18 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) offset; } + if (offset & (sb->s_blocksize - 1) || + (offset + length) & (sb->s_blocksize - 1)) { + /* + * Attach jinode to inode for jbd2 if we do any zeroing of + * partial block + */ + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + goto out_mutex; + + } + first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; @@ -3601,6 +3613,31 @@ out_mutex: return ret; } +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; +} + /* * ext4_truncate() * @@ -3661,6 +3698,12 @@ void ext4_truncate(struct inode *inode) return; } + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + if (ext4_inode_attach_jinode(inode) < 0) + return; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else -- cgit v1.2.3 From 2523d47a798b5d985ba404d0d793270494ccf6e9 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 17 Jul 2013 08:11:32 +0100 Subject: GFS2: Fix typo in gfs2_create_inode() PTR_RET should be PTR_ERR Reported-by: Sachin Kamat Cc: Rusty Russell Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index bbb2715171cd..a01b8fd3a1c1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -594,7 +594,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_glock_dq_uninit(ghs); if (IS_ERR(d)) - return PTR_RET(d); + return PTR_ERR(d); return error; } else if (error != -ENOENT) { goto fail_gunlock; -- cgit v1.2.3 From d08fa65a81625765ff4733b4a6556b7156954073 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 30 Jul 2013 08:40:25 -0400 Subject: GFS2: WQ_NON_REENTRANT is meaningless and going away dbf2576e37 ("workqueue: make all workqueues non-reentrant") made WQ_NON_REENTRANT no-op and the flag is going away. Remove its usages. This patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Steven Whitehouse Cc: cluster-devel@redhat.com --- fs/gfs2/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index e04d0e09ee7b..7b0f5043cf24 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -155,7 +155,7 @@ static int __init init_gfs2_fs(void) goto fail_wq; gfs2_control_wq = alloc_workqueue("gfs2_control", - WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); + WQ_UNBOUND | WQ_FREEZABLE, 0); if (!gfs2_control_wq) goto fail_recovery; -- cgit v1.2.3 From 1bc333f4cf601f77ba0f5046ff226fe654e83bee Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Fri, 26 Jul 2013 17:09:33 -0500 Subject: GFS2: don't overrun reserved revokes When run during fsync, a gfs2_log_flush could happen between the time when gfs2_ail_flush checked the number of blocks to revoke, and when it actually started the transaction to do those revokes. This occassionally caused it to need more revokes than it reserved, causing gfs2 to crash. Instead of just reserving enough revokes to handle the blocks that currently need them, this patch makes gfs2_ail_flush reserve the maximum number of revokes it can, without increasing the total number of reserved log blocks. This patch also passes the number of reserved revokes to __gfs2_ail_flush() so that it doesn't go over its limit and cause a crash like we're seeing. Non-fsync calls to __gfs2_ail_flush will still cause a BUG() necessary revokes are skipped. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5f2e5224c51c..e2e0a90396e7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -47,7 +47,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) * None of the buffers should be dirty, locked, or pinned. */ -static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, + unsigned int nr_revokes) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; @@ -57,7 +58,9 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) { + list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) { + if (nr_revokes == 0) + break; bh = bd->bd_bh; if (bh->b_state & b_state) { if (fsync) @@ -65,6 +68,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) gfs2_ail_error(gl, bh); } gfs2_trans_add_revoke(sdp, bd); + nr_revokes--; } GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); @@ -91,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; - __gfs2_ail_flush(gl, 0); + __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); @@ -101,15 +105,19 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); + unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); int ret; if (!revokes) return; - ret = gfs2_trans_begin(sdp, 0, revokes); + while (revokes > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + + ret = gfs2_trans_begin(sdp, 0, max_revokes); if (ret) return; - __gfs2_ail_flush(gl, fsync); + __gfs2_ail_flush(gl, fsync, max_revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } -- cgit v1.2.3 From dfc4616ddeb133290599d4d13936e208f6ba8142 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 15 Aug 2013 10:54:43 +0300 Subject: GFS2: alloc_workqueue() doesn't return an ERR_PTR alloc_workqueue() returns a NULL on error, it doesn't return an ERR_PTR. Signed-off-by: Dan Carpenter Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9435384562a2..544a809819c3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1838,14 +1838,14 @@ int __init gfs2_glock_init(void) glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0); - if (IS_ERR(glock_workqueue)) - return PTR_ERR(glock_workqueue); + if (!glock_workqueue) + return -ENOMEM; gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); - if (IS_ERR(gfs2_delete_workqueue)) { + if (!gfs2_delete_workqueue) { destroy_workqueue(glock_workqueue); - return PTR_ERR(gfs2_delete_workqueue); + return -ENOMEM; } register_shrinker(&glock_shrinker); -- cgit v1.2.3 From 7bd9ee58a4fe026514266757e812cb89c3c945eb Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 16 Aug 2013 21:10:28 +0100 Subject: GFS2: Check for glock already held in gfs2_getxattr Since the introduction of atomic_open, gfs2_getxattr can be called with the glock already held, so we need to allow for this. Signed-off-by: Steven Whitehouse Reported-by: David Teigland Tested-by: David Teigland --- fs/gfs2/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a01b8fd3a1c1..64915eeae5a7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1750,6 +1750,10 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, struct gfs2_holder gh; int ret; + /* For selinux during lookup */ + if (gfs2_glock_is_locked_by_me(ip->i_gl)) + return generic_getxattr(dentry, name, data, size); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { -- cgit v1.2.3 From 94fc5d9de5bd757ad46f0d94bc4ebf617c4487f6 Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Mon, 19 Aug 2013 18:30:31 +0200 Subject: proc: return on proc_readdir error Commit f0c3b5093add ("[readdir] convert procfs") introduced a bug on the listing of the proc file-system. The return value of proc_readdir() isn't tested anymore in the proc_root_readdir function. This lead to an "interesting" behaviour when we are using the getdents() system call with a buffer too small: instead of failing, it returns the first entries of /proc (enough to fill the given buffer), plus the PID directories. This is not triggered on glibc (as getdents is called with a 32KB buffer), but on uclibc, the buffer size is only 1KB, thus some proc entries are missing. See https://lkml.org/lkml/2013/8/12/288 for more background. Signed-off-by: Richard Genoud Cc: Al Viro Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index 229e366598da..e0a790da726d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -205,7 +205,9 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr static int proc_root_readdir(struct file *file, struct dir_context *ctx) { if (ctx->pos < FIRST_PROCESS_ENTRY) { - proc_readdir(file, ctx); + int error = proc_readdir(file, ctx); + if (unlikely(error <= 0)) + return error; ctx->pos = FIRST_PROCESS_ENTRY; } -- cgit v1.2.3 From fd3930f70c8d14008f3377d51ce039806dfc542e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 19 Aug 2013 16:26:12 -0700 Subject: proc: more readdir conversion bug-fixes In the previous commit, Richard Genoud fixed proc_root_readdir(), which had lost the check for whether all of the non-process /proc entries had been returned or not. But that in turn exposed _another_ bug, namely that the original readdir conversion patch had yet another problem: it had lost the return value of proc_readdir_de(), so now checking whether it had completed successfully or not didn't actually work right anyway. This reinstates the non-zero return for the "end of base entries" that had also gotten lost in commit f0c3b5093add ("[readdir] convert procfs"). So now you get all the base entries *and* you get all the process entries, regardless of getdents buffer size. (Side note: the Linux "getdents" manual page actually has a nice example application for testing getdents, which can be easily modified to use different buffers. Who knew? Man-pages can be useful) Reported-by: Emmanuel Benisty Reported-by: Marc Dionne Cc: Richard Genoud Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 94441a407337..737e15615b04 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -271,7 +271,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, de = next; } while (de); spin_unlock(&proc_subdir_lock); - return 0; + return 1; } int proc_readdir(struct file *file, struct dir_context *ctx) -- cgit v1.2.3 From 35dc248383bbab0a7203fca4d722875bc81ef091 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 5 Aug 2013 17:55:01 -0700 Subject: [SCSI] sg: Fix user memory corruption when SG_IO is interrupted by a signal There is a nasty bug in the SCSI SG_IO ioctl that in some circumstances leads to one process writing data into the address space of some other random unrelated process if the ioctl is interrupted by a signal. What happens is the following: - A process issues an SG_IO ioctl with direction DXFER_FROM_DEV (ie the underlying SCSI command will transfer data from the SCSI device to the buffer provided in the ioctl) - Before the command finishes, a signal is sent to the process waiting in the ioctl. This will end up waking up the sg_ioctl() code: result = wait_event_interruptible(sfp->read_wait, (srp_done(sfp, srp) || sdp->detached)); but neither srp_done() nor sdp->detached is true, so we end up just setting srp->orphan and returning to userspace: srp->orphan = 1; write_unlock_irq(&sfp->rq_list_lock); return result; /* -ERESTARTSYS because signal hit process */ At this point the original process is done with the ioctl and blithely goes ahead handling the signal, reissuing the ioctl, etc. - Eventually, the SCSI command issued by the first ioctl finishes and ends up in sg_rq_end_io(). At the end of that function, we run through: write_lock_irqsave(&sfp->rq_list_lock, iflags); if (unlikely(srp->orphan)) { if (sfp->keep_orphan) srp->sg_io_owned = 0; else done = 0; } srp->done = done; write_unlock_irqrestore(&sfp->rq_list_lock, iflags); if (likely(done)) { /* Now wake up any sg_read() that is waiting for this * packet. */ wake_up_interruptible(&sfp->read_wait); kill_fasync(&sfp->async_qp, SIGPOLL, POLL_IN); kref_put(&sfp->f_ref, sg_remove_sfp); } else { INIT_WORK(&srp->ew.work, sg_rq_end_io_usercontext); schedule_work(&srp->ew.work); } Since srp->orphan *is* set, we set done to 0 (assuming the userspace app has not set keep_orphan via an SG_SET_KEEP_ORPHAN ioctl), and therefore we end up scheduling sg_rq_end_io_usercontext() to run in a workqueue. - In workqueue context we go through sg_rq_end_io_usercontext() -> sg_finish_rem_req() -> blk_rq_unmap_user() -> ... -> bio_uncopy_user() -> __bio_copy_iov() -> copy_to_user(). The key point here is that we are doing copy_to_user() on a workqueue -- that is, we're on a kernel thread with current->mm equal to whatever random previous user process was scheduled before this kernel thread. So we end up copying whatever data the SCSI command returned to the virtual address of the buffer passed into the original ioctl, but it's quite likely we do this copying into a different address space! As suggested by James Bottomley , add a check for current->mm (which is NULL if we're on a kernel thread without a real userspace address space) in bio_uncopy_user(), and skip the copy if we're on a kernel thread. There's no reason that I can think of for any caller of bio_uncopy_user() to want to do copying on a kernel thread with a random active userspace address space. Huge thanks to Costa Sapuntzakis for the original pointer to this bug in the sg code. Signed-off-by: Roland Dreier Tested-by: David Milburn Cc: Jens Axboe Cc: Signed-off-by: James Bottomley --- fs/bio.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 94bbc04dba77..c5eae7251490 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1045,12 +1045,22 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - int ret = 0; + struct bio_vec *bvec; + int ret = 0, i; - if (!bio_flagged(bio, BIO_NULL_MAPPED)) - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); + if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free. + */ + if (current->mm) + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, + bmd->nr_sgvecs, bio_data_dir(bio) == READ, + 0, bmd->is_our_pages); + else if (bmd->is_our_pages) + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); + } bio_free_map_data(bmd); bio_put(bio); return ret; -- cgit v1.2.3 From 2df37a19c686c2d7c4e9b4ce1505b5141e3e5552 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Thu, 22 Aug 2013 16:35:44 -0700 Subject: nilfs2: remove double bio_put() in nilfs_end_bio_write() for BIO_EOPNOTSUPP error Remove double call of bio_put() in nilfs_end_bio_write() for the case of BIO_EOPNOTSUPP error detection. The issue was found by Dan Carpenter and he suggests first version of the fix too. Signed-off-by: Vyacheslav Dubeyko Reported-by: Dan Carpenter Acked-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segbuf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc9a913784ab..5bacf46dc4b3 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -345,8 +345,7 @@ static void nilfs_end_bio_write(struct bio *bio, int err) if (err == -EOPNOTSUPP) { set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - bio_put(bio); - /* to be detected by submit_seg_bio() */ + /* to be detected by nilfs_segbuf_submit_bio() */ } if (!uptodate) -- cgit v1.2.3 From 4bf93b50fd04118ac7f33a3c2b8a0a1f9fa80bc9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Thu, 22 Aug 2013 16:35:45 -0700 Subject: nilfs2: fix issue with counting number of bio requests for BIO_EOPNOTSUPP error detection Fix the issue with improper counting number of flying bio requests for BIO_EOPNOTSUPP error detection case. The sb_nbio must be incremented exactly the same number of times as complete() function was called (or will be called) because nilfs_segbuf_wait() will call wail_for_completion() for the number of times set to sb_nbio: do { wait_for_completion(&segbuf->sb_bio_event); } while (--segbuf->sb_nbio > 0); Two functions complete() and wait_for_completion() must be called the same number of times for the same sb_bio_event. Otherwise, wait_for_completion() will hang or leak. Signed-off-by: Vyacheslav Dubeyko Cc: Dan Carpenter Acked-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 5bacf46dc4b3..2d8be51f90dc 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -376,12 +376,12 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_private = segbuf; bio_get(bio); submit_bio(mode, bio); + segbuf->sb_nbio++; if (bio_flagged(bio, BIO_EOPNOTSUPP)) { bio_put(bio); err = -EOPNOTSUPP; goto failed; } - segbuf->sb_nbio++; bio_put(bio); wi->bio = NULL; -- cgit v1.2.3 From 118b23022512eb2f41ce42db70dc0568d00be4ba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 24 Aug 2013 12:08:17 -0400 Subject: cope with potentially long ->d_dname() output for shmem/hugetlb dynamic_dname() is both too much and too little for those - the output may be well in excess of 64 bytes dynamic_dname() assumes to be enough (thanks to ashmem feeding really long names to shmem_file_setup()) and vsnprintf() is an overkill for those guys. Signed-off-by: Al Viro --- fs/dcache.c | 11 +++++++++++ fs/hugetlbfs/inode.c | 8 +------- include/linux/dcache.h | 1 + mm/shmem.c | 8 +------- 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 87bdb5329c3c..83cfb834db03 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2724,6 +2724,17 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, return memcpy(buffer, temp, sz); } +char *simple_dname(struct dentry *dentry, char *buffer, int buflen) +{ + char *end = buffer + buflen; + /* these dentries are never renamed, so d_lock is not needed */ + if (prepend(&end, &buflen, " (deleted)", 11) || + prepend_name(&end, &buflen, &dentry->d_name) || + prepend(&end, &buflen, "/", 1)) + end = ERR_PTR(-ENAMETOOLONG); + return end; +} + /* * Write full pathname from the root of the filesystem into the buffer. */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 34423978b170..d19b30ababf1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -926,14 +926,8 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } -static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen) -{ - return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", - dentry->d_name.name); -} - static struct dentry_operations anon_ops = { - .d_dname = hugetlb_dname + .d_dname = simple_dname }; /* diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b90337c9d468..4a12532da8c4 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -336,6 +336,7 @@ extern int d_validate(struct dentry *, struct dentry *); * helper function for dentry_operations.d_dname() members */ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); +extern char *simple_dname(struct dentry *, char *, int); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); diff --git a/mm/shmem.c b/mm/shmem.c index 8335dbd3fc35..e43dc555069d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2909,14 +2909,8 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen) -{ - return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", - dentry->d_name.name); -} - static struct dentry_operations anon_ops = { - .d_dname = shmem_dname + .d_dname = simple_dname }; /** -- cgit v1.2.3 From a5a1955e0c2d4d325fabdf6b09aa3f9d33e78a10 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Aug 2013 17:14:34 +0200 Subject: proc: kill the extra proc_readfd_common()->dir_emit_dots() proc_readfd_common() does dir_emit_dots() twice in a row, we need to do this only once. Signed-off-by: Oleg Nesterov Signed-off-by: Al Viro --- fs/proc/fd.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 75f2890abbd8..0ff80f9b930f 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -228,8 +228,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!p) return -ENOENT; - if (!dir_emit_dots(file, ctx)) - goto out; if (!dir_emit_dots(file, ctx)) goto out; files = get_files_struct(p); -- cgit v1.2.3 From 136eefa48d638cd8e57e8e2f0087f59c76863bee Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Aug 2013 12:49:40 +0300 Subject: efs: iget_locked() doesn't return an ERR_PTR() The iget_locked() function returns NULL on error and never an ERR_PTR. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/efs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/efs/inode.c b/fs/efs/inode.c index f3913eb2c474..d15ccf20f1b3 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -57,7 +57,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) struct inode *inode; inode = iget_locked(super, ino); - if (IS_ERR(inode)) + if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; -- cgit v1.2.3 From 821ff77c6c524c40667e01483fc72428b58810b4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Aug 2013 12:51:03 +0300 Subject: bfs: iget_locked() doesn't return an ERR_PTR iget_locked() returns a NULL on error, it doesn't return an ERR_PTR. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/bfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 5e376bb93419..8defc6b3f9a2 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -40,7 +40,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) int block, off; inode = iget_locked(sb, ino); - if (IS_ERR(inode)) + if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; -- cgit v1.2.3 From 52e220d357a38cb29fa2e29f34ed94c1d66357f4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Aug 2013 12:44:39 +0300 Subject: VFS: collect_mounts() should return an ERR_PTR This should actually be returning an ERR_PTR on error instead of NULL. That was how it was designed and all the callers expect it. [AV: actually, that's what "VFS: Make clone_mnt()/copy_tree()/collect_mounts() return errors" missed - originally collect_mounts() was expected to return NULL on failure] Cc: # 3.10+ Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 7b1ca9ba0b0a..a45ba4f267fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1429,7 +1429,7 @@ struct vfsmount *collect_mounts(struct path *path) CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) - return NULL; + return ERR_CAST(tree); return &tree->mnt; } -- cgit v1.2.3 From f0cc6ffb8ce8961db587e5072168cac0cbc25f05 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 28 Aug 2013 09:18:05 -0700 Subject: Revert "fs: Allow unprivileged linkat(..., AT_EMPTY_PATH) aka flink" This reverts commit bb2314b47996491bbc5add73633905c3120b6268. It wasn't necessarily wrong per se, but we're still busily discussing the exact details of this all, so I'm going to revert it for now. It's true that you can already do flink() through /proc and that flink() isn't new. But as Brad Spengler points out, some secure environments do not mount proc, and flink adds a new interface that can avoid path lookup of the source for those kinds of environments. We may re-do this (and even mark it for stable backporting back in 3.11 and possibly earlier) once the whole discussion about the interface is done. Cc: Andy Lutomirski Cc: Al Viro Cc: Oleg Nesterov Cc: Brad Spengler Signed-off-by: Linus Torvalds --- fs/namei.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 89a612e392eb..8b61d103a8a7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3671,11 +3671,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; /* - * Using empty names is equivalent to using AT_SYMLINK_FOLLOW - * on /proc/self/fd/. + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. */ - if (flags & AT_EMPTY_PATH) + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; how = LOOKUP_EMPTY; + } if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; -- cgit v1.2.3 From 98474236f72e5a8b89c14cd7c74f0bb77a4b1a99 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 28 Aug 2013 18:24:59 -0700 Subject: vfs: make the dentry cache use the lockref infrastructure This just replaces the dentry count/lock combination with the lockref structure that contains both a count and a spinlock, and does the mechanical conversion to use the lockref infrastructure. There are no semantic changes here, it's purely syntactic. The reference lockref implementation uses the spinlock exactly the same way that the old dcache code did, and the bulk of this patch is just expanding the internal "d_count" use in the dcache code to use "d_lockref.count" instead. This is purely preparation for the real change to make the reference count updates be lockless during the 3.12 merge window. [ As with the previous commit, this is a rewritten version of a concept originally from Waiman, so credit goes to him, blame for any errors goes to me. Waiman's patch had some semantic differences for taking advantage of the lockless update in dget_parent(), while this patch is intentionally a pure search-and-replace change with no semantic changes. - Linus ] Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- fs/dcache.c | 57 ++++++++++++++++++++------------------------------ fs/namei.c | 6 +++--- include/linux/dcache.h | 19 ++++++++--------- 3 files changed, 35 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 83cfb834db03..b949af850cd6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -229,7 +229,7 @@ static void __d_free(struct rcu_head *head) */ static void d_free(struct dentry *dentry) { - BUG_ON(dentry->d_count); + BUG_ON(dentry->d_lockref.count); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -467,7 +467,7 @@ relock: } if (ref) - dentry->d_count--; + dentry->d_lockref.count--; /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. @@ -513,15 +513,10 @@ void dput(struct dentry *dentry) return; repeat: - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) might_sleep(); - spin_lock(&dentry->d_lock); - BUG_ON(!dentry->d_count); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } if (dentry->d_flags & DCACHE_OP_DELETE) { if (dentry->d_op->d_delete(dentry)) @@ -535,7 +530,7 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); - dentry->d_count--; + dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return; @@ -590,7 +585,7 @@ int d_invalidate(struct dentry * dentry) * We also need to leave mountpoints alone, * directory or not. */ - if (dentry->d_count > 1 && dentry->d_inode) { + if (dentry->d_lockref.count > 1 && dentry->d_inode) { if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; @@ -606,14 +601,12 @@ EXPORT_SYMBOL(d_invalidate); /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) { - dentry->d_count++; + dentry->d_lockref.count++; } static inline void __dget(struct dentry *dentry) { - spin_lock(&dentry->d_lock); - __dget_dlock(dentry); - spin_unlock(&dentry->d_lock); + lockref_get(&dentry->d_lockref); } struct dentry *dget_parent(struct dentry *dentry) @@ -634,8 +627,8 @@ repeat: goto repeat; } rcu_read_unlock(); - BUG_ON(!ret->d_count); - ret->d_count++; + BUG_ON(!ret->d_lockref.count); + ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } @@ -718,7 +711,7 @@ restart: spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); - if (!dentry->d_count) { + if (!dentry->d_lockref.count) { __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -763,12 +756,8 @@ static void try_prune_one_dentry(struct dentry *dentry) /* Prune ancestors. */ dentry = parent; while (dentry) { - spin_lock(&dentry->d_lock); - if (dentry->d_count > 1) { - dentry->d_count--; - spin_unlock(&dentry->d_lock); + if (lockref_put_or_lock(&dentry->d_lockref)) return; - } dentry = dentry_kill(dentry, 1); } } @@ -793,7 +782,7 @@ static void shrink_dentry_list(struct list_head *list) * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ - if (dentry->d_count) { + if (dentry->d_lockref.count) { dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; @@ -913,7 +902,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) dentry_lru_del(dentry); __d_shrink(dentry); - if (dentry->d_count != 0) { + if (dentry->d_lockref.count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" " still in use (%d)" @@ -922,7 +911,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry->d_name.name, - dentry->d_count, + dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); BUG(); @@ -933,7 +922,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) list_del(&dentry->d_u.d_child); } else { parent = dentry->d_parent; - parent->d_count--; + parent->d_lockref.count--; list_del(&dentry->d_u.d_child); } @@ -981,7 +970,7 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - dentry->d_count--; + dentry->d_lockref.count--; shrink_dcache_for_umount_subtree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { @@ -1147,7 +1136,7 @@ resume: * loop in shrink_dcache_parent() might not make any progress * and loop forever. */ - if (dentry->d_count) { + if (dentry->d_lockref.count) { dentry_lru_del(dentry); } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { dentry_lru_move_list(dentry, dispose); @@ -1269,7 +1258,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) smp_wmb(); dentry->d_name.name = dname; - dentry->d_count = 1; + dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_init(&dentry->d_seq); @@ -1970,7 +1959,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) goto next; } - dentry->d_count++; + dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; @@ -2069,7 +2058,7 @@ again: spin_lock(&dentry->d_lock); inode = dentry->d_inode; isdir = S_ISDIR(inode->i_mode); - if (dentry->d_count == 1) { + if (dentry->d_lockref.count == 1) { if (!spin_trylock(&inode->i_lock)) { spin_unlock(&dentry->d_lock); cpu_relax(); @@ -2948,7 +2937,7 @@ resume: } if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_count--; + dentry->d_lockref.count--; } spin_unlock(&dentry->d_lock); } @@ -2956,7 +2945,7 @@ resume: struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; - this_parent->d_count--; + this_parent->d_lockref.count--; } this_parent = try_to_ascend(this_parent, locked, seq); if (!this_parent) diff --git a/fs/namei.c b/fs/namei.c index 8b61d103a8a7..7720fbd5277b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -536,8 +536,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) * a reference at this point. */ BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; + BUG_ON(!parent->d_lockref.count); + parent->d_lockref.count++; spin_unlock(&dentry->d_lock); } spin_unlock(&parent->d_lock); @@ -3327,7 +3327,7 @@ void dentry_unhash(struct dentry *dentry) { shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) __d_drop(dentry); spin_unlock(&dentry->d_lock); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4a12532da8c4..efdc94434c30 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -9,6 +9,7 @@ #include #include #include +#include struct nameidata; struct path; @@ -100,6 +101,8 @@ extern unsigned int full_name_hash(const unsigned char *, unsigned int); # endif #endif +#define d_lock d_lockref.lock + struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ @@ -112,8 +115,7 @@ struct dentry { unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ /* Ref lookup also touches following */ - unsigned int d_count; /* protected by d_lock */ - spinlock_t d_lock; /* per dentry lock */ + struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ @@ -318,7 +320,7 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) assert_spin_locked(&dentry->d_lock); if (!read_seqcount_retry(&dentry->d_seq, seq)) { ret = 1; - dentry->d_count++; + dentry->d_lockref.count++; } return ret; @@ -326,7 +328,7 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) static inline unsigned d_count(const struct dentry *dentry) { - return dentry->d_count; + return dentry->d_lockref.count; } /* validate "insecure" dentry pointer */ @@ -357,17 +359,14 @@ extern char *dentry_path(struct dentry *, char *, int); static inline struct dentry *dget_dlock(struct dentry *dentry) { if (dentry) - dentry->d_count++; + dentry->d_lockref.count++; return dentry; } static inline struct dentry *dget(struct dentry *dentry) { - if (dentry) { - spin_lock(&dentry->d_lock); - dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - } + if (dentry) + lockref_get(&dentry->d_lockref); return dentry; } -- cgit v1.2.3 From 49fa8140e487b492da24c76ac3cccad0a0ae63e4 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 28 Aug 2013 16:35:21 -0700 Subject: fs/ocfs2/super.c: Use bigger nodestr to accomodate 32-bit node numbers While using pacemaker/corosync, the node numbers are generated using IP address as opposed to serial node number generation. This may not fit in a 8-byte string. Use a bigger string to print the complete node number. Signed-off-by: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 854d80955bf8..121da2dc3be8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1022,7 +1022,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; - char nodestr[8]; + char nodestr[12]; struct ocfs2_blockcheck_stats stats; trace_ocfs2_fill_super(sb, data, silent); -- cgit v1.2.3 From da9803bc8812f5bd3b26baaa90e515b843c65ff7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Aug 2013 17:29:38 -0400 Subject: FS-Cache: Add interface to check consistency of a cached object Extend the fscache netfs API so that the netfs can ask as to whether a cache object is up to date with respect to its corresponding netfs object: int fscache_check_consistency(struct fscache_cookie *cookie) This will call back to the netfs to check whether the auxiliary data associated with a cookie is correct. It returns 0 if it is and -ESTALE if it isn't; it may also return -ENOMEM and -ERESTARTSYS. The backends now have to implement a mandatory operation pointer: int (*check_consistency)(struct fscache_object *object) that corresponds to the above API call. FS-Cache takes care of pinning the object and the cookie in memory and managing this call with respect to the object state. Original-author: Hongyi Jia Signed-off-by: David Howells cc: Hongyi Jia cc: Milosz Tanski --- Documentation/filesystems/caching/backend-api.txt | 9 +++ Documentation/filesystems/caching/netfs-api.txt | 17 ++++-- fs/fscache/cookie.c | 71 +++++++++++++++++++++++ fs/fscache/internal.h | 6 ++ fs/fscache/page.c | 55 ++++++++++-------- include/linux/fscache-cache.h | 4 ++ include/linux/fscache.h | 20 +++++++ 7 files changed, 154 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt index d78bab9622c6..277d1e810670 100644 --- a/Documentation/filesystems/caching/backend-api.txt +++ b/Documentation/filesystems/caching/backend-api.txt @@ -299,6 +299,15 @@ performed on the denizens of the cache. These are held in a structure of type: enough space in the cache to permit this. + (*) Check coherency state of an object [mandatory]: + + int (*check_consistency)(struct fscache_object *object) + + This method is called to have the cache check the saved auxiliary data of + the object against the netfs's idea of the state. 0 should be returned + if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS + may also be returned. + (*) Update object [mandatory]: int (*update_object)(struct fscache_object *object) diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt index 97e6c0ecc5ef..12b344251523 100644 --- a/Documentation/filesystems/caching/netfs-api.txt +++ b/Documentation/filesystems/caching/netfs-api.txt @@ -32,7 +32,7 @@ This document contains the following sections: (9) Setting the data file size (10) Page alloc/read/write (11) Page uncaching - (12) Index and data file update + (12) Index and data file consistency (13) Miscellaneous cookie operations (14) Cookie unregistration (15) Index invalidation @@ -690,9 +690,18 @@ written to the cache and for the cache to finish with the page generally. No error is returned. -========================== -INDEX AND DATA FILE UPDATE -========================== +=============================== +INDEX AND DATA FILE CONSISTENCY +=============================== + +To find out whether auxiliary data for an object is up to data within the +cache, the following function can be called: + + int fscache_check_consistency(struct fscache_cookie *cookie) + +This will call back to the netfs to check whether the auxiliary data associated +with a cookie is correct. It returns 0 if it is and -ESTALE if it isn't; it +may also return -ENOMEM and -ERESTARTSYS. To request an update of the index data for an index or other object, the following function should be called: diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0e91a3c9fdb2..318e8433527c 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -558,3 +558,74 @@ void __fscache_cookie_put(struct fscache_cookie *cookie) _leave(""); } + +/* + * check the consistency between the netfs inode and the backing cache + * + * NOTE: it only serves no-index type + */ +int __fscache_check_consistency(struct fscache_cookie *cookie) +{ + struct fscache_operation *op; + struct fscache_object *object; + int ret; + + _enter("%p,", cookie); + + ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + + if (fscache_wait_for_deferred_lookup(cookie) < 0) + return -ERESTARTSYS; + + if (hlist_empty(&cookie->backing_objects)) + return 0; + + op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!op) + return -ENOMEM; + + fscache_operation_init(op, NULL, NULL); + op->flags = FSCACHE_OP_MYTHREAD | + (1 << FSCACHE_OP_WAITING); + + spin_lock(&cookie->lock); + + if (hlist_empty(&cookie->backing_objects)) + goto inconsistent; + object = hlist_entry(cookie->backing_objects.first, + struct fscache_object, cookie_link); + if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) + goto inconsistent; + + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + + atomic_inc(&cookie->n_active); + if (fscache_submit_op(object, op) < 0) + goto submit_failed; + + /* the work queue now carries its own ref on the object */ + spin_unlock(&cookie->lock); + + ret = fscache_wait_for_operation_activation(object, op, + NULL, NULL, NULL); + if (ret == 0) { + /* ask the cache to honour the operation */ + ret = object->cache->ops->check_consistency(op); + fscache_op_complete(op, false); + } else if (ret == -ENOBUFS) { + ret = 0; + } + + fscache_put_operation(op); + _leave(" = %d", ret); + return ret; + +submit_failed: + atomic_dec(&cookie->n_active); +inconsistent: + spin_unlock(&cookie->lock); + kfree(op); + _leave(" = -ESTALE"); + return -ESTALE; +} +EXPORT_SYMBOL(__fscache_check_consistency); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 12d505bedb5c..4226f6680b06 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -130,6 +130,12 @@ extern void fscache_operation_gc(struct work_struct *); /* * page.c */ +extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); +extern int fscache_wait_for_operation_activation(struct fscache_object *, + struct fscache_operation *, + atomic_t *, + atomic_t *, + void (*)(struct fscache_operation *)); extern void fscache_invalidate_writes(struct fscache_cookie *); /* diff --git a/fs/fscache/page.c b/fs/fscache/page.c index d479ab3c63e4..793e3d5ca4b5 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -278,7 +278,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval( /* * wait for a deferred lookup to complete */ -static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) +int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) { unsigned long jif; @@ -322,42 +322,46 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op) /* * wait for an object to become active (or dead) */ -static int fscache_wait_for_retrieval_activation(struct fscache_object *object, - struct fscache_retrieval *op, - atomic_t *stat_op_waits, - atomic_t *stat_object_dead) +int fscache_wait_for_operation_activation(struct fscache_object *object, + struct fscache_operation *op, + atomic_t *stat_op_waits, + atomic_t *stat_object_dead, + void (*do_cancel)(struct fscache_operation *)) { int ret; - if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags)) + if (!test_bit(FSCACHE_OP_WAITING, &op->flags)) goto check_if_dead; _debug(">>> WT"); - fscache_stat(stat_op_waits); - if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + if (stat_op_waits) + fscache_stat(stat_op_waits); + if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); + ret = fscache_cancel_op(op, do_cancel); if (ret == 0) return -ERESTARTSYS; /* it's been removed from the pending queue by another party, * so we should get to run shortly */ - wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, + wait_on_bit(&op->flags, FSCACHE_OP_WAITING, fscache_wait_bit, TASK_UNINTERRUPTIBLE); } _debug("<<< GO"); check_if_dead: - if (op->op.state == FSCACHE_OP_ST_CANCELLED) { - fscache_stat(stat_object_dead); + if (op->state == FSCACHE_OP_ST_CANCELLED) { + if (stat_object_dead) + fscache_stat(stat_object_dead); _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state); - fscache_cancel_op(&op->op, fscache_do_cancel_retrieval); - fscache_stat(stat_object_dead); + pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); + fscache_cancel_op(op, do_cancel); + if (stat_object_dead) + fscache_stat(stat_object_dead); return -ENOBUFS; } return 0; @@ -432,10 +436,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_retrieval_activation( - object, op, + ret = fscache_wait_for_operation_activation( + object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); + __fscache_stat(&fscache_n_retrievals_object_dead), + fscache_do_cancel_retrieval); if (ret < 0) goto error; @@ -557,10 +562,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ - ret = fscache_wait_for_retrieval_activation( - object, op, + ret = fscache_wait_for_operation_activation( + object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead)); + __fscache_stat(&fscache_n_retrievals_object_dead), + fscache_do_cancel_retrieval); if (ret < 0) goto error; @@ -658,10 +664,11 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_alloc_ops); - ret = fscache_wait_for_retrieval_activation( - object, op, + ret = fscache_wait_for_operation_activation( + object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead)); + __fscache_stat(&fscache_n_allocs_object_dead), + fscache_do_cancel_retrieval); if (ret < 0) goto error; diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index a9ff9a36b86d..7823e9ef995e 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -251,6 +251,10 @@ struct fscache_cache_ops { /* unpin an object in the cache */ void (*unpin_object)(struct fscache_object *object); + /* check the consistency between the backing cache and the FS-Cache + * cookie */ + bool (*check_consistency)(struct fscache_operation *op); + /* store the updated auxiliary data on an object */ void (*update_object)(struct fscache_object *object); diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 7a086235da4b..d984aff32a11 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -183,6 +183,7 @@ extern struct fscache_cookie *__fscache_acquire_cookie( const struct fscache_cookie_def *, void *); extern void __fscache_relinquish_cookie(struct fscache_cookie *, int); +extern int __fscache_check_consistency(struct fscache_cookie *); extern void __fscache_update_cookie(struct fscache_cookie *); extern int __fscache_attr_changed(struct fscache_cookie *); extern void __fscache_invalidate(struct fscache_cookie *); @@ -325,6 +326,25 @@ void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) __fscache_relinquish_cookie(cookie, retire); } +/** + * fscache_check_consistency - Request that if the cache is updated + * @cookie: The cookie representing the cache object + * + * Request an consistency check from fscache, which passes the request + * to the backing cache. + * + * Returns 0 if consistent and -ESTALE if inconsistent. May also + * return -ENOMEM and -ERESTARTSYS. + */ +static inline +int fscache_check_consistency(struct fscache_cookie *cookie) +{ + if (fscache_cookie_valid(cookie)) + return __fscache_check_consistency(cookie); + else + return 0; +} + /** * fscache_update_cookie - Request that a cache object be updated * @cookie: The cookie representing the cache object -- cgit v1.2.3 From 5002d7bef81c9646bbb06fb57db4a100aa5a57c5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Aug 2013 17:29:21 -0400 Subject: CacheFiles: Implement interface to check cache consistency Implement the FS-Cache interface to check the consistency of a cache object in CacheFiles. Original-author: Hongyi Jia Signed-off-by: David Howells cc: Hongyi Jia cc: Milosz Tanski --- fs/cachefiles/interface.c | 26 ++++++++++++++++++++++++++ fs/cachefiles/internal.h | 1 + fs/cachefiles/xattr.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index d4c1206af9fc..43eb5592cdea 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -377,6 +377,31 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) ret); } +/* + * check if the backing cache is updated to FS-Cache + * - called by FS-Cache when evaluates if need to invalidate the cache + */ +static bool cachefiles_check_consistency(struct fscache_operation *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + int ret; + + _enter("{OBJ%x}", op->object->debug_id); + + object = container_of(op->object, struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + cachefiles_begin_secure(cache, &saved_cred); + ret = cachefiles_check_auxdata(object); + cachefiles_end_secure(cache, saved_cred); + + _leave(" = %d", ret); + return ret; +} + /* * notification the attributes on an object have changed * - called with reads/writes excluded by FS-Cache @@ -522,4 +547,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .write_page = cachefiles_write_page, .uncache_page = cachefiles_uncache_page, .dissociate_pages = cachefiles_dissociate_pages, + .check_consistency = cachefiles_check_consistency, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 49382519907a..5349473df1b1 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -235,6 +235,7 @@ extern int cachefiles_set_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); extern int cachefiles_update_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); +extern int cachefiles_check_auxdata(struct cachefiles_object *object); extern int cachefiles_check_object_xattr(struct cachefiles_object *object, struct cachefiles_xattr *auxdata); extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 2476e5162609..34c88b83e39f 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -156,6 +156,42 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, return ret; } +/* + * check the consistency between the backing cache and the FS-Cache cookie + */ +int cachefiles_check_auxdata(struct cachefiles_object *object) +{ + struct cachefiles_xattr *auxbuf; + struct dentry *dentry = object->dentry; + unsigned int dlen; + int ret; + + ASSERT(dentry); + ASSERT(dentry->d_inode); + ASSERT(object->fscache.cookie->def->check_aux); + + auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); + if (!auxbuf) + return -ENOMEM; + + auxbuf->len = vfs_getxattr(dentry, cachefiles_xattr_cache, + &auxbuf->type, 512 + 1); + if (auxbuf->len < 1) + return -ESTALE; + + if (auxbuf->type != object->fscache.cookie->def->type) + return -ESTALE; + + dlen = auxbuf->len - 1; + ret = fscache_check_aux(&object->fscache, &auxbuf->data, dlen); + + kfree(auxbuf); + if (ret != FSCACHE_CHECKAUX_OKAY) + return -ESTALE; + + return 0; +} + /* * check the state xattr on a cache file * - return -ESTALE if the object should be deleted -- cgit v1.2.3 From 5a6f282a2052bb13171b53f03b34501cf72c33f1 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 21 Aug 2013 17:30:11 -0400 Subject: fscache: Netfs function for cleanup post readpages Currently the fscache code expect the netfs to call fscache_readpages_or_alloc inside the aops readpages callback. It marks all the pages in the list provided by readahead with PG_private_2. In the cases that the netfs fails to read all the pages (which is legal) it ends up returning to the readahead and triggering a BUG. This happens because the page list still contains marked pages. This patch implements a simple fscache_readpages_cancel function that the netfs should call before returning from readpages. It will revoke the pages from the underlying cache backend and unmark them. The problem was originally worked out in the Ceph devel tree, but it also occurs in CIFS. It appears that NFS, AFS and 9P are okay as read_cache_pages() will clean up the unprocessed pages in the case of an error. This can be used to address the following oops: [12410647.597278] BUG: Bad page state in process petabucket pfn:3d504e [12410647.597292] page:ffffea000f541380 count:0 mapcount:0 mapping: (null) index:0x0 [12410647.597298] page flags: 0x200000000001000(private_2) ... [12410647.597334] Call Trace: [12410647.597345] [] dump_stack+0x19/0x1b [12410647.597356] [] bad_page+0xc7/0x120 [12410647.597359] [] free_pages_prepare+0x10e/0x120 [12410647.597361] [] free_hot_cold_page+0x40/0x170 [12410647.597363] [] __put_single_page+0x27/0x30 [12410647.597365] [] put_page+0x25/0x40 [12410647.597376] [] ceph_readpages+0x2e9/0x6e0 [ceph] [12410647.597379] [] __do_page_cache_readahead+0x1af/0x260 [12410647.597382] [] ra_submit+0x21/0x30 [12410647.597384] [] filemap_fault+0x254/0x490 [12410647.597387] [] __do_fault+0x6f/0x4e0 [12410647.597391] [] ? __switch_to+0x16d/0x4a0 [12410647.597395] [] ? finish_task_switch+0x5a/0xc0 [12410647.597398] [] handle_pte_fault+0xf6/0x930 [12410647.597401] [] ? pte_mfn_to_pfn+0x93/0x110 [12410647.597403] [] ? xen_pmd_val+0xe/0x10 [12410647.597405] [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e [12410647.597407] [] handle_mm_fault+0x251/0x370 [12410647.597411] [] ? call_rwsem_down_read_failed+0x14/0x30 [12410647.597414] [] __do_page_fault+0x1aa/0x550 [12410647.597418] [] ? up_write+0x1d/0x20 [12410647.597422] [] ? vm_mmap_pgoff+0xbc/0xe0 [12410647.597425] [] ? SyS_mmap_pgoff+0xd8/0x240 [12410647.597427] [] do_page_fault+0xe/0x10 [12410647.597431] [] page_fault+0x28/0x30 Signed-off-by: Milosz Tanski Signed-off-by: David Howells --- Documentation/filesystems/caching/netfs-api.txt | 18 +++++++++++++++++- fs/fscache/page.c | 16 ++++++++++++++++ include/linux/fscache.h | 22 ++++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt index 26c1dd5a6a21..11a0a40ce445 100644 --- a/Documentation/filesystems/caching/netfs-api.txt +++ b/Documentation/filesystems/caching/netfs-api.txt @@ -499,7 +499,7 @@ Else if there's a copy of the page resident in the cache: (*) An argument that's 0 on success or negative for an error code. If an error occurs, it should be assumed that the page contains no usable - data. + data. fscache_readpages_cancel() may need to be called. end_io_func() will be called in process context if the read is results in an error, but it might be called in interrupt context if the read is @@ -623,6 +623,22 @@ some of the pages being read and some being allocated. Those pages will have been marked appropriately and will need uncaching. +CANCELLATION OF UNREAD PAGES +---------------------------- + +If one or more pages are passed to fscache_read_or_alloc_pages() but not then +read from the cache and also not read from the underlying filesystem then +those pages will need to have any marks and reservations removed. This can be +done by calling: + + void fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages); + +prior to returning to the caller. The cookie argument should be as passed to +fscache_read_or_alloc_pages(). Every page in the pages list will be examined +and any that have PG_fscache set will be uncached. + + ============== PAGE UNCACHING ============== diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 793e3d5ca4b5..8702b732109a 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -700,6 +700,22 @@ nobufs: } EXPORT_SYMBOL(__fscache_alloc_page); +/* + * Unmark pages allocate in the readahead code path (via: + * fscache_readpages_or_alloc) after delegating to the base filesystem + */ +void __fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages) +{ + struct page *page; + + list_for_each_entry(page, pages, lru) { + if (PageFsCache(page)) + __fscache_uncache_page(cookie, page); + } +} +EXPORT_SYMBOL(__fscache_readpages_cancel); + /* * release a write op reference */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index d984aff32a11..19b46458e4e8 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -209,6 +209,8 @@ extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *, gfp_t); extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *, struct inode *); +extern void __fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages); /** * fscache_register_netfs - Register a filesystem as desiring caching services @@ -589,6 +591,26 @@ int fscache_alloc_page(struct fscache_cookie *cookie, return -ENOBUFS; } +/** + * fscache_readpages_cancel - Cancel read/alloc on pages + * @cookie: The cookie representing the inode's cache object. + * @pages: The netfs pages that we canceled write on in readpages() + * + * Uncache/unreserve the pages reserved earlier in readpages() via + * fscache_readpages_or_alloc() and similar. In most successful caches in + * readpages() this doesn't do anything. In cases when the underlying netfs's + * readahead failed we need to clean up the pagelist (unmark and uncache). + * + * This function may sleep as it may have to clean up disk state. + */ +static inline +void fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages) +{ + if (fscache_cookie_valid(cookie)) + __fscache_readpages_cancel(cookie, pages); +} + /** * fscache_write_page - Request storage of a page in the cache * @cookie: The cookie representing the cache object -- cgit v1.2.3