diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 187 |
1 files changed, 87 insertions, 100 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..1283fc825458 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -13,7 +13,6 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/uaccess.h> -#include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> @@ -101,6 +100,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -175,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock and + * mem_cgroup_begin_page_stat(). */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __delete_from_page_cache(struct page *page, void *shadow, + struct mem_cgroup *memcg) { struct address_space *mapping = page->mapping; @@ -197,22 +199,24 @@ void __delete_from_page_cache(struct page *page, void *shadow) page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - __dec_zone_page_state(page, NR_FILE_PAGES); + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(page)) + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* - * Some filesystems seem to re-dirty the page even after - * the VM has canceled the dirty bit (eg ext3 journaling). + * At this point page must be either written or cleaned by truncate. + * Dirty page here signals a bug and loss of unwritten data. * - * Fix it up by doing a final dirty accounting check after - * having removed the page entirely. + * This fixes dirty accounting after removing the page entirely but + * leaves PageDirty set: it has no effect for truncated page and + * anyway will be cleared before returning page into buddy allocator. */ - if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); - } + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping, memcg, + inode_to_wb(mapping->host)); } /** @@ -226,14 +230,20 @@ void __delete_from_page_cache(struct page *page, void *shadow) void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); freepage = mapping->a_ops->freepage; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page, NULL); - spin_unlock_irq(&mapping->tree_lock); + + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(page, NULL, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -283,7 +293,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, if (!mapping_cap_writeback_dirty(mapping)) return 0; + wbc_attach_fdatawrite_inode(&wbc, mapping->host); ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); return ret; } @@ -472,6 +484,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); + struct mem_cgroup *memcg; + unsigned long flags; pgoff_t offset = old->index; freepage = mapping->a_ops->freepage; @@ -480,15 +494,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old, NULL); + memcg = mem_cgroup_begin_page_stat(old); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(old, NULL, memcg); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; - __inc_zone_page_state(new, NR_FILE_PAGES); + + /* + * hugetlb pages do not participate in page cache accounting. + */ + if (!PageHuge(new)) + __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) @@ -577,7 +598,10 @@ static int __add_to_page_cache_locked(struct page *page, radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - __inc_zone_page_state(page, NR_FILE_PAGES); + + /* hugetlb pages do not participate in page cache accounting. */ + if (!huge) + __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_commit_charge(page, memcg, false); @@ -1656,8 +1680,8 @@ no_cached_page: error = -ENOMEM; goto out; } - error = add_to_page_cache_lru(page, mapping, - index, GFP_KERNEL); + error = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1695,7 +1719,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; - if (io_is_direct(file)) { + if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); @@ -1708,7 +1732,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) pos + count - 1); if (!retval) { struct iov_iter data = *iter; - retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); + retval = mapping->a_ops->direct_IO(iocb, &data, pos); } if (retval > 0) { @@ -1758,7 +1782,8 @@ static int page_cache_read(struct file *file, pgoff_t offset) if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + ret = add_to_page_cache_lru(page, mapping, offset, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2261,41 +2286,38 @@ EXPORT_SYMBOL(read_cache_page_gfp); * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ -inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; unsigned long limit = rlimit(RLIMIT_FSIZE); + loff_t pos; - if (unlikely(*pos < 0)) - return -EINVAL; + if (!iov_iter_count(from)) + return 0; - if (!isblk) { - /* FIXME: this is for backwards compatibility with 2.4 */ - if (file->f_flags & O_APPEND) - *pos = i_size_read(inode); + /* FIXME: this is for backwards compatibility with 2.4 */ + if (iocb->ki_flags & IOCB_APPEND) + iocb->ki_pos = i_size_read(inode); - if (limit != RLIM_INFINITY) { - if (*pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - if (*count > limit - (typeof(limit))*pos) { - *count = limit - (typeof(limit))*pos; - } + pos = iocb->ki_pos; + + if (limit != RLIM_INFINITY) { + if (iocb->ki_pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; } + iov_iter_truncate(from, limit - (unsigned long)pos); } /* * LFS rule */ - if (unlikely(*pos + *count > MAX_NON_LFS && + if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && !(file->f_flags & O_LARGEFILE))) { - if (*pos >= MAX_NON_LFS) { + if (pos >= MAX_NON_LFS) return -EFBIG; - } - if (*count > MAX_NON_LFS - (unsigned long)*pos) { - *count = MAX_NON_LFS - (unsigned long)*pos; - } + iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); } /* @@ -2305,34 +2327,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i * exceeded without writing data we send a signal and return EFBIG. * Linus frestrict idea will clean these up nicely.. */ - if (likely(!isblk)) { - if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { - if (*count || *pos > inode->i_sb->s_maxbytes) { - return -EFBIG; - } - /* zero-length writes at ->s_maxbytes are OK */ - } - - if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) - *count = inode->i_sb->s_maxbytes - *pos; - } else { -#ifdef CONFIG_BLOCK - loff_t isize; - if (bdev_read_only(I_BDEV(inode))) - return -EPERM; - isize = i_size_read(inode); - if (*pos >= isize) { - if (*count || *pos > isize) - return -ENOSPC; - } + if (unlikely(pos >= inode->i_sb->s_maxbytes)) + return -EFBIG; - if (*pos + *count > isize) - *count = isize - *pos; -#else - return -EPERM; -#endif - } - return 0; + iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); + return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); @@ -2396,7 +2395,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) } data = *from; - written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); + written = mapping->a_ops->direct_IO(iocb, &data, pos); /* * Finally, try again to invalidate clean pages which might have been @@ -2558,24 +2557,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; ssize_t written = 0; ssize_t err; ssize_t status; - size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - - if (count == 0) - goto out; - - iov_iter_truncate(from, count); - - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -2583,10 +2571,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; - if (io_is_direct(file)) { - loff_t endbyte; + if (iocb->ki_flags & IOCB_DIRECT) { + loff_t pos, endbyte; - written = generic_file_direct_write(iocb, from, pos); + written = generic_file_direct_write(iocb, from, iocb->ki_pos); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -2594,13 +2582,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ - if (written < 0 || written == count || IS_DAX(inode)) + if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) goto out; - pos += written; - count -= written; - - status = generic_perform_write(file, from, pos); + status = generic_perform_write(file, from, pos = iocb->ki_pos); /* * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were @@ -2612,15 +2597,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) err = status; goto out; } - iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ endbyte = pos + status - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + err = filemap_write_and_wait_range(mapping, pos, endbyte); if (err == 0) { + iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, @@ -2632,9 +2617,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) */ } } else { - written = generic_perform_write(file, from, pos); - if (likely(written >= 0)) - iocb->ki_pos = pos + written; + written = generic_perform_write(file, from, iocb->ki_pos); + if (likely(written > 0)) + iocb->ki_pos += written; } out: current->backing_dev_info = NULL; @@ -2658,7 +2643,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; mutex_lock(&inode->i_mutex); - ret = __generic_file_write_iter(iocb, from); + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (ret > 0) { |