diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-02-06 13:49:03 -0800 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-02-06 13:49:03 -0800 | 
| commit | 9343224bfd4be6a02e6ae0c0d66426c955c7d76e (patch) | |
| tree | c5d2287ff3a8fdfc15186dd35b9c00896003114d | |
| parent | f2de3a159937bfb1ab1ca671e0f2d06cda286a24 (diff) | |
| parent | 227d53b397a32a7614667b3ecaf1d89902fb6c12 (diff) | |
| download | linux-9343224bfd4be6a02e6ae0c0d66426c955c7d76e.tar.bz2 | |
Merge branch 'akpm' (patches from Andrew Morton)
Merge a bunch of fixes from Andrew Morton:
 "Commit 579f82901f6f ("swap: add a simple detector for inappropriate
  swapin readahead") is a feature.  No probs if you decide to defer it
  until the next merge window.
  It has been sitting in my tree for over a year because of my dislike
  of all the magic numbers, but recent discussion with Hugh has made me
  give up"
* emailed patches fron Andrew Morton <akpm@linux-foundation.org>:
  mm: __set_page_dirty uses spin_lock_irqsave instead of spin_lock_irq
  arch/x86/mm/numa.c: fix array index overflow when synchronizing nid to memblock.reserved.
  arch/x86/mm/numa.c: initialize numa_kernel_nodes in numa_clear_kernel_node_hotplug()
  mm: __set_page_dirty_nobuffers() uses spin_lock_irqsave() instead of spin_lock_irq()
  mm/swap: fix race on swap_info reuse between swapoff and swapon
  swap: add a simple detector for inappropriate swapin readahead
  ocfs2: free allocated clusters if error occurs after ocfs2_claim_clusters
  Documentation/kernel-parameters.txt: fix memmap= language
| -rw-r--r-- | Documentation/kernel-parameters.txt | 8 | ||||
| -rw-r--r-- | arch/x86/mm/numa.c | 21 | ||||
| -rw-r--r-- | fs/buffer.c | 6 | ||||
| -rw-r--r-- | fs/ocfs2/alloc.c | 38 | ||||
| -rw-r--r-- | fs/ocfs2/localalloc.c | 42 | ||||
| -rw-r--r-- | fs/ocfs2/localalloc.h | 6 | ||||
| -rw-r--r-- | include/linux/page-flags.h | 4 | ||||
| -rw-r--r-- | mm/page-writeback.c | 5 | ||||
| -rw-r--r-- | mm/swap_state.c | 63 | ||||
| -rw-r--r-- | mm/swapfile.c | 11 | 
10 files changed, 178 insertions, 26 deletions
| diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8f441dab0396..7116fda7077f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1726,16 +1726,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.  			option description.  	memmap=nn[KMG]@ss[KMG] -			[KNL] Force usage of a specific region of memory -			Region of memory to be used, from ss to ss+nn. +			[KNL] Force usage of a specific region of memory. +			Region of memory to be used is from ss to ss+nn.  	memmap=nn[KMG]#ss[KMG]  			[KNL,ACPI] Mark specific memory as ACPI data. -			Region of memory to be used, from ss to ss+nn. +			Region of memory to be marked is from ss to ss+nn.  	memmap=nn[KMG]$ss[KMG]  			[KNL,ACPI] Mark specific memory as reserved. -			Region of memory to be used, from ss to ss+nn. +			Region of memory to be reserved is from ss to ss+nn.  			Example: Exclude memory from 0x18690000-0x1869ffff  			         memmap=64K$0x18690000  			         or diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 81b2750f3666..27aa0455fab3 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)  		struct numa_memblk *mb = &mi->blk[i];  		memblock_set_node(mb->start, mb->end - mb->start,  				  &memblock.memory, mb->nid); - -		/* -		 * At this time, all memory regions reserved by memblock are -		 * used by the kernel. Set the nid in memblock.reserved will -		 * mark out all the nodes the kernel resides in. -		 */ -		memblock_set_node(mb->start, mb->end - mb->start, -				  &memblock.reserved, mb->nid);  	}  	/* @@ -565,10 +557,21 @@ static void __init numa_init_array(void)  static void __init numa_clear_kernel_node_hotplug(void)  {  	int i, nid; -	nodemask_t numa_kernel_nodes; +	nodemask_t numa_kernel_nodes = NODE_MASK_NONE;  	unsigned long start, end;  	struct memblock_type *type = &memblock.reserved; +	/* +	 * At this time, all memory regions reserved by memblock are +	 * used by the kernel. Set the nid in memblock.reserved will +	 * mark out all the nodes the kernel resides in. +	 */ +	for (i = 0; i < numa_meminfo.nr_blks; i++) { +		struct numa_memblk *mb = &numa_meminfo.blk[i]; +		memblock_set_node(mb->start, mb->end - mb->start, +				  &memblock.reserved, mb->nid); +	} +  	/* Mark all kernel nodes. */  	for (i = 0; i < type->cnt; i++)  		node_set(type->regions[i].nid, numa_kernel_nodes); diff --git a/fs/buffer.c b/fs/buffer.c index 651dba10b9c2..27265a8b43c1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);  static void __set_page_dirty(struct page *page,  		struct address_space *mapping, int warn)  { -	spin_lock_irq(&mapping->tree_lock); +	unsigned long flags; + +	spin_lock_irqsave(&mapping->tree_lock, flags);  	if (page->mapping) {	/* Race with truncate? */  		WARN_ON_ONCE(warn && !PageUptodate(page));  		account_page_dirtied(page, mapping);  		radix_tree_tag_set(&mapping->page_tree,  				page_index(page), PAGECACHE_TAG_DIRTY);  	} -	spin_unlock_irq(&mapping->tree_lock); +	spin_unlock_irqrestore(&mapping->tree_lock, flags);  	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);  } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 8750ae1b8636..aada5801567a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  				enum ocfs2_alloc_restarted *reason_ret)  {  	int status = 0, err = 0; +	int need_free = 0;  	int free_extents;  	enum ocfs2_alloc_restarted reason = RESTART_NONE;  	u32 bit_off, num_bits; @@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  					      OCFS2_JOURNAL_ACCESS_WRITE);  	if (status < 0) {  		mlog_errno(status); -		goto leave; +		need_free = 1; +		goto bail;  	}  	block = ocfs2_clusters_to_blocks(osb->sb, bit_off); @@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  				     num_bits, flags, meta_ac);  	if (status < 0) {  		mlog_errno(status); -		goto leave; +		need_free = 1; +		goto bail;  	}  	ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  		reason = RESTART_TRANS;  	} +bail: +	if (need_free) { +		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) +			ocfs2_free_local_alloc_bits(osb, handle, data_ac, +					bit_off, num_bits); +		else +			ocfs2_free_clusters(handle, +					data_ac->ac_inode, +					data_ac->ac_bh, +					ocfs2_clusters_to_blocks(osb->sb, bit_off), +					num_bits); +	} +  leave:  	if (reason_ret)  		*reason_ret = reason; @@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  					 struct buffer_head *di_bh)  {  	int ret, i, has_data, num_pages = 0; +	int need_free = 0; +	u32 bit_off, num;  	handle_t *handle;  	u64 uninitialized_var(block);  	struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  	}  	if (has_data) { -		u32 bit_off, num;  		unsigned int page_end;  		u64 phys; @@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);  		if (ret) {  			mlog_errno(ret); +			need_free = 1;  			goto out_commit;  		} @@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);  		if (ret) {  			mlog_errno(ret); +			need_free = 1;  			goto out_commit;  		} @@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);  		if (ret) {  			mlog_errno(ret); +			need_free = 1;  			goto out_commit;  		} @@ -6938,6 +6958,18 @@ out_commit:  		dquot_free_space_nodirty(inode,  					  ocfs2_clusters_to_bytes(osb->sb, 1)); +	if (need_free) { +		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) +			ocfs2_free_local_alloc_bits(osb, handle, data_ac, +					bit_off, num); +		else +			ocfs2_free_clusters(handle, +					data_ac->ac_inode, +					data_ac->ac_bh, +					ocfs2_clusters_to_blocks(osb->sb, bit_off), +					num); +	} +  	ocfs2_commit_trans(osb, handle);  out_unlock: diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index cd5496b7a0a3..044013455621 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -781,6 +781,48 @@ bail:  	return status;  } +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, +				handle_t *handle, +				struct ocfs2_alloc_context *ac, +				u32 bit_off, +				u32 num_bits) +{ +	int status, start; +	u32 clear_bits; +	struct inode *local_alloc_inode; +	void *bitmap; +	struct ocfs2_dinode *alloc; +	struct ocfs2_local_alloc *la; + +	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + +	local_alloc_inode = ac->ac_inode; +	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; +	la = OCFS2_LOCAL_ALLOC(alloc); + +	bitmap = la->la_bitmap; +	start = bit_off - le32_to_cpu(la->la_bm_off); +	clear_bits = num_bits; + +	status = ocfs2_journal_access_di(handle, +			INODE_CACHE(local_alloc_inode), +			osb->local_alloc_bh, +			OCFS2_JOURNAL_ACCESS_WRITE); +	if (status < 0) { +		mlog_errno(status); +		goto bail; +	} + +	while (clear_bits--) +		ocfs2_clear_bit(start++, bitmap); + +	le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); +	ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: +	return status; +} +  static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)  {  	u32 count; diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 1be9b5864460..44a7d1fb2dec 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,  				 u32 *bit_off,  				 u32 *num_bits); +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, +				handle_t *handle, +				struct ocfs2_alloc_context *ac, +				u32 bit_off, +				u32 num_bits); +  void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,  				      unsigned int num_clusters);  void ocfs2_la_enable_worker(struct work_struct *work); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e464b4e987e8..d1fe1a761047 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)  TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)  PAGEFLAG(MappedToDisk, mappedtodisk) -/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ +/* PG_readahead is only used for reads; PG_reclaim is only for writes */  PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim)		/* Reminder to do async read-ahead */ +PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)  #ifdef CONFIG_HIGHMEM  /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2d30e2cfe804..7106cb1aca8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page)  	if (!TestSetPageDirty(page)) {  		struct address_space *mapping = page_mapping(page);  		struct address_space *mapping2; +		unsigned long flags;  		if (!mapping)  			return 1; -		spin_lock_irq(&mapping->tree_lock); +		spin_lock_irqsave(&mapping->tree_lock, flags);  		mapping2 = page_mapping(page);  		if (mapping2) { /* Race with truncate? */  			BUG_ON(mapping2 != mapping); @@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page)  			radix_tree_tag_set(&mapping->page_tree,  				page_index(page), PAGECACHE_TAG_DIRTY);  		} -		spin_unlock_irq(&mapping->tree_lock); +		spin_unlock_irqrestore(&mapping->tree_lock, flags);  		if (mapping->host) {  			/* !PageAnon && !swapper_space */  			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/mm/swap_state.c b/mm/swap_state.c index 98e85e9c2b2d..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)  	return ret;  } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); +  void show_swap_cache_info(void)  {  	printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)  	page = find_get_page(swap_address_space(entry), entry.val); -	if (page) +	if (page) {  		INC_CACHE_INFO(find_success); +		if (TestClearPageReadahead(page)) +			atomic_inc(&swapin_readahead_hits); +	}  	INC_CACHE_INFO(find_total);  	return page; @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,  	return found_page;  } +static unsigned long swapin_nr_pages(unsigned long offset) +{ +	static unsigned long prev_offset; +	unsigned int pages, max_pages, last_ra; +	static atomic_t last_readahead_pages; + +	max_pages = 1 << ACCESS_ONCE(page_cluster); +	if (max_pages <= 1) +		return 1; + +	/* +	 * This heuristic has been found to work well on both sequential and +	 * random loads, swapping to hard disk or to SSD: please don't ask +	 * what the "+ 2" means, it just happens to work well, that's all. +	 */ +	pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; +	if (pages == 2) { +		/* +		 * We can have no readahead hits to judge by: but must not get +		 * stuck here forever, so check for an adjacent offset instead +		 * (and don't even bother to check whether swap type is same). +		 */ +		if (offset != prev_offset + 1 && offset != prev_offset - 1) +			pages = 1; +		prev_offset = offset; +	} else { +		unsigned int roundup = 4; +		while (roundup < pages) +			roundup <<= 1; +		pages = roundup; +	} + +	if (pages > max_pages) +		pages = max_pages; + +	/* Don't shrink readahead too fast */ +	last_ra = atomic_read(&last_readahead_pages) / 2; +	if (pages < last_ra) +		pages = last_ra; +	atomic_set(&last_readahead_pages, pages); + +	return pages; +} +  /**   * swapin_readahead - swap in pages in hope we need them soon   * @entry: swap entry of this memory @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,  			struct vm_area_struct *vma, unsigned long addr)  {  	struct page *page; -	unsigned long offset = swp_offset(entry); +	unsigned long entry_offset = swp_offset(entry); +	unsigned long offset = entry_offset;  	unsigned long start_offset, end_offset; -	unsigned long mask = (1UL << page_cluster) - 1; +	unsigned long mask;  	struct blk_plug plug; +	mask = swapin_nr_pages(offset) - 1; +	if (!mask) +		goto skip; +  	/* Read a page_cluster sized and aligned cluster around offset. */  	start_offset = offset & ~mask;  	end_offset = offset | mask; @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,  						gfp_mask, vma, addr);  		if (!page)  			continue; +		if (offset != entry_offset) +			SetPageReadahead(page);  		page_cache_release(page);  	}  	blk_finish_plug(&plug);  	lru_add_drain();	/* Push any new pages onto the LRU now */ +skip:  	return read_swap_cache_async(entry, gfp_mask, vma, addr);  } diff --git a/mm/swapfile.c b/mm/swapfile.c index c6c13b050a58..4a7f7e6992b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)  	p->swap_map = NULL;  	cluster_info = p->cluster_info;  	p->cluster_info = NULL; -	p->flags = 0;  	frontswap_map = frontswap_map_get(p);  	spin_unlock(&p->lock);  	spin_unlock(&swap_lock); @@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)  		mutex_unlock(&inode->i_mutex);  	}  	filp_close(swap_file, NULL); + +	/* +	 * Clear the SWP_USED flag after all resources are freed so that swapon +	 * can reuse this swap_info in alloc_swap_info() safely.  It is ok to +	 * not hold p->lock after we cleared its SWP_WRITEOK. +	 */ +	spin_lock(&swap_lock); +	p->flags = 0; +	spin_unlock(&swap_lock); +  	err = 0;  	atomic_inc(&proc_poll_event);  	wake_up_interruptible(&proc_poll_wait); |