diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:01:38 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:01:38 -0700 |
commit | 26935fb06ee88f1188789807687c03041f3c70d9 (patch) | |
tree | 381c487716540b52348d78bee6555f8fa61d77ef /include | |
parent | 3cc69b638e11bfda5d013c2b75b60934aa0e88a1 (diff) | |
parent | bf2ba3bc185269eca274b458aac46ba1ad7c1121 (diff) | |
download | linux-26935fb06ee88f1188789807687c03041f3c70d9.tar.bz2 |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs pile 4 from Al Viro:
"list_lru pile, mostly"
This came out of Andrew's pile, Al ended up doing the merge work so that
Andrew didn't have to.
Additionally, a few fixes.
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (42 commits)
super: fix for destroy lrus
list_lru: dynamically adjust node arrays
shrinker: Kill old ->shrink API.
shrinker: convert remaining shrinkers to count/scan API
staging/lustre/libcfs: cleanup linux-mem.h
staging/lustre/ptlrpc: convert to new shrinker API
staging/lustre/obdclass: convert lu_object shrinker to count/scan API
staging/lustre/ldlm: convert to shrinkers to count/scan API
hugepage: convert huge zero page shrinker to new shrinker API
i915: bail out earlier when shrinker cannot acquire mutex
drivers: convert shrinkers to new count/scan API
fs: convert fs shrinkers to new scan/count API
xfs: fix dquot isolation hang
xfs-convert-dquot-cache-lru-to-list_lru-fix
xfs: convert dquot cache lru to list_lru
xfs: rework buffer dispose list tracking
xfs-convert-buftarg-lru-to-generic-code-fix
xfs: convert buftarg LRU to generic code
fs: convert inode and dentry shrinking to be node aware
vmscan: per-node deferred work
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/dcache.h | 14 | ||||
-rw-r--r-- | include/linux/fs.h | 26 | ||||
-rw-r--r-- | include/linux/list_lru.h | 131 | ||||
-rw-r--r-- | include/linux/shrinker.h | 54 | ||||
-rw-r--r-- | include/trace/events/vmscan.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/fs.h | 6 |
6 files changed, 196 insertions, 39 deletions
diff --git a/include/linux/dcache.h b/include/linux/dcache.h index feaa8d88eef7..59066e0b4ff1 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -55,11 +55,11 @@ struct qstr { #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) struct dentry_stat_t { - int nr_dentry; - int nr_unused; - int age_limit; /* age in seconds */ - int want_pages; /* pages requested by system */ - int dummy[2]; + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long dummy[2]; }; extern struct dentry_stat_t dentry_stat; @@ -395,4 +395,8 @@ static inline bool d_mountpoint(const struct dentry *dentry) extern int sysctl_vfs_cache_pressure; +static inline unsigned long vfs_pressure_ratio(unsigned long val) +{ + return mult_frac(val, sysctl_vfs_cache_pressure, 100); +} #endif /* __LINUX_DCACHE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 529d8711baba..a4acd3c61190 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,6 +10,7 @@ #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> +#include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/rbtree.h> @@ -1269,15 +1270,6 @@ struct super_block { struct list_head s_files; #endif struct list_head s_mounts; /* list of mounts; _not_ for fs use */ - /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ - struct list_head s_dentry_lru; /* unused dentry lru */ - int s_nr_dentry_unused; /* # of dentry on lru */ - - /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ - spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; - struct list_head s_inode_lru; /* unused inode lru */ - int s_nr_inodes_unused; /* # of inodes on lru */ - struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; @@ -1331,11 +1323,14 @@ struct super_block { /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; -}; -/* superblock cache pruning functions */ -extern void prune_icache_sb(struct super_block *sb, int nr_to_scan); -extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan); + /* + * Keep the lru lists last in the structure so they always sit on their + * own individual cachelines. + */ + struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; + struct list_lru s_inode_lru ____cacheline_aligned_in_smp; +}; extern struct timespec current_fs_time(struct super_block *sb); @@ -1629,8 +1624,8 @@ struct super_operations { ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); - int (*nr_cached_objects)(struct super_block *); - void (*free_cached_objects)(struct super_block *, int); + long (*nr_cached_objects)(struct super_block *, int); + long (*free_cached_objects)(struct super_block *, long, int); }; /* @@ -2494,7 +2489,6 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int vfs_readlink(struct dentry *, char __user *, int, const char *); -extern int vfs_follow_link(struct nameidata *, const char *); extern int page_readlink(struct dentry *, char __user *, int); extern void *page_follow_link_light(struct dentry *, struct nameidata *); extern void page_put_link(struct dentry *, struct nameidata *, void *); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h new file mode 100644 index 000000000000..3ce541753c88 --- /dev/null +++ b/include/linux/list_lru.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. + * Authors: David Chinner and Glauber Costa + * + * Generic LRU infrastructure + */ +#ifndef _LRU_LIST_H +#define _LRU_LIST_H + +#include <linux/list.h> +#include <linux/nodemask.h> + +/* list_lru_walk_cb has to always return one of those */ +enum lru_status { + LRU_REMOVED, /* item removed from list */ + LRU_ROTATE, /* item referenced, give another pass */ + LRU_SKIP, /* item cannot be locked, skip */ + LRU_RETRY, /* item not freeable. May drop the lock + internally, but has to return locked. */ +}; + +struct list_lru_node { + spinlock_t lock; + struct list_head list; + /* kept as signed so we can catch imbalance bugs */ + long nr_items; +} ____cacheline_aligned_in_smp; + +struct list_lru { + struct list_lru_node *node; + nodemask_t active_nodes; +}; + +void list_lru_destroy(struct list_lru *lru); +int list_lru_init(struct list_lru *lru); + +/** + * list_lru_add: add an element to the lru list's tail + * @list_lru: the lru pointer + * @item: the item to be added. + * + * If the element is already part of a list, this function returns doing + * nothing. Therefore the caller does not need to keep state about whether or + * not the element already belongs in the list and is allowed to lazy update + * it. Note however that this is valid for *a* list, not *this* list. If + * the caller organize itself in a way that elements can be in more than + * one type of list, it is up to the caller to fully remove the item from + * the previous list (with list_lru_del() for instance) before moving it + * to @list_lru + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_add(struct list_lru *lru, struct list_head *item); + +/** + * list_lru_del: delete an element to the lru list + * @list_lru: the lru pointer + * @item: the item to be deleted. + * + * This function works analogously as list_lru_add in terms of list + * manipulation. The comments about an element already pertaining to + * a list are also valid for list_lru_del. + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_del(struct list_lru *lru, struct list_head *item); + +/** + * list_lru_count_node: return the number of objects currently held by @lru + * @lru: the lru pointer. + * @nid: the node id to count from. + * + * Always return a non-negative number, 0 for empty lists. There is no + * guarantee that the list is not updated while the count is being computed. + * Callers that want such a guarantee need to provide an outer lock. + */ +unsigned long list_lru_count_node(struct list_lru *lru, int nid); +static inline unsigned long list_lru_count(struct list_lru *lru) +{ + long count = 0; + int nid; + + for_each_node_mask(nid, lru->active_nodes) + count += list_lru_count_node(lru, nid); + + return count; +} + +typedef enum lru_status +(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg); +/** + * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items. + * @lru: the lru pointer. + * @nid: the node id to scan from. + * @isolate: callback function that is resposible for deciding what to do with + * the item currently being scanned + * @cb_arg: opaque type that will be passed to @isolate + * @nr_to_walk: how many items to scan. + * + * This function will scan all elements in a particular list_lru, calling the + * @isolate callback for each of those items, along with the current list + * spinlock and a caller-provided opaque. The @isolate callback can choose to + * drop the lock internally, but *must* return with the lock held. The callback + * will return an enum lru_status telling the list_lru infrastructure what to + * do with the object being scanned. + * + * Please note that nr_to_walk does not mean how many objects will be freed, + * just how many objects will be scanned. + * + * Return value: the number of objects effectively removed from the LRU. + */ +unsigned long list_lru_walk_node(struct list_lru *lru, int nid, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk); + +static inline unsigned long +list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, + void *cb_arg, unsigned long nr_to_walk) +{ + long isolated = 0; + int nid; + + for_each_node_mask(nid, lru->active_nodes) { + isolated += list_lru_walk_node(lru, nid, isolate, + cb_arg, &nr_to_walk); + if (nr_to_walk <= 0) + break; + } + return isolated; +} +#endif /* _LRU_LIST_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index ac6b8ee07825..68c097077ef0 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -4,39 +4,67 @@ /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extention later. + * + * The 'gfpmask' refers to the allocation we are currently trying to + * fulfil. */ struct shrink_control { gfp_t gfp_mask; - /* How many slab objects shrinker() should scan and try to reclaim */ + /* + * How many objects scan_objects should scan and try to reclaim. + * This is reset before every call, so it is safe for callees + * to modify. + */ unsigned long nr_to_scan; + + /* shrink from these nodes */ + nodemask_t nodes_to_scan; + /* current node being shrunk (for NUMA aware shrinkers) */ + int nid; }; +#define SHRINK_STOP (~0UL) /* * A callback you can register to apply pressure to ageable caches. * - * 'sc' is passed shrink_control which includes a count 'nr_to_scan' - * and a 'gfpmask'. It should look through the least-recently-used - * 'nr_to_scan' entries and attempt to free them up. It should return - * the number of objects which remain in the cache. If it returns -1, it means - * it cannot do any scanning at this time (eg. there is a risk of deadlock). + * @count_objects should return the number of freeable items in the cache. If + * there are no objects to free or the number of freeable items cannot be + * determined, it should return 0. No deadlock checks should be done during the + * count callback - the shrinker relies on aggregating scan counts that couldn't + * be executed due to potential deadlocks to be run at a later call when the + * deadlock condition is no longer pending. * - * The 'gfpmask' refers to the allocation we are currently trying to - * fulfil. + * @scan_objects will only be called if @count_objects returned a non-zero + * value for the number of freeable objects. The callout should scan the cache + * and attempt to free items from the cache. It should then return the number + * of objects freed during the scan, or SHRINK_STOP if progress cannot be made + * due to potential deadlocks. If SHRINK_STOP is returned, then no further + * attempts to call the @scan_objects will be made from the current reclaim + * context. * - * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is - * querying the cache size, so a fastpath for that case is appropriate. + * @flags determine the shrinker abilities, like numa awareness */ struct shrinker { - int (*shrink)(struct shrinker *, struct shrink_control *sc); + unsigned long (*count_objects)(struct shrinker *, + struct shrink_control *sc); + unsigned long (*scan_objects)(struct shrinker *, + struct shrink_control *sc); + int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ + unsigned long flags; /* These are for internal use */ struct list_head list; - atomic_long_t nr_in_batch; /* objs pending delete */ + /* objs pending delete, per node */ + atomic_long_t *nr_deferred; }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ -extern void register_shrinker(struct shrinker *); + +/* Flags */ +#define SHRINKER_NUMA_AWARE (1 << 0) + +extern int register_shrinker(struct shrinker *); extern void unregister_shrinker(struct shrinker *); #endif diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 63cfcccaebb3..132a985aba8b 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -202,7 +202,7 @@ TRACE_EVENT(mm_shrink_slab_start, TP_fast_assign( __entry->shr = shr; - __entry->shrink = shr->shrink; + __entry->shrink = shr->scan_objects; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; __entry->pgs_scanned = pgs_scanned; @@ -241,7 +241,7 @@ TRACE_EVENT(mm_shrink_slab_end, TP_fast_assign( __entry->shr = shr; - __entry->shrink = shr->shrink; + __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->retval = shrinker_retval; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index a4ed56cf0eac..6c28b61bb690 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -49,9 +49,9 @@ struct files_stat_struct { }; struct inodes_stat_t { - int nr_inodes; - int nr_unused; - int dummy[5]; /* padding for sysctl ABI compatibility */ + long nr_inodes; + long nr_unused; + long dummy[5]; /* padding for sysctl ABI compatibility */ }; |