diff options
-rw-r--r-- | Documentation/admin-guide/ext4.rst | 574 | ||||
-rw-r--r-- | Documentation/admin-guide/index.rst | 1 | ||||
-rw-r--r-- | Documentation/conf.py | 4 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/about.rst (renamed from Documentation/filesystems/ext4/ondisk/about.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/allocators.rst (renamed from Documentation/filesystems/ext4/ondisk/allocators.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/attributes.rst (renamed from Documentation/filesystems/ext4/ondisk/attributes.rst) | 8 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/bigalloc.rst (renamed from Documentation/filesystems/ext4/ondisk/bigalloc.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/bitmaps.rst (renamed from Documentation/filesystems/ext4/ondisk/bitmaps.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blockgroup.rst (renamed from Documentation/filesystems/ext4/ondisk/blockgroup.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blockmap.rst (renamed from Documentation/filesystems/ext4/ondisk/blockmap.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blocks.rst (renamed from Documentation/filesystems/ext4/ondisk/blocks.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/checksums.rst (renamed from Documentation/filesystems/ext4/ondisk/checksums.rst) | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/directory.rst (renamed from Documentation/filesystems/ext4/ondisk/directory.rst) | 18 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/dynamic.rst (renamed from Documentation/filesystems/ext4/ondisk/dynamic.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/eainode.rst (renamed from Documentation/filesystems/ext4/ondisk/eainode.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/ext4.rst | 613 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/globals.rst (renamed from Documentation/filesystems/ext4/ondisk/globals.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/group_descr.rst (renamed from Documentation/filesystems/ext4/ondisk/group_descr.rst) | 4 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/ifork.rst (renamed from Documentation/filesystems/ext4/ondisk/ifork.rst) | 8 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/index.rst | 19 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/inlinedata.rst (renamed from Documentation/filesystems/ext4/ondisk/inlinedata.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/inodes.rst) | 19 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/journal.rst (renamed from Documentation/filesystems/ext4/ondisk/journal.rst) | 32 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/mmp.rst (renamed from Documentation/filesystems/ext4/ondisk/mmp.rst) | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/ondisk/index.rst | 9 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/overview.rst (renamed from Documentation/filesystems/ext4/ondisk/overview.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/special_inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/special_inodes.rst) | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/super.rst (renamed from Documentation/filesystems/ext4/ondisk/super.rst) | 24 | ||||
-rw-r--r-- | fs/ext4/acl.c | 4 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 17 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 13 | ||||
-rw-r--r-- | fs/ext4/extents.c | 595 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 654 | ||||
-rw-r--r-- | fs/ext4/extents_status.h | 80 | ||||
-rw-r--r-- | fs/ext4/inline.c | 2 | ||||
-rw-r--r-- | fs/ext4/inode.c | 142 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 97 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 14 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 8 | ||||
-rw-r--r-- | fs/ext4/namei.c | 2 | ||||
-rw-r--r-- | fs/ext4/super.c | 81 | ||||
-rw-r--r-- | fs/jbd2/checkpoint.c | 4 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 2 | ||||
-rw-r--r-- | include/trace/events/ext4.h | 99 |
44 files changed, 1984 insertions, 1169 deletions
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst new file mode 100644 index 000000000000..e506d3dae510 --- /dev/null +++ b/Documentation/admin-guide/ext4.rst @@ -0,0 +1,574 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +ext4 General Information +======================== + +Ext4 is an advanced level of the ext3 filesystem which incorporates +scalability and reliability enhancements for supporting large filesystems +(64 bit) in keeping with increasing disk capacities and state-of-the-art +feature requirements. + +Mailing list: linux-ext4@vger.kernel.org +Web site: http://ext4.wiki.kernel.org + + +Quick usage instructions +======================== + +Note: More extensive information for getting started with ext4 can be +found at the ext4 wiki site at the URL: +http://ext4.wiki.kernel.org/index.php/Ext4_Howto + + - The latest version of e2fsprogs can be found at: + + https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ + + or + + http://sourceforge.net/project/showfiles.php?group_id=2406 + + or grab the latest git repository from: + + https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git + + - Create a new filesystem using the ext4 filesystem type: + + # mke2fs -t ext4 /dev/hda1 + + Or to configure an existing ext3 filesystem to support extents: + + # tune2fs -O extents /dev/hda1 + + If the filesystem was created with 128 byte inodes, it can be + converted to use 256 byte for greater efficiency via: + + # tune2fs -I 256 /dev/hda1 + + - Mounting: + + # mount -t ext4 /dev/hda1 /wherever + + - When comparing performance with other filesystems, it's always + important to try multiple workloads; very often a subtle change in a + workload parameter can completely change the ranking of which + filesystems do well compared to others. When comparing versus ext3, + note that ext4 enables write barriers by default, while ext3 does + not enable write barriers by default. So it is useful to use + explicitly specify whether barriers are enabled or not when via the + '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems + for a fair comparison. When tuning ext3 for best benchmark numbers, + it is often worthwhile to try changing the data journaling mode; '-o + data=writeback' can be faster for some workloads. (Note however that + running mounted with data=writeback can potentially leave stale data + exposed in recently written files in case of an unclean shutdown, + which could be a security exposure in some situations.) Configuring + the filesystem with a large journal can also be helpful for + metadata-intensive workloads. + +Features +======== + +Currently Available +------------------- + +* ability to use filesystems > 16TB (e2fsprogs support not available yet) +* extent format reduces metadata overhead (RAM, IO for access, transactions) +* extent format more robust in face of on-disk corruption due to magics, +* internal redundancy in tree +* improved file allocation (multi-block alloc) +* lift 32000 subdirectory limit imposed by i_links_count[1] +* nsec timestamps for mtime, atime, ctime, create time +* inode version field on disk (NFSv4, Lustre) +* reduced e2fsck time via uninit_bg feature +* journal checksumming for robustness, performance +* persistent file preallocation (e.g for streaming media, databases) +* ability to pack bitmaps and inode tables into larger virtual groups via the + flex_bg feature +* large file support +* inode allocation using large virtual block groups via flex_bg +* delayed allocation +* large block (up to pagesize) support +* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force + the ordering) + +[1] Filesystems with a block size of 1k may see a limit imposed by the +directory hash tree having a maximum depth of two. + +Options +======= + +When mounting an ext4 filesystem, the following option are accepted: +(*) == default + + ro + Mount filesystem read only. Note that ext4 will replay the journal (and + thus write to the partition) even when mounted "read only". The mount + options "ro,noload" can be used to prevent writes to the filesystem. + + journal_checksum + Enable checksumming of the journal transactions. This will allow the + recovery code in e2fsck and the kernel to detect corruption in the + kernel. It is a compatible change and will be ignored by older + kernels. + + journal_async_commit + Commit block can be written to disk without waiting for descriptor + blocks. If enabled older kernels cannot mount the device. This will + enable 'journal_checksum' internally. + + journal_path=path, journal_dev=devnum + When the external journal device's major/minor numbers have changed, + these options allow the user to specify the new journal location. The + journal device is identified through either its new major/minor numbers + encoded in devnum, or via a path to the device. + + norecovery, noload + Don't load the journal on mounting. Note that if the filesystem was + not unmounted cleanly, skipping the journal replay will lead to the + filesystem containing inconsistencies that can lead to any number of + problems. + + data=journal + All data are committed into the journal prior to being written into the + main file system. Enabling this mode will disable delayed allocation + and O_DIRECT support. + + data=ordered (*) + All data are forced directly out to the main file system prior to its + metadata being committed to the journal. + + data=writeback + Data ordering is not preserved, data may be written into the main file + system after its metadata has been committed to the journal. + + commit=nrsec (*) + Ext4 can be told to sync all its data and metadata every 'nrsec' + seconds. The default value is 5 seconds. This means that if you lose + your power, you will lose as much as the latest 5 seconds of work (your + filesystem will not be damaged though, thanks to the journaling). This + default value (or any low value) will hurt performance, but it's good + for data-safety. Setting it to 0 will have the same effect as leaving + it at the default (5 seconds). Setting it to very large values will + improve performance. + + barrier=<0|1(*)>, barrier(*), nobarrier + This enables/disables the use of write barriers in the jbd code. + barrier=0 disables, barrier=1 enables. This also requires an IO stack + which can support barriers, and if jbd gets an error on a barrier + write, it will disable again with a warning. Write barriers enforce + proper on-disk ordering of journal commits, making volatile disk write + caches safe to use, at some performance penalty. If your disks are + battery-backed in one way or another, disabling barriers may safely + improve performance. The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for consistency with other + ext4 mount options. + + inode_readahead_blks=n + This tuning parameter controls the maximum number of inode table blocks + that ext4's inode table readahead algorithm will pre-read into the + buffer cache. The default value is 32 blocks. + + nouser_xattr + Disables Extended User Attributes. See the attr(5) manual page for + more information about extended attributes. + + noacl + This option disables POSIX Access Control List support. If ACL support + is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL + is enabled by default on mount. See the acl(5) manual page for more + information about acl. + + bsddf (*) + Make 'df' act like BSD. + + minixdf + Make 'df' act like Minix. + + debug + Extra debugging information is sent to syslog. + + abort + Simulate the effects of calling ext4_abort() for debugging purposes. + This is normally used while remounting a filesystem which is already + mounted. + + errors=remount-ro + Remount the filesystem read-only on an error. + + errors=continue + Keep going on a filesystem error. + + errors=panic + Panic and halt the machine if an error occurs. (These mount options + override the errors behavior specified in the superblock, which can be + configured using tune2fs) + + data_err=ignore(*) + Just print an error message if an error occurs in a file data buffer in + ordered mode. + data_err=abort + Abort the journal if an error occurs in a file data buffer in ordered + mode. + + grpid | bsdgroups + New objects have the group ID of their parent. + + nogrpid (*) | sysvgroups + New objects have the group ID of their creator. + + resgid=n + The group ID which may use the reserved blocks. + + resuid=n + The user ID which may use the reserved blocks. + + sb= + Use alternate superblock at this location. + + quota, noquota, grpquota, usrquota + These options are ignored by the filesystem. They are used only by + quota tools to recognize volumes where quota should be turned on. See + documentation in the quota-tools package for more details + (http://sourceforge.net/projects/linuxquota). + + jqfmt=<quota type>, usrjquota=<file>, grpjquota=<file> + These options tell filesystem details about quota so that quota + information can be properly updated during journal replay. They replace + the above quota options. See documentation in the quota-tools package + for more details (http://sourceforge.net/projects/linuxquota). + + stripe=n + Number of filesystem blocks that mballoc will try to use for allocation + size and alignment. For RAID5/6 systems this should be the number of + data disks * RAID chunk size in file system blocks. + + delalloc (*) + Defer block allocation until just before ext4 writes out the block(s) + in question. This allows ext4 to better allocation decisions more + efficiently. + + nodelalloc + Disable delayed allocation. Blocks are allocated when the data is + copied from userspace to the page cache, either via the write(2) system + call or when an mmap'ed page which was previously unallocated is + written for the first time. + + max_batch_time=usec + Maximum amount of time ext4 should wait for additional filesystem + operations to be batch together with a synchronous write operation. + Since a synchronous write operation is going to force a commit and then + a wait for the I/O complete, it doesn't cost much, and can be a huge + throughput win, we wait for a small amount of time to see if any other + transactions can piggyback on the synchronous write. The algorithm + used is designed to automatically tune for the speed of the disk, by + measuring the amount of time (on average) that it takes to finish + committing a transaction. Call this time the "commit time". If the + time that the transaction has been running is less than the commit + time, ext4 will try sleeping for the commit time to see if other + operations will join the transaction. The commit time is capped by + the max_batch_time, which defaults to 15000us (15ms). This + optimization can be turned off entirely by setting max_batch_time to 0. + + min_batch_time=usec + This parameter sets the commit time (as described above) to be at least + min_batch_time. It defaults to zero microseconds. Increasing this + parameter may improve the throughput of multi-threaded, synchronous + workloads on very fast disks, at the cost of increasing latency. + + journal_ioprio=prio + The I/O priority (from 0 to 7, where 0 is the highest priority) which + should be used for I/O operations submitted by kjournald2 during a + commit operation. This defaults to 3, which is a slightly higher + priority than the default I/O priority. + + auto_da_alloc(*), noauto_da_alloc + Many broken applications don't use fsync() when replacing existing + files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, fd = open("foo", + O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4 + will detect the replace-via-rename and replace-via-truncate patterns + and force that any delayed allocation blocks are allocated such that at + the next journal commit, in the default data=ordered mode, the data + blocks of the new file are forced to disk before the rename() operation + is committed. This provides roughly the same level of guarantees as + ext3, and avoids the "zero-length" problem that can happen when a + system crashes before the delayed allocation blocks are forced to disk. + + noinit_itable + Do not initialize any uninitialized inode table blocks in the + background. This feature may be used by installation CD's so that the + install process can complete as quickly as possible; the inode table + initialization process would then be deferred until the next time the + file system is unmounted. + + init_itable=n + The lazy itable init code will wait n times the number of milliseconds + it took to zero out the previous block group's inode table. This + minimizes the impact on the system performance while file system's + inode table is being initialized. + + discard, nodiscard(*) + Controls whether ext4 should issue discard/TRIM commands to the + underlying block device when blocks are freed. This is useful for SSD + devices and sparse/thinly-provisioned LUNs, but it is off by default + until sufficient testing has been done. + + nouid32 + Disables 32-bit UIDs and GIDs. This is for interoperability with + older kernels which only store and expect 16-bit values. + + block_validity(*), noblock_validity + These options enable or disable the in-kernel facility for tracking + filesystem metadata blocks within internal data structures. This + allows multi- block allocator and other routines to notice bugs or + corrupted allocation bitmaps which cause blocks to be allocated which + overlap with filesystem metadata blocks. + + dioread_lock, dioread_nolock + Controls whether or not ext4 should use the DIO read locking. If the + dioread_nolock option is specified ext4 will allocate uninitialized + extent before buffer write and convert the extent to initialized after + IO completes. This approach allows ext4 code to avoid using inode + mutex, which improves scalability on high speed storages. However this + does not work with data journaling and dioread_nolock option will be + ignored with kernel warning. Note that dioread_nolock code path is only + used for extent-based files. Because of the restrictions this options + comprises it is off by default (e.g. dioread_lock). + + max_dir_size_kb=n + This limits the size of directories so that any attempt to expand them + beyond the specified limit in kilobytes will cause an ENOSPC error. + This is useful in memory constrained environments, where a very large + directory can cause severe performance problems or even provoke the Out + Of Memory killer. (For example, if there is only 512mb memory + available, a 176mb directory may seriously cramp the system's style.) + + i_version + Enable 64-bit inode version support. This option is off by default. + + dax + Use direct access (no page cache). See + Documentation/filesystems/dax.txt. Note that this option is + incompatible with data=journal. + +Data Mode +========= +There are 3 different data modes: + +* writeback mode + + In data=writeback mode, ext4 does not journal data at all. This mode provides + a similar level of journaling as that of XFS, JFS, and ReiserFS in its default + mode - metadata journaling. A crash+recovery can cause incorrect data to + appear in files which were written shortly before the crash. This mode will + typically provide the best ext4 performance. + +* ordered mode + + In data=ordered mode, ext4 only officially journals metadata, but it logically + groups metadata information related to data changes with the data blocks into + a single unit called a transaction. When it's time to write the new metadata + out to disk, the associated data blocks are written first. In general, this + mode performs slightly slower than writeback but significantly faster than + journal mode. + +* journal mode + + data=journal mode provides full data and metadata journaling. All new data is + written to the journal first, and then to its final location. In the event of + a crash, the journal can be replayed, bringing both data and metadata into a + consistent state. This mode is the slowest except when data needs to be read + from and written to disk at the same time where it outperforms all others + modes. Enabling this mode will disable delayed allocation and O_DIRECT + support. + +/proc entries +============= + +Information about mounted ext4 file systems can be found in +/proc/fs/ext4. Each mounted filesystem will have a directory in +/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or +/proc/fs/ext4/dm-0). The files in each per-device directory are shown +in table below. + +Files in /proc/fs/ext4/<devname> + + mb_groups + details of multiblock allocator buddy cache of free blocks + +/sys entries +============ + +Information about mounted ext4 file systems can be found in +/sys/fs/ext4. Each mounted filesystem will have a directory in +/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or +/sys/fs/ext4/dm-0). The files in each per-device directory are shown +in table below. + +Files in /sys/fs/ext4/<devname>: + +(see also Documentation/ABI/testing/sysfs-fs-ext4) + + delayed_allocation_blocks + This file is read-only and shows the number of blocks that are dirty in + the page cache, but which do not have their location in the filesystem + allocated yet. + + inode_goal + Tuning parameter which (if non-zero) controls the goal inode used by + the inode allocator in preference to all other allocation heuristics. + This is intended for debugging use only, and should be 0 on production + systems. + + inode_readahead_blks + Tuning parameter which controls the maximum number of inode table + blocks that ext4's inode table readahead algorithm will pre-read into + the buffer cache. + + lifetime_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was created. + + max_writeback_mb_bump + The maximum number of megabytes the writeback code will try to write + out before move on to another inode. + + mb_group_prealloc + The multiblock allocator will round up allocation requests to a + multiple of this tuning parameter if the stripe size is not set in the + ext4 superblock + + mb_max_to_scan + The maximum number of extents the multiblock allocator will search to + find the best extent. + + mb_min_to_scan + The minimum number of extents the multiblock allocator will search to + find the best extent. + + mb_order2_req + Tuning parameter which controls the minimum size for requests (as a + power of 2) where the buddy cache is used. + + mb_stats + Controls whether the multiblock allocator should collect statistics, + which are shown during the unmount. 1 means to collect statistics, 0 + means not to collect statistics. + + mb_stream_req + Files which have fewer blocks than this tunable parameter will have + their blocks allocated out of a block group specific preallocation + pool, so that small files are packed closely together. Each large file + will have its blocks allocated out of its own unique preallocation + pool. + + session_write_kbytes + This file is read-only and shows the number of kilobytes of data that + have been written to this filesystem since it was mounted. + + reserved_clusters + This is RW file and contains number of reserved clusters in the file + system which will be used in the specific situations to avoid costly + zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or + 4096 clusters, whichever is smaller and this can be changed however it + can never exceed number of clusters in the file system. If there is not + enough space for the reserved space when mounting the file mount will + _not_ fail. + +Ioctls +====== + +There is some Ext4 specific functionality which can be accessed by applications +through the system call interfaces. The list of all Ext4 specific ioctls are +shown in the table below. + +Table of Ext4 specific ioctls + + EXT4_IOC_GETFLAGS + Get additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_GETFLAGS. + + EXT4_IOC_SETFLAGS + Set additional attributes associated with inode. The ioctl argument is + an integer bitfield, with bit values described in ext4.h. This ioctl is + an alias for FS_IOC_SETFLAGS. + + EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD + Get the inode i_generation number stored for each inode. The + i_generation number is normally changed only when new inode is created + and it is particularly useful for network filesystems. The '_OLD' + version of this ioctl is an alias for FS_IOC_GETVERSION. + + EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD + Set the inode i_generation number stored for each inode. The '_OLD' + version of this ioctl is an alias for FS_IOC_SETVERSION. + + EXT4_IOC_GROUP_EXTEND + This ioctl has the same purpose as the resize mount option. It allows + to resize filesystem to the end of the last existing block group, + further resize has to be done with resize2fs, either online, or + offline. The argument points to the unsigned logn number representing + the filesystem new block count. + + EXT4_IOC_MOVE_EXT + Move the block extents from orig_fd (the one this ioctl is pointing to) + to the donor_fd (the one specified in move_extent structure passed as + an argument to this ioctl). Then, exchange inode metadata between + orig_fd and donor_fd. This is especially useful for online + defragmentation, because the allocator has the opportunity to allocate + moved blocks better, ideally into one contiguous extent. + + EXT4_IOC_GROUP_ADD + Add a new group descriptor to an existing or new group descriptor + block. The new group descriptor is described by ext4_new_group_input + structure, which is passed as an argument to this ioctl. This is + especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which + allows online resize of the filesystem to the end of the last existing + block group. Those two ioctls combined is used in userspace online + resize tool (e.g. resize2fs). + + EXT4_IOC_MIGRATE + This ioctl operates on the filesystem itself. It converts (migrates) + ext3 indirect block mapped inode to ext4 extent mapped inode by walking + through indirect block mapping of the original inode and converting + contiguous block ranges into ext4 extents of the temporary inode. Then, + inodes are swapped. This ioctl might help, when migrating from ext3 to + ext4 filesystem, however suggestion is to create fresh ext4 filesystem + and copy data from the backup. Note, that filesystem has to support + extents for this ioctl to work. + + EXT4_IOC_ALLOC_DA_BLKS + Force all of the delay allocated blocks to be allocated to preserve + application-expected ext3 behaviour. Note that this will also start + triggering a write of the data blocks, but this behaviour may change in + the future as it is not necessary and has been done this way only for + sake of simplicity. + + EXT4_IOC_RESIZE_FS + Resize the filesystem to a new size. The number of blocks of resized + filesystem is passed in via 64 bit integer argument. The kernel + allocates bitmaps and inode table, the userspace tool thus just passes + the new number of blocks. + + EXT4_IOC_SWAP_BOOT + Swap i_blocks and associated attributes (like i_blocks, i_size, + i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO + (#5). This is typically used to store a boot loader in a secure part of + the filesystem, where it can't be changed by a normal user by accident. + The data blocks of the previous boot loader will be associated with the + given inode. + +References +========== + +kernel source: <file:fs/ext4/> + <file:fs/jbd2/> + +programs: http://e2fsprogs.sourceforge.net/ + +useful links: http://fedoraproject.org/wiki/ext3-devel + http://www.bullopensource.org/ext4/ + http://ext4.wiki.kernel.org/index.php/Main_Page + http://fedoraproject.org/wiki/Features/Ext4 diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 0873685bab0f..965745d5fb9a 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -71,6 +71,7 @@ configure specific aspects of kernel behavior to your liking. java ras bcache + ext4 pm/index thunderbolt LSM/index diff --git a/Documentation/conf.py b/Documentation/conf.py index b691af4831fa..ede67ccafc29 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -383,6 +383,10 @@ latex_documents = [ 'The kernel development community', 'manual'), ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 'The kernel development community', 'manual'), + ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', + 'ext4 Community', 'manual'), + ('filesystems/ext4/index', 'ext4-data-structures.tex', + 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'), ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 'The kernel development community', 'manual'), ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/about.rst index 0aadba052264..0aadba052264 100644 --- a/Documentation/filesystems/ext4/ondisk/about.rst +++ b/Documentation/filesystems/ext4/about.rst diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/allocators.rst index 7aa85152ace3..7aa85152ace3 100644 --- a/Documentation/filesystems/ext4/ondisk/allocators.rst +++ b/Documentation/filesystems/ext4/allocators.rst diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 0b01b67b81fe..54386a010a8d 100644 --- a/Documentation/filesystems/ext4/ondisk/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst @@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header ``ext4_xattr_ibody_header`` that is 4 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -47,7 +47,7 @@ The beginning of an extended attribute block is in ``struct ext4_xattr_header``, which is 32 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is Attributes stored inside an inode do not need be stored in sorted order. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from the key name. Here is a map of name index values to key prefixes: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Name Index diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst index c6d88557553c..c6d88557553c 100644 --- a/Documentation/filesystems/ext4/ondisk/bigalloc.rst +++ b/Documentation/filesystems/ext4/bigalloc.rst diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst index c7546dbc197a..c7546dbc197a 100644 --- a/Documentation/filesystems/ext4/ondisk/bitmaps.rst +++ b/Documentation/filesystems/ext4/bitmaps.rst diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst index baf888e4c06a..baf888e4c06a 100644 --- a/Documentation/filesystems/ext4/ondisk/blockgroup.rst +++ b/Documentation/filesystems/ext4/blockgroup.rst diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst index 30e25750d88a..30e25750d88a 100644 --- a/Documentation/filesystems/ext4/ondisk/blockmap.rst +++ b/Documentation/filesystems/ext4/blockmap.rst diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/blocks.rst index 73d4dc0f7bda..73d4dc0f7bda 100644 --- a/Documentation/filesystems/ext4/ondisk/blocks.rst +++ b/Documentation/filesystems/ext4/blocks.rst diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/checksums.rst index 9d6a793b2e03..5519e253810d 100644 --- a/Documentation/filesystems/ext4/ondisk/checksums.rst +++ b/Documentation/filesystems/ext4/checksums.rst @@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes (crc32c as of October 2013) unless noted otherwise. .. list-table:: - :widths: 1 1 4 + :widths: 20 8 50 :header-rows: 1 * - Metadata diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/directory.rst index 8fcba68c2884..614034e24669 100644 --- a/Documentation/filesystems/ext4/ondisk/directory.rst +++ b/Documentation/filesystems/ext4/directory.rst @@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference ``dirent.rec_len`` to know for sure. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most ``dirent.rec_len`` to know for sure. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most The directory file type is one of the following values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is ``struct ext4_dir_entry_tail``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length of a data block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -305,7 +305,7 @@ of a data block: The directory hash is one of the following values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is also the full length of a data block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum. The dx\_tail structure is 8 bytes long and looks like this: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/dynamic.rst index bb0c84333341..bb0c84333341 100644 --- a/Documentation/filesystems/ext4/ondisk/dynamic.rst +++ b/Documentation/filesystems/ext4/dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/eainode.rst index ecc0d01a0a72..ecc0d01a0a72 100644 --- a/Documentation/filesystems/ext4/ondisk/eainode.rst +++ b/Documentation/filesystems/ext4/eainode.rst diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst deleted file mode 100644 index 9d4368d591fa..000000000000 --- a/Documentation/filesystems/ext4/ext4.rst +++ /dev/null @@ -1,613 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -======================== -General Information -======================== - -Ext4 is an advanced level of the ext3 filesystem which incorporates -scalability and reliability enhancements for supporting large filesystems -(64 bit) in keeping with increasing disk capacities and state-of-the-art -feature requirements. - -Mailing list: linux-ext4@vger.kernel.org -Web site: http://ext4.wiki.kernel.org - - -Quick usage instructions -======================== - -Note: More extensive information for getting started with ext4 can be -found at the ext4 wiki site at the URL: -http://ext4.wiki.kernel.org/index.php/Ext4_Howto - - - The latest version of e2fsprogs can be found at: - - https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ - - or - - http://sourceforge.net/project/showfiles.php?group_id=2406 - - or grab the latest git repository from: - - https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git - - - Create a new filesystem using the ext4 filesystem type: - - # mke2fs -t ext4 /dev/hda1 - - Or to configure an existing ext3 filesystem to support extents: - - # tune2fs -O extents /dev/hda1 - - If the filesystem was created with 128 byte inodes, it can be - converted to use 256 byte for greater efficiency via: - - # tune2fs -I 256 /dev/hda1 - - - Mounting: - - # mount -t ext4 /dev/hda1 /wherever - - - When comparing performance with other filesystems, it's always - important to try multiple workloads; very often a subtle change in a - workload parameter can completely change the ranking of which - filesystems do well compared to others. When comparing versus ext3, - note that ext4 enables write barriers by default, while ext3 does - not enable write barriers by default. So it is useful to use - explicitly specify whether barriers are enabled or not when via the - '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems - for a fair comparison. When tuning ext3 for best benchmark numbers, - it is often worthwhile to try changing the data journaling mode; '-o - data=writeback' can be faster for some workloads. (Note however that - running mounted with data=writeback can potentially leave stale data - exposed in recently written files in case of an unclean shutdown, - which could be a security exposure in some situations.) Configuring - the filesystem with a large journal can also be helpful for - metadata-intensive workloads. - -Features -======== - -Currently Available -------------------- - -* ability to use filesystems > 16TB (e2fsprogs support not available yet) -* extent format reduces metadata overhead (RAM, IO for access, transactions) -* extent format more robust in face of on-disk corruption due to magics, -* internal redundancy in tree -* improved file allocation (multi-block alloc) -* lift 32000 subdirectory limit imposed by i_links_count[1] -* nsec timestamps for mtime, atime, ctime, create time -* inode version field on disk (NFSv4, Lustre) -* reduced e2fsck time via uninit_bg feature -* journal checksumming for robustness, performance -* persistent file preallocation (e.g for streaming media, databases) -* ability to pack bitmaps and inode tables into larger virtual groups via the - flex_bg feature -* large file support -* inode allocation using large virtual block groups via flex_bg -* delayed allocation -* large block (up to pagesize) support -* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force - the ordering) - -[1] Filesystems with a block size of 1k may see a limit imposed by the -directory hash tree having a maximum depth of two. - -Options -======= - -When mounting an ext4 filesystem, the following option are accepted: -(*) == default - -======================= ======================================================= -Mount Option Description -======================= ======================================================= -ro Mount filesystem read only. Note that ext4 will - replay the journal (and thus write to the - partition) even when mounted "read only". The - mount options "ro,noload" can be used to prevent - writes to the filesystem. - -journal_checksum Enable checksumming of the journal transactions. - This will allow the recovery code in e2fsck and the - kernel to detect corruption in the kernel. It is a - compatible change and will be ignored by older kernels. - -journal_async_commit Commit block can be written to disk without waiting - for descriptor blocks. If enabled older kernels cannot - mount the device. This will enable 'journal_checksum' - internally. - -journal_path=path -journal_dev=devnum When the external journal device's major/minor numbers - have changed, these options allow the user to specify - the new journal location. The journal device is - identified through either its new major/minor numbers - encoded in devnum, or via a path to the device. - -norecovery Don't load the journal on mounting. Note that -noload if the filesystem was not unmounted cleanly, - skipping the journal replay will lead to the - filesystem containing inconsistencies that can - lead to any number of problems. - -data=journal All data are committed into the journal prior to being - written into the main file system. Enabling - this mode will disable delayed allocation and - O_DIRECT support. - -data=ordered (*) All data are forced directly out to the main file - system prior to its metadata being committed to the - journal. - -data=writeback Data ordering is not preserved, data may be written - into the main file system after its metadata has been - committed to the journal. - -commit=nrsec (*) Ext4 can be told to sync all its data and metadata - every 'nrsec' seconds. The default value is 5 seconds. - This means that if you lose your power, you will lose - as much as the latest 5 seconds of work (your - filesystem will not be damaged though, thanks to the - journaling). This default value (or any low value) - will hurt performance, but it's good for data-safety. - Setting it to 0 will have the same effect as leaving - it at the default (5 seconds). - Setting it to very large values will improve - performance. - -barrier=<0|1(*)> This enables/disables the use of write barriers in -barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. -nobarrier This also requires an IO stack which can support - barriers, and if jbd gets an error on a barrier - write, it will disable again with a warning. - Write barriers enforce proper on-disk ordering - of journal commits, making volatile disk write caches - safe to use, at some performance penalty. If - your disks are battery-backed in one way or another, - disabling barriers may safely improve performance. - The mount options "barrier" and "nobarrier" can - also be used to enable or disable barriers, for - consistency with other ext4 mount options. - -inode_readahead_blks=n This tuning parameter controls the maximum - number of inode table blocks that ext4's inode - table readahead algorithm will pre-read into - the buffer cache. The default value is 32 blocks. - -nouser_xattr Disables Extended User Attributes. See the - attr(5) manual page for more information about - extended attributes. - -noacl This option disables POSIX Access Control List - support. If ACL support is enabled in the kernel - configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is - enabled by default on mount. See the acl(5) manual - page for more information about acl. - -bsddf (*) Make 'df' act like BSD. -minixdf Make 'df' act like Minix. - -debug Extra debugging information is sent to syslog. - -abort Simulate the effects of calling ext4_abort() for - debugging purposes. This is normally used while - remounting a filesystem which is already mounted. - -errors=remount-ro Remount the filesystem read-only on an error. -errors=continue Keep going on a filesystem error. -errors=panic Panic and halt the machine if an error occurs. - (These mount options override the errors behavior - specified in the superblock, which can be configured - using tune2fs) - -data_err=ignore(*) Just print an error message if an error occurs - in a file data buffer in ordered mode. -data_err=abort Abort the journal if an error occurs in a file - data buffer in ordered mode. - -grpid New objects have the group ID of their parent. -bsdgroups - -nogrpid (*) New objects have the group ID of their creator. -sysvgroups - -resgid=n The group ID which may use the reserved blocks. - -resuid=n The user ID which may use the reserved blocks. - -sb=n Use alternate superblock at this location. - -quota These options are ignored by the filesystem. They -noquota are used only by quota tools to recognize volumes -grpquota where quota should be turned on. See documentation -usrquota in the quota-tools package for more details - (http://sourceforge.net/projects/linuxquota). - -jqfmt=<quota type> These options tell filesystem details about quota -usrjquota=<file> so that quota information can be properly updated -grpjquota=<file> during journal replay. They replace the above - quota options. See documentation in the quota-tools - package for more details - (http://sourceforge.net/projects/linuxquota). - -stripe=n Number of filesystem blocks that mballoc will try - to use for allocation size and alignment. For RAID5/6 - systems this should be the number of data - disks * RAID chunk size in file system blocks. - -delalloc (*) Defer block allocation until just before ext4 - writes out the block(s) in question. This - allows ext4 to better allocation decisions - more efficiently. -nodelalloc Disable delayed allocation. Blocks are allocated - when the data is copied from userspace to the - page cache, either via the write(2) system call - or when an mmap'ed page which was previously - unallocated is written for the first time. - -max_batch_time=usec Maximum amount of time ext4 should wait for - additional filesystem operations to be batch - together with a synchronous write operation. - Since a synchronous write operation is going to - force a commit and then a wait for the I/O - complete, it doesn't cost much, and can be a - huge throughput win, we wait for a small amount - of time to see if any other transactions can - piggyback on the synchronous write. The - algorithm used is designed to automatically tune - for the speed of the disk, by measuring the - amount of time (on average) that it takes to - finish committing a transaction. Call this time - the "commit time". If the time that the - transaction has been running is less than the - commit time, ext4 will try sleeping for the - commit time to see if other operations will join - the transaction. The commit time is capped by - the max_batch_time, which defaults to 15000us - (15ms). This optimization can be turned off - entirely by setting max_batch_time to 0. - -min_batch_time=usec This parameter sets the commit time (as - described above) to be at least min_batch_time. - It defaults to zero microseconds. Increasing - this parameter may improve the throughput of - multi-threaded, synchronous workloads on very - fast disks, at the cost of increasing latency. - -journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the - highest priority) which should be used for I/O - operations submitted by kjournald2 during a - commit operation. This defaults to 3, which is - a slightly higher priority than the default I/O - priority. - -auto_da_alloc(*) Many broken applications don't use fsync() when -noauto_da_alloc replacing existing files via patterns such as - fd = open("foo.new")/write(fd,..)/close(fd)/ - rename("foo.new", "foo"), or worse yet, - fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). - If auto_da_alloc is enabled, ext4 will detect - the replace-via-rename and replace-via-truncate - patterns and force that any delayed allocation - blocks are allocated such that at the next - journal commit, in the default data=ordered - mode, the data blocks of the new file are forced - to disk before the rename() operation is - committed. This provides roughly the same level - of guarantees as ext3, and avoids the - "zero-length" problem that can happen when a - system crashes before the delayed allocation - blocks are forced to disk. - -noinit_itable Do not initialize any uninitialized inode table - blocks in the background. This feature may be - used by installation CD's so that the install - process can complete as quickly as possible; the - inode table initialization process would then be - deferred until the next time the file system - is unmounted. - -init_itable=n The lazy itable init code will wait n times the - number of milliseconds it took to zero out the - previous block group's inode table. This - minimizes the impact on the system performance - while file system's inode table is being initialized. - -discard Controls whether ext4 should issue discard/TRIM -nodiscard(*) commands to the underlying block device when - blocks are freed. This is useful for SSD devices - and sparse/thinly-provisioned LUNs, but it is off - by default until sufficient testing has been done. - -nouid32 Disables 32-bit UIDs and GIDs. This is for - interoperability with older kernels which only - store and expect 16-bit values. - -block_validity(*) These options enable or disable the in-kernel -noblock_validity facility for tracking filesystem metadata blocks - within internal data structures. This allows multi- - block allocator and other routines to notice - bugs or corrupted allocation bitmaps which cause - blocks to be allocated which overlap with - filesystem metadata blocks. - -dioread_lock Controls whether or not ext4 should use the DIO read -dioread_nolock locking. If the dioread_nolock option is specified - ext4 will allocate uninitialized extent before buffer - write and convert the extent to initialized after IO - completes. This approach allows ext4 code to avoid - using inode mutex, which improves scalability on high - speed storages. However this does not work with - data journaling and dioread_nolock option will be - ignored with kernel warning. Note that dioread_nolock - code path is only used for extent-based files. - Because of the restrictions this options comprises - it is off by default (e.g. dioread_lock). - -max_dir_size_kb=n This limits the size of directories so that any - attempt to expand them beyond the specified - limit in kilobytes will cause an ENOSPC error. - This is useful in memory constrained - environments, where a very large directory can - cause severe performance problems or even - provoke the Out Of Memory killer. (For example, - if there is only 512mb memory available, a 176mb - directory may seriously cramp the system's style.) - -i_version Enable 64-bit inode version support. This option is - off by default. - -dax Use direct access (no page cache). See - Documentation/filesystems/dax.txt. Note that - this option is incompatible with data=journal. -======================= ======================================================= - -Data Mode -========= -There are 3 different data modes: - -* writeback mode - - In data=writeback mode, ext4 does not journal data at all. This mode provides - a similar level of journaling as that of XFS, JFS, and ReiserFS in its default - mode - metadata journaling. A crash+recovery can cause incorrect data to - appear in files which were written shortly before the crash. This mode will - typically provide the best ext4 performance. - -* ordered mode - - In data=ordered mode, ext4 only officially journals metadata, but it logically - groups metadata information related to data changes with the data blocks into - a single unit called a transaction. When it's time to write the new metadata - out to disk, the associated data blocks are written first. In general, this - mode performs slightly slower than writeback but significantly faster than - journal mode. - -* journal mode - - data=journal mode provides full data and metadata journaling. All new data is - written to the journal first, and then to its final location. In the event of - a crash, the journal can be replayed, bringing both data and metadata into a - consistent state. This mode is the slowest except when data needs to be read - from and written to disk at the same time where it outperforms all others - modes. Enabling this mode will disable delayed allocation and O_DIRECT - support. - -/proc entries -============= - -Information about mounted ext4 file systems can be found in -/proc/fs/ext4. Each mounted filesystem will have a directory in -/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or -/proc/fs/ext4/dm-0). The files in each per-device directory are shown -in table below. - -Files in /proc/fs/ext4/<devname> - -================ ======= - File Content -================ ======= - mb_groups details of multiblock allocator buddy cache of free blocks -================ ======= - -/sys entries -============ - -Information about mounted ext4 file systems can be found in -/sys/fs/ext4. Each mounted filesystem will have a directory in -/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or -/sys/fs/ext4/dm-0). The files in each per-device directory are shown -in table below. - -Files in /sys/fs/ext4/<devname>: - -(see also Documentation/ABI/testing/sysfs-fs-ext4) - -============================= ================================================= -File Content -============================= ================================================= - delayed_allocation_blocks This file is read-only and shows the number of - blocks that are dirty in the page cache, but - which do not have their location in the - filesystem allocated yet. - -inode_goal Tuning parameter which (if non-zero) controls - the goal inode used by the inode allocator in - preference to all other allocation heuristics. - This is intended for debugging use only, and - should be 0 on production systems. - -inode_readahead_blks Tuning parameter which controls the maximum - number of inode table blocks that ext4's inode - table readahead algorithm will pre-read into - the buffer cache - -lifetime_write_kbytes This file is read-only and shows the number of - kilobytes of data that have been written to this - filesystem since it was created. - - max_writeback_mb_bump The maximum number of megabytes the writeback - code will try to write out before move on to - another inode. - - mb_group_prealloc The multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if - the stripe size is not set in the ext4 superblock - - mb_max_to_scan The maximum number of extents the multiblock - allocator will search to find the best extent - - mb_min_to_scan The minimum number of extents the multiblock - allocator will search to find the best extent - - mb_order2_req Tuning parameter which controls the minimum size - for requests (as a power of 2) where the buddy - cache is used - - mb_stats Controls whether the multiblock allocator should - collect statistics, which are shown during the - unmount. 1 means to collect statistics, 0 means - not to collect statistics - - mb_stream_req Files which have fewer blocks than this tunable - parameter will have their blocks allocated out - of a block group specific preallocation pool, so - that small files are packed closely together. - Each large file will have its blocks allocated - out of its own unique preallocation pool. - - session_write_kbytes This file is read-only and shows the number of - kilobytes of data that have been written to this - filesystem since it was mounted. - - reserved_clusters This is RW file and contains number of reserved - clusters in the file system which will be used - in the specific situations to avoid costly - zeroout, unexpected ENOSPC, or possible data - loss. The default is 2% or 4096 clusters, - whichever is smaller and this can be changed - however it can never exceed number of clusters - in the file system. If there is not enough space - for the reserved space when mounting the file - mount will _not_ fail. -============================= ================================================= - -Ioctls -====== - -There is some Ext4 specific functionality which can be accessed by applications -through the system call interfaces. The list of all Ext4 specific ioctls are -shown in the table below. - -Table of Ext4 specific ioctls - -============================= ================================================= -Ioctl Description -============================= ================================================= - EXT4_IOC_GETFLAGS Get additional attributes associated with inode. - The ioctl argument is an integer bitfield, with - bit values described in ext4.h. This ioctl is an - alias for FS_IOC_GETFLAGS. - - EXT4_IOC_SETFLAGS Set additional attributes associated with inode. - The ioctl argument is an integer bitfield, with - bit values described in ext4.h. This ioctl is an - alias for FS_IOC_SETFLAGS. - - EXT4_IOC_GETVERSION - EXT4_IOC_GETVERSION_OLD - Get the inode i_generation number stored for - each inode. The i_generation number is normally - changed only when new inode is created and it is - particularly useful for network filesystems. The - '_OLD' version of this ioctl is an alias for - FS_IOC_GETVERSION. - - EXT4_IOC_SETVERSION - EXT4_IOC_SETVERSION_OLD - Set the inode i_generation number stored for - each inode. The '_OLD' version of this ioctl - is an alias for FS_IOC_SETVERSION. - - EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize - mount option. It allows to resize filesystem - to the end of the last existing block group, - further resize has to be done with resize2fs, - either online, or offline. The argument points - to the unsigned logn number representing the - filesystem new block count. - - EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one - this ioctl is pointing to) to the donor_fd (the - one specified in move_extent structure passed - as an argument to this ioctl). Then, exchange - inode metadata between orig_fd and donor_fd. - This is especially useful for online - defragmentation, because the allocator has the - opportunity to allocate moved blocks better, - ideally into one contiguous extent. - - EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or - new group descriptor block. The new group - descriptor is described by ext4_new_group_input - structure, which is passed as an argument to - this ioctl. This is especially useful in - conjunction with EXT4_IOC_GROUP_EXTEND, - which allows online resize of the filesystem - to the end of the last existing block group. - Those two ioctls combined is used in userspace - online resize tool (e.g. resize2fs). - - EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself. - It converts (migrates) ext3 indirect block mapped - inode to ext4 extent mapped inode by walking - through indirect block mapping of the original - inode and converting contiguous block ranges - into ext4 extents of the temporary inode. Then, - inodes are swapped. This ioctl might help, when - migrating from ext3 to ext4 filesystem, however - suggestion is to create fresh ext4 filesystem - and copy data from the backup. Note, that - filesystem has to support extents for this ioctl - to work. - - EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be - allocated to preserve application-expected ext3 - behaviour. Note that this will also start - triggering a write of the data blocks, but this - behaviour may change in the future as it is - not necessary and has been done this way only - for sake of simplicity. - - EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number - of blocks of resized filesystem is passed in via - 64 bit integer argument. The kernel allocates - bitmaps and inode table, the userspace tool thus - just passes the new number of blocks. - - EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes - (like i_blocks, i_size, i_flags, ...) from - the specified inode with inode - EXT4_BOOT_LOADER_INO (#5). This is typically - used to store a boot loader in a secure part of - the filesystem, where it can't be changed by a - normal user by accident. - The data blocks of the previous boot loader - will be associated with the given inode. -============================= ================================================= - -References -========== - -kernel source: <file:fs/ext4/> - <file:fs/jbd2/> - -programs: http://e2fsprogs.sourceforge.net/ - -useful links: http://fedoraproject.org/wiki/ext3-devel - http://www.bullopensource.org/ext4/ - http://ext4.wiki.kernel.org/index.php/Main_Page - http://fedoraproject.org/wiki/Features/Ext4 diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/globals.rst index 368bf7662b96..368bf7662b96 100644 --- a/Documentation/filesystems/ext4/ondisk/globals.rst +++ b/Documentation/filesystems/ext4/globals.rst diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst index 759827e5d2cf..0f783ed88592 100644 --- a/Documentation/filesystems/ext4/ondisk/group_descr.rst +++ b/Documentation/filesystems/ext4/group_descr.rst @@ -43,7 +43,7 @@ entire bitmap. The block group descriptor is laid out in ``struct ext4_group_desc``. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. Block group flags can be any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ifork.rst index 5dbe3b2b121a..b9816d5a896b 100644 --- a/Documentation/filesystems/ext4/ondisk/ifork.rst +++ b/Documentation/filesystems/ext4/ifork.rst @@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``, which is 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are recorded as ``struct ext4_extent_idx``, and are 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, and are also 12 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -174,7 +174,7 @@ including) the checksum itself. ``struct ext4_extent_tail`` is 4 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 71121605558c..3be3e54d480d 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst @@ -1,17 +1,14 @@ .. SPDX-License-Identifier: GPL-2.0 -=============== -ext4 Filesystem -=============== - -General usage and on-disk artifacts writen by ext4. More documentation may -be ported from the wiki as time permits. This should be considered the -canonical source of information as the details here have been reviewed by -the ext4 community. +=================================== +ext4 Data Structures and Algorithms +=================================== .. toctree:: - :maxdepth: 5 + :maxdepth: 6 :numbered: - ext4 - ondisk/index + about.rst + overview.rst + globals.rst + dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst index d1075178ce0b..d1075178ce0b 100644 --- a/Documentation/filesystems/ext4/ondisk/inlinedata.rst +++ b/Documentation/filesystems/ext4/inlinedata.rst diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index 655ce898f3f5..6bd35e506b6f 100644 --- a/Documentation/filesystems/ext4/ondisk/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst @@ -29,8 +29,9 @@ and the inode structure itself. The inode table entry is laid out in ``struct ext4_inode``. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 + :class: longtable * - Offset - Size @@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``. The ``i_mode`` value is a combination of the following flags: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags: The ``i_flags`` field is a combination of these values: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator: Linux: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -331,7 +332,7 @@ Linux: Hurd: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -346,7 +347,7 @@ Hurd: Masix: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator: Linux: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -402,7 +403,7 @@ Linux: Hurd: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -433,7 +434,7 @@ Hurd: Masix: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/journal.rst index e7031af86876..ea613ee701f5 100644 --- a/Documentation/filesystems/ext4/ondisk/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst @@ -48,7 +48,7 @@ Layout Generally speaking, the journal has this format: .. list-table:: - :widths: 1 1 78 + :widths: 16 48 16 :header-rows: 1 * - Superblock @@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the superblock. .. list-table:: - :widths: 1 1 1 1 76 + :widths: 12 12 12 32 12 :header-rows: 1 * - 1024 bytes of padding @@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header ``struct journal_header_s``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header The journal block type can be any one of: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``, which is 1024 bytes long: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -264,7 +264,7 @@ which is 1024 bytes long: The journal compat features are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -278,7 +278,7 @@ The journal compat features are any combination of the following: The journal incompat features are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the most likely choices. .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway. Descriptor blocks consume at least 36 bytes, but use a full block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the following. The size is 16 or 32 bytes. .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes. The journal tag flags are any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the following. The size is 8, 12, 24, or 28 bytes: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a ``struct jbd2_journal_block_tail``, which looks like this: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -513,7 +513,7 @@ Revocation blocks are described in length, but use a full block: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation block is a ``struct jbd2_journal_revoke_tail``, which has this format: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32 bytes long (but uses a full block): .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/mmp.rst index b7d7a3137f80..25660981d93c 100644 --- a/Documentation/filesystems/ext4/ondisk/mmp.rst +++ b/Documentation/filesystems/ext4/mmp.rst @@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure. The MMP structure (``struct mmp_struct``) is as follows: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 12 20 40 :header-rows: 1 * - Offset diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst deleted file mode 100644 index f7d082c3a435..000000000000 --- a/Documentation/filesystems/ext4/ondisk/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============================== -Data Structures and Algorithms -============================== -.. include:: about.rst -.. include:: overview.rst -.. include:: globals.rst -.. include:: dynamic.rst diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/overview.rst index cbab18baba12..cbab18baba12 100644 --- a/Documentation/filesystems/ext4/ondisk/overview.rst +++ b/Documentation/filesystems/ext4/overview.rst diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst index a82f70c9baeb..9061aabba827 100644 --- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst +++ b/Documentation/filesystems/ext4/special_inodes.rst @@ -6,7 +6,7 @@ Special inodes ext4 reserves some inode for special features, as follows: .. list-table:: - :widths: 1 79 + :widths: 6 70 :header-rows: 1 * - inode Number diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/super.rst index 5f81dd87e0b9..04ff079a2acf 100644 --- a/Documentation/filesystems/ext4/ondisk/super.rst +++ b/Documentation/filesystems/ext4/super.rst @@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in ``struct ext4_super_block``: .. list-table:: - :widths: 1 1 1 77 + :widths: 8 8 24 40 :header-rows: 1 * - Offset @@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in The superblock state is some combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -500,7 +500,7 @@ The superblock state is some combination of the following: The superblock error policy is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -517,7 +517,7 @@ The superblock error policy is one of the following: The filesystem creator is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -538,7 +538,7 @@ The filesystem creator is one of the following: The superblock revision is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of the following: .. list-table:: - :widths: 1 79 + :widths: 16 64 :header-rows: 1 * - Value @@ -702,7 +702,7 @@ the following: The ``s_def_hash_version`` field is one of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following: The ``s_default_mount_opts`` field is any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following: The ``s_flags`` field is any combination of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value @@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following: The ``s_encrypt_algos`` list can contain any of the following: .. list-table:: - :widths: 1 79 + :widths: 8 72 :header-rows: 1 * - Value diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fb50f9aa6ead..c1d570ee1d9f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index caff935fbeb8..12f90d48ba61 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -628,6 +628,7 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 /* * ioctl commands @@ -1030,6 +1031,9 @@ struct ext4_inode_info { ext4_lblk_t i_da_metadata_calc_last_lblock; int i_da_metadata_calc_len; + /* pending cluster reservations for bigalloc file systems */ + struct ext4_pending_tree i_pending_tree; + /* on-disk additional length */ __u16 i_extra_isize; @@ -1401,7 +1405,8 @@ struct ext4_sb_info { u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ + /* Names of quota files with journalled quota */ + char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ @@ -2483,10 +2488,11 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); -extern int ext4_page_mkwrite(struct vm_fault *vmf); -extern int ext4_filemap_fault(struct vm_fault *vmf); +extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); +extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, @@ -3142,10 +3148,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -3156,6 +3158,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); +extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index adf6668b596f..98bd0e9ee7df 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -120,6 +120,19 @@ struct ext4_ext_path { }; /* + * Used to record a portion of a cluster found at the beginning or end + * of an extent while traversing the extent tree during space removal. + * A partial cluster may be removed if it does not contain blocks shared + * with extents that aren't being deleted (tofree state). Otherwise, + * it cannot be removed (nofree state). + */ +struct partial_cluster { + ext4_fsblk_t pclu; /* physical cluster number */ + ext4_lblk_t lblk; /* logical block number within logical cluster */ + enum {initial, tofree, nofree} state; +}; + +/* * structure for external API */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..240b6dea5441 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, { struct extent_status es; - ext4_es_find_delayed_extent_range(inode, hole_start, - hole_start + hole_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, + hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ if (es.es_lblk <= hole_start) @@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode) return 0; } +/* + * ext4_rereserve_cluster - increment the reserved cluster count when + * freeing a cluster with a pending reservation + * + * @inode - file containing the cluster + * @lblk - logical block in cluster to be reserved + * + * Increments the reserved cluster count and adjusts quota in a bigalloc + * file system when freeing a partial cluster containing at least one + * delayed and unwritten block. A partial cluster meeting that + * requirement will have a pending reservation. If so, the + * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to + * defer reserved and allocated space accounting to a subsequent call + * to this function. + */ +static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); + + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); + spin_unlock(&ei->i_block_reservation_lock); + + percpu_counter_add(&sbi->s_freeclusters_counter, 1); + ext4_remove_pending(inode, lblk); +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); - ext4_fsblk_t pblk; - int flags = get_default_free_blocks_flags(inode); + ext4_fsblk_t last_pblk, pblk; + ext4_lblk_t num; + int flags; + + /* only extent tail removal is allowed */ + if (from < le32_to_cpu(ex->ee_block) || + to != le32_to_cpu(ex->ee_block) + ee_len - 1) { + ext4_error(sbi->s_sb, + "strange request: removal(2) %u-%u from %u:%u", + from, to, le32_to_cpu(ex->ee_block), ee_len); + return 0; + } + +#ifdef EXTENTS_STATS + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); +#endif + + trace_ext4_remove_blocks(inode, ex, from, to, partial); /* - * For bigalloc file systems, we never free a partial cluster - * at the beginning of the extent. Instead, we make a note - * that we tried freeing the cluster, and check to see if we - * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + * if we have a partial cluster, and it's different from the + * cluster of the last block in the extent, we free it */ - flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + last_pblk = ext4_ext_pblock(ex) + ee_len - 1; + + if (partial->state != initial && + partial->pclu != EXT4_B2C(sbi, last_pblk)) { + if (partial->state == tofree) { + flags = get_default_free_blocks_flags(inode); + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); + } + partial->state = initial; + } + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; - trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); /* - * If we have a partial cluster, and it's different from the - * cluster of the last block, we need to explicitly free the - * partial cluster here. + * We free the partial cluster at the end of the extent (if any), + * unless the cluster is used by another extent (partial_cluster + * state is nofree). If a partial cluster exists here, it must be + * shared with the last block in the extent. */ - pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster > 0 && - *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + flags = get_default_free_blocks_flags(inode); + + /* partial, left end cluster aligned, right end unaligned */ + if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && + (EXT4_LBLK_CMASK(sbi, to) >= from) && + (partial->state != nofree)) { + if (ext4_is_pending(inode, to)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), + EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); - *partial_cluster = 0; + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, to); + partial->state = initial; + flags = get_default_free_blocks_flags(inode); } -#ifdef EXTENTS_STATS - { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - spin_lock(&sbi->s_ext_stats_lock); - sbi->s_ext_blocks += ee_len; - sbi->s_ext_extents++; - if (ee_len < sbi->s_ext_min) - sbi->s_ext_min = ee_len; - if (ee_len > sbi->s_ext_max) - sbi->s_ext_max = ee_len; - if (ext_depth(inode) > sbi->s_depth_max) - sbi->s_depth_max = ext_depth(inode); - spin_unlock(&sbi->s_ext_stats_lock); - } -#endif - if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* tail removal */ - ext4_lblk_t num; - long long first_cluster; - - num = le32_to_cpu(ex->ee_block) + ee_len - from; - pblk = ext4_ext_pblock(ex) + ee_len - num; - /* - * Usually we want to free partial cluster at the end of the - * extent, except for the situation when the cluster is still - * used by any other extent (partial_cluster is negative). - */ - if (*partial_cluster < 0 && - *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) - flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; - ext_debug("free last %u blocks starting %llu partial %lld\n", - num, pblk, *partial_cluster); - ext4_free_blocks(handle, inode, NULL, pblk, num, flags); - /* - * If the block range to be freed didn't start at the - * beginning of a cluster, and we removed the entire - * extent and the cluster is not used by any other extent, - * save the partial cluster here, since we might need to - * delete if we determine that the truncate or punch hole - * operation has removed all of the blocks in the cluster. - * If that cluster is used by another extent, preserve its - * negative value so it isn't freed later on. - * - * If the whole extent wasn't freed, we've reached the - * start of the truncated/punched region and have finished - * removing blocks. If there's a partial cluster here it's - * shared with the remainder of the extent and is no longer - * a candidate for removal. - */ - if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { - first_cluster = (long long) EXT4_B2C(sbi, pblk); - if (first_cluster != -*partial_cluster) - *partial_cluster = first_cluster; - } else { - *partial_cluster = 0; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + + /* reset the partial cluster if we've freed past it */ + if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) + partial->state = initial; + + /* + * If we've freed the entire extent but the beginning is not left + * cluster aligned and is not marked as ineligible for freeing we + * record the partial cluster at the beginning of the extent. It + * wasn't freed by the preceding ext4_free_blocks() call, and we + * need to look farther to the left to determine if it's to be freed + * (not shared with another extent). Else, reset the partial + * cluster - we're either done freeing or the beginning of the + * extent is left cluster aligned. + */ + if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { + if (partial->state == initial) { + partial->pclu = EXT4_B2C(sbi, pblk); + partial->lblk = from; + partial->state = tofree; } - } else - ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u", - from, to, le32_to_cpu(ex->ee_block), ee_len); + } else { + partial->state = initial; + } + return 0; } - /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" @@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - long long *partial_cluster, + struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { @@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); - *partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial->pclu = EXT4_B2C(sbi, pblk); + partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - err = ext4_remove_blocks(handle, inode, ex, partial_cluster, - a, b); + err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; @@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent - * we zero partial_cluster because we've reached the start of the + * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + if (partial->pclu != EXT4_B2C(sbi, pblk)) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial->lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial->pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial->lblk); } - *partial_cluster = 0; + partial->state = initial; } /* if this leaf is free, then we should @@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - long long partial_cluster = 0; + struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + partial.pclu = 0; + partial.lblk = 0; + partial.state = initial; + ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ @@ -2882,8 +2941,8 @@ again: */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 2; - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; } /* @@ -2911,9 +2970,10 @@ again: &ex); if (err) goto out; - if (pblk) - partial_cluster = - -(long long) EXT4_B2C(sbi, pblk); + if (pblk) { + partial.pclu = EXT4_B2C(sbi, pblk); + partial.state = nofree; + } } } /* @@ -2948,8 +3008,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, start, - end); + &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3021,21 +3080,24 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, end, depth, - partial_cluster, path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, + path->p_hdr->eh_entries); /* - * If we still have something in the partial cluster and we have removed - * even the first extent, then we should free the blocks in the partial - * cluster as well. (This code will only run when there are no leaves - * to the immediate left of the truncated/punched region.) + * if there's a partial cluster and we have removed the first extent + * in the file, then we also free the partial cluster, if any */ - if (partial_cluster > 0 && err == 0) { - /* don't zero partial_cluster since it's not used afterwards */ + if (partial.state == tofree && err == 0) { + int flags = get_default_free_blocks_flags(inode); + + if (ext4_is_pending(inode, partial.lblk)) + flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, partial_cluster), - sbi->s_cluster_ratio, - get_default_free_blocks_flags(inode)); + EXT4_C2B(sbi, partial.pclu), + sbi->s_cluster_ratio, flags); + if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) + ext4_rereserve_cluster(inode, partial.lblk); + partial.state = initial; } /* TODO: flexible tree reduction should be here */ @@ -3819,114 +3881,6 @@ out: return ext4_mark_inode_dirty(handle, inode); } -/** - * ext4_find_delalloc_range: find delayed allocated block in the given range. - * - * Return 1 if there is a delalloc block in the range, otherwise 0. - */ -int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end) -{ - struct extent_status es; - - ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); - if (es.es_len == 0) - return 0; /* there is no delay extent in this tree */ - else if (es.es_lblk <= lblk_start && - lblk_start < es.es_lblk + es.es_len) - return 1; - else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) - return 1; - else - return 0; -} - -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t lblk_start, lblk_end; - lblk_start = EXT4_LBLK_CMASK(sbi, lblk); - lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - - return ext4_find_delalloc_range(inode, lblk_start, lblk_end); -} - -/** - * Determines how many complete clusters (out of those specified by the 'map') - * are under delalloc and were reserved quota for. - * This function is called when we are writing out the blocks that were - * originally written with their allocation delayed, but then the space was - * allocated using fallocate() before the delayed allocation could be resolved. - * The cases to look for are: - * ('=' indicated delayed allocated blocks - * '-' indicates non-delayed allocated blocks) - * (a) partial clusters towards beginning and/or end outside of allocated range - * are not delalloc'ed. - * Ex: - * |----c---=|====c====|====c====|===-c----| - * |++++++ allocated ++++++| - * ==> 4 complete clusters in above example - * - * (b) partial cluster (outside of allocated range) towards either end is - * marked for delayed allocation. In this case, we will exclude that - * cluster. - * Ex: - * |----====c========|========c========| - * |++++++ allocated ++++++| - * ==> 1 complete clusters in above example - * - * Ex: - * |================c================| - * |++++++ allocated ++++++| - * ==> 0 complete clusters in above example - * - * The ext4_da_update_reserve_space will be called only if we - * determine here that there were some "entire" clusters that span - * this 'allocated' range. - * In the non-bigalloc case, this function will just end up returning num_blks - * without ever calling ext4_find_delalloc_range. - */ -static unsigned int -get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, - unsigned int num_blks) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t alloc_cluster_start, alloc_cluster_end; - ext4_lblk_t lblk_from, lblk_to, c_offset; - unsigned int allocated_clusters = 0; - - alloc_cluster_start = EXT4_B2C(sbi, lblk_start); - alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); - - /* max possible clusters for this allocation */ - allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; - - trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); - - /* Check towards left side */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start); - if (c_offset) { - lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); - lblk_to = lblk_from + c_offset - 1; - - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) - allocated_clusters--; - } - - /* Now check towards right. */ - c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); - if (allocated_clusters && c_offset) { - lblk_from = lblk_start + num_blks; - lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) - allocated_clusters--; - } - - return allocated_clusters; -} - static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -4108,23 +4062,6 @@ out: } map->m_len = allocated; - /* - * If we have done fallocate with the offset that is already - * delayed allocated, we would have block reservation - * and quota reservation done in the delayed write path. - * But fallocate would have already updated quota and block - * count for this offset. So cancel these reservation - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, map->m_len); - if (reserved_clusters) - ext4_da_update_reserve_space(inode, - reserved_clusters, - 0); - } - map_out: map->m_flags |= EXT4_MAP_MAPPED; if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { @@ -4513,77 +4450,39 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_NEW; /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - /* - * Check how many clusters we had reserved this allocated range - */ - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, allocated); - if (!map_from_cluster) { - BUG_ON(allocated_clusters < reserved_clusters); - if (reserved_clusters < allocated_clusters) { - struct ext4_inode_info *ei = EXT4_I(inode); - int reservation = allocated_clusters - - reserved_clusters; - /* - * It seems we claimed few clusters outside of - * the range of this allocation. We should give - * it back to the reservation pool. This can - * happen in the following case: - * - * * Suppose s_cluster_ratio is 4 (i.e., each - * cluster has 4 blocks. Thus, the clusters - * are [0-3],[4-7],[8-11]... - * * First comes delayed allocation write for - * logical blocks 10 & 11. Since there were no - * previous delayed allocated blocks in the - * range [8-11], we would reserve 1 cluster - * for this write. - * * Next comes write for logical blocks 3 to 8. - * In this case, we will reserve 2 clusters - * (for [0-3] and [4-7]; and not for [8-11] as - * that range has a delayed allocated blocks. - * Thus total reserved clusters now becomes 3. - * * Now, during the delayed allocation writeout - * time, we will first write blocks [3-8] and - * allocate 3 clusters for writing these - * blocks. Also, we would claim all these - * three clusters above. - * * Now when we come here to writeout the - * blocks [10-11], we would expect to claim - * the reservation of 1 cluster we had made - * (and we would claim it since there are no - * more delayed allocated blocks in the range - * [8-11]. But our reserved cluster count had - * already gone to 0. - * - * Thus, at the step 4 above when we determine - * that there are still some unwritten delayed - * allocated blocks outside of our current - * block range, we should increment the - * reserved clusters count so that when the - * remaining blocks finally gets written, we - * could claim them. - */ - dquot_reserve_block(inode, - EXT4_C2B(sbi, reservation)); - spin_lock(&ei->i_block_reservation_lock); - ei->i_reserved_data_blocks += reservation; - spin_unlock(&ei->i_block_reservation_lock); - } + if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* - * We will claim quota for all newly allocated blocks. - * We're updating the reserved space *after* the - * correction above so we do not accidentally free - * all the metadata reservation because we might - * actually need it later on. + * When allocating delayed allocated clusters, simply + * reduce the reserved cluster count and claim quota */ ext4_da_update_reserve_space(inode, allocated_clusters, 1); + } else { + ext4_lblk_t lblk, len; + unsigned int n; + + /* + * When allocating non-delayed allocated clusters + * (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by + * ext4_nonda_switch), reduce the reserved cluster + * count by the number of allocated clusters that + * have previously been delayed allocated. Quota + * has been claimed by ext4_mb_new_blocks() above, + * so release the quota reservations made for any + * previously delayed allocated clusters. + */ + lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); + len = allocated_clusters << sbi->s_cluster_bits; + n = ext4_es_delayed_clu(inode, lblk, len); + if (n > 0) + ext4_da_update_reserve_space(inode, (int) n, 0); } } @@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode, ext4_lblk_t block, next_del; if (newes->es_pblk == 0) { - ext4_es_find_delayed_extent_range(inode, newes->es_lblk, - newes->es_lblk + newes->es_len - 1, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + newes->es_lblk, + newes->es_lblk + newes->es_len - 1, + &es); /* * No extent in extent-tree contains block @newes->es_pblk, @@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode, } block = newes->es_lblk + newes->es_len; - ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, + EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else @@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, } return replaced_count; } + +/* + * ext4_clu_mapped - determine whether any block in a logical cluster has + * been mapped to a physical cluster + * + * @inode - file containing the logical cluster + * @lclu - logical cluster of interest + * + * Returns 1 if any block in the logical cluster is mapped, signifying + * that a physical cluster has been allocated for it. Otherwise, + * returns 0. Can also return negative error codes. Derived from + * ext4_ext_map_blocks(). + */ +int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_ext_path *path; + int depth, mapped = 0, err = 0; + struct ext4_extent *extent; + ext4_lblk_t first_lblk, first_lclu, last_lclu; + + /* search for the extent closest to the first block in the cluster */ + path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out; + } + + depth = ext_depth(inode); + + /* + * A consistent leaf must not be empty. This situation is possible, + * though, _during_ tree modification, and it's why an assert can't + * be put in ext4_find_extent(). + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, + "bad extent address - lblock: %lu, depth: %d, pblock: %lld", + (unsigned long) EXT4_C2B(sbi, lclu), + depth, path[depth].p_block); + err = -EFSCORRUPTED; + goto out; + } + + extent = path[depth].p_ext; + + /* can't be mapped if the extent tree is empty */ + if (extent == NULL) + goto out; + + first_lblk = le32_to_cpu(extent->ee_block); + first_lclu = EXT4_B2C(sbi, first_lblk); + + /* + * Three possible outcomes at this point - found extent spanning + * the target cluster, to the left of the target cluster, or to the + * right of the target cluster. The first two cases are handled here. + * The last case indicates the target cluster is not mapped. + */ + if (lclu >= first_lclu) { + last_lclu = EXT4_B2C(sbi, first_lblk + + ext4_ext_get_actual_len(extent) - 1); + if (lclu <= last_lclu) { + mapped = 1; + } else { + first_lblk = ext4_ext_next_allocated_block(path); + first_lclu = EXT4_B2C(sbi, first_lblk); + if (lclu == first_lclu) + mapped = 1; + } + } + +out: + ext4_ext_drop_refs(path); + kfree(path); + + return err ? err : mapped; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c4e6fb15101b..2b439afafe13 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -142,6 +142,7 @@ */ static struct kmem_cache *ext4_es_cachep; +static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, @@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); int __init ext4_init_es(void) { @@ -233,30 +236,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering - * @es->lblk if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_extent_range - find extent with specified status within block + * range or next extent following block range in + * extents status tree * - * @inode: the inode which owns delayed extents - * @lblk: the offset where we start to search - * @end: the offset where we stop to search - * @es: delayed extent that we found + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * @es - extent found, if any + * + * Find the first extent within the block range specified by @lblk and @end + * in the extents status tree that satisfies @matching_fn. If a match + * is found, it's returned in @es. If not, and a matching extent is found + * beyond the block range, it's returned in @es. If no match is found, an + * extent is returned in @es whose es_lblk, es_len, and es_pblk components + * are 0. */ -void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es) +static void __es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - BUG_ON(es == NULL); - BUG_ON(end < lblk); - trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); + WARN_ON(es == NULL); + WARN_ON(end < lblk); - read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find extent in cache firstly */ + /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; @@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, es1 = __es_tree_search(&tree->root, lblk); out: - if (es1 && !ext4_es_is_delayed(es1)) { + if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } - if (ext4_es_is_delayed(es1)) + if (matching_fn(es1)) break; } } - if (es1 && ext4_es_is_delayed(es1)) { + if (es1 && matching_fn(es1)) { tree->cache_es = es1; es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } +} + +/* + * Locking for __es_find_extent_range() for external use + */ +void ext4_es_find_extent_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es) +{ + trace_ext4_es_find_extent_range_enter(inode, lblk); + + read_lock(&EXT4_I(inode)->i_es_lock); + __es_find_extent_range(inode, matching_fn, lblk, end, es); + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_extent_range_exit(inode, es); +} + +/* + * __es_scan_range - search block range for block with specified status + * in extents status tree + * + * @inode - file containing the range + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block defining start of range + * @end - logical block defining end of range + * + * Returns true if at least one block in the specified block range satisfies + * the criterion specified by @matching_fn, and false if not. If at least + * one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t start, ext4_lblk_t end) +{ + struct extent_status es; + + __es_find_extent_range(inode, matching_fn, start, end, &es); + if (es.es_len == 0) + return false; /* no matching extent in the tree */ + else if (es.es_lblk <= start && + start < es.es_lblk + es.es_len) + return true; + else if (start <= es.es_lblk && es.es_lblk <= end) + return true; + else + return false; +} +/* + * Locking for __es_scan_range() for external use + */ +bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_range(inode, matching_fn, lblk, end); read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_delayed_extent_range_exit(inode, es); + return ret; +} + +/* + * __es_scan_clu - search cluster for block with specified status in + * extents status tree + * + * @inode - file containing the cluster + * @matching_fn - pointer to function that matches extents with desired status + * @lblk - logical block in cluster to be searched + * + * Returns true if at least one extent in the cluster containing @lblk + * satisfies the criterion specified by @matching_fn, and false if not. If at + * least one extent has the specified status, then there is at least one block + * in the cluster with that status. Should only be called by code that has + * taken i_es_lock. + */ +static bool __es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); +} + +/* + * Locking for __es_scan_clu() for external use + */ +bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk) +{ + bool ret; + + read_lock(&EXT4_I(inode)->i_es_lock); + ret = __es_scan_clu(inode, matching_fn, lblk); + read_unlock(&EXT4_I(inode)->i_es_lock); + + return ret; } static void ext4_es_list_add(struct inode *inode) @@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); @@ -730,6 +847,11 @@ retry: if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; + if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && + (status & EXTENT_STATUS_WRITTEN || + status & EXTENT_STATUS_UNWRITTEN)) + __revise_pending(inode, lblk, len); + error: write_unlock(&EXT4_I(inode)->i_es_lock); @@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) ei->i_es_tree.cache_es = NULL; return nr_shrunk; } + +#ifdef ES_DEBUG__ +static void ext4_print_pending_tree(struct inode *inode) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr; + + printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_pending_tree; + node = rb_first(&tree->root); + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + printk(KERN_DEBUG " %u", pr->lclu); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_print_pending_tree(inode) +#endif + +int __init ext4_init_pending(void) +{ + ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", + sizeof(struct pending_reservation), + 0, (SLAB_RECLAIM_ACCOUNT), NULL); + if (ext4_pending_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_pending(void) +{ + kmem_cache_destroy(ext4_pending_cachep); +} + +void ext4_init_pending_tree(struct ext4_pending_tree *tree) +{ + tree->root = RB_ROOT; +} + +/* + * __get_pending - retrieve a pointer to a pending reservation + * + * @inode - file containing the pending cluster reservation + * @lclu - logical cluster of interest + * + * Returns a pointer to a pending reservation if it's a member of + * the set, and NULL if not. Must be called holding i_es_lock. + */ +static struct pending_reservation *__get_pending(struct inode *inode, + ext4_lblk_t lclu) +{ + struct ext4_pending_tree *tree; + struct rb_node *node; + struct pending_reservation *pr = NULL; + + tree = &EXT4_I(inode)->i_pending_tree; + node = (&tree->root)->rb_node; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else if (lclu == pr->lclu) + return pr; + } + return NULL; +} + +/* + * __insert_pending - adds a pending cluster reservation to the set of + * pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster to be added + * + * Returns 0 on successful insertion and -ENOMEM on failure. If the + * pending reservation is already in the set, returns successfully. + */ +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct pending_reservation *pr; + ext4_lblk_t lclu; + int ret = 0; + + lclu = EXT4_B2C(sbi, lblk); + /* search to find parent for insertion */ + while (*p) { + parent = *p; + pr = rb_entry(parent, struct pending_reservation, rb_node); + + if (lclu < pr->lclu) { + p = &(*p)->rb_left; + } else if (lclu > pr->lclu) { + p = &(*p)->rb_right; + } else { + /* pending reservation already inserted */ + goto out; + } + } + + pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + if (pr == NULL) { + ret = -ENOMEM; + goto out; + } + pr->lclu = lclu; + + rb_link_node(&pr->rb_node, parent, p); + rb_insert_color(&pr->rb_node, &tree->root); + +out: + return ret; +} + +/* + * __remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Returns successfully if pending reservation is not a member of the set. + */ +static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } +} + +/* + * ext4_remove_pending - removes a pending cluster reservation from the set + * of pending reservations + * + * @inode - file containing the cluster + * @lblk - logical block in the pending cluster reservation to be removed + * + * Locking for external use of __remove_pending. + */ +void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + write_lock(&ei->i_es_lock); + __remove_pending(inode, lblk); + write_unlock(&ei->i_es_lock); +} + +/* + * ext4_is_pending - determine whether a cluster has a pending reservation + * on it + * + * @inode - file containing the cluster + * @lblk - logical block in the cluster + * + * Returns true if there's a pending reservation for the cluster in the + * set of pending reservations, and false if not. + */ +bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + bool ret; + + read_lock(&ei->i_es_lock); + ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); + read_unlock(&ei->i_es_lock); + + return ret; +} + +/* + * ext4_es_insert_delayed_block - adds a delayed block to the extents status + * tree, adding a pending reservation where + * needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * @allocated - indicates whether a physical cluster has been allocated for + * the logical cluster that contains the block + * + * Returns 0 on success, negative error code on failure. + */ +int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) +{ + struct extent_status newes; + int err = 0; + + es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", + lblk, inode->i_ino); + + newes.es_lblk = lblk; + newes.es_len = 1; + ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); + trace_ext4_es_insert_delayed_block(inode, &newes, allocated); + + ext4_es_insert_extent_check(inode, &newes); + + write_lock(&EXT4_I(inode)->i_es_lock); + + err = __es_remove_extent(inode, lblk, lblk); + if (err != 0) + goto error; +retry: + err = __es_insert_extent(inode, &newes); + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) + goto retry; + if (err != 0) + goto error; + + if (allocated) + __insert_pending(inode, lblk); + +error: + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + ext4_print_pending_tree(inode); + + return err; +} + +/* + * __es_delayed_clu - count number of clusters containing blocks that + * are delayed only + * + * @inode - file containing block range + * @start - logical block defining start of range + * @end - logical block defining end of range + * + * Returns the number of clusters containing only delayed (not delayed + * and unwritten) blocks in the range specified by @start and @end. Any + * cluster or part of a cluster within the range and containing a delayed + * and not unwritten block within the range is counted as a whole cluster. + */ +static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; + struct extent_status *es; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + unsigned long long last_counted_lclu; + unsigned int n = 0; + + /* guaranteed to be unequal to any ext4_lblk_t value */ + last_counted_lclu = ~0ULL; + + es = __es_tree_search(&tree->root, start); + + while (es && (es->es_lblk <= end)) { + if (ext4_es_is_delonly(es)) { + if (es->es_lblk <= start) + first_lclu = EXT4_B2C(sbi, start); + else + first_lclu = EXT4_B2C(sbi, es->es_lblk); + + if (ext4_es_end(es) >= end) + last_lclu = EXT4_B2C(sbi, end); + else + last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); + + if (first_lclu == last_counted_lclu) + n += last_lclu - first_lclu; + else + n += last_lclu - first_lclu + 1; + last_counted_lclu = last_lclu; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + + return n; +} + +/* + * ext4_es_delayed_clu - count number of clusters containing blocks that + * are both delayed and unwritten + * + * @inode - file containing block range + * @lblk - logical block defining start of range + * @len - number of blocks in range + * + * Locking for external use of __es_delayed_clu(). + */ +unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_lblk_t end; + unsigned int n; + + if (len == 0) + return 0; + + end = lblk + len - 1; + WARN_ON(end < lblk); + + read_lock(&ei->i_es_lock); + + n = __es_delayed_clu(inode, lblk, end); + + read_unlock(&ei->i_es_lock); + + return n; +} + +/* + * __revise_pending - makes, cancels, or leaves unchanged pending cluster + * reservations for a specified block range depending + * upon the presence or absence of delayed blocks + * outside the range within clusters at the ends of the + * range + * + * @inode - file containing the range + * @lblk - logical block defining the start of range + * @len - length of range in blocks + * + * Used after a newly allocated extent is added to the extents status tree. + * Requires that the extents in the range have either written or unwritten + * status. Must be called while holding i_es_lock. + */ +static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t end = lblk + len - 1; + ext4_lblk_t first, last; + bool f_del = false, l_del = false; + + if (len == 0) + return; + + /* + * Two cases - block range within single cluster and block range + * spanning two or more clusters. Note that a cluster belonging + * to a range starting and/or ending on a cluster boundary is treated + * as if it does not contain a delayed extent. The new range may + * have allocated space for previously delayed blocks out to the + * cluster boundary, requiring that any pre-existing pending + * reservation be canceled. Because this code only looks at blocks + * outside the range, it should revise pending reservations + * correctly even if the extent represented by the range can't be + * inserted in the extents status tree due to ENOSPC. + */ + + if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) { + __insert_pending(inode, first); + } else { + last = EXT4_LBLK_CMASK(sbi, end) + + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, + &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } + } else { + first = EXT4_LBLK_CMASK(sbi, lblk); + if (first != lblk) + f_del = __es_scan_range(inode, &ext4_es_is_delonly, + first, lblk - 1); + if (f_del) + __insert_pending(inode, first); + else + __remove_pending(inode, first); + + last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; + if (last != end) + l_del = __es_scan_range(inode, &ext4_es_is_delonly, + end + 1, last); + if (l_del) + __insert_pending(inode, last); + else + __remove_pending(inode, last); + } +} + +/* + * ext4_es_remove_blks - remove block range from extents status tree and + * reduce reservation count or cancel pending + * reservation as needed + * + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + */ +void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int clu_size, reserved = 0; + ext4_lblk_t last_lclu, first, length, remainder, last; + bool delonly; + int err = 0; + struct pending_reservation *pr; + struct ext4_pending_tree *tree; + + /* + * Process cluster by cluster for bigalloc - there may be up to + * two clusters in a 4k page with a 1k block size and two blocks + * per cluster. Also necessary for systems with larger page sizes + * and potentially larger block sizes. + */ + clu_size = sbi->s_cluster_ratio; + last_lclu = EXT4_B2C(sbi, lblk + len - 1); + + write_lock(&EXT4_I(inode)->i_es_lock); + + for (first = lblk, remainder = len; + remainder > 0; + first += length, remainder -= length) { + + if (EXT4_B2C(sbi, first) == last_lclu) + length = remainder; + else + length = clu_size - EXT4_LBLK_COFF(sbi, first); + + /* + * The BH_Delay flag, which triggers calls to this function, + * and the contents of the extents status tree can be + * inconsistent due to writepages activity. So, note whether + * the blocks to be removed actually belong to an extent with + * delayed only status. + */ + delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); + + /* + * because of the writepages effect, written and unwritten + * blocks could be removed here + */ + last = first + length - 1; + err = __es_remove_extent(inode, first, last); + if (err) + ext4_warning(inode->i_sb, + "%s: couldn't remove page (err = %d)", + __func__, err); + + /* non-bigalloc case: simply count the cluster for release */ + if (sbi->s_cluster_ratio == 1 && delonly) { + reserved++; + continue; + } + + /* + * bigalloc case: if all delayed allocated only blocks have + * just been removed from a cluster, either cancel a pending + * reservation if it exists or count a cluster for release + */ + if (delonly && + !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { + pr = __get_pending(inode, EXT4_B2C(sbi, first)); + if (pr != NULL) { + tree = &EXT4_I(inode)->i_pending_tree; + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + } else { + reserved++; + } + } + } + + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_da_release_space(inode, reserved); +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8efdeb903d6b..131a8b7df265 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -78,6 +78,51 @@ struct ext4_es_stats { struct percpu_counter es_stats_shk_cnt; }; +/* + * Pending cluster reservations for bigalloc file systems + * + * A cluster with a pending reservation is a logical cluster shared by at + * least one extent in the extents status tree with delayed and unwritten + * status and at least one other written or unwritten extent. The + * reservation is said to be pending because a cluster reservation would + * have to be taken in the event all blocks in the cluster shared with + * written or unwritten extents were deleted while the delayed and + * unwritten blocks remained. + * + * The set of pending cluster reservations is an auxiliary data structure + * used with the extents status tree to implement reserved cluster/block + * accounting for bigalloc file systems. The set is kept in memory and + * records all pending cluster reservations. + * + * Its primary function is to avoid the need to read extents from the + * disk when invalidating pages as a result of a truncate, punch hole, or + * collapse range operation. Page invalidation requires a decrease in the + * reserved cluster count if it results in the removal of all delayed + * and unwritten extents (blocks) from a cluster that is not shared with a + * written or unwritten extent, and no decrease otherwise. Determining + * whether the cluster is shared can be done by searching for a pending + * reservation on it. + * + * Secondarily, it provides a potentially faster method for determining + * whether the reserved cluster count should be increased when a physical + * cluster is deallocated as a result of a truncate, punch hole, or + * collapse range operation. The necessary information is also present + * in the extents status tree, but might be more rapidly accessed in + * the pending reservation set in many cases due to smaller size. + * + * The pending cluster reservation set is implemented as a red-black tree + * with the goal of minimizing per page search time overhead. + */ + +struct pending_reservation { + struct rb_node rb_node; + ext4_lblk_t lclu; +}; + +struct ext4_pending_tree { + struct rb_root root; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, unsigned int status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_find_delayed_extent_range(struct inode *inode, - ext4_lblk_t lblk, ext4_lblk_t end, - struct extent_status *es); +extern void ext4_es_find_extent_range(struct inode *inode, + int (*match_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end, + struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern bool ext4_es_scan_range(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk, ext4_lblk_t end); +extern bool ext4_es_scan_clu(struct inode *inode, + int (*matching_fn)(struct extent_status *es), + ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { @@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es) return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } +static inline int ext4_es_is_mapped(struct extent_status *es) +{ + return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); +} + +static inline int ext4_es_is_delonly(struct extent_status *es) +{ + return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); +} + static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; @@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); +extern int __init ext4_init_pending(void); +extern void ext4_exit_pending(void); +extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); +extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); +extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); +extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); +extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); +extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7b4736022761..9c4bac18cc6c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; - int retries; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d767e993591d..c3d9a42c561e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -701,8 +701,8 @@ found: EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && - ext4_find_delalloc_range(inode, map->m_lblk, - map->m_lblk + map->m_len - 1)) + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode) return 0; /* success */ } -static void ext4_da_release_space(struct inode *inode, int to_free) +void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); @@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page, unsigned int offset, unsigned int length) { - int to_release = 0, contiguous_blks = 0; + int contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int stop = offset + length; - int num_clusters; ext4_fsblk_t lblk; BUG_ON(stop > PAGE_SIZE || stop < length); @@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page, break; if ((offset <= curr_off) && (buffer_delay(bh))) { - to_release++; contiguous_blks++; clear_buffer_delay(bh); } else if (contiguous_blks) { @@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page, (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); contiguous_blks = 0; } curr_off = next_off; @@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page, if (contiguous_blks) { lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_extent(inode, lblk, contiguous_blks); + ext4_es_remove_blks(inode, lblk, contiguous_blks); } - /* If we have released all the blocks belonging to a cluster, then we - * need to release the reserved space for that cluster. */ - num_clusters = EXT4_NUM_B2C(sbi, to_release); - while (num_clusters > 0) { - lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + - ((num_clusters - 1) << sbi->s_cluster_bits); - if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk)) - ext4_da_release_space(inode, 1); - - num_clusters--; - } } /* @@ -1781,6 +1766,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) } /* + * ext4_insert_delayed_block - adds a delayed block to the extents status + * tree, incrementing the reserved cluster/block + * count or making a pending reservation + * where needed + * + * @inode - file containing the newly added block + * @lblk - logical block to be added + * + * Returns 0 on success, negative error code on failure. + */ +static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int ret; + bool allocated = false; + + /* + * If the cluster containing lblk is shared with a delayed, + * written, or unwritten extent in a bigalloc file system, it's + * already been accounted for and does not need to be reserved. + * A pending reservation must be made for the cluster if it's + * shared with a written or unwritten extent and doesn't already + * have one. Written and unwritten extents can be purged from the + * extents status tree if the system is under memory pressure, so + * it's necessary to examine the extent tree if a search of the + * extents status tree doesn't get a match. + */ + if (sbi->s_cluster_ratio == 1) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { /* bigalloc */ + if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { + if (!ext4_es_scan_clu(inode, + &ext4_es_is_mapped, lblk)) { + ret = ext4_clu_mapped(inode, + EXT4_B2C(sbi, lblk)); + if (ret < 0) + goto errout; + if (ret == 0) { + ret = ext4_da_reserve_space(inode); + if (ret != 0) /* ENOSPC */ + goto errout; + } else { + allocated = true; + } + } else { + allocated = true; + } + } + } + + ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + +errout: + return ret; +} + +/* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write * time. This function looks up the requested blocks and sets the @@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, add_delayed: if (retval == 0) { int ret; + /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* - * If the block was allocated from previously allocated cluster, - * then we don't need to reserve it again. However we still need - * to reserve metadata for every block we're going to write. - */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, map->m_lblk)) { - ret = ext4_da_reserve_space(inode); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } - } - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - ~0, EXTENT_STATUS_DELAYED); - if (ret) { + ret = ext4_insert_delayed_block(inode, map->m_lblk); + if (ret != 0) { retval = ret; goto out_unlock; } @@ -3450,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, ext4_lblk_t end = map.m_lblk + map.m_len - 1; struct extent_status es; - ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map.m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) { /* entire range is a hole */ @@ -6153,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_fault *vmf) +vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; loff_t size; unsigned long len; - int ret; + int err; + vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; @@ -6172,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf) down_read(&EXT4_I(inode)->i_mmap_sem); - ret = ext4_convert_inline_data(inode); - if (ret) + err = ext4_convert_inline_data(inode); + if (err) goto out_ret; /* Delalloc case is easy... */ @@ -6181,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf) !ext4_should_journal_data(inode) && !ext4_nonda_switch(inode->i_sb)) { do { - ret = block_page_mkwrite(vma, vmf, + err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); - } while (ret == -ENOSPC && + } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } @@ -6228,8 +6260,8 @@ retry_alloc: ret = VM_FAULT_SIGBUS; goto out; } - ret = block_page_mkwrite(vma, vmf, get_block); - if (!ret && ext4_should_journal_data(inode)) { + err = block_page_mkwrite(vma, vmf, get_block); + if (!err && ext4_should_journal_data(inode)) { if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); @@ -6240,24 +6272,24 @@ retry_alloc: ext4_set_inode_state(inode, EXT4_STATE_JDATA); } ext4_journal_stop(handle); - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(ret); + ret = block_page_mkwrite_return(err); out: up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } -int ext4_filemap_fault(struct vm_fault *vmf) +vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - int err; + vm_fault_t ret; down_read(&EXT4_I(inode)->i_mmap_sem); - err = filemap_fault(vmf); + ret = filemap_fault(vmf); up_read(&EXT4_I(inode)->i_mmap_sem); - return err; + return ret; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a7074115d6f6..0edee31913d1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); - swap(inode1->i_flags, inode2->i_flags); swap(inode1->i_version, inode2->i_version); swap(inode1->i_blocks, inode2->i_blocks); swap(inode1->i_bytes, inode2->i_bytes); @@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) i_size_write(inode2, isize); } +static void reset_inode_seed(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + __u32 csum; + + if (!ext4_has_metadata_csum(inode->i_sb)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); +} + /** * Swap the information from the given @inode and the inode * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other @@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb, struct inode *inode_bl; struct ext4_inode_info *ei_bl; - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + ext4_has_inline_data(inode)) return -EINVAL; - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); @@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); - truncate_inode_pages(&inode->i_data, 0); - truncate_inode_pages(&inode_bl->i_data, 0); - /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; @@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_generation = prandom_u32(); inode_bl->i_generation = prandom_u32(); + reset_inode_seed(inode); + reset_inode_seed(inode_bl); ext4_discard_preallocations(inode); @@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb, inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); } else { err = ext4_mark_inode_dirty(handle, inode_bl); if (err < 0) { @@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb, /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); } } ext4_journal_stop(handle); @@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; - err = mnt_want_write_file(filp); - if (err) - return err; - err = -EPERM; - inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) - goto out_unlock; + return err; err = ext4_get_inode_loc(inode, &iloc); if (err) - goto out_unlock; + return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { @@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) - goto out_unlock; + return err; } else { brelse(iloc.bh); } - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) + return err; handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_unlock; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) @@ -400,9 +416,6 @@ out_dirty: err = rc; out_stop: ext4_journal_stop(handle); -out_unlock: - inode_unlock(inode); - mnt_drop_write_file(filp); return err; } #else @@ -626,6 +639,30 @@ group_add_out: return err; } +static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1025,19 +1062,19 @@ resizefs_out: return err; inode_lock(inode); + err = ext4_ioctl_check_project(inode, &fa); + if (err) + goto out; flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); err = ext4_ioctl_setflags(inode, flags); - inode_unlock(inode); - mnt_drop_write_file(filp); if (err) - return err; - + goto out; err = ext4_ioctl_setproject(filp, fa.fsx_projid); - if (err) - return err; - - return 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..e2248083cdca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4915,9 +4915,17 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + /* + * on a bigalloc file system, defer the s_freeclusters_counter + * update to the caller (ext4_remove_space and friends) so they + * can determine if a cluster freed here should be rereserved + */ + if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, + count_clusters); + } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a409ff70d67b..2f5be02fc6f6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_eof < orig_start + *len - 1) + if (orig_eof <= orig_start) + *len = 0; + else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; - if (donor_eof < donor_start + *len - 1) + if (donor_eof <= donor_start) + *len = 0; + else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 377d516c475f..67a38532032a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2261,7 +2261,7 @@ again: dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", - info->indirect_levels)); + dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1145109968ef..a221f1cdf704 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb) for (type = 0; type < EXT4_MAXQUOTAS; type++) ext4_quota_off(sb, type); } + +/* + * This is a helper function which is used in the mount/remount + * codepaths (which holds s_umount) to fetch the quota file name. + */ +static inline char *get_qf_name(struct super_block *sb, + struct ext4_sb_info *sbi, + int type) +{ + return rcu_dereference_protected(sbi->s_qf_names[type], + lockdep_is_held(&sb->s_umount)); +} #else static inline void ext4_quota_off_umount(struct super_block *sb) { @@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb) percpu_free_rwsem(&sbi->s_journal_flag_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list @@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_da_metadata_calc_len = 0; ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); @@ -1530,11 +1543,10 @@ static const char deprecated_msg[] = static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *qname; + char *qname, *old_qname = get_qf_name(sb, sbi, qtype); int ret = -1; - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "Not enough memory for storing quotafile name"); return -1; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (old_qname) { + if (strcmp(old_qname, qname) == 0) ret = 1; else ext4_msg(sb, KERN_ERR, @@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + rcu_assign_pointer(sbi->s_qf_names[qtype], qname); set_opt(sb, QUOTA); return 1; errout: @@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype) { struct ext4_sb_info *sbi = EXT4_SB(sb); + char *old_qname = get_qf_name(sb, sbi, qtype); - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && old_qname) { ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -1; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); + synchronize_rcu(); + kfree(old_qname); return 1; } #endif @@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *p; + char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb, "Cannot enable project quota enforcement."); return 0; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); + grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); + if (usr_qf_name || grp_qf_name) { + if (test_opt(sb, USRQUOTA) && usr_qf_name) clear_opt(sb, USRQUOTA); - if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && grp_qf_name) clear_opt(sb, GRPQUOTA); if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { @@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); + char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; @@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + rcu_read_lock(); + usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); + grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); + if (usr_qf_name) + seq_show_option(seq, "usrjquota", usr_qf_name); + if (grp_qf_name) + seq_show_option(seq, "grpjquota", grp_qf_name); + rcu_read_unlock(); #endif } @@ -5103,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int err = 0; #ifdef CONFIG_QUOTA int i, j; + char *to_free[EXT4_MAXQUOTAS]; #endif char *orig_data = kstrdup(data, GFP_KERNEL); @@ -5122,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { - old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); + char *qf_name = get_qf_name(sb, sbi, i); + + old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); @@ -5352,9 +5373,12 @@ restore_opts: #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + to_free[i] = get_qf_name(sb, sbi, i); + rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } + synchronize_rcu(); + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(to_free[i]); #endif kfree(orig_data); return err; @@ -5545,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type) */ static int ext4_quota_on_mount(struct super_block *sb, int type) { - return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), EXT4_SB(sb)->s_jquota_fmt, type); } @@ -5954,6 +5978,10 @@ static int __init ext4_init_fs(void) if (err) return err; + err = ext4_init_pending(); + if (err) + goto out6; + err = ext4_init_pageio(); if (err) goto out5; @@ -5992,6 +6020,8 @@ out3: out4: ext4_exit_pageio(); out5: + ext4_exit_pending(); +out6: ext4_exit_es(); return err; @@ -6009,6 +6039,7 @@ static void __exit ext4_exit_fs(void) ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_es(); + ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c125d662777c..26f8d7e46462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -251,8 +251,8 @@ restart: bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -333,8 +333,8 @@ restart2: jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 96225a77c112..7b73ef7f902d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -242,7 +242,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); /* Convert errno to return value from ->page_mkwrite() call */ -static inline int block_page_mkwrite_return(int err) +static inline vm_fault_t block_page_mkwrite_return(int err) { if (err == 0) return VM_FAULT_LOCKED; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0e31eb136c57..698e0d8a5ca4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -17,6 +17,7 @@ struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; +struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2035,21 +2036,23 @@ TRACE_EVENT(ext4_ext_show_extent, ); TRACE_EVENT(ext4_remove_blocks, - TP_PROTO(struct inode *inode, struct ext4_extent *ex, - ext4_lblk_t from, ext4_fsblk_t to, - long long partial_cluster), + TP_PROTO(struct inode *inode, struct ext4_extent *ex, + ext4_lblk_t from, ext4_fsblk_t to, + struct partial_cluster *pc), - TP_ARGS(inode, ex, from, to, partial_cluster), + TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) - __field( long long, partial ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( @@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks, __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; - __entry->partial = partial_cluster; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" - "from %u to %u partial_cluster %lld", + "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, @@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, - long long partial_cluster), + struct partial_cluster *pc), - TP_ARGS(inode, start, ex, partial_cluster), + TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( long long, partial ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->partial = partial_cluster; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" - "partial_cluster %lld", + "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, - (long long) __entry->partial) + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, @@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space, TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, - int depth, long long partial, __le16 eh_entries), + int depth, struct partial_cluster *pc, __le16 eh_entries), - TP_ARGS(inode, start, end, depth, partial, eh_entries), + TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) @@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done, __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) - __field( long long, partial ) + __field( ext4_fsblk_t, pc_pclu ) + __field( ext4_lblk_t, pc_lblk ) + __field( int, pc_state ) __field( unsigned short, eh_entries ) ), @@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done, __entry->start = start; __entry->end = end; __entry->depth = depth; - __entry->partial = partial; + __entry->pc_pclu = pc->pclu; + __entry->pc_lblk = pc->lblk; + __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), - TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " + TP_printk("dev %d,%d ino %lu since %u end %u depth %d " + "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, - (long long) __entry->partial, + (long long) __entry->pc_pclu, + (unsigned int) __entry->pc_lblk, + (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); @@ -2270,7 +2290,7 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->lblk, __entry->len) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, +TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), @@ -2292,7 +2312,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, (unsigned long) __entry->ino, __entry->lblk) ); -TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, +TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), @@ -2512,6 +2532,41 @@ TRACE_EVENT(ext4_es_shrink, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); +TRACE_EVENT(ext4_es_insert_delayed_block, + TP_PROTO(struct inode *inode, struct extent_status *es, + bool allocated), + + TP_ARGS(inode, es, allocated), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, lblk ) + __field( ext4_lblk_t, len ) + __field( ext4_fsblk_t, pblk ) + __field( char, status ) + __field( bool, allocated ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblk = es->es_lblk; + __entry->len = es->es_len; + __entry->pblk = ext4_es_pblock(es); + __entry->status = ext4_es_status(es); + __entry->allocated = allocated; + ), + + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " + "allocated %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->lblk, __entry->len, + __entry->pblk, show_extent_status(__entry->status), + __entry->allocated) +); + /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, |