summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/procfs-diskstats10
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu24
-rw-r--r--Documentation/RCU/Design/Data-Structures/Data-Structures.html118
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html22
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg123
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg16
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg56
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg237
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg12
-rw-r--r--Documentation/RCU/stallwarn.txt24
-rw-r--r--Documentation/RCU/whatisRCU.txt18
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst92
-rw-r--r--Documentation/admin-guide/index.rst9
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt89
-rw-r--r--Documentation/admin-guide/l1tf.rst610
-rw-r--r--Documentation/block/null_blk.txt7
-rw-r--r--Documentation/block/stat.txt28
-rw-r--r--Documentation/conf.py2
-rw-r--r--Documentation/core-api/atomic_ops.rst2
-rw-r--r--Documentation/device-mapper/writecache.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/samsung/samsung-boards.txt2
-rw-r--r--Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt2
-rw-r--r--Documentation/devicetree/bindings/gpio/nintendo,hollywood-gpio.txt2
-rw-r--r--Documentation/devicetree/bindings/hwmon/npcm750-pwm-fan.txt84
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/hideep.txt2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.txt1
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt2
-rw-r--r--Documentation/devicetree/bindings/mips/brcm/soc.txt2
-rw-r--r--Documentation/devicetree/bindings/net/fsl-fman.txt2
-rw-r--r--Documentation/devicetree/bindings/phy/phy-ath79-usb.txt4
-rw-r--r--Documentation/devicetree/bindings/power/power_domain.txt2
-rw-r--r--Documentation/devicetree/bindings/regulator/tps65090.txt2
-rw-r--r--Documentation/devicetree/bindings/reset/st,sti-softreset.txt2
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,apq8016-sbc.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,apq8096.txt2
-rw-r--r--Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt34
-rw-r--r--Documentation/devicetree/bindings/usb/rockchip,dwc3.txt3
-rw-r--r--Documentation/devicetree/bindings/w1/w1-gpio.txt2
-rw-r--r--Documentation/features/sched/membarrier-sync-core/arch-support.txt8
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/cifs/CHANGES1072
-rw-r--r--Documentation/filesystems/cifs/README22
-rw-r--r--Documentation/filesystems/ext4/ext4.rst (renamed from Documentation/filesystems/ext4.txt)142
-rw-r--r--Documentation/filesystems/ext4/index.rst17
-rw-r--r--Documentation/filesystems/ext4/ondisk/about.rst44
-rw-r--r--Documentation/filesystems/ext4/ondisk/allocators.rst56
-rw-r--r--Documentation/filesystems/ext4/ondisk/attributes.rst191
-rw-r--r--Documentation/filesystems/ext4/ondisk/bigalloc.rst22
-rw-r--r--Documentation/filesystems/ext4/ondisk/bitmaps.rst28
-rw-r--r--Documentation/filesystems/ext4/ondisk/blockgroup.rst135
-rw-r--r--Documentation/filesystems/ext4/ondisk/blockmap.rst49
-rw-r--r--Documentation/filesystems/ext4/ondisk/blocks.rst142
-rw-r--r--Documentation/filesystems/ext4/ondisk/checksums.rst73
-rw-r--r--Documentation/filesystems/ext4/ondisk/directory.rst426
-rw-r--r--Documentation/filesystems/ext4/ondisk/dynamic.rst12
-rw-r--r--Documentation/filesystems/ext4/ondisk/eainode.rst18
-rw-r--r--Documentation/filesystems/ext4/ondisk/globals.rst13
-rw-r--r--Documentation/filesystems/ext4/ondisk/group_descr.rst170
-rw-r--r--Documentation/filesystems/ext4/ondisk/ifork.rst194
-rw-r--r--Documentation/filesystems/ext4/ondisk/index.rst9
-rw-r--r--Documentation/filesystems/ext4/ondisk/inlinedata.rst37
-rw-r--r--Documentation/filesystems/ext4/ondisk/inodes.rst575
-rw-r--r--Documentation/filesystems/ext4/ondisk/journal.rst611
-rw-r--r--Documentation/filesystems/ext4/ondisk/mmp.rst77
-rw-r--r--Documentation/filesystems/ext4/ondisk/overview.rst26
-rw-r--r--Documentation/filesystems/ext4/ondisk/special_inodes.rst38
-rw-r--r--Documentation/filesystems/ext4/ondisk/super.rst801
-rw-r--r--Documentation/filesystems/index.rst33
-rw-r--r--Documentation/filesystems/porting20
-rw-r--r--Documentation/filesystems/vfs.txt18
-rw-r--r--Documentation/filesystems/xfs.txt4
-rw-r--r--Documentation/hwmon/max3444016
-rw-r--r--Documentation/hwmon/mlxreg-fan60
-rw-r--r--Documentation/hwmon/npcm750-pwm-fan22
-rw-r--r--Documentation/hwmon/sysfs-interface48
-rw-r--r--Documentation/index.rst11
-rw-r--r--Documentation/iostats.txt15
-rw-r--r--Documentation/kbuild/kbuild.txt17
-rw-r--r--Documentation/kbuild/kconfig.txt51
-rw-r--r--Documentation/kprobes.txt35
-rw-r--r--Documentation/memory-barriers.txt43
-rw-r--r--Documentation/networking/bonding.txt2
-rw-r--r--Documentation/networking/dpaa2/overview.rst1
-rw-r--r--Documentation/networking/e100.rst27
-rw-r--r--Documentation/networking/e1000.rst187
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt22
-rw-r--r--Documentation/virtual/kvm/api.txt16
-rw-r--r--Documentation/x86/intel_rdt_ui.txt380
-rw-r--r--Documentation/x86/x86_64/boot-options.txt8
92 files changed, 6033 insertions, 1672 deletions
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index f91a973a37fe..abac31d216de 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -5,6 +5,7 @@ Description:
The /proc/diskstats file displays the I/O statistics
of block devices. Each line contains the following 14
fields:
+
1 - major number
2 - minor mumber
3 - device name
@@ -19,4 +20,13 @@ Description:
12 - I/Os currently in progress
13 - time spent doing I/Os (ms)
14 - weighted time spent doing I/Os (ms)
+
+ Kernel 4.18+ appends four more fields for discard
+ tracking putting the total at 18:
+
+ 15 - discards completed successfully
+ 16 - discards merged
+ 17 - sectors discarded
+ 18 - time spent discarding
+
For more details refer to Documentation/iostats.txt
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 9c5e7732d249..73318225a368 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -476,6 +476,7 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/spectre_v1
/sys/devices/system/cpu/vulnerabilities/spectre_v2
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
+ /sys/devices/system/cpu/vulnerabilities/l1tf
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
@@ -487,3 +488,26 @@ Description: Information about CPU vulnerabilities
"Not affected" CPU is not affected by the vulnerability
"Vulnerable" CPU is affected and no mitigation in effect
"Mitigation: $M" CPU is affected and mitigation $M is in effect
+
+ Details about the l1tf file can be found in
+ Documentation/admin-guide/l1tf.rst
+
+What: /sys/devices/system/cpu/smt
+ /sys/devices/system/cpu/smt/active
+ /sys/devices/system/cpu/smt/control
+Date: June 2018
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description: Control Symetric Multi Threading (SMT)
+
+ active: Tells whether SMT is active (enabled and siblings online)
+
+ control: Read/write interface to control SMT. Possible
+ values:
+
+ "on" SMT is enabled
+ "off" SMT is disabled
+ "forceoff" SMT is force disabled. Cannot be changed.
+ "notsupported" SMT is not supported by the CPU
+
+ If control status is "forceoff" or "notsupported" writes
+ are rejected.
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
index 6c06e10bd04b..f5120a00f511 100644
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@@ -380,31 +380,26 @@ and therefore need no protection.
as follows:
<pre>
- 1 unsigned long gpnum;
- 2 unsigned long completed;
+ 1 unsigned long gp_seq;
</pre>
<p>RCU grace periods are numbered, and
-the <tt>-&gt;gpnum</tt> field contains the number of the grace
-period that started most recently.
-The <tt>-&gt;completed</tt> field contains the number of the
-grace period that completed most recently.
-If the two fields are equal, the RCU grace period that most recently
-started has already completed, and therefore the corresponding
-flavor of RCU is idle.
-If <tt>-&gt;gpnum</tt> is one greater than <tt>-&gt;completed</tt>,
-then <tt>-&gt;gpnum</tt> gives the number of the current RCU
-grace period, which has not yet completed.
-Any other combination of values indicates that something is broken.
-These two fields are protected by the root <tt>rcu_node</tt>'s
+the <tt>-&gt;gp_seq</tt> field contains the current grace-period
+sequence number.
+The bottom two bits are the state of the current grace period,
+which can be zero for not yet started or one for in progress.
+In other words, if the bottom two bits of <tt>-&gt;gp_seq</tt> are
+zero, the corresponding flavor of RCU is idle.
+Any other value in the bottom two bits indicates that something is broken.
+This field is protected by the root <tt>rcu_node</tt> structure's
<tt>-&gt;lock</tt> field.
-</p><p>There are <tt>-&gt;gpnum</tt> and <tt>-&gt;completed</tt> fields
+</p><p>There are <tt>-&gt;gp_seq</tt> fields
in the <tt>rcu_node</tt> and <tt>rcu_data</tt> structures
as well.
The fields in the <tt>rcu_state</tt> structure represent the
-most current values, and those of the other structures are compared
-in order to detect the start of a new grace period in a distributed
+most current value, and those of the other structures are compared
+in order to detect the beginnings and ends of grace periods in a distributed
fashion.
The values flow from <tt>rcu_state</tt> to <tt>rcu_node</tt>
(down the tree from the root to the leaves) to <tt>rcu_data</tt>.
@@ -512,27 +507,47 @@ than to be heisenbugged out of existence.
as follows:
<pre>
- 1 unsigned long gpnum;
- 2 unsigned long completed;
+ 1 unsigned long gp_seq;
+ 2 unsigned long gp_seq_needed;
</pre>
-<p>These fields are the counterparts of the fields of the same name in
-the <tt>rcu_state</tt> structure.
-They each may lag up to one behind their <tt>rcu_state</tt>
-counterparts.
-If a given <tt>rcu_node</tt> structure's <tt>-&gt;gpnum</tt> and
-<tt>-&gt;complete</tt> fields are equal, then this <tt>rcu_node</tt>
+<p>The <tt>rcu_node</tt> structures' <tt>-&gt;gp_seq</tt> fields are
+the counterparts of the field of the same name in the <tt>rcu_state</tt>
+structure.
+They each may lag up to one step behind their <tt>rcu_state</tt>
+counterpart.
+If the bottom two bits of a given <tt>rcu_node</tt> structure's
+<tt>-&gt;gp_seq</tt> field is zero, then this <tt>rcu_node</tt>
structure believes that RCU is idle.
-Otherwise, as with the <tt>rcu_state</tt> structure,
-the <tt>-&gt;gpnum</tt> field will be one greater than the
-<tt>-&gt;complete</tt> fields, with <tt>-&gt;gpnum</tt>
-indicating which grace period this <tt>rcu_node</tt> believes
-is still being waited for.
+</p><p>The <tt>&gt;gp_seq</tt> field of each <tt>rcu_node</tt>
+structure is updated at the beginning and the end
+of each grace period.
+
+<p>The <tt>-&gt;gp_seq_needed</tt> fields record the
+furthest-in-the-future grace period request seen by the corresponding
+<tt>rcu_node</tt> structure. The request is considered fulfilled when
+the value of the <tt>-&gt;gp_seq</tt> field equals or exceeds that of
+the <tt>-&gt;gp_seq_needed</tt> field.
-</p><p>The <tt>&gt;gpnum</tt> field of each <tt>rcu_node</tt>
-structure is updated at the beginning
-of each grace period, and the <tt>-&gt;completed</tt> fields are
-updated at the end of each grace period.
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Suppose that this <tt>rcu_node</tt> structure doesn't see
+ a request for a very long time.
+ Won't wrapping of the <tt>-&gt;gp_seq</tt> field cause
+ problems?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ No, because if the <tt>-&gt;gp_seq_needed</tt> field lags behind the
+ <tt>-&gt;gp_seq</tt> field, the <tt>-&gt;gp_seq_needed</tt> field
+ will be updated at the end of the grace period.
+ Modulo-arithmetic comparisons therefore will always get the
+ correct answer, even with wrapping.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<h5>Quiescent-State Tracking</h5>
@@ -626,9 +641,8 @@ normal and expedited grace periods, respectively.
</ol>
<p><font color="ffffff">So the locking is absolutely required in
- order to coordinate
- clearing of the bits with the grace-period numbers in
- <tt>-&gt;gpnum</tt> and <tt>-&gt;completed</tt>.
+ order to coordinate clearing of the bits with updating of the
+ grace-period sequence number in <tt>-&gt;gp_seq</tt>.
</font></td></tr>
<tr><td>&nbsp;</td></tr>
</table>
@@ -1038,15 +1052,15 @@ out any <tt>rcu_data</tt> structure for which this flag is not set.
as follows:
<pre>
- 1 unsigned long completed;
- 2 unsigned long gpnum;
+ 1 unsigned long gp_seq;
+ 2 unsigned long gp_seq_needed;
3 bool cpu_no_qs;
4 bool core_needs_qs;
5 bool gpwrap;
6 unsigned long rcu_qs_ctr_snap;
</pre>
-<p>The <tt>completed</tt> and <tt>gpnum</tt>
+<p>The <tt>-&gt;gp_seq</tt> and <tt>-&gt;gp_seq_needed</tt>
fields are the counterparts of the fields of the same name
in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.
They may each lag up to one behind their <tt>rcu_node</tt>
@@ -1054,15 +1068,9 @@ counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and
<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag
arbitrarily far behind for CPUs in dyntick-idle mode (but these counters
will catch up upon exit from dyntick-idle mode).
-If a given <tt>rcu_data</tt> structure's <tt>-&gt;gpnum</tt> and
-<tt>-&gt;complete</tt> fields are equal, then this <tt>rcu_data</tt>
+If the lower two bits of a given <tt>rcu_data</tt> structure's
+<tt>-&gt;gp_seq</tt> are zero, then this <tt>rcu_data</tt>
structure believes that RCU is idle.
-Otherwise, as with the <tt>rcu_state</tt> and <tt>rcu_node</tt>
-structure,
-the <tt>-&gt;gpnum</tt> field will be one greater than the
-<tt>-&gt;complete</tt> fields, with <tt>-&gt;gpnum</tt>
-indicating which grace period this <tt>rcu_data</tt> believes
-is still being waited for.
<table>
<tr><th>&nbsp;</th></tr>
@@ -1070,13 +1078,13 @@ is still being waited for.
<tr><td>
All this replication of the grace period numbers can only cause
massive confusion.
- Why not just keep a global pair of counters and be done with it???
+ Why not just keep a global sequence number and be done with it???
</td></tr>
<tr><th align="left">Answer:</th></tr>
<tr><td bgcolor="#ffffff"><font color="ffffff">
- Because if there was only a single global pair of grace-period
+ Because if there was only a single global sequence
numbers, there would need to be a single global lock to allow
- safely accessing and updating them.
+ safely accessing and updating it.
And if we are not going to have a single global lock, we need
to carefully manage the numbers on a per-node basis.
Recall from the answer to a previous Quick Quiz that the consequences
@@ -1091,8 +1099,8 @@ CPU has not yet passed through a quiescent state,
while the <tt>-&gt;core_needs_qs</tt> flag indicates that the
RCU core needs a quiescent state from the corresponding CPU.
The <tt>-&gt;gpwrap</tt> field indicates that the corresponding
-CPU has remained idle for so long that the <tt>completed</tt>
-and <tt>gpnum</tt> counters are in danger of overflow, which
+CPU has remained idle for so long that the
+<tt>gp_seq</tt> counter is in danger of overflow, which
will cause the CPU to disregard the values of its counters on
its next exit from idle.
Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect
@@ -1130,10 +1138,10 @@ The CPU advances the callbacks in its <tt>rcu_data</tt> structure
whenever it notices that another RCU grace period has completed.
The CPU detects the completion of an RCU grace period by noticing
that the value of its <tt>rcu_data</tt> structure's
-<tt>-&gt;completed</tt> field differs from that of its leaf
+<tt>-&gt;gp_seq</tt> field differs from that of its leaf
<tt>rcu_node</tt> structure.
Recall that each <tt>rcu_node</tt> structure's
-<tt>-&gt;completed</tt> field is updated at the end of each
+<tt>-&gt;gp_seq</tt> field is updated at the beginnings and ends of each
grace period.
<p>
diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
index 8651b0b4fd79..a346ce0116eb 100644
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
@@ -357,7 +357,7 @@ parts, starting in this section with the various phases of
grace-period initialization.
<p>The first ordering-related grace-period initialization action is to
-increment the <tt>rcu_state</tt> structure's <tt>-&gt;gpnum</tt>
+advance the <tt>rcu_state</tt> structure's <tt>-&gt;gp_seq</tt>
grace-period-number counter, as shown below:
</p><p><img src="TreeRCU-gp-init-1.svg" alt="TreeRCU-gp-init-1.svg" width="75%">
@@ -388,7 +388,7 @@ its last CPU and if the next <tt>rcu_node</tt> structure has no online CPUs).
<p>The final <tt>rcu_gp_init()</tt> pass through the <tt>rcu_node</tt>
tree traverses breadth-first, setting each <tt>rcu_node</tt> structure's
-<tt>-&gt;gpnum</tt> field to the newly incremented value from the
+<tt>-&gt;gp_seq</tt> field to the newly advanced value from the
<tt>rcu_state</tt> structure, as shown in the following diagram.
</p><p><img src="TreeRCU-gp-init-3.svg" alt="TreeRCU-gp-init-1.svg" width="75%">
@@ -398,9 +398,9 @@ tree traverses breadth-first, setting each <tt>rcu_node</tt> structure's
to notice that a new grace period has started, as described in the next
section.
But because the grace-period kthread started the grace period at the
-root (with the increment of the <tt>rcu_state</tt> structure's
-<tt>-&gt;gpnum</tt> field) before setting each leaf <tt>rcu_node</tt>
-structure's <tt>-&gt;gpnum</tt> field, each CPU's observation of
+root (with the advancing of the <tt>rcu_state</tt> structure's
+<tt>-&gt;gp_seq</tt> field) before setting each leaf <tt>rcu_node</tt>
+structure's <tt>-&gt;gp_seq</tt> field, each CPU's observation of
the start of the grace period will happen after the actual start
of the grace period.
@@ -466,7 +466,7 @@ section that the grace period must wait on.
<tr><td>
But a RCU read-side critical section might have started
after the beginning of the grace period
- (the <tt>-&gt;gpnum++</tt> from earlier), so why should
+ (the advancing of <tt>-&gt;gp_seq</tt> from earlier), so why should
the grace period wait on such a critical section?
</td></tr>
<tr><th align="left">Answer:</th></tr>
@@ -609,10 +609,8 @@ states outstanding from other CPUs.
<h4><a name="Grace-Period Cleanup">Grace-Period Cleanup</a></h4>
<p>Grace-period cleanup first scans the <tt>rcu_node</tt> tree
-breadth-first setting all the <tt>-&gt;completed</tt> fields equal
-to the number of the newly completed grace period, then it sets
-the <tt>rcu_state</tt> structure's <tt>-&gt;completed</tt> field,
-again to the number of the newly completed grace period.
+breadth-first advancing all the <tt>-&gt;gp_seq</tt> fields, then it
+advances the <tt>rcu_state</tt> structure's <tt>-&gt;gp_seq</tt> field.
The ordering effects are shown below:
</p><p><img src="TreeRCU-gp-cleanup.svg" alt="TreeRCU-gp-cleanup.svg" width="75%">
@@ -634,7 +632,7 @@ grace-period cleanup is complete, the next grace period can begin.
CPU has reported its quiescent state, but it may be some
milliseconds before RCU becomes aware of this.
The latest reasonable candidate is once the <tt>rcu_state</tt>
- structure's <tt>-&gt;completed</tt> field has been updated,
+ structure's <tt>-&gt;gp_seq</tt> field has been updated,
but it is quite possible that some CPUs have already completed
phase two of their updates by that time.
In short, if you are going to work with RCU, you need to
@@ -647,7 +645,7 @@ grace-period cleanup is complete, the next grace period can begin.
<h4><a name="Callback Invocation">Callback Invocation</a></h4>
<p>Once a given CPU's leaf <tt>rcu_node</tt> structure's
-<tt>-&gt;completed</tt> field has been updated, that CPU can begin
+<tt>-&gt;gp_seq</tt> field has been updated, that CPU can begin
invoking its RCU callbacks that were waiting for this grace period
to end.
These callbacks are identified by <tt>rcu_advance_cbs()</tt>,
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg
index 754f426b297a..bf84fbab27ee 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-cleanup.svg
@@ -384,11 +384,11 @@
inkscape:window-height="1144"
id="namedview208"
showgrid="true"
- inkscape:zoom="0.70710678"
- inkscape:cx="617.89017"
- inkscape:cy="542.52419"
- inkscape:window-x="86"
- inkscape:window-y="28"
+ inkscape:zoom="0.78716603"
+ inkscape:cx="513.06403"
+ inkscape:cy="623.1214"
+ inkscape:window-x="102"
+ inkscape:window-y="38"
inkscape:window-maximized="0"
inkscape:current-layer="g3188-3"
fit-margin-top="5"
@@ -417,13 +417,15 @@
id="g3188">
<text
xml:space="preserve"
- x="3199.1516"
+ x="3145.9592"
y="13255.592"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3143">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
<g
id="g3107"
transform="translate(947.90548,11584.029)">
@@ -502,13 +504,15 @@
</g>
<text
xml:space="preserve"
- x="5324.5371"
- y="15414.598"
+ x="5264.4731"
+ y="15428.84"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-753"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ id="text202-36-7"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-5">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<g
style="fill:none;stroke-width:0.025in"
@@ -547,15 +551,6 @@
sodipodi:linespacing="125%"><tspan
style="font-size:159.57754517px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Liberation Sans;-inkscape-font-specification:Liberation Sans"
id="tspan3104-6-5-6-0">Leaf</tspan></text>
- <text
- xml:space="preserve"
- x="7479.5796"
- y="17699.943"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-9"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
<path
sodipodi:nodetypes="cc"
inkscape:connector-curvature="0"
@@ -566,15 +561,6 @@
style="fill:none;stroke-width:0.025in"
transform="translate(-737.93887,7732.6672)"
id="g3188-3">
- <text
- xml:space="preserve"
- x="3225.7478"
- y="13175.802"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-60"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">rsp-&gt;completed =</text>
<g
id="g3107-62"
transform="translate(947.90548,11584.029)">
@@ -607,15 +593,6 @@
sodipodi:linespacing="125%"><tspan
style="font-size:159.57754517px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Liberation Sans;-inkscape-font-specification:Liberation Sans"
id="tspan3104-6-5-7">Root</tspan></text>
- <text
- xml:space="preserve"
- x="3225.7478"
- y="13390.038"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-60-3"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"> rnp-&gt;completed</text>
<flowRoot
xml:space="preserve"
id="flowRoot3356"
@@ -627,7 +604,18 @@
height="63.63961"
x="332.34018"
y="681.87292" /></flowRegion><flowPara
- id="flowPara3362" /></flowRoot> </g>
+ id="flowPara3362" /></flowRoot> <text
+ xml:space="preserve"
+ x="3156.6121"
+ y="13317.754"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-36-6"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-0">rcu_seq_end(&amp;rsp-&gt;gp_seq)</tspan></text>
+ </g>
<g
style="fill:none;stroke-width:0.025in"
transform="translate(-858.40227,7769.0342)"
@@ -859,6 +847,17 @@
id="path3414-8-3-6-6"
inkscape:connector-curvature="0"
sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ x="7418.769"
+ y="17646.104"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-36-70"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-93">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<g
transform="translate(-1642.5377,-11611.245)"
@@ -887,13 +886,15 @@
</g>
<text
xml:space="preserve"
- x="5327.3057"
+ x="5274.1133"
y="15428.84"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-36"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<g
transform="translate(-151.71746,-11647.612)"
@@ -972,13 +973,15 @@
id="tspan3104-6-5-6-0-92">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7486.4907"
- y="17670.119"
+ x="7408.5918"
+ y="17619.504"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-6"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ id="text202-36-2"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-9">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<g
transform="translate(-6817.1997,-11647.612)"
@@ -1019,13 +1022,15 @@
id="tspan3104-6-5-6-0-1">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7474.1382"
- y="17688.926"
+ x="7416.8003"
+ y="17619.504"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-5"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ id="text202-36-3"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-56">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<path
style="fill:none;stroke:#000000;stroke-width:13.29812908px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)"
@@ -1059,15 +1064,6 @@
id="path3414-8-3-6"
inkscape:connector-curvature="0"
sodipodi:nodetypes="cc" />
- <text
- xml:space="preserve"
- x="7318.9653"
- y="6031.6353"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-2"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
<g
style="fill:none;stroke-width:0.025in"
id="g4504-3-9"
@@ -1123,4 +1119,15 @@
id="path3134-9-0-3-5"
d="m 6875.6003,15833.906 1595.7755,0"
style="fill:none;stroke:#969696;stroke-width:53.19251633;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Send-36)" />
+ <text
+ xml:space="preserve"
+ x="7275.2612"
+ y="5971.8916"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-36-1"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-2">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</svg>
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg
index 0161262904ec..8c207550818f 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-1.svg
@@ -272,13 +272,13 @@
inkscape:window-height="1144"
id="namedview208"
showgrid="true"
- inkscape:zoom="0.70710678"
- inkscape:cx="617.89019"
- inkscape:cy="636.57143"
- inkscape:window-x="697"
+ inkscape:zoom="2.6330492"
+ inkscape:cx="524.82797"
+ inkscape:cy="519.31194"
+ inkscape:window-x="79"
inkscape:window-y="28"
inkscape:window-maximized="0"
- inkscape:current-layer="svg2"
+ inkscape:current-layer="g3188"
fit-margin-top="5"
fit-margin-right="5"
fit-margin-left="5"
@@ -305,13 +305,15 @@
id="g3188">
<text
xml:space="preserve"
- x="3305.5364"
+ x="3119.363"
y="13255.592"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">rsp-&gt;gpnum++</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3071">rcu_seq_start(rsp-&gt;gp_seq)</tspan></text>
<g
id="g3107"
transform="translate(947.90548,11584.029)">
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg
index de6ecc51b00e..d24d7d555dbc 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-init-3.svg
@@ -19,7 +19,7 @@
id="svg2"
version="1.1"
inkscape:version="0.48.4 r9939"
- sodipodi:docname="TreeRCU-gp-init-2.svg">
+ sodipodi:docname="TreeRCU-gp-init-3.svg">
<metadata
id="metadata212">
<rdf:RDF>
@@ -257,18 +257,22 @@
inkscape:window-width="1087"
inkscape:window-height="1144"
id="namedview208"
- showgrid="false"
- inkscape:zoom="0.70710678"
+ showgrid="true"
+ inkscape:zoom="0.68224756"
inkscape:cx="617.89019"
inkscape:cy="625.84293"
- inkscape:window-x="697"
+ inkscape:window-x="54"
inkscape:window-y="28"
inkscape:window-maximized="0"
- inkscape:current-layer="svg2"
+ inkscape:current-layer="g3153"
fit-margin-top="5"
fit-margin-right="5"
fit-margin-left="5"
- fit-margin-bottom="5" />
+ fit-margin-bottom="5">
+ <inkscape:grid
+ type="xygrid"
+ id="grid3090" />
+ </sodipodi:namedview>
<path
sodipodi:nodetypes="cccccccccccccccccccccccc"
inkscape:connector-curvature="0"
@@ -281,13 +285,13 @@
id="g3188">
<text
xml:space="preserve"
- x="3305.5364"
+ x="3145.9592"
y="13255.592"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
<g
id="g3107"
transform="translate(947.90548,11584.029)">
@@ -366,13 +370,13 @@
</g>
<text
xml:space="preserve"
- x="5392.3345"
- y="15407.104"
+ x="5253.6904"
+ y="15407.032"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-6"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
style="fill:none;stroke-width:0.025in"
@@ -413,13 +417,13 @@
id="tspan3104-6-5-6-0">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7536.4883"
- y="17640.934"
+ x="7415.4365"
+ y="17670.572"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-9"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
transform="translate(-1642.5375,-11610.962)"
@@ -448,13 +452,13 @@
</g>
<text
xml:space="preserve"
- x="5378.4146"
- y="15436.927"
+ x="5258.0688"
+ y="15412.313"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-3"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
transform="translate(-151.71726,-11647.329)"
@@ -533,13 +537,13 @@
id="tspan3104-6-5-6-0-92">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7520.1294"
- y="17673.639"
+ x="7405.2607"
+ y="17670.572"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-35"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
transform="translate(-6817.1998,-11647.329)"
@@ -580,13 +584,13 @@
id="tspan3104-6-5-6-0-1">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7521.4663"
- y="17666.062"
+ x="7413.4688"
+ y="17670.566"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-75"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<path
style="fill:none;stroke:#000000;stroke-width:13.29812908px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)"
@@ -622,11 +626,11 @@
sodipodi:nodetypes="cc" />
<text
xml:space="preserve"
- x="7370.856"
- y="5997.5972"
+ x="7271.9297"
+ y="6023.2412"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-62"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</svg>
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
index b13b7b01bb3a..acd73c7ad0f4 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
@@ -1070,13 +1070,13 @@
inkscape:window-height="1144"
id="namedview208"
showgrid="true"
- inkscape:zoom="0.6004608"
- inkscape:cx="826.65969"
- inkscape:cy="483.3047"
- inkscape:window-x="66"
- inkscape:window-y="28"
+ inkscape:zoom="0.81932583"
+ inkscape:cx="840.45848"
+ inkscape:cy="5052.4242"
+ inkscape:window-x="787"
+ inkscape:window-y="24"
inkscape:window-maximized="0"
- inkscape:current-layer="svg2"
+ inkscape:current-layer="g4"
fit-margin-top="5"
fit-margin-right="5"
fit-margin-left="5"
@@ -1543,15 +1543,6 @@
style="fill:none;stroke-width:0.025in"
transform="translate(1749.0282,658.72243)"
id="g3188">
- <text
- xml:space="preserve"
- x="3305.5364"
- y="13255.592"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-5"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">rsp-&gt;gpnum++</text>
<g
id="g3107-62"
transform="translate(947.90548,11584.029)">
@@ -1584,6 +1575,17 @@
sodipodi:linespacing="125%"><tspan
style="font-size:159.57754517px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Liberation Sans;-inkscape-font-specification:Liberation Sans"
id="tspan3104-6-5-7">Root</tspan></text>
+ <text
+ xml:space="preserve"
+ x="3137.9988"
+ y="13271.316"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-626"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3071">rcu_seq_start(rsp-&gt;gp_seq)</tspan></text>
</g>
<rect
ry="0"
@@ -2318,15 +2320,6 @@
style="fill:none;stroke-width:0.025in"
transform="translate(1739.0986,17188.625)"
id="g3188-6">
- <text
- xml:space="preserve"
- x="3305.5364"
- y="13255.592"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-1"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
<g
id="g3107-5"
transform="translate(947.90548,11584.029)">
@@ -2359,6 +2352,15 @@
sodipodi:linespacing="125%"><tspan
style="font-size:159.57754517px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Liberation Sans;-inkscape-font-specification:Liberation Sans"
id="tspan3104-6-5-1">Root</tspan></text>
+ <text
+ xml:space="preserve"
+ x="3147.9268"
+ y="13240.524"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-1"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
style="fill:none;stroke-width:0.025in"
@@ -2387,13 +2389,13 @@
</g>
<text
xml:space="preserve"
- x="5392.3345"
- y="15407.104"
+ x="5263.1094"
+ y="15411.646"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-6-7"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ id="text202-92"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
style="fill:none;stroke-width:0.025in"
@@ -2434,13 +2436,13 @@
id="tspan3104-6-5-6-0-94">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7536.4883"
- y="17640.934"
+ x="7417.4053"
+ y="17655.502"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-9"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ id="text202-759"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
transform="translate(-2353.8462,17224.992)"
@@ -2469,13 +2471,13 @@
</g>
<text
xml:space="preserve"
- x="5378.4146"
- y="15436.927"
+ x="5246.1548"
+ y="15411.648"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-3"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ id="text202-87"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
transform="translate(-863.02613,17188.625)"
@@ -2554,13 +2556,13 @@
id="tspan3104-6-5-6-0-92-6">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7520.1294"
- y="17673.639"
+ x="7433.8257"
+ y="17682.098"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-35"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ id="text202-2"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<g
transform="translate(-7528.5085,17188.625)"
@@ -2601,13 +2603,13 @@
id="tspan3104-6-5-6-0-1-8">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7521.4663"
- y="17666.062"
+ x="7415.4404"
+ y="17682.098"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-75-1"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
+ id="text202-0"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
<path
style="fill:none;stroke:#000000;stroke-width:13.29812813px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)"
@@ -2641,15 +2643,6 @@
id="path3414-8-3-6-4"
inkscape:connector-curvature="0"
sodipodi:nodetypes="cc" />
- <text
- xml:space="preserve"
- x="6659.5469"
- y="34833.551"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-62"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gpnum = rsp-&gt;gpnum</text>
<path
sodipodi:nodetypes="ccc"
inkscape:connector-curvature="0"
@@ -3844,7 +3837,7 @@
font-weight="bold"
font-size="192"
id="text202-6-6-5"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rdp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rdp-&gt;gp_seq</text>
<text
xml:space="preserve"
x="5035.4155"
@@ -4284,15 +4277,6 @@
style="fill:none;stroke-width:0.025in"
transform="translate(1874.038,53203.538)"
id="g3188-7">
- <text
- xml:space="preserve"
- x="3199.1516"
- y="13255.592"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-82"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
<g
id="g3107-53"
transform="translate(947.90548,11584.029)">
@@ -4325,6 +4309,17 @@
sodipodi:linespacing="125%"><tspan
style="font-size:159.57754517px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Liberation Sans;-inkscape-font-specification:Liberation Sans"
id="tspan3104-6-5-19">Root</tspan></text>
+ <text
+ xml:space="preserve"
+ x="3175.896"
+ y="13240.11"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-36-3"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<rect
ry="0"
@@ -4371,13 +4366,15 @@
</g>
<text
xml:space="preserve"
- x="5324.5371"
- y="15414.598"
+ x="5264.4829"
+ y="15411.231"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-753"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ id="text202-36-7"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-5">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<g
style="fill:none;stroke-width:0.025in"
@@ -4412,30 +4409,12 @@
sodipodi:linespacing="125%"><tspan
style="font-size:159.57754517px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Liberation Sans;-inkscape-font-specification:Liberation Sans"
id="tspan3104-6-5-6-0-4">Leaf</tspan></text>
- <text
- xml:space="preserve"
- x="10084.225"
- y="70903.312"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-9-0"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
<path
sodipodi:nodetypes="ccc"
inkscape:connector-curvature="0"
id="path3134-9-0-3-9"
d="m 6315.6122,72629.054 -20.9533,8108.684 1648.968,0"
style="fill:none;stroke:#969696;stroke-width:53.19251251;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow1Send)" />
- <text
- xml:space="preserve"
- x="5092.4683"
- y="74111.672"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-60"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rsp-&gt;completed =</text>
<g
style="fill:none;stroke-width:0.025in"
id="g3107-62-6"
@@ -4469,15 +4448,6 @@
sodipodi:linespacing="125%"><tspan
style="font-size:159.57754517px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:125%;writing-mode:lr-tb;text-anchor:start;font-family:Liberation Sans;-inkscape-font-specification:Liberation Sans"
id="tspan3104-6-5-7-7">Root</tspan></text>
- <text
- xml:space="preserve"
- x="5092.4683"
- y="74325.906"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-60-3"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"> rnp-&gt;completed</text>
<g
style="fill:none;stroke-width:0.025in"
transform="translate(1746.2528,60972.572)"
@@ -4736,13 +4706,15 @@
</g>
<text
xml:space="preserve"
- x="5327.3057"
- y="15428.84"
+ x="5274.1216"
+ y="15411.231"
font-style="normal"
font-weight="bold"
font-size="192"
id="text202-36"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-6">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<g
transform="translate(-728.08545,53203.538)"
@@ -4821,13 +4793,15 @@
id="tspan3104-6-5-6-0-92-5">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7486.4907"
- y="17670.119"
+ x="7435.1987"
+ y="17708.281"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-6-2"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ id="text202-36-9"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-1">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<g
transform="translate(-7393.5687,53203.538)"
@@ -4868,13 +4842,15 @@
id="tspan3104-6-5-6-0-1-5">Leaf</tspan></text>
<text
xml:space="preserve"
- x="7474.1382"
- y="17688.926"
+ x="7416.8125"
+ y="17708.281"
font-style="normal"
font-weight="bold"
font-size="192"
- id="text202-5-1"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
+ id="text202-36-35"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-62">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
</g>
<path
style="fill:none;stroke:#000000;stroke-width:13.29812813px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)"
@@ -4908,15 +4884,6 @@
id="path3414-8-3-6-67"
inkscape:connector-curvature="0"
sodipodi:nodetypes="cc" />
- <text
- xml:space="preserve"
- x="6742.6001"
- y="70882.617"
- font-style="normal"
- font-weight="bold"
- font-size="192"
- id="text202-2"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;completed = -&gt;gpnum</text>
<g
style="fill:none;stroke-width:0.025in"
id="g4504-3-9-6"
@@ -5131,5 +5098,47 @@
font-size="192"
id="text202-7-9-6-6-7"
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_do_batch()</text>
+ <text
+ xml:space="preserve"
+ x="6698.9019"
+ y="70885.211"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-36-2"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-7">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
+ <text
+ xml:space="preserve"
+ x="10023.457"
+ y="70885.234"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-36-0"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-9">rcu_seq_end(&amp;rnp-&gt;gp_seq)</tspan></text>
+ <text
+ xml:space="preserve"
+ x="5023.3389"
+ y="74209.773"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-36-36"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"><tspan
+ style="font-size:172.87567139px"
+ id="tspan3166-0">rcu_seq_end(&amp;rsp-&gt;gp_seq)</tspan></text>
+ <text
+ xml:space="preserve"
+ x="6562.5884"
+ y="34870.727"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ id="text202-3"
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">-&gt;gp_seq = rsp-&gt;gp_seq</text>
</g>
</svg>
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg
index de3992f4cbe1..149bec2a4493 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg
@@ -300,13 +300,13 @@
inkscape:window-height="1144"
id="namedview208"
showgrid="true"
- inkscape:zoom="0.70710678"
- inkscape:cx="616.47598"
- inkscape:cy="595.41964"
- inkscape:window-x="813"
+ inkscape:zoom="0.96484375"
+ inkscape:cx="507.0191"
+ inkscape:cy="885.62207"
+ inkscape:window-x="47"
inkscape:window-y="28"
inkscape:window-maximized="0"
- inkscape:current-layer="g4405"
+ inkscape:current-layer="g3115"
fit-margin-top="5"
fit-margin-right="5"
fit-margin-left="5"
@@ -710,7 +710,7 @@
font-weight="bold"
font-size="192"
id="text202-6-6"
- style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rdp-&gt;gpnum</text>
+ style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rdp-&gt;gp_seq</text>
<text
xml:space="preserve"
x="5035.4155"
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 4259f95c3261..f99cf11b314b 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -172,7 +172,7 @@ it will print a message similar to the following:
INFO: rcu_sched detected stalls on CPUs/tasks:
2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0
16-...: (0 ticks this GP) idle=81c/0/0 softirq=764/764 fqs=0
- (detected by 32, t=2603 jiffies, g=7073, c=7072, q=625)
+ (detected by 32, t=2603 jiffies, g=7075, q=625)
This message indicates that CPU 32 detected that CPUs 2 and 16 were both
causing stalls, and that the stall was affecting RCU-sched. This message
@@ -215,11 +215,10 @@ CPU since the last time that this CPU noted the beginning of a grace
period.
The "detected by" line indicates which CPU detected the stall (in this
-case, CPU 32), how many jiffies have elapsed since the start of the
-grace period (in this case 2603), the number of the last grace period
-to start and to complete (7073 and 7072, respectively), and an estimate
-of the total number of RCU callbacks queued across all CPUs (625 in
-this case).
+case, CPU 32), how many jiffies have elapsed since the start of the grace
+period (in this case 2603), the grace-period sequence number (7075), and
+an estimate of the total number of RCU callbacks queued across all CPUs
+(625 in this case).
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
for each CPU:
@@ -266,15 +265,16 @@ If the relevant grace-period kthread has been unable to run prior to
the stall warning, as was the case in the "All QSes seen" line above,
the following additional line is printed:
- kthread starved for 23807 jiffies! g7073 c7072 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1
+ kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5
Starving the grace-period kthreads of CPU time can of course result
in RCU CPU stall warnings even when all CPUs and tasks have passed
-through the required quiescent states. The "g" and "c" numbers flag the
-number of the last grace period started and completed, respectively,
-the "f" precedes the ->gp_flags command to the grace-period kthread,
-the "RCU_GP_WAIT_FQS" indicates that the kthread is waiting for a short
-timeout, and the "state" precedes value of the task_struct ->state field.
+through the required quiescent states. The "g" number shows the current
+grace-period sequence number, the "f" precedes the ->gp_flags command
+to the grace-period kthread, the "RCU_GP_WAIT_FQS" indicates that the
+kthread is waiting for a short timeout, the "state" precedes value of the
+task_struct ->state field, and the "cpu" indicates that the grace-period
+kthread last ran on CPU 5.
Multiple Warnings From One Stall
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 65eb856526b7..c2a7facf7ff9 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -588,6 +588,7 @@ It is extremely simple:
void synchronize_rcu(void)
{
write_lock(&rcu_gp_mutex);
+ smp_mb__after_spinlock();
write_unlock(&rcu_gp_mutex);
}
@@ -609,12 +610,15 @@ don't forget about them when submitting patches making use of RCU!]
The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
and release a global reader-writer lock. The synchronize_rcu()
-primitive write-acquires this same lock, then immediately releases
-it. This means that once synchronize_rcu() exits, all RCU read-side
-critical sections that were in progress before synchronize_rcu() was
-called are guaranteed to have completed -- there is no way that
-synchronize_rcu() would have been able to write-acquire the lock
-otherwise.
+primitive write-acquires this same lock, then releases it. This means
+that once synchronize_rcu() exits, all RCU read-side critical sections
+that were in progress before synchronize_rcu() was called are guaranteed
+to have completed -- there is no way that synchronize_rcu() would have
+been able to write-acquire the lock otherwise. The smp_mb__after_spinlock()
+promotes synchronize_rcu() to a full memory barrier in compliance with
+the "Memory-Barrier Guarantees" listed in:
+
+ Documentation/RCU/Design/Requirements/Requirements.html.
It is possible to nest rcu_read_lock(), since reader-writer locks may
be recursively acquired. Note also that rcu_read_lock() is immune
@@ -816,11 +820,13 @@ RCU list traversal:
list_next_rcu
list_for_each_entry_rcu
list_for_each_entry_continue_rcu
+ list_for_each_entry_from_rcu
hlist_first_rcu
hlist_next_rcu
hlist_pprev_rcu
hlist_for_each_entry_rcu
hlist_for_each_entry_rcu_bh
+ hlist_for_each_entry_from_rcu
hlist_for_each_entry_continue_rcu
hlist_for_each_entry_continue_rcu_bh
hlist_nulls_first_rcu
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8a2c52d5c53b..1746131bc9cb 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -51,6 +51,9 @@ v1 is available under Documentation/cgroup-v1/.
5-3. IO
5-3-1. IO Interface Files
5-3-2. Writeback
+ 5-3-3. IO Latency
+ 5-3-3-1. How IO Latency Throttling Works
+ 5-3-3-2. IO Latency Interface Files
5-4. PID
5-4-1. PID Interface Files
5-5. Device
@@ -1314,17 +1317,19 @@ IO Interface Files
Lines are keyed by $MAJ:$MIN device numbers and not ordered.
The following nested keys are defined.
- ====== ===================
+ ====== =====================
rbytes Bytes read
wbytes Bytes written
rios Number of read IOs
wios Number of write IOs
- ====== ===================
+ dbytes Bytes discarded
+ dios Number of discard IOs
+ ====== =====================
An example read output follows:
- 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
- 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
+ 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
+ 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
io.weight
A read-write flat-keyed file which exists on non-root cgroups.
@@ -1446,6 +1451,85 @@ writeback as follows.
vm.dirty[_background]_ratio.
+IO Latency
+~~~~~~~~~~
+
+This is a cgroup v2 controller for IO workload protection. You provide a group
+with a latency target, and if the average latency exceeds that target the
+controller will throttle any peers that have a lower latency target than the
+protected workload.
+
+The limits are only applied at the peer level in the hierarchy. This means that
+in the diagram below, only groups A, B, and C will influence each other, and
+groups D and F will influence each other. Group G will influence nobody.
+
+ [root]
+ / | \
+ A B C
+ / \ |
+ D F G
+
+
+So the ideal way to configure this is to set io.latency in groups A, B, and C.
+Generally you do not want to set a value lower than the latency your device
+supports. Experiment to find the value that works best for your workload.
+Start at higher than the expected latency for your device and watch the
+avg_lat value in io.stat for your workload group to get an idea of the
+latency you see during normal operation. Use the avg_lat value as a basis for
+your real setting, setting at 10-15% higher than the value in io.stat.
+
+How IO Latency Throttling Works
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+io.latency is work conserving; so as long as everybody is meeting their latency
+target the controller doesn't do anything. Once a group starts missing its
+target it begins throttling any peer group that has a higher target than itself.
+This throttling takes 2 forms:
+
+- Queue depth throttling. This is the number of outstanding IO's a group is
+ allowed to have. We will clamp down relatively quickly, starting at no limit
+ and going all the way down to 1 IO at a time.
+
+- Artificial delay induction. There are certain types of IO that cannot be
+ throttled without possibly adversely affecting higher priority groups. This
+ includes swapping and metadata IO. These types of IO are allowed to occur
+ normally, however they are "charged" to the originating group. If the
+ originating group is being throttled you will see the use_delay and delay
+ fields in io.stat increase. The delay value is how many microseconds that are
+ being added to any process that runs in this group. Because this number can
+ grow quite large if there is a lot of swapping or metadata IO occurring we
+ limit the individual delay events to 1 second at a time.
+
+Once the victimized group starts meeting its latency target again it will start
+unthrottling any peer groups that were throttled previously. If the victimized
+group simply stops doing IO the global counter will unthrottle appropriately.
+
+IO Latency Interface Files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ io.latency
+ This takes a similar format as the other controllers.
+
+ "MAJOR:MINOR target=<target time in microseconds"
+
+ io.stat
+ If the controller is enabled you will see extra stats in io.stat in
+ addition to the normal ones.
+
+ depth
+ This is the current queue depth for the group.
+
+ avg_lat
+ This is an exponential moving average with a decay rate of 1/exp
+ bound by the sampling interval. The decay rate interval can be
+ calculated by multiplying the win value in io.stat by the
+ corresponding number of samples based on the win value.
+
+ win
+ The sampling window size in milliseconds. This is the minimum
+ duration of time between evaluation events. Windows only elapse
+ with IO activity. Idle periods extend the most recent window.
+
PID
---
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 48d70af11652..0873685bab0f 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -17,6 +17,15 @@ etc.
kernel-parameters
devices
+This section describes CPU vulnerabilities and provides an overview of the
+possible mitigations along with guidance for selecting mitigations if they
+are configurable at compile, boot or run time.
+
+.. toctree::
+ :maxdepth: 1
+
+ l1tf
+
Here is a set of documents aimed at users who are trying to track down
problems and bugs in particular.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index efc7aa7a0670..5a67e409d370 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1967,10 +1967,84 @@
(virtualized real and unpaged mode) on capable
Intel chips. Default is 1 (enabled)
+ kvm-intel.vmentry_l1d_flush=[KVM,Intel] Mitigation for L1 Terminal Fault
+ CVE-2018-3620.
+
+ Valid arguments: never, cond, always
+
+ always: L1D cache flush on every VMENTER.
+ cond: Flush L1D on VMENTER only when the code between
+ VMEXIT and VMENTER can leak host memory.
+ never: Disables the mitigation
+
+ Default is cond (do L1 cache flush in specific instances)
+
kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
feature (tagged TLBs) on capable Intel chips.
Default is 1 (enabled)
+ l1tf= [X86] Control mitigation of the L1TF vulnerability on
+ affected CPUs
+
+ The kernel PTE inversion protection is unconditionally
+ enabled and cannot be disabled.
+
+ full
+ Provides all available mitigations for the
+ L1TF vulnerability. Disables SMT and
+ enables all mitigations in the
+ hypervisors, i.e. unconditional L1D flush.
+
+ SMT control and L1D flush control via the
+ sysfs interface is still possible after
+ boot. Hypervisors will issue a warning
+ when the first VM is started in a
+ potentially insecure configuration,
+ i.e. SMT enabled or L1D flush disabled.
+
+ full,force
+ Same as 'full', but disables SMT and L1D
+ flush runtime control. Implies the
+ 'nosmt=force' command line option.
+ (i.e. sysfs control of SMT is disabled.)
+
+ flush
+ Leaves SMT enabled and enables the default
+ hypervisor mitigation, i.e. conditional
+ L1D flush.
+
+ SMT control and L1D flush control via the
+ sysfs interface is still possible after
+ boot. Hypervisors will issue a warning
+ when the first VM is started in a
+ potentially insecure configuration,
+ i.e. SMT enabled or L1D flush disabled.
+
+ flush,nosmt
+
+ Disables SMT and enables the default
+ hypervisor mitigation.
+
+ SMT control and L1D flush control via the
+ sysfs interface is still possible after
+ boot. Hypervisors will issue a warning
+ when the first VM is started in a
+ potentially insecure configuration,
+ i.e. SMT enabled or L1D flush disabled.
+
+ flush,nowarn
+ Same as 'flush', but hypervisors will not
+ warn when a VM is started in a potentially
+ insecure configuration.
+
+ off
+ Disables hypervisor mitigations and doesn't
+ emit any warnings.
+
+ Default is 'flush'.
+
+ For details see: Documentation/admin-guide/l1tf.rst
+
l2cr= [PPC]
l3cr= [PPC]
@@ -2687,6 +2761,10 @@
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
+ [KNL,x86] Disable symmetric multithreading (SMT).
+ nosmt=force: Force disable SMT, cannot be undone
+ via the sysfs control file.
+
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
(indirect branch prediction) vulnerability. System may
allow data leaks with this option, which is equivalent
@@ -2835,8 +2913,6 @@
nosync [HW,M68K] Disables sync negotiation for all devices.
- notsc [BUGS=X86-32] Disable Time Stamp Counter
-
nowatchdog [KNL] Disable both lockup detectors, i.e.
soft-lockup and NMI watchdog (hard-lockup).
@@ -3632,8 +3708,8 @@
Set time (s) after boot for CPU-hotplug testing.
rcutorture.onoff_interval= [KNL]
- Set time (s) between CPU-hotplug operations, or
- zero to disable CPU-hotplug testing.
+ Set time (jiffies) between CPU-hotplug operations,
+ or zero to disable CPU-hotplug testing.
rcutorture.shuffle_interval= [KNL]
Set task-shuffle interval (s). Shuffling tasks
@@ -4846,3 +4922,8 @@
xirc2ps_cs= [NET,PCMCIA]
Format:
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
+
+ xhci-hcd.quirks [USB,KNL]
+ A hex value specifying bitmask with supplemental xhci
+ host controller quirks. Meaning of each bit can be
+ consulted in header drivers/usb/host/xhci.h.
diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
new file mode 100644
index 000000000000..bae52b845de0
--- /dev/null
+++ b/Documentation/admin-guide/l1tf.rst
@@ -0,0 +1,610 @@
+L1TF - L1 Terminal Fault
+========================
+
+L1 Terminal Fault is a hardware vulnerability which allows unprivileged
+speculative access to data which is available in the Level 1 Data Cache
+when the page table entry controlling the virtual address, which is used
+for the access, has the Present bit cleared or other reserved bits set.
+
+Affected processors
+-------------------
+
+This vulnerability affects a wide range of Intel processors. The
+vulnerability is not present on:
+
+ - Processors from AMD, Centaur and other non Intel vendors
+
+ - Older processor models, where the CPU family is < 6
+
+ - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
+ Penwell, Pineview, Silvermont, Airmont, Merrifield)
+
+ - The Intel XEON PHI family
+
+ - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
+ IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
+ by the Meltdown vulnerability either. These CPUs should become
+ available by end of 2018.
+
+Whether a processor is affected or not can be read out from the L1TF
+vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entries are related to the L1TF vulnerability:
+
+ ============= ================= ==============================
+ CVE-2018-3615 L1 Terminal Fault SGX related aspects
+ CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
+ CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
+ ============= ================= ==============================
+
+Problem
+-------
+
+If an instruction accesses a virtual address for which the relevant page
+table entry (PTE) has the Present bit cleared or other reserved bits set,
+then speculative execution ignores the invalid PTE and loads the referenced
+data if it is present in the Level 1 Data Cache, as if the page referenced
+by the address bits in the PTE was still present and accessible.
+
+While this is a purely speculative mechanism and the instruction will raise
+a page fault when it is retired eventually, the pure act of loading the
+data and making it available to other speculative instructions opens up the
+opportunity for side channel attacks to unprivileged malicious code,
+similar to the Meltdown attack.
+
+While Meltdown breaks the user space to kernel space protection, L1TF
+allows to attack any physical memory address in the system and the attack
+works across all protection domains. It allows an attack of SGX and also
+works from inside virtual machines because the speculation bypasses the
+extended page table (EPT) protection mechanism.
+
+
+Attack scenarios
+----------------
+
+1. Malicious user space
+^^^^^^^^^^^^^^^^^^^^^^^
+
+ Operating Systems store arbitrary information in the address bits of a
+ PTE which is marked non present. This allows a malicious user space
+ application to attack the physical memory to which these PTEs resolve.
+ In some cases user-space can maliciously influence the information
+ encoded in the address bits of the PTE, thus making attacks more
+ deterministic and more practical.
+
+ The Linux kernel contains a mitigation for this attack vector, PTE
+ inversion, which is permanently enabled and has no performance
+ impact. The kernel ensures that the address bits of PTEs, which are not
+ marked present, never point to cacheable physical memory space.
+
+ A system with an up to date kernel is protected against attacks from
+ malicious user space applications.
+
+2. Malicious guest in a virtual machine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ The fact that L1TF breaks all domain protections allows malicious guest
+ OSes, which can control the PTEs directly, and malicious guest user
+ space applications, which run on an unprotected guest kernel lacking the
+ PTE inversion mitigation for L1TF, to attack physical host memory.
+
+ A special aspect of L1TF in the context of virtualization is symmetric
+ multi threading (SMT). The Intel implementation of SMT is called
+ HyperThreading. The fact that Hyperthreads on the affected processors
+ share the L1 Data Cache (L1D) is important for this. As the flaw allows
+ only to attack data which is present in L1D, a malicious guest running
+ on one Hyperthread can attack the data which is brought into the L1D by
+ the context which runs on the sibling Hyperthread of the same physical
+ core. This context can be host OS, host user space or a different guest.
+
+ If the processor does not support Extended Page Tables, the attack is
+ only possible, when the hypervisor does not sanitize the content of the
+ effective (shadow) page tables.
+
+ While solutions exist to mitigate these attack vectors fully, these
+ mitigations are not enabled by default in the Linux kernel because they
+ can affect performance significantly. The kernel provides several
+ mechanisms which can be utilized to address the problem depending on the
+ deployment scenario. The mitigations, their protection scope and impact
+ are described in the next sections.
+
+ The default mitigations and the rationale for choosing them are explained
+ at the end of this document. See :ref:`default_mitigations`.
+
+.. _l1tf_sys_info:
+
+L1TF system information
+-----------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current L1TF
+status of the system: whether the system is vulnerable, and which
+mitigations are active. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/l1tf
+
+The possible values in this file are:
+
+ =========================== ===============================
+ 'Not affected' The processor is not vulnerable
+ 'Mitigation: PTE Inversion' The host protection is active
+ =========================== ===============================
+
+If KVM/VMX is enabled and the processor is vulnerable then the following
+information is appended to the 'Mitigation: PTE Inversion' part:
+
+ - SMT status:
+
+ ===================== ================
+ 'VMX: SMT vulnerable' SMT is enabled
+ 'VMX: SMT disabled' SMT is disabled
+ ===================== ================
+
+ - L1D Flush mode:
+
+ ================================ ====================================
+ 'L1D vulnerable' L1D flushing is disabled
+
+ 'L1D conditional cache flushes' L1D flush is conditionally enabled
+
+ 'L1D cache flushes' L1D flush is unconditionally enabled
+ ================================ ====================================
+
+The resulting grade of protection is discussed in the following sections.
+
+
+Host mitigation mechanism
+-------------------------
+
+The kernel is unconditionally protected against L1TF attacks from malicious
+user space running on the host.
+
+
+Guest mitigation mechanisms
+---------------------------
+
+.. _l1d_flush:
+
+1. L1D flush on VMENTER
+^^^^^^^^^^^^^^^^^^^^^^^
+
+ To make sure that a guest cannot attack data which is present in the L1D
+ the hypervisor flushes the L1D before entering the guest.
+
+ Flushing the L1D evicts not only the data which should not be accessed
+ by a potentially malicious guest, it also flushes the guest
+ data. Flushing the L1D has a performance impact as the processor has to
+ bring the flushed guest data back into the L1D. Depending on the
+ frequency of VMEXIT/VMENTER and the type of computations in the guest
+ performance degradation in the range of 1% to 50% has been observed. For
+ scenarios where guest VMEXIT/VMENTER are rare the performance impact is
+ minimal. Virtio and mechanisms like posted interrupts are designed to
+ confine the VMEXITs to a bare minimum, but specific configurations and
+ application scenarios might still suffer from a high VMEXIT rate.
+
+ The kernel provides two L1D flush modes:
+ - conditional ('cond')
+ - unconditional ('always')
+
+ The conditional mode avoids L1D flushing after VMEXITs which execute
+ only audited code paths before the corresponding VMENTER. These code
+ paths have been verified that they cannot expose secrets or other
+ interesting data to an attacker, but they can leak information about the
+ address space layout of the hypervisor.
+
+ Unconditional mode flushes L1D on all VMENTER invocations and provides
+ maximum protection. It has a higher overhead than the conditional
+ mode. The overhead cannot be quantified correctly as it depends on the
+ workload scenario and the resulting number of VMEXITs.
+
+ The general recommendation is to enable L1D flush on VMENTER. The kernel
+ defaults to conditional mode on affected processors.
+
+ **Note**, that L1D flush does not prevent the SMT problem because the
+ sibling thread will also bring back its data into the L1D which makes it
+ attackable again.
+
+ L1D flush can be controlled by the administrator via the kernel command
+ line and sysfs control files. See :ref:`mitigation_control_command_line`
+ and :ref:`mitigation_control_kvm`.
+
+.. _guest_confinement:
+
+2. Guest VCPU confinement to dedicated physical cores
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ To address the SMT problem, it is possible to make a guest or a group of
+ guests affine to one or more physical cores. The proper mechanism for
+ that is to utilize exclusive cpusets to ensure that no other guest or
+ host tasks can run on these cores.
+
+ If only a single guest or related guests run on sibling SMT threads on
+ the same physical core then they can only attack their own memory and
+ restricted parts of the host memory.
+
+ Host memory is attackable, when one of the sibling SMT threads runs in
+ host OS (hypervisor) context and the other in guest context. The amount
+ of valuable information from the host OS context depends on the context
+ which the host OS executes, i.e. interrupts, soft interrupts and kernel
+ threads. The amount of valuable data from these contexts cannot be
+ declared as non-interesting for an attacker without deep inspection of
+ the code.
+
+ **Note**, that assigning guests to a fixed set of physical cores affects
+ the ability of the scheduler to do load balancing and might have
+ negative effects on CPU utilization depending on the hosting
+ scenario. Disabling SMT might be a viable alternative for particular
+ scenarios.
+
+ For further information about confining guests to a single or to a group
+ of cores consult the cpusets documentation:
+
+ https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+
+.. _interrupt_isolation:
+
+3. Interrupt affinity
+^^^^^^^^^^^^^^^^^^^^^
+
+ Interrupts can be made affine to logical CPUs. This is not universally
+ true because there are types of interrupts which are truly per CPU
+ interrupts, e.g. the local timer interrupt. Aside of that multi queue
+ devices affine their interrupts to single CPUs or groups of CPUs per
+ queue without allowing the administrator to control the affinities.
+
+ Moving the interrupts, which can be affinity controlled, away from CPUs
+ which run untrusted guests, reduces the attack vector space.
+
+ Whether the interrupts with are affine to CPUs, which run untrusted
+ guests, provide interesting data for an attacker depends on the system
+ configuration and the scenarios which run on the system. While for some
+ of the interrupts it can be assumed that they won't expose interesting
+ information beyond exposing hints about the host OS memory layout, there
+ is no way to make general assumptions.
+
+ Interrupt affinity can be controlled by the administrator via the
+ /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
+ available at:
+
+ https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
+
+.. _smt_control:
+
+4. SMT control
+^^^^^^^^^^^^^^
+
+ To prevent the SMT issues of L1TF it might be necessary to disable SMT
+ completely. Disabling SMT can have a significant performance impact, but
+ the impact depends on the hosting scenario and the type of workloads.
+ The impact of disabling SMT needs also to be weighted against the impact
+ of other mitigation solutions like confining guests to dedicated cores.
+
+ The kernel provides a sysfs interface to retrieve the status of SMT and
+ to control it. It also provides a kernel command line interface to
+ control SMT.
+
+ The kernel command line interface consists of the following options:
+
+ =========== ==========================================================
+ nosmt Affects the bring up of the secondary CPUs during boot. The
+ kernel tries to bring all present CPUs online during the
+ boot process. "nosmt" makes sure that from each physical
+ core only one - the so called primary (hyper) thread is
+ activated. Due to a design flaw of Intel processors related
+ to Machine Check Exceptions the non primary siblings have
+ to be brought up at least partially and are then shut down
+ again. "nosmt" can be undone via the sysfs interface.
+
+ nosmt=force Has the same effect as "nosmt" but it does not allow to
+ undo the SMT disable via the sysfs interface.
+ =========== ==========================================================
+
+ The sysfs interface provides two files:
+
+ - /sys/devices/system/cpu/smt/control
+ - /sys/devices/system/cpu/smt/active
+
+ /sys/devices/system/cpu/smt/control:
+
+ This file allows to read out the SMT control state and provides the
+ ability to disable or (re)enable SMT. The possible states are:
+
+ ============== ===================================================
+ on SMT is supported by the CPU and enabled. All
+ logical CPUs can be onlined and offlined without
+ restrictions.
+
+ off SMT is supported by the CPU and disabled. Only
+ the so called primary SMT threads can be onlined
+ and offlined without restrictions. An attempt to
+ online a non-primary sibling is rejected
+
+ forceoff Same as 'off' but the state cannot be controlled.
+ Attempts to write to the control file are rejected.
+
+ notsupported The processor does not support SMT. It's therefore
+ not affected by the SMT implications of L1TF.
+ Attempts to write to the control file are rejected.
+ ============== ===================================================
+
+ The possible states which can be written into this file to control SMT
+ state are:
+
+ - on
+ - off
+ - forceoff
+
+ /sys/devices/system/cpu/smt/active:
+
+ This file reports whether SMT is enabled and active, i.e. if on any
+ physical core two or more sibling threads are online.
+
+ SMT control is also possible at boot time via the l1tf kernel command
+ line parameter in combination with L1D flush control. See
+ :ref:`mitigation_control_command_line`.
+
+5. Disabling EPT
+^^^^^^^^^^^^^^^^
+
+ Disabling EPT for virtual machines provides full mitigation for L1TF even
+ with SMT enabled, because the effective page tables for guests are
+ managed and sanitized by the hypervisor. Though disabling EPT has a
+ significant performance impact especially when the Meltdown mitigation
+ KPTI is enabled.
+
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+There is ongoing research and development for new mitigation mechanisms to
+address the performance impact of disabling SMT or EPT.
+
+.. _mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the L1TF mitigations at boot
+time with the option "l1tf=". The valid arguments for this option are:
+
+ ============ =============================================================
+ full Provides all available mitigations for the L1TF
+ vulnerability. Disables SMT and enables all mitigations in
+ the hypervisors, i.e. unconditional L1D flushing
+
+ SMT control and L1D flush control via the sysfs interface
+ is still possible after boot. Hypervisors will issue a
+ warning when the first VM is started in a potentially
+ insecure configuration, i.e. SMT enabled or L1D flush
+ disabled.
+
+ full,force Same as 'full', but disables SMT and L1D flush runtime
+ control. Implies the 'nosmt=force' command line option.
+ (i.e. sysfs control of SMT is disabled.)
+
+ flush Leaves SMT enabled and enables the default hypervisor
+ mitigation, i.e. conditional L1D flushing
+
+ SMT control and L1D flush control via the sysfs interface
+ is still possible after boot. Hypervisors will issue a
+ warning when the first VM is started in a potentially
+ insecure configuration, i.e. SMT enabled or L1D flush
+ disabled.
+
+ flush,nosmt Disables SMT and enables the default hypervisor mitigation,
+ i.e. conditional L1D flushing.
+
+ SMT control and L1D flush control via the sysfs interface
+ is still possible after boot. Hypervisors will issue a
+ warning when the first VM is started in a potentially
+ insecure configuration, i.e. SMT enabled or L1D flush
+ disabled.
+
+ flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
+ started in a potentially insecure configuration.
+
+ off Disables hypervisor mitigations and doesn't emit any
+ warnings.
+ ============ =============================================================
+
+The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
+
+
+.. _mitigation_control_kvm:
+
+Mitigation control for KVM - module parameter
+-------------------------------------------------------------
+
+The KVM hypervisor mitigation mechanism, flushing the L1D cache when
+entering a guest, can be controlled with a module parameter.
+
+The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
+following arguments:
+
+ ============ ==============================================================
+ always L1D cache flush on every VMENTER.
+
+ cond Flush L1D on VMENTER only when the code between VMEXIT and
+ VMENTER can leak host memory which is considered
+ interesting for an attacker. This still can leak host memory
+ which allows e.g. to determine the hosts address space layout.
+
+ never Disables the mitigation
+ ============ ==============================================================
+
+The parameter can be provided on the kernel command line, as a module
+parameter when loading the modules and at runtime modified via the sysfs
+file:
+
+/sys/module/kvm_intel/parameters/vmentry_l1d_flush
+
+The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
+line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
+module parameter is ignored and writes to the sysfs file are rejected.
+
+
+Mitigation selection guide
+--------------------------
+
+1. No virtualization in use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ The system is protected by the kernel unconditionally and no further
+ action is required.
+
+2. Virtualization with trusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ If the guest comes from a trusted source and the guest OS kernel is
+ guaranteed to have the L1TF mitigations in place the system is fully
+ protected against L1TF and no further action is required.
+
+ To avoid the overhead of the default L1D flushing on VMENTER the
+ administrator can disable the flushing via the kernel command line and
+ sysfs control files. See :ref:`mitigation_control_command_line` and
+ :ref:`mitigation_control_kvm`.
+
+
+3. Virtualization with untrusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+3.1. SMT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+ If SMT is not supported by the processor or disabled in the BIOS or by
+ the kernel, it's only required to enforce L1D flushing on VMENTER.
+
+ Conditional L1D flushing is the default behaviour and can be tuned. See
+ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+3.2. EPT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+ If EPT is not supported by the processor or disabled in the hypervisor,
+ the system is fully protected. SMT can stay enabled and L1D flushing on
+ VMENTER is not required.
+
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+3.3. SMT and EPT supported and active
+"""""""""""""""""""""""""""""""""""""
+
+ If SMT and EPT are supported and active then various degrees of
+ mitigations can be employed:
+
+ - L1D flushing on VMENTER:
+
+ L1D flushing on VMENTER is the minimal protection requirement, but it
+ is only potent in combination with other mitigation methods.
+
+ Conditional L1D flushing is the default behaviour and can be tuned. See
+ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+ - Guest confinement:
+
+ Confinement of guests to a single or a group of physical cores which
+ are not running any other processes, can reduce the attack surface
+ significantly, but interrupts, soft interrupts and kernel threads can
+ still expose valuable data to a potential attacker. See
+ :ref:`guest_confinement`.
+
+ - Interrupt isolation:
+
+ Isolating the guest CPUs from interrupts can reduce the attack surface
+ further, but still allows a malicious guest to explore a limited amount
+ of host physical memory. This can at least be used to gain knowledge
+ about the host address space layout. The interrupts which have a fixed
+ affinity to the CPUs which run the untrusted guests can depending on
+ the scenario still trigger soft interrupts and schedule kernel threads
+ which might expose valuable information. See
+ :ref:`interrupt_isolation`.
+
+The above three mitigation methods combined can provide protection to a
+certain degree, but the risk of the remaining attack surface has to be
+carefully analyzed. For full protection the following methods are
+available:
+
+ - Disabling SMT:
+
+ Disabling SMT and enforcing the L1D flushing provides the maximum
+ amount of protection. This mitigation is not depending on any of the
+ above mitigation methods.
+
+ SMT control and L1D flushing can be tuned by the command line
+ parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
+ time with the matching sysfs control files. See :ref:`smt_control`,
+ :ref:`mitigation_control_command_line` and
+ :ref:`mitigation_control_kvm`.
+
+ - Disabling EPT:
+
+ Disabling EPT provides the maximum amount of protection as well. It is
+ not depending on any of the above mitigation methods. SMT can stay
+ enabled and L1D flushing is not required, but the performance impact is
+ significant.
+
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
+ parameter.
+
+3.4. Nested virtual machines
+""""""""""""""""""""""""""""
+
+When nested virtualization is in use, three operating systems are involved:
+the bare metal hypervisor, the nested hypervisor and the nested virtual
+machine. VMENTER operations from the nested hypervisor into the nested
+guest will always be processed by the bare metal hypervisor. If KVM is the
+bare metal hypervisor it wiil:
+
+ - Flush the L1D cache on every switch from the nested hypervisor to the
+ nested virtual machine, so that the nested hypervisor's secrets are not
+ exposed to the nested virtual machine;
+
+ - Flush the L1D cache on every switch from the nested virtual machine to
+ the nested hypervisor; this is a complex operation, and flushing the L1D
+ cache avoids that the bare metal hypervisor's secrets are exposed to the
+ nested virtual machine;
+
+ - Instruct the nested hypervisor to not perform any L1D cache flush. This
+ is an optimization to avoid double L1D flushing.
+
+
+.. _default_mitigations:
+
+Default mitigations
+-------------------
+
+ The kernel default mitigations for vulnerable processors are:
+
+ - PTE inversion to protect against malicious user space. This is done
+ unconditionally and cannot be controlled.
+
+ - L1D conditional flushing on VMENTER when EPT is enabled for
+ a guest.
+
+ The kernel does not by default enforce the disabling of SMT, which leaves
+ SMT systems vulnerable when running untrusted guests with EPT enabled.
+
+ The rationale for this choice is:
+
+ - Force disabling SMT can break existing setups, especially with
+ unattended updates.
+
+ - If regular users run untrusted guests on their machine, then L1TF is
+ just an add on to other malware which might be embedded in an untrusted
+ guest, e.g. spam-bots or attacks on the local network.
+
+ There is no technical way to prevent a user from running untrusted code
+ on their machines blindly.
+
+ - It's technically extremely unlikely and from today's knowledge even
+ impossible that L1TF can be exploited via the most popular attack
+ mechanisms like JavaScript because these mechanisms have no way to
+ control PTEs. If this would be possible and not other mitigation would
+ be possible, then the default might be different.
+
+ - The administrators of cloud and hosting setups have to carefully
+ analyze the risk for their scenarios and make the appropriate
+ mitigation choices, which might even vary across their deployed
+ machines and also result in other changes of their overall setup.
+ There is no way for the kernel to provide a sensible default for this
+ kind of scenarios.
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 07f147381f32..ea2dafe49ae8 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0
0: Tag set is not shared.
1: Tag set shared between devices for blk-mq. Only makes sense with
nr_devices > 1, otherwise there's no tag set to share.
+
+zoned=[0/1]: Default: 0
+ 0: Block device is exposed as a random-access block device.
+ 1: Block device is exposed as a host-managed zoned block device.
+
+zone_size=[MB]: Default: 256
+ Per zone size when exposed as a zoned block device. Must be a power of two.
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
index 0dbc946de2ea..0aace9cc536c 100644
--- a/Documentation/block/stat.txt
+++ b/Documentation/block/stat.txt
@@ -31,28 +31,32 @@ write ticks milliseconds total wait time for write requests
in_flight requests number of I/Os currently in flight
io_ticks milliseconds total time this block device has been active
time_in_queue milliseconds total wait time for all requests
+discard I/Os requests number of discard I/Os processed
+discard merges requests number of discard I/Os merged with in-queue I/O
+discard sectors sectors number of sectors discarded
+discard ticks milliseconds total wait time for discard requests
-read I/Os, write I/Os
-=====================
+read I/Os, write I/Os, discard I/0s
+===================================
These values increment when an I/O request completes.
-read merges, write merges
-=========================
+read merges, write merges, discard merges
+=========================================
These values increment when an I/O request is merged with an
already-queued I/O request.
-read sectors, write sectors
-===========================
+read sectors, write sectors, discard_sectors
+============================================
-These values count the number of sectors read from or written to this
-block device. The "sectors" in question are the standard UNIX 512-byte
-sectors, not any device- or filesystem-specific block size. The
-counters are incremented when the I/O completes.
+These values count the number of sectors read from, written to, or
+discarded from this block device. The "sectors" in question are the
+standard UNIX 512-byte sectors, not any device- or filesystem-specific
+block size. The counters are incremented when the I/O completes.
-read ticks, write ticks
-=======================
+read ticks, write ticks, discard ticks
+======================================
These values count the number of milliseconds that I/O requests have
waited on this block device. If there are multiple I/O requests waiting,
diff --git a/Documentation/conf.py b/Documentation/conf.py
index 62ac5a9f3a9f..b691af4831fa 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -34,7 +34,7 @@ needs_sphinx = '1.3'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
-extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure']
+extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig']
# The name of the math extension changed on Sphinx 1.4
if major == 1 and minor > 3:
diff --git a/Documentation/core-api/atomic_ops.rst b/Documentation/core-api/atomic_ops.rst
index 2e7165f86f55..724583453e1f 100644
--- a/Documentation/core-api/atomic_ops.rst
+++ b/Documentation/core-api/atomic_ops.rst
@@ -29,7 +29,7 @@ updated by one CPU, local_t is probably more appropriate. Please see
local_t.
The first operations to implement for atomic_t's are the initializers and
-plain reads. ::
+plain writes. ::
#define ATOMIC_INIT(i) { (i) }
#define atomic_set(v, i) ((v)->counter = (i))
diff --git a/Documentation/device-mapper/writecache.txt b/Documentation/device-mapper/writecache.txt
index 4424fa2c67d7..01532b3008ae 100644
--- a/Documentation/device-mapper/writecache.txt
+++ b/Documentation/device-mapper/writecache.txt
@@ -15,6 +15,8 @@ Constructor parameters:
size)
5. the number of optional parameters (the parameters with an argument
count as two)
+ start_sector n (default: 0)
+ offset from the start of cache device in 512-byte sectors
high_watermark n (default: 50)
start writeback when the number of used blocks reach this
watermark
diff --git a/Documentation/devicetree/bindings/arm/samsung/samsung-boards.txt b/Documentation/devicetree/bindings/arm/samsung/samsung-boards.txt
index bdadc3da9556..6970f30a3770 100644
--- a/Documentation/devicetree/bindings/arm/samsung/samsung-boards.txt
+++ b/Documentation/devicetree/bindings/arm/samsung/samsung-boards.txt
@@ -66,7 +66,7 @@ Required root node properties:
- "insignal,arndale-octa" - for Exynos5420-based Insignal Arndale
Octa board.
- "insignal,origen" - for Exynos4210-based Insignal Origen board.
- - "insignal,origen4412 - for Exynos4412-based Insignal Origen board.
+ - "insignal,origen4412" - for Exynos4412-based Insignal Origen board.
Optional nodes:
diff --git a/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt b/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt
index 6fddb4f4f71a..3055d5c2c04e 100644
--- a/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt
+++ b/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt
@@ -36,7 +36,7 @@ Optional nodes:
- port/ports: to describe a connection to an external encoder. The
binding follows Documentation/devicetree/bindings/graph.txt and
- suppors a single port with a single endpoint.
+ supports a single port with a single endpoint.
- See also Documentation/devicetree/bindings/display/tilcdc/panel.txt and
Documentation/devicetree/bindings/display/tilcdc/tfp410.txt for connecting
diff --git a/Documentation/devicetree/bindings/gpio/nintendo,hollywood-gpio.txt b/Documentation/devicetree/bindings/gpio/nintendo,hollywood-gpio.txt
index 20fc72d9e61e..45a61b462287 100644
--- a/Documentation/devicetree/bindings/gpio/nintendo,hollywood-gpio.txt
+++ b/Documentation/devicetree/bindings/gpio/nintendo,hollywood-gpio.txt
@@ -1,7 +1,7 @@
Nintendo Wii (Hollywood) GPIO controller
Required properties:
-- compatible: "nintendo,hollywood-gpio
+- compatible: "nintendo,hollywood-gpio"
- reg: Physical base address and length of the controller's registers.
- gpio-controller: Marks the device node as a GPIO controller.
- #gpio-cells: Should be <2>. The first cell is the pin number and the
diff --git a/Documentation/devicetree/bindings/hwmon/npcm750-pwm-fan.txt b/Documentation/devicetree/bindings/hwmon/npcm750-pwm-fan.txt
new file mode 100644
index 000000000000..28f43e929f6d
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/npcm750-pwm-fan.txt
@@ -0,0 +1,84 @@
+Nuvoton NPCM7xx PWM and Fan Tacho controller device
+
+The Nuvoton BMC NPCM7XX supports 8 Pulse-width modulation (PWM)
+controller outputs and 16 Fan tachometer controller inputs.
+
+Required properties for pwm-fan node
+- #address-cells : should be 1.
+- #size-cells : should be 0.
+- compatible : "nuvoton,npcm750-pwm-fan" for Poleg NPCM7XX.
+- reg : specifies physical base address and size of the registers.
+- reg-names : must contain:
+ * "pwm" for the PWM registers.
+ * "fan" for the Fan registers.
+- clocks : phandle of reference clocks.
+- clock-names : must contain
+ * "pwm" for PWM controller operating clock.
+ * "fan" for Fan controller operating clock.
+- interrupts : contain the Fan interrupts with flags for falling edge.
+- pinctrl-names : a pinctrl state named "default" must be defined.
+- pinctrl-0 : phandle referencing pin configuration of the PWM and Fan
+ controller ports.
+
+fan subnode format:
+===================
+Under fan subnode can be upto 8 child nodes, each child node representing a fan.
+Each fan subnode must have one PWM channel and atleast one Fan tach channel.
+
+For PWM channel can be configured cooling-levels to create cooling device.
+Cooling device could be bound to a thermal zone for the thermal control.
+
+Required properties for each child node:
+- reg : specify the PWM output channel.
+ integer value in the range 0 through 7, that represent
+ the PWM channel number that used.
+
+- fan-tach-ch : specify the Fan tach input channel.
+ integer value in the range 0 through 15, that represent
+ the fan tach channel number that used.
+
+ At least one Fan tach input channel is required
+
+Optional property for each child node:
+- cooling-levels: PWM duty cycle values in a range from 0 to 255
+ which correspond to thermal cooling states.
+
+Examples:
+
+pwm_fan:pwm-fan-controller@103000 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "nuvoton,npcm750-pwm-fan";
+ reg = <0x103000 0x2000>,
+ <0x180000 0x8000>;
+ reg-names = "pwm", "fan";
+ clocks = <&clk NPCM7XX_CLK_APB3>,
+ <&clk NPCM7XX_CLK_APB4>;
+ clock-names = "pwm","fan";
+ interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pwm0_pins &pwm1_pins &pwm2_pins
+ &fanin0_pins &fanin1_pins &fanin2_pins
+ &fanin3_pins &fanin4_pins>;
+ fan@0 {
+ reg = <0x00>;
+ fan-tach-ch = /bits/ 8 <0x00 0x01>;
+ cooling-levels = <127 255>;
+ };
+ fan@1 {
+ reg = <0x01>;
+ fan-tach-ch = /bits/ 8 <0x02 0x03>;
+ };
+ fan@2 {
+ reg = <0x02>;
+ fan-tach-ch = /bits/ 8 <0x04>;
+ };
+
+};
diff --git a/Documentation/devicetree/bindings/input/touchscreen/hideep.txt b/Documentation/devicetree/bindings/input/touchscreen/hideep.txt
index 121d9b7c79a2..1063c30d53f7 100644
--- a/Documentation/devicetree/bindings/input/touchscreen/hideep.txt
+++ b/Documentation/devicetree/bindings/input/touchscreen/hideep.txt
@@ -32,7 +32,7 @@ i2c@00000000 {
reg = <0x6c>;
interrupt-parent = <&gpx1>;
interrupts = <2 IRQ_TYPE_LEVEL_LOW>;
- vdd-supply = <&ldo15_reg>";
+ vdd-supply = <&ldo15_reg>;
vid-supply = <&ldo18_reg>;
reset-gpios = <&gpx1 5 0>;
touchscreen-size-x = <1080>;
diff --git a/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.txt b/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.txt
index 5f89fb635a1b..f97fd8ab5e45 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.txt
@@ -4,6 +4,7 @@ Required properties:
- compatible : should be "ingenic,<socname>-intc". Valid strings are:
ingenic,jz4740-intc
+ ingenic,jz4725b-intc
ingenic,jz4770-intc
ingenic,jz4775-intc
ingenic,jz4780-intc
diff --git a/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt b/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt
index 1099fe0788fa..f246ccbf8838 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/nvidia,tegra20-ictlr.txt
@@ -15,7 +15,7 @@ Required properties:
include "nvidia,tegra30-ictlr".
- reg : Specifies base physical address and size of the registers.
Each controller must be described separately (Tegra20 has 4 of them,
- whereas Tegra30 and later have 5"
+ whereas Tegra30 and later have 5).
- interrupt-controller : Identifies the node as an interrupt controller.
- #interrupt-cells : Specifies the number of cells needed to encode an
interrupt source. The value must be 3.
diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt
index 20f121daa910..697ca2f26d1b 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.txt
@@ -7,6 +7,7 @@ Required properties:
- "renesas,irqc-r8a73a4" (R-Mobile APE6)
- "renesas,irqc-r8a7743" (RZ/G1M)
- "renesas,irqc-r8a7745" (RZ/G1E)
+ - "renesas,irqc-r8a77470" (RZ/G1C)
- "renesas,irqc-r8a7790" (R-Car H2)
- "renesas,irqc-r8a7791" (R-Car M2-W)
- "renesas,irqc-r8a7792" (R-Car V2H)
@@ -16,6 +17,7 @@ Required properties:
- "renesas,intc-ex-r8a7796" (R-Car M3-W)
- "renesas,intc-ex-r8a77965" (R-Car M3-N)
- "renesas,intc-ex-r8a77970" (R-Car V3M)
+ - "renesas,intc-ex-r8a77980" (R-Car V3H)
- "renesas,intc-ex-r8a77995" (R-Car D3)
- #interrupt-cells: has to be <2>: an interrupt index and flags, as defined in
interrupts.txt in this directory
diff --git a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt
index 136bd612bd83..6a36bf66d932 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.txt
@@ -12,7 +12,7 @@ Required properties:
specifier, shall be 2
- interrupts: interrupts references to primary interrupt controller
(only needed for exti controller with multiple exti under
- same parent interrupt: st,stm32-exti and st,stm32h7-exti")
+ same parent interrupt: st,stm32-exti and st,stm32h7-exti)
Example:
diff --git a/Documentation/devicetree/bindings/mips/brcm/soc.txt b/Documentation/devicetree/bindings/mips/brcm/soc.txt
index 356c29789cf5..3a66d3c483e1 100644
--- a/Documentation/devicetree/bindings/mips/brcm/soc.txt
+++ b/Documentation/devicetree/bindings/mips/brcm/soc.txt
@@ -152,7 +152,7 @@ Required properties:
- compatible : should contain one of:
"brcm,bcm7425-timers"
"brcm,bcm7429-timers"
- "brcm,bcm7435-timers and
+ "brcm,bcm7435-timers" and
"brcm,brcmstb-timers"
- reg : the timers register range
- interrupts : the interrupt line for this timer block
diff --git a/Documentation/devicetree/bindings/net/fsl-fman.txt b/Documentation/devicetree/bindings/net/fsl-fman.txt
index df873d1f3b7c..f8c33890bc29 100644
--- a/Documentation/devicetree/bindings/net/fsl-fman.txt
+++ b/Documentation/devicetree/bindings/net/fsl-fman.txt
@@ -238,7 +238,7 @@ PROPERTIES
Must include one of the following:
- "fsl,fman-dtsec" for dTSEC MAC
- "fsl,fman-xgec" for XGEC MAC
- - "fsl,fman-memac for mEMAC MAC
+ - "fsl,fman-memac" for mEMAC MAC
- cell-index
Usage: required
diff --git a/Documentation/devicetree/bindings/phy/phy-ath79-usb.txt b/Documentation/devicetree/bindings/phy/phy-ath79-usb.txt
index cafe2197dad9..c3a29c5feea3 100644
--- a/Documentation/devicetree/bindings/phy/phy-ath79-usb.txt
+++ b/Documentation/devicetree/bindings/phy/phy-ath79-usb.txt
@@ -3,7 +3,7 @@
Required properties:
- compatible: "qca,ar7100-usb-phy"
- #phys-cells: should be 0
-- reset-names: "usb-phy"[, "usb-suspend-override"]
+- reset-names: "phy"[, "suspend-override"]
- resets: references to the reset controllers
Example:
@@ -11,7 +11,7 @@ Example:
usb-phy {
compatible = "qca,ar7100-usb-phy";
- reset-names = "usb-phy", "usb-suspend-override";
+ reset-names = "phy", "suspend-override";
resets = <&rst 4>, <&rst 3>;
#phy-cells = <0>;
diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 9b387f861aed..7dec508987c7 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -133,7 +133,7 @@ located inside a PM domain with index 0 of a power controller represented by a
node with the label "power".
In the second example the consumer device are partitioned across two PM domains,
the first with index 0 and the second with index 1, of a power controller that
-is represented by a node with the label "power.
+is represented by a node with the label "power".
Optional properties:
- required-opps: This contains phandle to an OPP node in another device's OPP
diff --git a/Documentation/devicetree/bindings/regulator/tps65090.txt b/Documentation/devicetree/bindings/regulator/tps65090.txt
index ca69f5e3040c..ae326f263597 100644
--- a/Documentation/devicetree/bindings/regulator/tps65090.txt
+++ b/Documentation/devicetree/bindings/regulator/tps65090.txt
@@ -16,7 +16,7 @@ Required properties:
Optional properties:
- ti,enable-ext-control: This is applicable for DCDC1, DCDC2 and DCDC3.
If DCDCs are externally controlled then this property should be there.
-- "dcdc-ext-control-gpios: This is applicable for DCDC1, DCDC2 and DCDC3.
+- dcdc-ext-control-gpios: This is applicable for DCDC1, DCDC2 and DCDC3.
If DCDCs are externally controlled and if it is from GPIO then GPIO
number should be provided. If it is externally controlled and no GPIO
entry then driver will just configure this rails as external control
diff --git a/Documentation/devicetree/bindings/reset/st,sti-softreset.txt b/Documentation/devicetree/bindings/reset/st,sti-softreset.txt
index a21658f18fe6..3661e6153a92 100644
--- a/Documentation/devicetree/bindings/reset/st,sti-softreset.txt
+++ b/Documentation/devicetree/bindings/reset/st,sti-softreset.txt
@@ -15,7 +15,7 @@ Please refer to reset.txt in this directory for common reset
controller binding usage.
Required properties:
-- compatible: Should be st,stih407-softreset";
+- compatible: Should be "st,stih407-softreset";
- #reset-cells: 1, see below
example:
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt
index d330c73de9a2..68b7d6207e3d 100644
--- a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.txt
@@ -39,7 +39,7 @@ Required properties:
Optional property:
- clock-frequency: Desired I2C bus clock frequency in Hz.
- When missing default to 400000Hz.
+ When missing default to 100000Hz.
Child nodes should conform to I2C bus binding as described in i2c.txt.
diff --git a/Documentation/devicetree/bindings/sound/qcom,apq8016-sbc.txt b/Documentation/devicetree/bindings/sound/qcom,apq8016-sbc.txt
index 6a4aadc4ce06..84b28dbe9f15 100644
--- a/Documentation/devicetree/bindings/sound/qcom,apq8016-sbc.txt
+++ b/Documentation/devicetree/bindings/sound/qcom,apq8016-sbc.txt
@@ -30,7 +30,7 @@ Required properties:
Board connectors:
* Headset Mic
- * Secondary Mic",
+ * Secondary Mic
* DMIC
* Ext Spk
diff --git a/Documentation/devicetree/bindings/sound/qcom,apq8096.txt b/Documentation/devicetree/bindings/sound/qcom,apq8096.txt
index aa54e49fc8a2..c7600a93ab39 100644
--- a/Documentation/devicetree/bindings/sound/qcom,apq8096.txt
+++ b/Documentation/devicetree/bindings/sound/qcom,apq8096.txt
@@ -35,7 +35,7 @@ This binding describes the APQ8096 sound card, which uses qdsp for audio.
"Digital Mic3"
Audio pins and MicBias on WCD9335 Codec:
- "MIC_BIAS1
+ "MIC_BIAS1"
"MIC_BIAS2"
"MIC_BIAS3"
"MIC_BIAS4"
diff --git a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt
index b1fe7e9de1b4..18d4d0166c76 100644
--- a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt
+++ b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt
@@ -1,19 +1,25 @@
-Mediatek MT6577, MT6572 and MT6589 Timers
----------------------------------------
+Mediatek Timers
+---------------
+
+Mediatek SoCs have two different timers on different platforms,
+- GPT (General Purpose Timer)
+- SYST (System Timer)
+
+The proper timer will be selected automatically by driver.
Required properties:
- compatible should contain:
- * "mediatek,mt2701-timer" for MT2701 compatible timers
- * "mediatek,mt6580-timer" for MT6580 compatible timers
- * "mediatek,mt6589-timer" for MT6589 compatible timers
- * "mediatek,mt7623-timer" for MT7623 compatible timers
- * "mediatek,mt8127-timer" for MT8127 compatible timers
- * "mediatek,mt8135-timer" for MT8135 compatible timers
- * "mediatek,mt8173-timer" for MT8173 compatible timers
- * "mediatek,mt6577-timer" for MT6577 and all above compatible timers
-- reg: Should contain location and length for timers register.
-- clocks: Clocks driving the timer hardware. This list should include two
- clocks. The order is system clock and as second clock the RTC clock.
+ * "mediatek,mt2701-timer" for MT2701 compatible timers (GPT)
+ * "mediatek,mt6580-timer" for MT6580 compatible timers (GPT)
+ * "mediatek,mt6589-timer" for MT6589 compatible timers (GPT)
+ * "mediatek,mt7623-timer" for MT7623 compatible timers (GPT)
+ * "mediatek,mt8127-timer" for MT8127 compatible timers (GPT)
+ * "mediatek,mt8135-timer" for MT8135 compatible timers (GPT)
+ * "mediatek,mt8173-timer" for MT8173 compatible timers (GPT)
+ * "mediatek,mt6577-timer" for MT6577 and all above compatible timers (GPT)
+ * "mediatek,mt6765-timer" for MT6765 compatible timers (SYST)
+- reg: Should contain location and length for timer register.
+- clocks: Should contain system clock.
Examples:
@@ -21,5 +27,5 @@ Examples:
compatible = "mediatek,mt6577-timer";
reg = <0x10008000 0x80>;
interrupts = <GIC_SPI 113 IRQ_TYPE_LEVEL_LOW>;
- clocks = <&system_clk>, <&rtc_clk>;
+ clocks = <&system_clk>;
};
diff --git a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt
index 252a05c5d976..c8c4b00ecb94 100644
--- a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt
+++ b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt
@@ -16,7 +16,8 @@ A child node must exist to represent the core DWC3 IP block. The name of
the node is not important. The content of the node is defined in dwc3.txt.
Phy documentation is provided in the following places:
-Documentation/devicetree/bindings/phy/qcom-dwc3-usb-phy.txt
+Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt - USB2.0 PHY
+Documentation/devicetree/bindings/phy/phy-rockchip-typec.txt - Type-C PHY
Example device nodes:
diff --git a/Documentation/devicetree/bindings/w1/w1-gpio.txt b/Documentation/devicetree/bindings/w1/w1-gpio.txt
index 6e09c35d9f1a..37091902a021 100644
--- a/Documentation/devicetree/bindings/w1/w1-gpio.txt
+++ b/Documentation/devicetree/bindings/w1/w1-gpio.txt
@@ -15,7 +15,7 @@ Optional properties:
Examples:
- onewire@0 {
+ onewire {
compatible = "w1-gpio";
gpios = <&gpio 126 0>, <&gpio 105 0>;
};
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index dbdf62907703..c7858dd1ea8f 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -5,10 +5,10 @@
#
# Architecture requirements
#
-# * arm64
+# * arm/arm64
#
-# Rely on eret context synchronization when returning from IPI handler, and
-# when returning to user-space.
+# Rely on implicit context synchronization as a result of exception return
+# when returning from IPI handler, and when returning to user-space.
#
# * x86
#
@@ -31,7 +31,7 @@
-----------------------
| alpha: | TODO |
| arc: | TODO |
- | arm: | TODO |
+ | arm: | ok |
| arm64: | ok |
| c6x: | TODO |
| h8300: | TODO |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 37bf0a9de75c..7cca82ed2ed1 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -64,7 +64,7 @@ prototypes:
void (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
- umode_t create_mode, int *opened);
+ umode_t create_mode);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
locking rules:
diff --git a/Documentation/filesystems/cifs/CHANGES b/Documentation/filesystems/cifs/CHANGES
index 455e1cc494a9..1df7f4910eb2 100644
--- a/Documentation/filesystems/cifs/CHANGES
+++ b/Documentation/filesystems/cifs/CHANGES
@@ -1,1068 +1,4 @@
-See https://wiki.samba.org/index.php/LinuxCIFSKernel for
-more current information.
-
-Version 1.62
-------------
-Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
-to more strictly handle corrupt frames.
-
-Version 1.61
-------------
-Fix append problem to Samba servers (files opened with O_APPEND could
-have duplicated data). Fix oops in cifs_lookup. Workaround problem
-mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
-Disable use of server inode numbers when server only
-partially supports them (e.g. for one server querying inode numbers on
-FindFirst fails but QPathInfo queries works). Fix oops with dfs in
-cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
-for OpenOffice when on forcedirectio mount e.g.)
-
-Version 1.60
--------------
-Fix memory leak in reconnect. Fix oops in DFS mount error path.
-Set s_maxbytes to smaller (the max that vfs can handle) so that
-sendfile will now work over cifs mounts again. Add noforcegid
-and noforceuid mount parameters. Fix small mem leak when using
-ntlmv2. Fix 2nd mount to same server but with different port to
-be allowed (rather than reusing the 1st port) - only when the
-user explicitly overrides the port on the 2nd mount.
-
-Version 1.59
-------------
-Client uses server inode numbers (which are persistent) rather than
-client generated ones by default (mount option "serverino" turned
-on by default if server supports it). Add forceuid and forcegid
-mount options (so that when negotiating unix extensions specifying
-which uid mounted does not immediately force the server's reported
-uids to be overridden). Add support for scope mount parm. Improve
-hard link detection to use same inode for both. Do not set
-read-only dos attribute on directories (for chmod) since Windows
-explorer special cases this attribute bit for directories for
-a different purpose.
-
-Version 1.58
-------------
-Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
-when the UTF-8 string is composed of unusually long (more than 4 byte) converted
-characters. Add support for mounting root of a share which redirects immediately
-to DFS target. Convert string conversion functions from Unicode to more
-accurately mark string length before allocating memory (which may help the
-rare cases where a UTF-8 string is much larger than the UCS2 string that
-we converted from). Fix endianness of the vcnum field used during
-session setup to distinguish multiple mounts to same server from different
-userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
-flag to be set to 2, and mount must enable krb5 to turn on extended security).
-Performance of file create to Samba improved (posix create on lookup
-removes 1 of 2 network requests sent on file create)
-
-Version 1.57
-------------
-Improve support for multiple security contexts to the same server. We
-used to use the same "vcnumber" for all connections which could cause
-the server to treat subsequent connections, especially those that
-are authenticated as guest, as reconnections, invalidating the earlier
-user's smb session. This fix allows cifs to mount multiple times to the
-same server with different userids without risking invalidating earlier
-established security contexts. fsync now sends SMB Flush operation
-to better ensure that we wait for server to write all of the data to
-server disk (not just write it over the network). Add new mount
-parameter to allow user to disable sending the (slow) SMB flush on
-fsync if desired (fsync still flushes all cached write data to the server).
-Posix file open support added (turned off after one attempt if server
-fails to support it properly, as with Samba server versions prior to 3.3.2)
-Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
-little memory for the "nativeFileSystem" field returned by the server
-during mount). Endian convert inode numbers if necessary (makes it easier
-to compare inode numbers on network files from big endian systems).
-
-Version 1.56
-------------
-Add "forcemandatorylock" mount option to allow user to use mandatory
-rather than posix (advisory) byte range locks, even though server would
-support posix byte range locks. Fix query of root inode when prefixpath
-specified and user does not have access to query information about the
-top of the share. Fix problem in 2.6.28 resolving DFS paths to
-Samba servers (worked to Windows). Fix rmdir so that pending search
-(readdir) requests do not get invalid results which include the now
-removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
-when using DFS. Add better file create support to servers which support
-the CIFS POSIX protocol extensions (this adds support for new flags
-on create, and improves semantics for write of locked ranges).
-
-Version 1.55
-------------
-Various fixes to make delete of open files behavior more predictable
-(when delete of an open file fails we mark the file as "delete-on-close"
-in a way that more servers accept, but only if we can first rename the
-file to a temporary name). Add experimental support for more safely
-handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp
-sends, and also let tcp autotune the socket send and receive buffers.
-This reduces the number of EAGAIN errors returned by TCP/IP in
-high stress workloads (and the number of retries on socket writes
-when sending large SMBWriteX requests). Fix case in which a portion of
-data can in some cases not get written to the file on the server before the
-file is closed. Fix DFS parsing to properly handle path consumed field,
-and to handle certain codepage conversions better. Fix mount and
-umount race that can cause oops in mount or umount or reconnect.
-
-Version 1.54
-------------
-Fix premature write failure on congested networks (we would give up
-on EAGAIN from the socket too quickly on large writes).
-Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
-Fix endian problems in acl (mode from/to cifs acl) on bigendian
-architectures. Fix problems with preserving timestamps on copying open
-files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit
-on parent directory when server supports Unix Extensions but not POSIX
-create. Update cifs.upcall version to handle new Kerberos sec flags
-(this requires update of cifs.upcall program from Samba). Fix memory leak
-on dns_upcall (resolving DFS referralls). Fix plain text password
-authentication (requires setting SecurityFlags to 0x30030 to enable
-lanman and plain text though). Fix writes to be at correct offset when
-file is open with O_APPEND and file is on a directio (forcediretio) mount.
-Fix bug in rewinding readdir directory searches. Add nodfs mount option.
-
-Version 1.53
-------------
-DFS support added (Microsoft Distributed File System client support needed
-for referrals which enable a hierarchical name space among servers).
-Disable temporary caching of mode bits to servers which do not support
-storing of mode (e.g. Windows servers, when client mounts without cifsacl
-mount option) and add new "dynperm" mount option to enable temporary caching
-of mode (enable old behavior). Fix hang on mount caused when server crashes
-tcp session during negotiate protocol.
-
-Version 1.52
-------------
-Fix oops on second mount to server when null auth is used.
-Enable experimental Kerberos support. Return writebehind errors on flush
-and sync so that events like out of disk space get reported properly on
-cached files. Fix setxattr failure to certain Samba versions. Fix mount
-of second share to disconnected server session (autoreconnect on this).
-Add ability to modify cifs acls for handling chmod (when mounted with
-cifsacl flag). Fix prefixpath path separator so we can handle mounts
-with prefixpaths longer than one directory (one path component) when
-mounted to Windows servers. Fix slow file open when cifsacl
-enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
-
-
-Version 1.51
-------------
-Fix memory leak in statfs when mounted to very old servers (e.g.
-Windows 9x). Add new feature "POSIX open" which allows servers
-which support the current POSIX Extensions to provide better semantics
-(e.g. delete for open files opened with posix open). Take into
-account umask on posix mkdir not just older style mkdir. Add
-ability to mount to IPC$ share (which allows CIFS named pipes to be
-opened, read and written as if they were files). When 1st tree
-connect fails (e.g. due to signing negotiation failure) fix
-leak that causes cifsd not to stop and rmmod to fail to cleanup
-cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
-bigendian architectures. Fix possible memory corruption when
-EAGAIN returned on kern_recvmsg. Return better error if server
-requires packet signing but client has disabled it. When mounted
-with cifsacl mount option - mode bits are approximated based
-on the contents of the ACL of the file or directory. When cifs
-mount helper is missing convert make sure that UNC name
-has backslash (not forward slash) between ip address of server
-and the share name.
-
-Version 1.50
-------------
-Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
-done with "serverino" mount option). Add support for POSIX Unlink
-(helps with certain sharing violation cases when server such as
-Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
-mount option to allow disabling the CIFS Unix Extensions for just
-that mount. Fix hang on spinlock in find_writable_file (race when
-reopening file after session crash). Byte range unlock request to
-windows server could unlock more bytes (on server copy of file)
-than intended if start of unlock request is well before start of
-a previous byte range lock that we issued.
-
-Version 1.49
-------------
-IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6
-address after the "ip=" mount option, at least until mount.cifs is fixed to
-handle DNS host to ipv6 name translation). Accept override of uid or gid
-on mount even when Unix Extensions are negotiated (it used to be ignored
-when Unix Extensions were ignored). This allows users to override the
-default uid and gid for files when they are certain that the uids or
-gids on the server do not match those of the client. Make "sec=none"
-mount override username (so that null user connection is attempted)
-to match what documentation said. Support for very large reads, over 127K,
-available to some newer servers (such as Samba 3.0.26 and later but
-note that it also requires setting CIFSMaxBufSize at module install
-time to a larger value which may hurt performance in some cases).
-Make sign option force signing (or fail if server does not support it).
-
-Version 1.48
-------------
-Fix mtime bouncing around from local idea of last write times to remote time.
-Fix hang (in i_size_read) when simultaneous size update of same remote file
-on smp system corrupts sequence number. Do not reread unnecessarily partial page
-(which we are about to overwrite anyway) when writing out file opened rw.
-When DOS attribute of file on non-Unix server's file changes on the server side
-from read-only back to read-write, reflect this change in default file mode
-(we had been leaving a file's mode read-only until the inode were reloaded).
-Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
-when archive dos attribute not set and we are changing mode back to writeable
-on server which does not support the Unix Extensions). Remove read only dos
-attribute on chmod when adding any write permission (ie on any of
-user/group/other (not all of user/group/other ie 0222) when
-mounted to windows. Add support for POSIX MkDir (slight performance
-enhancement and eliminates the network race between the mkdir and set
-path info of the mode).
-
-
-Version 1.47
-------------
-Fix oops in list_del during mount caused by unaligned string.
-Fix file corruption which could occur on some large file
-copies caused by writepages page i/o completion bug.
-Seek to SEEK_END forces check for update of file size for non-cached
-files. Allow file size to be updated on remote extend of locally open,
-non-cached file. Fix reconnect to newer Samba servers (or other servers
-which support the CIFS Unix/POSIX extensions) so that we again tell the
-server the Unix/POSIX cifs capabilities which we support (SetFSInfo).
-Add experimental support for new POSIX Open/Mkdir (which returns
-stat information on the open, and allows setting the mode).
-
-Version 1.46
-------------
-Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps.
-Allow null user to be specified on mount ("username="). Do not return
-EINVAL on readdir when filldir fails due to overwritten blocksize
-(fixes FC problem). Return error in rename 2nd attempt retry (ie report
-if rename by handle also fails, after rename by path fails, we were
-not reporting whether the retry worked or not). Fix NTLMv2 to
-work to Windows servers (mount with option "sec=ntlmv2").
-
-Version 1.45
-------------
-Do not time out lockw calls when using posix extensions. Do not
-time out requests if server still responding reasonably fast
-on requests on other threads. Improve POSIX locking emulation,
-(lock cancel now works, and unlock of merged range works even
-to Windows servers now). Fix oops on mount to lanman servers
-(win9x, os/2 etc.) when null password. Do not send listxattr
-(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux
-problem (instantiate inodes/dentries in right order for readdir).
-
-Version 1.44
-------------
-Rewritten sessionsetup support, including support for legacy SMB
-session setup needed for OS/2 and older servers such as Windows 95 and 98.
-Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst
-so we can do search (ls etc.) to OS/2. Do not send NTCreateX
-or recent levels of FindFirst unless server says it supports NT SMBs
-(instead use legacy equivalents from LANMAN dialect). Fix to allow
-NTLMv2 authentication support (now can use stronger password hashing
-on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
-Allow override of global cifs security flags on mount via "sec=" option(s).
-
-Version 1.43
-------------
-POSIX locking to servers which support CIFS POSIX Extensions
-(disabled by default controlled by proc/fs/cifs/Experimental).
-Handle conversion of long share names (especially Asian languages)
-to Unicode during mount. Fix memory leak in sess struct on reconnect.
-Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on
-cifs open which helps rare case when setpathinfo fails or server does
-not support it.
-
-Version 1.42
-------------
-Fix slow oplock break when mounted to different servers at the same time and
-the tids match and we try to find matching fid on wrong server. Fix read
-looping when signing required by server (2.6.16 kernel only). Fix readdir
-vs. rename race which could cause each to hang. Return . and .. even
-if server does not. Allow searches to skip first three entries and
-begin at any location. Fix oops in find_writeable_file.
-
-Version 1.41
-------------
-Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can
-configure stronger authentication. Fix sfu symlinks so they can
-be followed (not just recognized). Fix wraparound of bcc on
-read responses when buffer size over 64K and also fix wrap of
-max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in
-cifs_user_read and cifs_readpages (when EAGAIN on send of smb
-on socket is returned over and over). Add POSIX (advisory) byte range
-locking support (requires server with newest CIFS UNIX Extensions
-to the protocol implemented). Slow down negprot slightly in port 139
-RFC1001 case to give session_init time on buggy servers.
-
-Version 1.40
-------------
-Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
-of readpages by eliminating one extra memcpy. Allow update of file size
-from remote server even if file is open for write as long as mount is
-directio. Recognize share mode security and send NTLM encrypted password
-on tree connect if share mode negotiated.
-
-Version 1.39
-------------
-Defer close of a file handle slightly if pending writes depend on that handle
-(this reduces the EBADF bad file handle errors that can be logged under heavy
-stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2
-Fix SFU style symlinks and mknod needed for servers which do not support the
-CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative
-dentries so files that the client sees as deleted but that later get created
-on the server will be recognized. Add client side permission check on setattr.
-Timeout stuck requests better (where server has never responded or sent corrupt
-responses)
-
-Version 1.38
-------------
-Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket)
-to be smaller at first (but increasing) so large write performance performance
-over GigE is better. Do not hang thread on illegal byte range lock response
-from Windows (Windows can send an RFC1001 size which does not match smb size) by
-allowing an SMBs TCP length to be up to a few bytes longer than it should be.
-wsize and rsize can now be larger than negotiated buffer size if server
-supports large readx/writex, even when directio mount flag not specified.
-Write size will in many cases now be 16K instead of 4K which greatly helps
-file copy performance on lightly loaded networks. Fix oops in dnotify
-when experimental config flag enabled. Make cifsFYI more granular.
-
-Version 1.37
-------------
-Fix readdir caching when unlink removes file in current search buffer,
-and this is followed by a rewind search to just before the deleted entry.
-Do not attempt to set ctime unless atime and/or mtime change requested
-(most servers throw it away anyway). Fix length check of received smbs
-to be more accurate. Fix big endian problem with mapchars mount option,
-and with a field returned by statfs.
-
-Version 1.36
-------------
-Add support for mounting to older pre-CIFS servers such as Windows9x and ME.
-For these older servers, add option for passing netbios name of server in
-on mount (servernetbiosname). Add suspend support for power management, to
-avoid cifsd thread preventing software suspend from working.
-Add mount option for disabling the default behavior of sending byte range lock
-requests to the server (necessary for certain applications which break with
-mandatory lock behavior such as Evolution), and also mount option for
-requesting case insensitive matching for path based requests (requesting
-case sensitive is the default).
-
-Version 1.35
-------------
-Add writepage performance improvements. Fix path name conversions
-for long filenames on mounts which were done with "mapchars" mount option
-specified. Ensure multiplex ids do not collide. Fix case in which
-rmmod can oops if done soon after last unmount. Fix truncated
-search (readdir) output when resume filename was a long filename.
-Fix filename conversion when mapchars mount option was specified and
-filename was a long filename.
-
-Version 1.34
-------------
-Fix error mapping of the TOO_MANY_LINKS (hardlinks) case.
-Do not oops if root user kills cifs oplock kernel thread or
-kills the cifsd thread (NB: killing the cifs kernel threads is not
-recommended, unmount and rmmod cifs will kill them when they are
-no longer needed). Fix readdir to ASCII servers (ie older servers
-which do not support Unicode) and also require asterisk.
-Fix out of memory case in which data could be written one page
-off in the page cache.
-
-Version 1.33
-------------
-Fix caching problem, in which readdir of directory containing a file
-which was cached could cause the file's time stamp to be updated
-without invalidating the readahead data (so we could get stale
-file data on the client for that file even as the server copy changed).
-Cleanup response processing so cifsd can not loop when abnormally
-terminated.
-
-
-Version 1.32
-------------
-Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one
-transact response for an SMB request and search entry split across two frames.
-Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server)
-as new protocol extensions. Do not send Get/Set calls for POSIX ACLs
-unless server explicitly claims to support them in CIFS Unix extensions
-POSIX ACL capability bit. Fix packet signing when multiuser mounting with
-different users from the same client to the same server. Fix oops in
-cifs_close. Add mount option for remapping reserved characters in
-filenames (also allow recognizing files with created by SFU which have any
-of these seven reserved characters, except backslash, to be recognized).
-Fix invalid transact2 message (we were sometimes trying to interpret
-oplock breaks as SMB responses). Add ioctl for checking that the
-current uid matches the uid of the mounter (needed by umount.cifs).
-Reduce the number of large buffer allocations in cifs response processing
-(significantly reduces memory pressure under heavy stress with multiple
-processes accessing the same server at the same time).
-
-Version 1.31
-------------
-Fix updates of DOS attributes and time fields so that files on NT4 servers
-do not get marked delete on close. Display sizes of cifs buffer pools in
-cifs stats. Fix oops in unmount when cifsd thread being killed by
-shutdown. Add generic readv/writev and aio support. Report inode numbers
-consistently in readdir and lookup (when serverino mount option is
-specified use the inode number that the server reports - for both lookup
-and readdir, otherwise by default the locally generated inode number is used
-for inodes created in either path since servers are not always able to
-provide unique inode numbers when exporting multiple volumes from under one
-sharename).
-
-Version 1.30
-------------
-Allow new nouser_xattr mount parm to disable xattr support for user namespace.
-Do not flag user_xattr mount parm in dmesg. Retry failures setting file time
-(mostly affects NT4 servers) by retry with handle based network operation.
-Add new POSIX Query FS Info for returning statfs info more accurately.
-Handle passwords with multiple commas in them.
-
-Version 1.29
-------------
-Fix default mode in sysfs of cifs module parms. Remove old readdir routine.
-Fix capabilities flags for large readx so as to allow reads larger than 64K.
-
-Version 1.28
-------------
-Add module init parm for large SMB buffer size (to allow it to be changed
-from its default of 16K) which is especially useful for large file copy
-when mounting with the directio mount option. Fix oops after
-returning from mount when experimental ExtendedSecurity enabled and
-SpnegoNegotiated returning invalid error. Fix case to retry better when
-peek returns from 1 to 3 bytes on socket which should have more data.
-Fixed path based calls (such as cifs lookup) to handle path names
-longer than 530 (now can handle PATH_MAX). Fix pass through authentication
-from Samba server to DC (Samba required dummy LM password).
-
-Version 1.27
-------------
-Turn off DNOTIFY (directory change notification support) by default
-(unless built with the experimental flag) to fix hang with KDE
-file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event
-waiting on an SMB response) in SendReceive when session dies but
-reconnects quickly from another task. Add module init parms for
-minimum number of large and small network buffers in the buffer pools,
-and for the maximum number of simultaneous requests.
-
-Version 1.26
-------------
-Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later
-and other POSIX CIFS compliant servers. Fix error mapping for getfacl
-to EOPNOTSUPP when server does not support posix acls on the wire. Fix
-improperly zeroed buffer in CIFS Unix extensions set times call.
-
-Version 1.25
-------------
-Fix internationalization problem in cifs readdir with filenames that map to
-longer UTF-8 strings than the string on the wire was in Unicode. Add workaround
-for readdir to netapp servers. Fix search rewind (seek into readdir to return
-non-consecutive entries). Do not do readdir when server negotiates
-buffer size to small to fit filename. Add support for reading POSIX ACLs from
-the server (add also acl and noacl mount options).
-
-Version 1.24
-------------
-Optionally allow using server side inode numbers, rather than client generated
-ones by specifying mount option "serverino" - this is required for some apps
-to work which double check hardlinked files and have persistent inode numbers.
-
-Version 1.23
-------------
-Multiple bigendian fixes. On little endian systems (for reconnect after
-network failure) fix tcp session reconnect code so we do not try first
-to reconnect on reverse of port 445. Treat reparse points (NTFS junctions)
-as directories rather than symlinks because we can do follow link on them.
-
-Version 1.22
-------------
-Add config option to enable XATTR (extended attribute) support, mapping
-xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of
-minor fixes pointed out by the Stanford SWAT checker (mostly missing
-or out of order NULL pointer checks in little used error paths).
-
-Version 1.21
-------------
-Add new mount parm to control whether mode check (generic_permission) is done
-on the client. If Unix extensions are enabled and the uids on the client
-and server do not match, client permission checks are meaningless on
-server uids that do not exist on the client (this does not affect the
-normal ACL check which occurs on the server). Fix default uid
-on mknod to match create and mkdir. Add optional mount parm to allow
-override of the default uid behavior (in which the server sets the uid
-and gid of newly created files). Normally for network filesystem mounts
-user want the server to set the uid/gid on newly created files (rather than
-using uid of the client processes you would in a local filesystem).
-
-Version 1.20
-------------
-Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps
-info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir
-(in build_wildcard_path_from_dentry). Fix mknod to pass type field
-(block/char/fifo) properly. Remove spurious mount warning log entry when
-credentials passed as mount argument. Set major/minor device number in
-inode for block and char devices when unix extensions enabled.
-
-Version 1.19
-------------
-Fix /proc/fs/cifs/Stats and DebugData display to handle larger
-amounts of return data. Properly limit requests to MAX_REQ (50
-is the usual maximum active multiplex SMB/CIFS requests per server).
-Do not kill cifsd (and thus hurt the other SMB session) when more than one
-session to the same server (but with different userids) exists and one
-of the two user's smb sessions is being removed while leaving the other.
-Do not loop reconnecting in cifsd demultiplex thread when admin
-kills the thread without going through unmount.
-
-Version 1.18
-------------
-Do not rename hardlinked files (since that should be a noop). Flush
-cached write behind data when reopening a file after session abend,
-except when already in write. Grab per socket sem during reconnect
-to avoid oops in sendmsg if overlapping with reconnect. Do not
-reset cached inode file size on readdir for files open for write on
-client.
-
-
-Version 1.17
-------------
-Update number of blocks in file so du command is happier (in Linux a fake
-blocksize of 512 is required for calculating number of blocks in inode).
-Fix prepare write of partial pages to read in data from server if possible.
-Fix race on tcpStatus field between unmount and reconnection code, causing
-cifsd process sometimes to hang around forever. Improve out of memory
-checks in cifs_filldir
-
-Version 1.16
-------------
-Fix incorrect file size in file handle based setattr on big endian hardware.
-Fix oops in build_path_from_dentry when out of memory. Add checks for invalid
-and closing file structs in writepage/partialpagewrite. Add statistics
-for each mounted share (new menuconfig option). Fix endianness problem in
-volume information displayed in /proc/fs/cifs/DebugData (only affects
-affects big endian architectures). Prevent renames while constructing
-path names for open, mkdir and rmdir.
-
-Version 1.15
-------------
-Change to mempools for alloc smb request buffers and multiplex structs
-to better handle low memory problems (and potential deadlocks).
-
-Version 1.14
-------------
-Fix incomplete listings of large directories on Samba servers when Unix
-extensions enabled. Fix oops when smb_buffer can not be allocated. Fix
-rename deadlock when writing out dirty pages at same time.
-
-Version 1.13
-------------
-Fix open of files in which O_CREATE can cause the mode to change in
-some cases. Fix case in which retry of write overlaps file close.
-Fix PPC64 build error. Reduce excessive stack usage in smb password
-hashing. Fix overwrite of Linux user's view of file mode to Windows servers.
-
-Version 1.12
-------------
-Fixes for large file copy, signal handling, socket retry, buffer
-allocation and low memory situations.
-
-Version 1.11
-------------
-Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize)
-also now allowing support for specifying client netbiosname. NT4 support added.
-
-Version 1.10
-------------
-Fix reconnection (and certain failed mounts) to properly wake up the
-blocked users thread so it does not seem hung (in some cases was blocked
-until the cifs receive timeout expired). Fix spurious error logging
-to kernel log when application with open network files killed.
-
-Version 1.09
-------------
-Fix /proc/fs module unload warning message (that could be logged
-to the kernel log). Fix intermittent failure in connectathon
-test7 (hardlink count not immediately refreshed in case in which
-inode metadata can be incorrectly kept cached when time near zero)
-
-Version 1.08
-------------
-Allow file_mode and dir_mode (specified at mount time) to be enforced
-locally (the server already enforced its own ACLs too) for servers
-that do not report the correct mode (do not support the
-CIFS Unix Extensions).
-
-Version 1.07
-------------
-Fix some small memory leaks in some unmount error paths. Fix major leak
-of cache pages in readpages causing multiple read oriented stress
-testcases (including fsx, and even large file copy) to fail over time.
-
-Version 1.06
-------------
-Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server.
-This allows files that differ only in case and improves performance of file
-creation and file open to such servers. Fix semaphore conflict which causes
-slow delete of open file to Samba (which unfortunately can cause an oplock
-break to self while vfs_unlink held i_sem) which can hang for 20 seconds.
-
-Version 1.05
-------------
-fixes to cifs_readpages for fsx test case
-
-Version 1.04
-------------
-Fix caching data integrity bug when extending file size especially when no
-oplock on file. Fix spurious logging of valid already parsed mount options
-that are parsed outside of the cifs vfs such as nosuid.
-
-
-Version 1.03
-------------
-Connect to server when port number override not specified, and tcp port
-unitialized. Reset search to restart at correct file when kernel routine
-filldir returns error during large directory searches (readdir).
-
-Version 1.02
-------------
-Fix caching problem when files opened by multiple clients in which
-page cache could contain stale data, and write through did
-not occur often enough while file was still open when read ahead
-(read oplock) not allowed. Treat "sep=" when first mount option
-as an override of comma as the default separator between mount
-options.
-
-Version 1.01
-------------
-Allow passwords longer than 16 bytes. Allow null password string.
-
-Version 1.00
-------------
-Gracefully clean up failed mounts when attempting to mount to servers such as
-Windows 98 that terminate tcp sessions during protocol negotiation. Handle
-embedded commas in mount parsing of passwords.
-
-Version 0.99
-------------
-Invalidate local inode cached pages on oplock break and when last file
-instance is closed so that the client does not continue using stale local
-copy rather than later modified server copy of file. Do not reconnect
-when server drops the tcp session prematurely before negotiate
-protocol response. Fix oops in reopen_file when dentry freed. Allow
-the support for CIFS Unix Extensions to be disabled via proc interface.
-
-Version 0.98
-------------
-Fix hang in commit_write during reconnection of open files under heavy load.
-Fix unload_nls oops in a mount failure path. Serialize writes to same socket
-which also fixes any possible races when cifs signatures are enabled in SMBs
-being sent out of signature sequence number order.
-
-Version 0.97
-------------
-Fix byte range locking bug (endian problem) causing bad offset and
-length.
-
-Version 0.96
-------------
-Fix oops (in send_sig) caused by CIFS unmount code trying to
-wake up the demultiplex thread after it had exited. Do not log
-error on harmless oplock release of closed handle.
-
-Version 0.95
-------------
-Fix unsafe global variable usage and password hash failure on gcc 3.3.1
-Fix problem reconnecting secondary mounts to same server after session
-failure. Fix invalid dentry - race in mkdir when directory gets created
-by another client between the lookup and mkdir.
-
-Version 0.94
-------------
-Fix to list processing in reopen_files. Fix reconnection when server hung
-but tcpip session still alive. Set proper timeout on socket read.
-
-Version 0.93
-------------
-Add missing mount options including iocharset. SMP fixes in write and open.
-Fix errors in reconnecting after TCP session failure. Fix module unloading
-of default nls codepage
-
-Version 0.92
-------------
-Active smb transactions should never go negative (fix double FreeXid). Fix
-list processing in file routines. Check return code on kmalloc in open.
-Fix spinlock usage for SMP.
-
-Version 0.91
-------------
-Fix oops in reopen_files when invalid dentry. drop dentry on server rename
-and on revalidate errors. Fix cases where pid is now tgid. Fix return code
-on create hard link when server does not support them.
-
-Version 0.90
-------------
-Fix scheduling while atomic error in getting inode info on newly created file.
-Fix truncate of existing files opened with O_CREAT but not O_TRUNC set.
-
-Version 0.89
-------------
-Fix oops on write to dead tcp session. Remove error log write for case when file open
-O_CREAT but not O_EXCL
-
-Version 0.88
-------------
-Fix non-POSIX behavior on rename of open file and delete of open file by taking
-advantage of trans2 SetFileInfo rename facility if available on target server.
-Retry on ENOSPC and EAGAIN socket errors.
-
-Version 0.87
-------------
-Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix
-allocation size miscalculation. After oplock token lost do not read through
-cache.
-
-Version 0.86
-------------
-Fix oops on empty file readahead. Fix for file size handling for locally cached files.
-
-Version 0.85
-------------
-Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files
-during auto reconnection to server after server recovered from failure.
-
-Version 0.84
-------------
-Finish support for Linux 2.5 open/create changes, which removes the
-redundant NTCreate/QPathInfo/close that was sent during file create.
-Enable oplock by default. Enable packet signing by default (needed to
-access many recent Windows servers)
-
-Version 0.83
-------------
-Fix oops when mounting to long server names caused by inverted parms to kmalloc.
-Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled
-we will choose a cifs user session (smb uid) that better matches the local
-uid if a) the mount uid does not match the current uid and b) we have another
-session to the same server (ip address) for a different mount which
-matches the current local uid.
-
-Version 0.82
-------------
-Add support for mknod of block or character devices. Fix oplock
-code (distributed caching) to properly send response to oplock
-break from server.
-
-Version 0.81
-------------
-Finish up CIFS packet digital signing for the default
-NTLM security case. This should help Windows 2003
-network interoperability since it is common for
-packet signing to be required now. Fix statfs (stat -f)
-which recently started returning errors due to
-invalid value (-1 instead of 0) being set in the
-struct kstatfs f_ffiles field.
-
-Version 0.80
------------
-Fix oops on stopping oplock thread when removing cifs when
-built as module.
-
-Version 0.79
-------------
-Fix mount options for ro (readonly), uid, gid and file and directory mode.
-
-Version 0.78
-------------
-Fix errors displayed on failed mounts to be more understandable.
-Fixed various incorrect or misleading smb to posix error code mappings.
-
-Version 0.77
-------------
-Fix display of NTFS DFS junctions to display as symlinks.
-They are the network equivalent. Fix oops in
-cifs_partialpagewrite caused by missing spinlock protection
-of openfile linked list. Allow writebehind caching errors to
-be returned to the application at file close.
-
-Version 0.76
-------------
-Clean up options displayed in /proc/mounts by show_options to
-be more consistent with other filesystems.
-
-Version 0.75
-------------
-Fix delete of readonly file to Windows servers. Reflect
-presence or absence of read only dos attribute in mode
-bits for servers that do not support CIFS Unix extensions.
-Fix shortened results on readdir of large directories to
-servers supporting CIFS Unix extensions (caused by
-incorrect resume key).
-
-Version 0.74
-------------
-Fix truncate bug (set file size) that could cause hangs e.g. running fsx
-
-Version 0.73
-------------
-unload nls if mount fails.
-
-Version 0.72
-------------
-Add resume key support to search (readdir) code to workaround
-Windows bug. Add /proc/fs/cifs/LookupCacheEnable which
-allows disabling caching of attribute information for
-lookups.
-
-Version 0.71
-------------
-Add more oplock handling (distributed caching code). Remove
-dead code. Remove excessive stack space utilization from
-symlink routines.
-
-Version 0.70
-------------
-Fix oops in get dfs referral (triggered when null path sent in to
-mount). Add support for overriding rsize at mount time.
-
-Version 0.69
-------------
-Fix buffer overrun in readdir which caused intermittent kernel oopses.
-Fix writepage code to release kmap on write data. Allow "-ip=" new
-mount option to be passed in on parameter distinct from the first part
-(server name portion of) the UNC name. Allow override of the
-tcp port of the target server via new mount option "-port="
-
-Version 0.68
-------------
-Fix search handle leak on rewind. Fix setuid and gid so that they are
-reflected in the local inode immediately. Cleanup of whitespace
-to make 2.4 and 2.5 versions more consistent.
-
-
-Version 0.67
-------------
-Fix signal sending so that captive thread (cifsd) exits on umount
-(which was causing the warning in kmem_cache_free of the request buffers
-at rmmod time). This had broken as a sideeffect of the recent global
-kernel change to daemonize. Fix memory leak in readdir code which
-showed up in "ls -R" (and applications that did search rewinding).
-
-Version 0.66
-------------
-Reconnect tids and fids after session reconnection (still do not
-reconnect byte range locks though). Fix problem caching
-lookup information for directory inodes, improving performance,
-especially in deep directory trees. Fix various build warnings.
-
-Version 0.65
-------------
-Finish fixes to commit write for caching/readahead consistency. fsx
-now works to Samba servers. Fix oops caused when readahead
-was interrupted by a signal.
-
-Version 0.64
-------------
-Fix data corruption (in partial page after truncate) that caused fsx to
-fail to Windows servers. Cleaned up some extraneous error logging in
-common error paths. Add generic sendfile support.
-
-Version 0.63
-------------
-Fix memory leak in AllocMidQEntry.
-Finish reconnection logic, so connection with server can be dropped
-(or server rebooted) and the cifs client will reconnect.
-
-Version 0.62
-------------
-Fix temporary socket leak when bad userid or password specified
-(or other SMBSessSetup failure). Increase maximum buffer size to slightly
-over 16K to allow negotiation of up to Samba and Windows server default read
-sizes. Add support for readpages
-
-Version 0.61
-------------
-Fix oops when username not passed in on mount. Extensive fixes and improvements
-to error logging (strip redundant newlines, change debug macros to ensure newline
-passed in and to be more consistent). Fix writepage wrong file handle problem,
-a readonly file handle could be incorrectly used to attempt to write out
-file updates through the page cache to multiply open files. This could cause
-the iozone benchmark to fail on the fwrite test. Fix bug mounting two different
-shares to the same Windows server when using different usernames
-(doing this to Samba servers worked but Windows was rejecting it) - now it is
-possible to use different userids when connecting to the same server from a
-Linux client. Fix oops when treeDisconnect called during unmount on
-previously freed socket.
-
-Version 0.60
-------------
-Fix oops in readpages caused by not setting address space operations in inode in
-rare code path.
-
-Version 0.59
-------------
-Includes support for deleting of open files and renaming over existing files (per POSIX
-requirement). Add readlink support for Windows junction points (directory symlinks).
-
-Version 0.58
-------------
-Changed read and write to go through pagecache. Added additional address space operations.
-Memory mapped operations now working.
-
-Version 0.57
-------------
-Added writepage code for additional memory mapping support. Fixed leak in xids causing
-the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on
-every stat call. Additional formatting cleanup.
-
-Version 0.56
-------------
-Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup.
-
-Version 0.55
-------------
-Fixes from Zwane Mwaikambo for adding missing return code checking in a few places.
-Also included a modified version of his fix to protect global list manipulation of
-the smb session and tree connection and mid related global variables.
-
-Version 0.54
-------------
-Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre
-changes to superblock layout. Remove wasteful allocation of smb buffers (now the send
-buffer is reused for responses). Add more oplock handling. Additional minor cleanup.
-
-Version 0.53
-------------
-More stylistic updates to better match kernel style. Add additional statistics
-for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2
-and CIFS Packet Signing enablement.
-
-Version 0.52
-------------
-Replace call to sleep_on with safer wait_on_event.
-Make stylistic changes to better match kernel style recommendations.
-Remove most typedef usage (except for the PDUs themselves).
-
-Version 0.51
-------------
-Update mount so the -unc mount option is no longer required (the ip address can be specified
-in a UNC style device name. Implementation of readpage/writepage started.
-
-Version 0.50
-------------
-Fix intermittent problem with incorrect smb header checking on badly
-fragmented tcp responses
-
-Version 0.49
-------------
-Fixes to setting of allocation size and file size.
-
-Version 0.48
-------------
-Various 2.5.38 fixes. Now works on 2.5.38
-
-Version 0.47
-------------
-Prepare for 2.5 kernel merge. Remove ifdefs.
-
-Version 0.46
-------------
-Socket buffer management fixes. Fix dual free.
-
-Version 0.45
-------------
-Various big endian fixes for hardlinks and symlinks and also for dfs.
-
-Version 0.44
-------------
-Various big endian fixes for servers with Unix extensions such as Samba
-
-Version 0.43
-------------
-Various FindNext fixes for incorrect filenames on large directory searches on big endian
-clients. basic posix file i/o tests now work on big endian machines, not just le
-
-Version 0.42
-------------
-SessionSetup and NegotiateProtocol now work from Big Endian machines.
-Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older
-versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7).
-
-Version 0.41
-------------
-Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked
-files now return the correct number of links on fstat as they are repeatedly linked and unlinked.
-
-Version 0.40
-------------
-Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate
-session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP.
-Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs
-(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos).
-
-Version 0.38
-------------
-Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable
-it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU).
-
-Version 0.37
-------------
-Rewrote much of connection and mount/unmount logic to handle bugs with
-multiple uses to same share, multiple users to same server etc.
-
-Version 0.36
-------------
-Fixed major problem with dentry corruption (missing call to dput)
-
-Version 0.35
-------------
-Rewrite of readdir code to fix bug. Various fixes for bigendian machines.
-Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs
-although corresponding function not fully implemented in the vfs yet
-
-Version 0.34
-------------
-Fixed dentry caching bug, misc. cleanup
-
-Version 0.33
-------------
-Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build
-on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels.
-Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added.
-
-Version 0.32
-------------
-Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented
-and tested against Samba 2.2.5
-
-
-Version 0.31
-------------
-1) Fixed lockrange to be correct (it was one byte too short)
-
-2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly
-show range as locked when there is a conflict with an existing lock.
-
-3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories
-in most cases. Eventually will offer optional ability to query server for the correct perms.
-
-3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded
-but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb
-session)
-
-4) Fixed error logging of valid mount options
-
-5) Removed logging of password field.
-
-6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c
-and cleaned them up and made them more consistent with other cifs functions.
-
-7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways
-(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed,
-nor is the symlink support using the Unix extensions
-
-8) Started adding the readlink and follow_link code
-
-Version 0.3
------------
-Initial drop
-
+See https://wiki.samba.org/index.php/LinuxCIFSKernel for summary
+information (that may be easier to read than parsing the output of
+"git log fs/cifs") about fixes/improvements to CIFS/SMB2/SMB3 support (changes
+to cifs.ko module) by kernel version (and cifs internal module version).
diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README
index 99ce3d25003d..4a804619cff2 100644
--- a/Documentation/filesystems/cifs/README
+++ b/Documentation/filesystems/cifs/README
@@ -603,8 +603,7 @@ DebugData Displays information about active CIFS sessions and
shares, features enabled as well as the cifs.ko
version.
Stats Lists summary resource usage information as well as per
- share statistics, if CONFIG_CIFS_STATS in enabled
- in the kernel configuration.
+ share statistics.
Configuration pseudo-files:
SecurityFlags Flags which control security negotiation and
@@ -687,23 +686,22 @@ cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel
logging of various informational messages. 2 enables logging of non-zero
SMB return codes while 4 enables logging of requests that take longer
than one second to complete (except for byte range lock requests).
-Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the
-source code (typically by setting it in the beginning of cifsglob.h),
-and setting it to seven enables all three. Finally, tracing
+Setting it to 4 requires CONFIG_CIFS_STATS2 to be set in kernel configuration
+(.config). Setting it to seven enables all three. Finally, tracing
the start of smb requests and responses can be enabled via:
echo 1 > /proc/fs/cifs/traceSMB
-Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
-if the kernel was configured with cifs statistics enabled. The statistics
-represent the number of successful (ie non-zero return code from the server)
-SMB responses to some of the more common commands (open, delete, mkdir etc.).
+Per share (per client mount) statistics are available in /proc/fs/cifs/Stats.
+Additional information is available if CONFIG_CIFS_STATS2 is enabled in the
+kernel configuration (.config). The statistics returned include counters which
+represent the number of attempted and failed (ie non-zero return code from the
+server) SMB3 (or cifs) requests grouped by request type (read, write, close etc.).
Also recorded is the total bytes read and bytes written to the server for
that share. Note that due to client caching effects this can be less than the
number of bytes read and written by the application running on the client.
-The statistics for the number of total SMBs and oplock breaks are different in
-that they represent all for that share, not just those for which the server
-returned success.
+Statistics can be reset to zero by "echo 0 > /proc/fs/cifs/Stats" which may be
+useful if comparing performance of two different scenarios.
Also note that "cat /proc/fs/cifs/DebugData" will display information about
the active sessions and the shares that are mounted.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4/ext4.rst
index 7f628b9f7c4b..9d4368d591fa 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4/ext4.rst
@@ -1,6 +1,8 @@
+.. SPDX-License-Identifier: GPL-2.0
-Ext4 Filesystem
-===============
+========================
+General Information
+========================
Ext4 is an advanced level of the ext3 filesystem which incorporates
scalability and reliability enhancements for supporting large filesystems
@@ -11,37 +13,30 @@ Mailing list: linux-ext4@vger.kernel.org
Web site: http://ext4.wiki.kernel.org
-1. Quick usage instructions:
-===========================
+Quick usage instructions
+========================
Note: More extensive information for getting started with ext4 can be
- found at the ext4 wiki site at the URL:
- http://ext4.wiki.kernel.org/index.php/Ext4_Howto
+found at the ext4 wiki site at the URL:
+http://ext4.wiki.kernel.org/index.php/Ext4_Howto
- - Compile and install the latest version of e2fsprogs (as of this
- writing version 1.41.3) from:
+ - The latest version of e2fsprogs can be found at:
+
+ https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
- http://sourceforge.net/project/showfiles.php?group_id=2406
-
or
- https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
+ http://sourceforge.net/project/showfiles.php?group_id=2406
or grab the latest git repository from:
- git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
-
- - Note that it is highly important to install the mke2fs.conf file
- that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
- you have edited the /etc/mke2fs.conf file installed on your system,
- you will need to merge your changes with the version from e2fsprogs
- 1.41.x.
+ https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
- Create a new filesystem using the ext4 filesystem type:
- # mke2fs -t ext4 /dev/hda1
+ # mke2fs -t ext4 /dev/hda1
- Or to configure an existing ext3 filesystem to support extents:
+ Or to configure an existing ext3 filesystem to support extents:
# tune2fs -O extents /dev/hda1
@@ -50,10 +45,6 @@ Note: More extensive information for getting started with ext4 can be
# tune2fs -I 256 /dev/hda1
- (Note: we currently do not have tools to convert an ext4
- filesystem back to ext3; so please do not do try this on production
- filesystems.)
-
- Mounting:
# mount -t ext4 /dev/hda1 /wherever
@@ -75,10 +66,11 @@ Note: More extensive information for getting started with ext4 can be
the filesystem with a large journal can also be helpful for
metadata-intensive workloads.
-2. Features
-===========
+Features
+========
-2.1 Currently available
+Currently Available
+-------------------
* ability to use filesystems > 16TB (e2fsprogs support not available yet)
* extent format reduces metadata overhead (RAM, IO for access, transactions)
@@ -103,31 +95,15 @@ Note: More extensive information for getting started with ext4 can be
[1] Filesystems with a block size of 1k may see a limit imposed by the
directory hash tree having a maximum depth of two.
-2.2 Candidate features for future inclusion
-
-* online defrag (patches available but not well tested)
-* reduced mke2fs time via lazy itable initialization in conjunction with
- the uninit_bg feature (capability to do this is available in e2fsprogs
- but a kernel thread to do lazy zeroing of unused inode table blocks
- after filesystem is first mounted is required for safety)
-
-There are several others under discussion, whether they all make it in is
-partly a function of how much time everyone has to work on them. Features like
-metadata checksumming have been discussed and planned for a bit but no patches
-exist yet so I'm not sure they're in the near-term roadmap.
-
-The big performance win will come with mballoc, delalloc and flex_bg
-grouping of bitmaps and inode tables. Some test results available here:
-
- - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html
- - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html
-
-3. Options
-==========
+Options
+=======
When mounting an ext4 filesystem, the following option are accepted:
(*) == default
+======================= =======================================================
+Mount Option Description
+======================= =======================================================
ro Mount filesystem read only. Note that ext4 will
replay the journal (and thus write to the
partition) even when mounted "read only". The
@@ -387,33 +363,38 @@ i_version Enable 64-bit inode version support. This option is
dax Use direct access (no page cache). See
Documentation/filesystems/dax.txt. Note that
this option is incompatible with data=journal.
+======================= =======================================================
Data Mode
=========
There are 3 different data modes:
* writeback mode
-In data=writeback mode, ext4 does not journal data at all. This mode provides
-a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
-mode - metadata journaling. A crash+recovery can cause incorrect data to
-appear in files which were written shortly before the crash. This mode will
-typically provide the best ext4 performance.
+
+ In data=writeback mode, ext4 does not journal data at all. This mode provides
+ a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
+ mode - metadata journaling. A crash+recovery can cause incorrect data to
+ appear in files which were written shortly before the crash. This mode will
+ typically provide the best ext4 performance.
* ordered mode
-In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata information related to data changes with the data blocks into a
-single unit called a transaction. When it's time to write the new metadata
-out to disk, the associated data blocks are written first. In general,
-this mode performs slightly slower than writeback but significantly faster than journal mode.
+
+ In data=ordered mode, ext4 only officially journals metadata, but it logically
+ groups metadata information related to data changes with the data blocks into
+ a single unit called a transaction. When it's time to write the new metadata
+ out to disk, the associated data blocks are written first. In general, this
+ mode performs slightly slower than writeback but significantly faster than
+ journal mode.
* journal mode
-data=journal mode provides full data and metadata journaling. All new data is
-written to the journal first, and then to its final location.
-In the event of a crash, the journal can be replayed, bringing both data and
-metadata into a consistent state. This mode is the slowest except when data
-needs to be read from and written to disk at the same time where it
-outperforms all others modes. Enabling this mode will disable delayed
-allocation and O_DIRECT support.
+
+ data=journal mode provides full data and metadata journaling. All new data is
+ written to the journal first, and then to its final location. In the event of
+ a crash, the journal can be replayed, bringing both data and metadata into a
+ consistent state. This mode is the slowest except when data needs to be read
+ from and written to disk at the same time where it outperforms all others
+ modes. Enabling this mode will disable delayed allocation and O_DIRECT
+ support.
/proc entries
=============
@@ -425,10 +406,12 @@ Information about mounted ext4 file systems can be found in
in table below.
Files in /proc/fs/ext4/<devname>
-..............................................................................
+
+================ =======
File Content
+================ =======
mb_groups details of multiblock allocator buddy cache of free blocks
-..............................................................................
+================ =======
/sys entries
============
@@ -439,28 +422,30 @@ Information about mounted ext4 file systems can be found in
/sys/fs/ext4/dm-0). The files in each per-device directory are shown
in table below.
-Files in /sys/fs/ext4/<devname>
+Files in /sys/fs/ext4/<devname>:
+
(see also Documentation/ABI/testing/sysfs-fs-ext4)
-..............................................................................
- File Content
+============================= =================================================
+File Content
+============================= =================================================
delayed_allocation_blocks This file is read-only and shows the number of
blocks that are dirty in the page cache, but
which do not have their location in the
filesystem allocated yet.
- inode_goal Tuning parameter which (if non-zero) controls
+inode_goal Tuning parameter which (if non-zero) controls
the goal inode used by the inode allocator in
preference to all other allocation heuristics.
This is intended for debugging use only, and
should be 0 on production systems.
- inode_readahead_blks Tuning parameter which controls the maximum
+inode_readahead_blks Tuning parameter which controls the maximum
number of inode table blocks that ext4's inode
table readahead algorithm will pre-read into
the buffer cache
- lifetime_write_kbytes This file is read-only and shows the number of
+lifetime_write_kbytes This file is read-only and shows the number of
kilobytes of data that have been written to this
filesystem since it was created.
@@ -508,7 +493,7 @@ Files in /sys/fs/ext4/<devname>
in the file system. If there is not enough space
for the reserved space when mounting the file
mount will _not_ fail.
-..............................................................................
+============================= =================================================
Ioctls
======
@@ -518,8 +503,10 @@ through the system call interfaces. The list of all Ext4 specific ioctls are
shown in the table below.
Table of Ext4 specific ioctls
-..............................................................................
- Ioctl Description
+
+============================= =================================================
+Ioctl Description
+============================= =================================================
EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
The ioctl argument is an integer bitfield, with
bit values described in ext4.h. This ioctl is an
@@ -610,8 +597,7 @@ Table of Ext4 specific ioctls
normal user by accident.
The data blocks of the previous boot loader
will be associated with the given inode.
-
-..............................................................................
+============================= =================================================
References
==========
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst
new file mode 100644
index 000000000000..71121605558c
--- /dev/null
+++ b/Documentation/filesystems/ext4/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+ext4 Filesystem
+===============
+
+General usage and on-disk artifacts writen by ext4. More documentation may
+be ported from the wiki as time permits. This should be considered the
+canonical source of information as the details here have been reviewed by
+the ext4 community.
+
+.. toctree::
+ :maxdepth: 5
+ :numbered:
+
+ ext4
+ ondisk/index
diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/ondisk/about.rst
new file mode 100644
index 000000000000..0aadba052264
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/about.rst
@@ -0,0 +1,44 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+About this Book
+===============
+
+This document attempts to describe the on-disk format for ext4
+filesystems. The same general ideas should apply to ext2/3 filesystems
+as well, though they do not support all the features that ext4 supports,
+and the fields will be shorter.
+
+**NOTE**: This is a work in progress, based on notes that the author
+(djwong) made while picking apart a filesystem by hand. The data
+structure definitions should be current as of Linux 4.18 and
+e2fsprogs-1.44. All comments and corrections are welcome, since there is
+undoubtedly plenty of lore that might not be reflected in freshly
+created demonstration filesystems.
+
+License
+-------
+This book is licensed under the terms of the GNU Public License, v2.
+
+Terminology
+-----------
+
+ext4 divides a storage device into an array of logical blocks both to
+reduce bookkeeping overhead and to increase throughput by forcing larger
+transfer sizes. Generally, the block size will be 4KiB (the same size as
+pages on x86 and the block layer's default block size), though the
+actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes.
+Throughout this document, disk locations are given in terms of these
+logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of
+convenience, the logical block size will be referred to as
+``$block_size`` throughout the rest of the document.
+
+When referenced in ``preformatted text`` blocks, ``sb`` refers to fields
+in the super block, and ``inode`` refers to fields in an inode table
+entry.
+
+Other References
+----------------
+
+Also see http://www.nongnu.org/ext2-doc/ for quite a collection of
+information about ext2/3. Here's another old reference:
+http://wiki.osdev.org/Ext2
diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/ondisk/allocators.rst
new file mode 100644
index 000000000000..7aa85152ace3
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/allocators.rst
@@ -0,0 +1,56 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Block and Inode Allocation Policy
+---------------------------------
+
+ext4 recognizes (better than ext3, anyway) that data locality is
+generally a desirably quality of a filesystem. On a spinning disk,
+keeping related blocks near each other reduces the amount of movement
+that the head actuator and disk must perform to access a data block,
+thus speeding up disk IO. On an SSD there of course are no moving parts,
+but locality can increase the size of each transfer request while
+reducing the total number of requests. This locality may also have the
+effect of concentrating writes on a single erase block, which can speed
+up file rewrites significantly. Therefore, it is useful to reduce
+fragmentation whenever possible.
+
+The first tool that ext4 uses to combat fragmentation is the multi-block
+allocator. When a file is first created, the block allocator
+speculatively allocates 8KiB of disk space to the file on the assumption
+that the space will get written soon. When the file is closed, the
+unused speculative allocations are of course freed, but if the
+speculation is correct (typically the case for full writes of small
+files) then the file data gets written out in a single multi-block
+extent. A second related trick that ext4 uses is delayed allocation.
+Under this scheme, when a file needs more blocks to absorb file writes,
+the filesystem defers deciding the exact placement on the disk until all
+the dirty buffers are being written out to disk. By not committing to a
+particular placement until it's absolutely necessary (the commit timeout
+is hit, or sync() is called, or the kernel runs out of memory), the hope
+is that the filesystem can make better location decisions.
+
+The third trick that ext4 (and ext3) uses is that it tries to keep a
+file's data blocks in the same block group as its inode. This cuts down
+on the seek penalty when the filesystem first has to read a file's inode
+to learn where the file's data blocks live and then seek over to the
+file's data blocks to begin I/O operations.
+
+The fourth trick is that all the inodes in a directory are placed in the
+same block group as the directory, when feasible. The working assumption
+here is that all the files in a directory might be related, therefore it
+is useful to try to keep them all together.
+
+The fifth trick is that the disk volume is cut up into 128MB block
+groups; these mini-containers are used as outlined above to try to
+maintain data locality. However, there is a deliberate quirk -- when a
+directory is created in the root directory, the inode allocator scans
+the block groups and puts that directory into the least heavily loaded
+block group that it can find. This encourages directories to spread out
+over a disk; as the top-level directory/file blobs fill up one block
+group, the allocators simply move on to the next block group. Allegedly
+this scheme evens out the loading on the block groups, though the author
+suspects that the directories which are so unlucky as to land towards
+the end of a spinning drive get a raw deal performance-wise.
+
+Of course if all of these mechanisms fail, one can always use e4defrag
+to defragment files.
diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst
new file mode 100644
index 000000000000..0b01b67b81fe
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/attributes.rst
@@ -0,0 +1,191 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Extended Attributes
+-------------------
+
+Extended attributes (xattrs) are typically stored in a separate data
+block on the disk and referenced from inodes via ``inode.i_file_acl*``.
+The first use of extended attributes seems to have been for storing file
+ACLs and other security data (selinux). With the ``user_xattr`` mount
+option it is possible for users to store extended attributes so long as
+all attribute names begin with “user”; this restriction seems to have
+disappeared as of Linux 3.0.
+
+There are two places where extended attributes can be found. The first
+place is between the end of each inode entry and the beginning of the
+next inode entry. For example, if inode.i\_extra\_isize = 28 and
+sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes
+available for in-inode extended attribute storage. The second place
+where extended attributes can be found is in the block pointed to by
+``inode.i_file_acl``. As of Linux 3.11, it is not possible for this
+block to contain a pointer to a second extended attribute block (or even
+the remaining blocks of a cluster). In theory it is possible for each
+attribute's value to be stored in a separate data block, though as of
+Linux 3.11 the code does not permit this.
+
+Keys are generally assumed to be ASCIIZ strings, whereas values can be
+strings or binary data.
+
+Extended attributes, when stored after the inode, have a header
+``ext4_xattr_ibody_header`` that is 4 bytes long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - h\_magic
+ - Magic number for identification, 0xEA020000. This value is set by the
+ Linux driver, though e2fsprogs doesn't seem to check it(?)
+
+The beginning of an extended attribute block is in
+``struct ext4_xattr_header``, which is 32 bytes long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - h\_magic
+ - Magic number for identification, 0xEA020000.
+ * - 0x4
+ - \_\_le32
+ - h\_refcount
+ - Reference count.
+ * - 0x8
+ - \_\_le32
+ - h\_blocks
+ - Number of disk blocks used.
+ * - 0xC
+ - \_\_le32
+ - h\_hash
+ - Hash value of all attributes.
+ * - 0x10
+ - \_\_le32
+ - h\_checksum
+ - Checksum of the extended attribute block.
+ * - 0x14
+ - \_\_u32
+ - h\_reserved[2]
+ - Zero.
+
+The checksum is calculated against the FS UUID, the 64-bit block number
+of the extended attribute block, and the entire block (header +
+entries).
+
+Following the ``struct ext4_xattr_header`` or
+``struct ext4_xattr_ibody_header`` is an array of
+``struct ext4_xattr_entry``; each of these entries is at least 16 bytes
+long. When stored in an external block, the ``struct ext4_xattr_entry``
+entries must be stored in sorted order. The sort order is
+``e_name_index``, then ``e_name_len``, and finally ``e_name``.
+Attributes stored inside an inode do not need be stored in sorted order.
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_u8
+ - e\_name\_len
+ - Length of name.
+ * - 0x1
+ - \_\_u8
+ - e\_name\_index
+ - Attribute name index. There is a discussion of this below.
+ * - 0x2
+ - \_\_le16
+ - e\_value\_offs
+ - Location of this attribute's value on the disk block where it is stored.
+ Multiple attributes can share the same value. For an inode attribute
+ this value is relative to the start of the first entry; for a block this
+ value is relative to the start of the block (i.e. the header).
+ * - 0x4
+ - \_\_le32
+ - e\_value\_inum
+ - The inode where the value is stored. Zero indicates the value is in the
+ same block as this entry. This field is only used if the
+ INCOMPAT\_EA\_INODE feature is enabled.
+ * - 0x8
+ - \_\_le32
+ - e\_value\_size
+ - Length of attribute value.
+ * - 0xC
+ - \_\_le32
+ - e\_hash
+ - Hash value of attribute name and attribute value. The kernel doesn't
+ update the hash for in-inode attributes, so for that case this value
+ must be zero, because e2fsck validates any non-zero hash regardless of
+ where the xattr lives.
+ * - 0x10
+ - char
+ - e\_name[e\_name\_len]
+ - Attribute name. Does not include trailing NULL.
+
+Attribute values can follow the end of the entry table. There appears to
+be a requirement that they be aligned to 4-byte boundaries. The values
+are stored starting at the end of the block and grow towards the
+xattr\_header/xattr\_entry table. When the two collide, the overflow is
+put into a separate disk block. If the disk block fills up, the
+filesystem returns -ENOSPC.
+
+The first four fields of the ``ext4_xattr_entry`` are set to zero to
+mark the end of the key list.
+
+Attribute Name Indices
+~~~~~~~~~~~~~~~~~~~~~~
+
+Logically speaking, extended attributes are a series of key=value pairs.
+The keys are assumed to be NULL-terminated strings. To reduce the amount
+of on-disk space that the keys consume, the beginning of the key string
+is matched against the attribute name index. If a match is found, the
+attribute name index field is set, and matching string is removed from
+the key name. Here is a map of name index values to key prefixes:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Name Index
+ - Key Prefix
+ * - 0
+ - (no prefix)
+ * - 1
+ - “user.”
+ * - 2
+ - “system.posix\_acl\_access”
+ * - 3
+ - “system.posix\_acl\_default”
+ * - 4
+ - “trusted.”
+ * - 6
+ - “security.”
+ * - 7
+ - “system.” (inline\_data only?)
+ * - 8
+ - “system.richacl” (SuSE kernels only?)
+
+For example, if the attribute key is “user.fubar”, the attribute name
+index is set to 1 and the “fubar” name is recorded on disk.
+
+POSIX ACLs
+~~~~~~~~~~
+
+POSIX ACLs are stored in a reduced version of the Linux kernel (and
+libacl's) internal ACL format. The key difference is that the version
+number is different (1) and the ``e_id`` field is only stored for named
+user and group ACLs.
diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/ondisk/bigalloc.rst
new file mode 100644
index 000000000000..c6d88557553c
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/bigalloc.rst
@@ -0,0 +1,22 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Bigalloc
+--------
+
+At the moment, the default size of a block is 4KiB, which is a commonly
+supported page size on most MMU-capable hardware. This is fortunate, as
+ext4 code is not prepared to handle the case where the block size
+exceeds the page size. However, for a filesystem of mostly huge files,
+it is desirable to be able to allocate disk blocks in units of multiple
+blocks to reduce both fragmentation and metadata overhead. The
+`bigalloc <Bigalloc>`__ feature provides exactly this ability. The
+administrator can set a block cluster size at mkfs time (which is stored
+in the s\_log\_cluster\_size field in the superblock); from then on, the
+block bitmaps track clusters, not individual blocks. This means that
+block groups can be several gigabytes in size (instead of just 128MiB);
+however, the minimum allocation unit becomes a cluster, not a block,
+even for directories. TaoBao had a patchset to extend the “use units of
+clusters instead of blocks” to the extent tree, though it is not clear
+where those patches went-- they eventually morphed into “extent tree v2”
+but that code has not landed as of May 2015.
+
diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/ondisk/bitmaps.rst
new file mode 100644
index 000000000000..c7546dbc197a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/bitmaps.rst
@@ -0,0 +1,28 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Block and inode Bitmaps
+-----------------------
+
+The data block bitmap tracks the usage of data blocks within the block
+group.
+
+The inode bitmap records which entries in the inode table are in use.
+
+As with most bitmaps, one bit represents the usage status of one data
+block or inode table entry. This implies a block group size of 8 \*
+number\_of\_bytes\_in\_a\_logical\_block.
+
+NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts
+of the kernel and e2fsprogs code pretends that the block bitmap contains
+zeros (i.e. all blocks in the group are free). However, it is not
+necessarily the case that no blocks are in use -- if ``meta_bg`` is set,
+the bitmaps and group descriptor live inside the group. Unfortunately,
+ext2fs\_test\_block\_bitmap2() will return '0' for those locations,
+which produces confusing debugfs output.
+
+Inode Table
+-----------
+Inode tables are statically allocated at mkfs time. Each block group
+descriptor points to the start of the table, and the superblock records
+the number of inodes per group. See the section on inodes for more
+information.
diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/ondisk/blockgroup.rst
new file mode 100644
index 000000000000..baf888e4c06a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/blockgroup.rst
@@ -0,0 +1,135 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Layout
+------
+
+The layout of a standard block group is approximately as follows (each
+of these fields is discussed in a separate section below):
+
+.. list-table::
+ :widths: 1 1 1 1 1 1 1 1
+ :header-rows: 1
+
+ * - Group 0 Padding
+ - ext4 Super Block
+ - Group Descriptors
+ - Reserved GDT Blocks
+ - Data Block Bitmap
+ - inode Bitmap
+ - inode Table
+ - Data Blocks
+ * - 1024 bytes
+ - 1 block
+ - many blocks
+ - many blocks
+ - 1 block
+ - 1 block
+ - many blocks
+ - many more blocks
+
+For the special case of block group 0, the first 1024 bytes are unused,
+to allow for the installation of x86 boot sectors and other oddities.
+The superblock will start at offset 1024 bytes, whichever block that
+happens to be (usually 0). However, if for some reason the block size =
+1024, then block 0 is marked in use and the superblock goes in block 1.
+For all other block groups, there is no padding.
+
+The ext4 driver primarily works with the superblock and the group
+descriptors that are found in block group 0. Redundant copies of the
+superblock and group descriptors are written to some of the block groups
+across the disk in case the beginning of the disk gets trashed, though
+not all block groups necessarily host a redundant copy (see following
+paragraph for more details). If the group does not have a redundant
+copy, the block group begins with the data block bitmap. Note also that
+when the filesystem is freshly formatted, mkfs will allocate “reserve
+GDT block” space after the block group descriptors and before the start
+of the block bitmaps to allow for future expansion of the filesystem. By
+default, a filesystem is allowed to increase in size by a factor of
+1024x over the original filesystem size.
+
+The location of the inode table is given by ``grp.bg_inode_table_*``. It
+is continuous range of blocks large enough to contain
+``sb.s_inodes_per_group * sb.s_inode_size`` bytes.
+
+As for the ordering of items in a block group, it is generally
+established that the super block and the group descriptor table, if
+present, will be at the beginning of the block group. The bitmaps and
+the inode table can be anywhere, and it is quite possible for the
+bitmaps to come after the inode table, or for both to be in different
+groups (flex\_bg). Leftover space is used for file data blocks, indirect
+block maps, extent tree blocks, and extended attributes.
+
+Flexible Block Groups
+---------------------
+
+Starting in ext4, there is a new feature called flexible block groups
+(flex\_bg). In a flex\_bg, several block groups are tied together as one
+logical block group; the bitmap spaces and the inode table space in the
+first block group of the flex\_bg are expanded to include the bitmaps
+and inode tables of all other block groups in the flex\_bg. For example,
+if the flex\_bg size is 4, then group 0 will contain (in order) the
+superblock, group descriptors, data block bitmaps for groups 0-3, inode
+bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining
+space in group 0 is for file data. The effect of this is to group the
+block metadata close together for faster loading, and to enable large
+files to be continuous on disk. Backup copies of the superblock and
+group descriptors are always at the beginning of block groups, even if
+flex\_bg is enabled. The number of block groups that make up a flex\_bg
+is given by 2 ^ ``sb.s_log_groups_per_flex``.
+
+Meta Block Groups
+-----------------
+
+Without the option META\_BG, for safety concerns, all block group
+descriptors copies are kept in the first block group. Given the default
+128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4
+can have at most 2^27/64 = 2^21 block groups. This limits the entire
+filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB.
+
+The solution to this problem is to use the metablock group feature
+(META\_BG), which is already in ext3 for all 2.6 releases. With the
+META\_BG feature, ext4 filesystems are partitioned into many metablock
+groups. Each metablock group is a cluster of block groups whose group
+descriptor structures can be stored in a single disk block. For ext4
+filesystems with 4 KB block size, a single metablock group partition
+includes 64 block groups, or 8 GiB of disk space. The metablock group
+feature moves the location of the group descriptors from the congested
+first block group of the whole filesystem into the first group of each
+metablock group itself. The backups are in the second and last group of
+each metablock group. This increases the 2^21 maximum block groups limit
+to the hard limit 2^32, allowing support for a 512PiB filesystem.
+
+The change in the filesystem format replaces the current scheme where
+the superblock is followed by a variable-length set of block group
+descriptors. Instead, the superblock and a single block group descriptor
+block is placed at the beginning of the first, second, and last block
+groups in a meta-block group. A meta-block group is a collection of
+block groups which can be described by a single block group descriptor
+block. Since the size of the block group descriptor structure is 32
+bytes, a meta-block group contains 32 block groups for filesystems with
+a 1KB block size, and 128 block groups for filesystems with a 4KB
+blocksize. Filesystems can either be created using this new block group
+descriptor layout, or existing filesystems can be resized on-line, and
+the field s\_first\_meta\_bg in the superblock will indicate the first
+block group using this new layout.
+
+Please see an important note about ``BLOCK_UNINIT`` in the section about
+block and inode bitmaps.
+
+Lazy Block Group Initialization
+-------------------------------
+
+A new feature for ext4 are three block group descriptor flags that
+enable mkfs to skip initializing other parts of the block group
+metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean
+that the inode and block bitmaps for that group can be calculated and
+therefore the on-disk bitmap blocks are not initialized. This is
+generally the case for an empty block group or a block group containing
+only fixed-location block group metadata. The INODE\_ZEROED flag means
+that the inode table has been initialized; mkfs will unset this flag and
+rely on the kernel to initialize the inode tables in the background.
+
+By not writing zeroes to the bitmaps and inode table, mkfs time is
+reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM,
+but the dumpe2fs output prints this as “uninit\_bg”. They are the same
+thing.
diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/ondisk/blockmap.rst
new file mode 100644
index 000000000000..30e25750d88a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/blockmap.rst
@@ -0,0 +1,49 @@
+.. SPDX-License-Identifier: GPL-2.0
+
++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| i.i\_block Offset | Where It Points |
++=====================+==============================================================================================================================================================================================================================+
+| 0 to 11 | Direct map to file blocks 0 to 11. |
++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) |
+| | |
+| | +------------------------------+--------------------------------------------------------------------+ |
+| | | Indirect Block Offset | Where It Points | |
+| | +==============================+====================================================================+ |
+| | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | |
+| | +------------------------------+--------------------------------------------------------------------+ |
++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) |
+| | |
+| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ |
+| | | Double Indirect Block Offset | Where It Points | |
+| | +================================+=========================================================================================================+ |
+| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | |
+| | | | | |
+| | | | +------------------------------+--------------------------------------------------------------------+ | |
+| | | | | Indirect Block Offset | Where It Points | | |
+| | | | +==============================+====================================================================+ | |
+| | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | |
+| | | | +------------------------------+--------------------------------------------------------------------+ | |
+| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ |
++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) |
+| | |
+| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ |
+| | | Triple Indirect Block Offset | Where It Points | |
+| | +================================+================================================================================================================================================+ |
+| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | |
+| | | | | |
+| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | |
+| | | | | Double Indirect Block Offset | Where It Points | | |
+| | | | +================================+=========================================================================================================+ | |
+| | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | |
+| | | | | | | | |
+| | | | | | +------------------------------+--------------------------------------------------------------------+ | | |
+| | | | | | | Indirect Block Offset | Where It Points | | | |
+| | | | | | +==============================+====================================================================+ | | |
+| | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | |
+| | | | | | +------------------------------+--------------------------------------------------------------------+ | | |
+| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | |
+| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ |
++---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/ondisk/blocks.rst
new file mode 100644
index 000000000000..73d4dc0f7bda
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/blocks.rst
@@ -0,0 +1,142 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Blocks
+------
+
+ext4 allocates storage space in units of “blocks”. A block is a group of
+sectors between 1KiB and 64KiB, and the number of sectors must be an
+integral power of 2. Blocks are in turn grouped into larger units called
+block groups. Block size is specified at mkfs time and typically is
+4KiB. You may experience mounting problems if block size is greater than
+page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory
+pages). By default a filesystem can contain 2^32 blocks; if the '64bit'
+feature is enabled, then a filesystem can have 2^64 blocks.
+
+For 32-bit filesystems, limits are as follows:
+
+.. list-table::
+ :widths: 1 1 1 1 1
+ :header-rows: 1
+
+ * - Item
+ - 1KiB
+ - 2KiB
+ - 4KiB
+ - 64KiB
+ * - Blocks
+ - 2^32
+ - 2^32
+ - 2^32
+ - 2^32
+ * - Inodes
+ - 2^32
+ - 2^32
+ - 2^32
+ - 2^32
+ * - File System Size
+ - 4TiB
+ - 8TiB
+ - 16TiB
+ - 256PiB
+ * - Blocks Per Block Group
+ - 8,192
+ - 16,384
+ - 32,768
+ - 524,288
+ * - Inodes Per Block Group
+ - 8,192
+ - 16,384
+ - 32,768
+ - 524,288
+ * - Block Group Size
+ - 8MiB
+ - 32MiB
+ - 128MiB
+ - 32GiB
+ * - Blocks Per File, Extents
+ - 2^32
+ - 2^32
+ - 2^32
+ - 2^32
+ * - Blocks Per File, Block Maps
+ - 16,843,020
+ - 134,480,396
+ - 1,074,791,436
+ - 4,398,314,962,956 (really 2^32 due to field size limitations)
+ * - File Size, Extents
+ - 4TiB
+ - 8TiB
+ - 16TiB
+ - 256TiB
+ * - File Size, Block Maps
+ - 16GiB
+ - 256GiB
+ - 4TiB
+ - 256TiB
+
+For 64-bit filesystems, limits are as follows:
+
+.. list-table::
+ :widths: 1 1 1 1 1
+ :header-rows: 1
+
+ * - Item
+ - 1KiB
+ - 2KiB
+ - 4KiB
+ - 64KiB
+ * - Blocks
+ - 2^64
+ - 2^64
+ - 2^64
+ - 2^64
+ * - Inodes
+ - 2^32
+ - 2^32
+ - 2^32
+ - 2^32
+ * - File System Size
+ - 16ZiB
+ - 32ZiB
+ - 64ZiB
+ - 1YiB
+ * - Blocks Per Block Group
+ - 8,192
+ - 16,384
+ - 32,768
+ - 524,288
+ * - Inodes Per Block Group
+ - 8,192
+ - 16,384
+ - 32,768
+ - 524,288
+ * - Block Group Size
+ - 8MiB
+ - 32MiB
+ - 128MiB
+ - 32GiB
+ * - Blocks Per File, Extents
+ - 2^32
+ - 2^32
+ - 2^32
+ - 2^32
+ * - Blocks Per File, Block Maps
+ - 16,843,020
+ - 134,480,396
+ - 1,074,791,436
+ - 4,398,314,962,956 (really 2^32 due to field size limitations)
+ * - File Size, Extents
+ - 4TiB
+ - 8TiB
+ - 16TiB
+ - 256TiB
+ * - File Size, Block Maps
+ - 16GiB
+ - 256GiB
+ - 4TiB
+ - 256TiB
+
+Note: Files not using extents (i.e. files using block maps) must be
+placed within the first 2^32 blocks of a filesystem. Files with extents
+must be placed within the first 2^48 blocks of a filesystem. It's not
+clear what happens with larger filesystems.
diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst
new file mode 100644
index 000000000000..9d6a793b2e03
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/checksums.rst
@@ -0,0 +1,73 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Checksums
+---------
+
+Starting in early 2012, metadata checksums were added to all major ext4
+and jbd2 data structures. The associated feature flag is metadata\_csum.
+The desired checksum algorithm is indicated in the superblock, though as
+of October 2012 the only supported algorithm is crc32c. Some data
+structures did not have space to fit a full 32-bit checksum, so only the
+lower 16 bits are stored. Enabling the 64bit feature increases the data
+structure size so that full 32-bit checksums can be stored for many data
+structures. However, existing 32-bit filesystems cannot be extended to
+enable 64bit mode, at least not without the experimental resize2fs
+patches to do so.
+
+Existing filesystems can have checksumming added by running
+``tune2fs -O metadata_csum`` against the underlying device. If tune2fs
+encounters directory blocks that lack sufficient empty space to add a
+checksum, it will request that you run ``e2fsck -D`` to have the
+directories rebuilt with checksums. This has the added benefit of
+removing slack space from the directory files and rebalancing the htree
+indexes. If you \_ignore\_ this step, your directories will not be
+protected by a checksum!
+
+The following table describes the data elements that go into each type
+of checksum. The checksum function is whatever the superblock describes
+(crc32c as of October 2013) unless noted otherwise.
+
+.. list-table::
+ :widths: 1 1 4
+ :header-rows: 1
+
+ * - Metadata
+ - Length
+ - Ingredients
+ * - Superblock
+ - \_\_le32
+ - The entire superblock up to the checksum field. The UUID lives inside
+ the superblock.
+ * - MMP
+ - \_\_le32
+ - UUID + the entire MMP block up to the checksum field.
+ * - Extended Attributes
+ - \_\_le32
+ - UUID + the entire extended attribute block. The checksum field is set to
+ zero.
+ * - Directory Entries
+ - \_\_le32
+ - UUID + inode number + inode generation + the directory block up to the
+ fake entry enclosing the checksum field.
+ * - HTREE Nodes
+ - \_\_le32
+ - UUID + inode number + inode generation + all valid extents + HTREE tail.
+ The checksum field is set to zero.
+ * - Extents
+ - \_\_le32
+ - UUID + inode number + inode generation + the entire extent block up to
+ the checksum field.
+ * - Bitmaps
+ - \_\_le32 or \_\_le16
+ - UUID + the entire bitmap. Checksums are stored in the group descriptor,
+ and truncated if the group descriptor size is 32 bytes (i.e. ^64bit)
+ * - Inodes
+ - \_\_le32
+ - UUID + inode number + inode generation + the entire inode. The checksum
+ field is set to zero. Each inode has its own checksum.
+ * - Group Descriptors
+ - \_\_le16
+ - If metadata\_csum, then UUID + group number + the entire descriptor;
+ else if gdt\_csum, then crc16(UUID + group number + the entire
+ descriptor). In all cases, only the lower 16 bits are stored.
+
diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst
new file mode 100644
index 000000000000..8fcba68c2884
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/directory.rst
@@ -0,0 +1,426 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Directory Entries
+-----------------
+
+In an ext4 filesystem, a directory is more or less a flat file that maps
+an arbitrary byte string (usually ASCII) to an inode number on the
+filesystem. There can be many directory entries across the filesystem
+that reference the same inode number--these are known as hard links, and
+that is why hard links cannot reference files on other filesystems. As
+such, directory entries are found by reading the data block(s)
+associated with a directory file for the particular directory entry that
+is desired.
+
+Linear (Classic) Directories
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, each directory lists its entries in an “almost-linear”
+array. I write “almost” because it's not a linear array in the memory
+sense because directory entries are not split across filesystem blocks.
+Therefore, it is more accurate to say that a directory is a series of
+data blocks and that each block contains a linear array of directory
+entries. The end of each per-block array is signified by reaching the
+end of the block; the last entry in the block has a record length that
+takes it all the way to the end of the block. The end of the entire
+directory is of course signified by reaching the end of the file. Unused
+directory entries are signified by inode = 0. By default the filesystem
+uses ``struct ext4_dir_entry_2`` for directory entries unless the
+“filetype” feature flag is not set, in which case it uses
+``struct ext4_dir_entry``.
+
+The original directory entry format is ``struct ext4_dir_entry``, which
+is at most 263 bytes long, though on disk you'll need to reference
+``dirent.rec_len`` to know for sure.
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - inode
+ - Number of the inode that this directory entry points to.
+ * - 0x4
+ - \_\_le16
+ - rec\_len
+ - Length of this directory entry. Must be a multiple of 4.
+ * - 0x6
+ - \_\_le16
+ - name\_len
+ - Length of the file name.
+ * - 0x8
+ - char
+ - name[EXT4\_NAME\_LEN]
+ - File name.
+
+Since file names cannot be longer than 255 bytes, the new directory
+entry format shortens the rec\_len field and uses the space for a file
+type flag, probably to avoid having to load every inode during directory
+tree traversal. This format is ``ext4_dir_entry_2``, which is at most
+263 bytes long, though on disk you'll need to reference
+``dirent.rec_len`` to know for sure.
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - inode
+ - Number of the inode that this directory entry points to.
+ * - 0x4
+ - \_\_le16
+ - rec\_len
+ - Length of this directory entry.
+ * - 0x6
+ - \_\_u8
+ - name\_len
+ - Length of the file name.
+ * - 0x7
+ - \_\_u8
+ - file\_type
+ - File type code, see ftype_ table below.
+ * - 0x8
+ - char
+ - name[EXT4\_NAME\_LEN]
+ - File name.
+
+.. _ftype:
+
+The directory file type is one of the following values:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x0
+ - Unknown.
+ * - 0x1
+ - Regular file.
+ * - 0x2
+ - Directory.
+ * - 0x3
+ - Character device file.
+ * - 0x4
+ - Block device file.
+ * - 0x5
+ - FIFO.
+ * - 0x6
+ - Socket.
+ * - 0x7
+ - Symbolic link.
+
+In order to add checksums to these classic directory blocks, a phony
+``struct ext4_dir_entry`` is placed at the end of each leaf block to
+hold the checksum. The directory entry is 12 bytes long. The inode
+number and name\_len fields are set to zero to fool old software into
+ignoring an apparently empty directory entry, and the checksum is stored
+in the place where the name normally goes. The structure is
+``struct ext4_dir_entry_tail``:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - det\_reserved\_zero1
+ - Inode number, which must be zero.
+ * - 0x4
+ - \_\_le16
+ - det\_rec\_len
+ - Length of this directory entry, which must be 12.
+ * - 0x6
+ - \_\_u8
+ - det\_reserved\_zero2
+ - Length of the file name, which must be zero.
+ * - 0x7
+ - \_\_u8
+ - det\_reserved\_ft
+ - File type, which must be 0xDE.
+ * - 0x8
+ - \_\_le32
+ - det\_checksum
+ - Directory leaf block checksum.
+
+The leaf directory block checksum is calculated against the FS UUID, the
+directory's inode number, the directory's inode generation number, and
+the entire directory entry block up to (but not including) the fake
+directory entry.
+
+Hash Tree Directories
+~~~~~~~~~~~~~~~~~~~~~
+
+A linear array of directory entries isn't great for performance, so a
+new feature was added to ext3 to provide a faster (but peculiar)
+balanced tree keyed off a hash of the directory entry name. If the
+EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a
+hashed btree (htree) to organize and find directory entries. For
+backwards read-only compatibility with ext2, this tree is actually
+hidden inside the directory file, masquerading as “empty” directory data
+blocks! It was stated previously that the end of the linear directory
+entry table was signified with an entry pointing to inode 0; this is
+(ab)used to fool the old linear-scan algorithm into thinking that the
+rest of the directory block is empty so that it moves on.
+
+The root of the tree always lives in the first data block of the
+directory. By ext2 custom, the '.' and '..' entries must appear at the
+beginning of this first block, so they are put here as two
+``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of
+the root node contains metadata about the tree and finally a hash->block
+map to find nodes that are lower in the htree. If
+``dx_root.info.indirect_levels`` is non-zero then the htree has two
+levels; the data block pointed to by the root node's map is an interior
+node, which is indexed by a minor hash. Interior nodes in this tree
+contains a zeroed out ``struct ext4_dir_entry_2`` followed by a
+minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear
+array of all ``struct ext4_dir_entry_2``; all of these entries
+(presumably) hash to the same value. If there is an overflow, the
+entries simply overflow into the next leaf node, and the
+least-significant bit of the hash (in the interior node map) that gets
+us to this next leaf node is set.
+
+To traverse the directory as a htree, the code calculates the hash of
+the desired file name and uses it to find the corresponding block
+number. If the tree is flat, the block is a linear array of directory
+entries that can be searched; otherwise, the minor hash of the file name
+is computed and used against this second block to find the corresponding
+third block number. That third block number will be a linear array of
+directory entries.
+
+To traverse the directory as a linear array (such as the old code does),
+the code simply reads every data block in the directory. The blocks used
+for the htree will appear to have no entries (aside from '.' and '..')
+and so only the leaf nodes will appear to have any interesting content.
+
+The root of the htree is in ``struct dx_root``, which is the full length
+of a data block:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - dot.inode
+ - inode number of this directory.
+ * - 0x4
+ - \_\_le16
+ - dot.rec\_len
+ - Length of this record, 12.
+ * - 0x6
+ - u8
+ - dot.name\_len
+ - Length of the name, 1.
+ * - 0x7
+ - u8
+ - dot.file\_type
+ - File type of this entry, 0x2 (directory) (if the feature flag is set).
+ * - 0x8
+ - char
+ - dot.name[4]
+ - “.\\0\\0\\0”
+ * - 0xC
+ - \_\_le32
+ - dotdot.inode
+ - inode number of parent directory.
+ * - 0x10
+ - \_\_le16
+ - dotdot.rec\_len
+ - block\_size - 12. The record length is long enough to cover all htree
+ data.
+ * - 0x12
+ - u8
+ - dotdot.name\_len
+ - Length of the name, 2.
+ * - 0x13
+ - u8
+ - dotdot.file\_type
+ - File type of this entry, 0x2 (directory) (if the feature flag is set).
+ * - 0x14
+ - char
+ - dotdot\_name[4]
+ - “..\\0\\0”
+ * - 0x18
+ - \_\_le32
+ - struct dx\_root\_info.reserved\_zero
+ - Zero.
+ * - 0x1C
+ - u8
+ - struct dx\_root\_info.hash\_version
+ - Hash type, see dirhash_ table below.
+ * - 0x1D
+ - u8
+ - struct dx\_root\_info.info\_length
+ - Length of the tree information, 0x8.
+ * - 0x1E
+ - u8
+ - struct dx\_root\_info.indirect\_levels
+ - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR
+ feature is set; cannot be larger than 2 otherwise.
+ * - 0x1F
+ - u8
+ - struct dx\_root\_info.unused\_flags
+ -
+ * - 0x20
+ - \_\_le16
+ - limit
+ - Maximum number of dx\_entries that can follow this header, plus 1 for
+ the header itself.
+ * - 0x22
+ - \_\_le16
+ - count
+ - Actual number of dx\_entries that follow this header, plus 1 for the
+ header itself.
+ * - 0x24
+ - \_\_le32
+ - block
+ - The block number (within the directory file) that goes with hash=0.
+ * - 0x28
+ - struct dx\_entry
+ - entries[0]
+ - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
+
+.. _dirhash:
+
+The directory hash is one of the following values:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x0
+ - Legacy.
+ * - 0x1
+ - Half MD4.
+ * - 0x2
+ - Tea.
+ * - 0x3
+ - Legacy, unsigned.
+ * - 0x4
+ - Half MD4, unsigned.
+ * - 0x5
+ - Tea, unsigned.
+
+Interior nodes of an htree are recorded as ``struct dx_node``, which is
+also the full length of a data block:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - fake.inode
+ - Zero, to make it look like this entry is not in use.
+ * - 0x4
+ - \_\_le16
+ - fake.rec\_len
+ - The size of the block, in order to hide all of the dx\_node data.
+ * - 0x6
+ - u8
+ - name\_len
+ - Zero. There is no name for this “unused” directory entry.
+ * - 0x7
+ - u8
+ - file\_type
+ - Zero. There is no file type for this “unused” directory entry.
+ * - 0x8
+ - \_\_le16
+ - limit
+ - Maximum number of dx\_entries that can follow this header, plus 1 for
+ the header itself.
+ * - 0xA
+ - \_\_le16
+ - count
+ - Actual number of dx\_entries that follow this header, plus 1 for the
+ header itself.
+ * - 0xE
+ - \_\_le32
+ - block
+ - The block number (within the directory file) that goes with the lowest
+ hash value of this block. This value is stored in the parent block.
+ * - 0x12
+ - struct dx\_entry
+ - entries[0]
+ - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
+
+The hash maps that exist in both ``struct dx_root`` and
+``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes
+long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - hash
+ - Hash code.
+ * - 0x4
+ - \_\_le32
+ - block
+ - Block number (within the directory file, not filesystem blocks) of the
+ next node in the htree.
+
+(If you think this is all quite clever and peculiar, so does the
+author.)
+
+If metadata checksums are enabled, the last 8 bytes of the directory
+block (precisely the length of one dx\_entry) are used to store a
+``struct dx_tail``, which contains the checksum. The ``limit`` and
+``count`` entries in the dx\_root/dx\_node structures are adjusted as
+necessary to fit the dx\_tail into the block. If there is no space for
+the dx\_tail, the user is notified to run e2fsck -D to rebuild the
+directory index (which will ensure that there's space for the checksum.
+The dx\_tail structure is 8 bytes long and looks like this:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - u32
+ - dt\_reserved
+ - Zero.
+ * - 0x4
+ - \_\_le32
+ - dt\_checksum
+ - Checksum of the htree directory block.
+
+The checksum is calculated against the FS UUID, the htree index header
+(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in
+use, and the tail block (dx\_tail).
diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/ondisk/dynamic.rst
new file mode 100644
index 000000000000..bb0c84333341
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/dynamic.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Dynamic Structures
+==================
+
+Dynamic metadata are created on the fly when files and blocks are
+allocated to files.
+
+.. include:: inodes.rst
+.. include:: ifork.rst
+.. include:: directory.rst
+.. include:: attributes.rst
diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/ondisk/eainode.rst
new file mode 100644
index 000000000000..ecc0d01a0a72
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/eainode.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Large Extended Attribute Values
+-------------------------------
+
+To enable ext4 to store extended attribute values that do not fit in the
+inode or in the single extended attribute block attached to an inode,
+the EA\_INODE feature allows us to store the value in the data blocks of
+a regular file inode. This “EA inode” is linked only from the extended
+attribute name index and must not appear in a directory entry. The
+inode's i\_atime field is used to store a checksum of the xattr value;
+and i\_ctime/i\_version store a 64-bit reference count, which enables
+sharing of large xattr values between multiple owning inodes. For
+backward compatibility with older versions of this feature, the
+i\_mtime/i\_generation *may* store a back-reference to the inode number
+and i\_generation of the **one** owning inode (in cases where the EA
+inode is not referenced by multiple inodes) to verify that the EA inode
+is the correct one being accessed.
diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/ondisk/globals.rst
new file mode 100644
index 000000000000..368bf7662b96
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/globals.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Global Structures
+=================
+
+The filesystem is sharded into a number of block groups, each of which
+have static metadata at fixed locations.
+
+.. include:: super.rst
+.. include:: group_descr.rst
+.. include:: bitmaps.rst
+.. include:: mmp.rst
+.. include:: journal.rst
diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst
new file mode 100644
index 000000000000..759827e5d2cf
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/group_descr.rst
@@ -0,0 +1,170 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Block Group Descriptors
+-----------------------
+
+Each block group on the filesystem has one of these descriptors
+associated with it. As noted in the Layout section above, the group
+descriptors (if present) are the second item in the block group. The
+standard configuration is for each block group to contain a full copy of
+the block group descriptor table unless the sparse\_super feature flag
+is set.
+
+Notice how the group descriptor records the location of both bitmaps and
+the inode table (i.e. they can float). This means that within a block
+group, the only data structures with fixed locations are the superblock
+and the group descriptor table. The flex\_bg mechanism uses this
+property to group several block groups into a flex group and lay out all
+of the groups' bitmaps and inode tables into one long run in the first
+group of the flex group.
+
+If the meta\_bg feature flag is set, then several block groups are
+grouped together into a meta group. Note that in the meta\_bg case,
+however, the first and last two block groups within the larger meta
+group contain only group descriptors for the groups inside the meta
+group.
+
+flex\_bg and meta\_bg do not appear to be mutually exclusive features.
+
+In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the
+block group descriptor was only 32 bytes long and therefore ends at
+bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the
+block group descriptor expands to at least the 64 bytes described below;
+the size is stored in the superblock.
+
+If gdt\_csum is set and metadata\_csum is not set, the block group
+checksum is the crc16 of the FS UUID, the group number, and the group
+descriptor structure. If metadata\_csum is set, then the block group
+checksum is the lower 16 bits of the checksum of the FS UUID, the group
+number, and the group descriptor structure. Both block and inode bitmap
+checksums are calculated against the FS UUID, the group number, and the
+entire bitmap.
+
+The block group descriptor is laid out in ``struct ext4_group_desc``.
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - bg\_block\_bitmap\_lo
+ - Lower 32-bits of location of block bitmap.
+ * - 0x4
+ - \_\_le32
+ - bg\_inode\_bitmap\_lo
+ - Lower 32-bits of location of inode bitmap.
+ * - 0x8
+ - \_\_le32
+ - bg\_inode\_table\_lo
+ - Lower 32-bits of location of inode table.
+ * - 0xC
+ - \_\_le16
+ - bg\_free\_blocks\_count\_lo
+ - Lower 16-bits of free block count.
+ * - 0xE
+ - \_\_le16
+ - bg\_free\_inodes\_count\_lo
+ - Lower 16-bits of free inode count.
+ * - 0x10
+ - \_\_le16
+ - bg\_used\_dirs\_count\_lo
+ - Lower 16-bits of directory count.
+ * - 0x12
+ - \_\_le16
+ - bg\_flags
+ - Block group flags. See the bgflags_ table below.
+ * - 0x14
+ - \_\_le32
+ - bg\_exclude\_bitmap\_lo
+ - Lower 32-bits of location of snapshot exclusion bitmap.
+ * - 0x18
+ - \_\_le16
+ - bg\_block\_bitmap\_csum\_lo
+ - Lower 16-bits of the block bitmap checksum.
+ * - 0x1A
+ - \_\_le16
+ - bg\_inode\_bitmap\_csum\_lo
+ - Lower 16-bits of the inode bitmap checksum.
+ * - 0x1C
+ - \_\_le16
+ - bg\_itable\_unused\_lo
+ - Lower 16-bits of unused inode count. If set, we needn't scan past the
+ ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the
+ inode table for this group.
+ * - 0x1E
+ - \_\_le16
+ - bg\_checksum
+ - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the
+ RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) &
+ 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set.
+ * -
+ -
+ -
+ - These fields only exist if the 64bit feature is enabled and s_desc_size
+ > 32.
+ * - 0x20
+ - \_\_le32
+ - bg\_block\_bitmap\_hi
+ - Upper 32-bits of location of block bitmap.
+ * - 0x24
+ - \_\_le32
+ - bg\_inode\_bitmap\_hi
+ - Upper 32-bits of location of inodes bitmap.
+ * - 0x28
+ - \_\_le32
+ - bg\_inode\_table\_hi
+ - Upper 32-bits of location of inodes table.
+ * - 0x2C
+ - \_\_le16
+ - bg\_free\_blocks\_count\_hi
+ - Upper 16-bits of free block count.
+ * - 0x2E
+ - \_\_le16
+ - bg\_free\_inodes\_count\_hi
+ - Upper 16-bits of free inode count.
+ * - 0x30
+ - \_\_le16
+ - bg\_used\_dirs\_count\_hi
+ - Upper 16-bits of directory count.
+ * - 0x32
+ - \_\_le16
+ - bg\_itable\_unused\_hi
+ - Upper 16-bits of unused inode count.
+ * - 0x34
+ - \_\_le32
+ - bg\_exclude\_bitmap\_hi
+ - Upper 32-bits of location of snapshot exclusion bitmap.
+ * - 0x38
+ - \_\_le16
+ - bg\_block\_bitmap\_csum\_hi
+ - Upper 16-bits of the block bitmap checksum.
+ * - 0x3A
+ - \_\_le16
+ - bg\_inode\_bitmap\_csum\_hi
+ - Upper 16-bits of the inode bitmap checksum.
+ * - 0x3C
+ - \_\_u32
+ - bg\_reserved
+ - Padding to 64 bytes.
+
+.. _bgflags:
+
+Block group flags can be any combination of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT).
+ * - 0x2
+ - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT).
+ * - 0x4
+ - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED).
diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst
new file mode 100644
index 000000000000..5dbe3b2b121a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/ifork.rst
@@ -0,0 +1,194 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+The Contents of inode.i\_block
+------------------------------
+
+Depending on the type of file an inode describes, the 60 bytes of
+storage in ``inode.i_block`` can be used in different ways. In general,
+regular files and directories will use it for file block indexing
+information, and special files will use it for special purposes.
+
+Symbolic Links
+~~~~~~~~~~~~~~
+
+The target of a symbolic link will be stored in this field if the target
+string is less than 60 bytes long. Otherwise, either extents or block
+maps will be used to allocate data blocks to store the link target.
+
+Direct/Indirect Block Addressing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In ext2/3, file block numbers were mapped to logical block numbers by
+means of an (up to) three level 1-1 block map. To find the logical block
+that stores a particular file block, the code would navigate through
+this increasingly complicated structure. Notice that there is neither a
+magic number nor a checksum to provide any level of confidence that the
+block isn't full of garbage.
+
+.. ifconfig:: builder != 'latex'
+
+ .. include:: blockmap.rst
+
+.. ifconfig:: builder == 'latex'
+
+ [Table omitted because LaTeX doesn't support nested tables.]
+
+Note that with this block mapping scheme, it is necessary to fill out a
+lot of mapping data even for a large contiguous file! This inefficiency
+led to the creation of the extent mapping scheme, discussed below.
+
+Notice also that a file using this mapping scheme cannot be placed
+higher than 2^32 blocks.
+
+Extent Tree
+~~~~~~~~~~~
+
+In ext4, the file to logical block map has been replaced with an extent
+tree. Under the old scheme, allocating a contiguous run of 1,000 blocks
+requires an indirect block to map all 1,000 entries; with extents, the
+mapping is reduced to a single ``struct ext4_extent`` with
+``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate
+very large files with a single extent, at a considerable reduction in
+metadata block use, and some improvement in disk efficiency. The inode
+must have the extents flag (0x80000) flag set for this feature to be in
+use.
+
+Extents are arranged as a tree. Each node of the tree begins with a
+``struct ext4_extent_header``. If the node is an interior node
+(``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries``
+instances of ``struct ext4_extent_idx``; each of these index entries
+points to a block containing more nodes in the extent tree. If the node
+is a leaf node (``eh.eh_depth == 0``), then the header is followed by
+``eh.eh_entries`` instances of ``struct ext4_extent``; these instances
+point to the file's data blocks. The root node of the extent tree is
+stored in ``inode.i_block``, which allows for the first four extents to
+be recorded without the use of extra metadata blocks.
+
+The extent tree header is recorded in ``struct ext4_extent_header``,
+which is 12 bytes long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le16
+ - eh\_magic
+ - Magic number, 0xF30A.
+ * - 0x2
+ - \_\_le16
+ - eh\_entries
+ - Number of valid entries following the header.
+ * - 0x4
+ - \_\_le16
+ - eh\_max
+ - Maximum number of entries that could follow the header.
+ * - 0x6
+ - \_\_le16
+ - eh\_depth
+ - Depth of this extent node in the extent tree. 0 = this extent node
+ points to data blocks; otherwise, this extent node points to other
+ extent nodes. The extent tree can be at most 5 levels deep: a logical
+ block number can be at most ``2^32``, and the smallest ``n`` that
+ satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5.
+ * - 0x8
+ - \_\_le32
+ - eh\_generation
+ - Generation of the tree. (Used by Lustre, but not standard ext4).
+
+Internal nodes of the extent tree, also known as index nodes, are
+recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - ei\_block
+ - This index node covers file blocks from 'block' onward.
+ * - 0x4
+ - \_\_le32
+ - ei\_leaf\_lo
+ - Lower 32-bits of the block number of the extent node that is the next
+ level lower in the tree. The tree node pointed to can be either another
+ internal node or a leaf node, described below.
+ * - 0x8
+ - \_\_le16
+ - ei\_leaf\_hi
+ - Upper 16-bits of the previous field.
+ * - 0xA
+ - \_\_u16
+ - ei\_unused
+ -
+
+Leaf nodes of the extent tree are recorded as ``struct ext4_extent``,
+and are also 12 bytes long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - ee\_block
+ - First file block number that this extent covers.
+ * - 0x4
+ - \_\_le16
+ - ee\_len
+ - Number of blocks covered by extent. If the value of this field is <=
+ 32768, the extent is initialized. If the value of the field is > 32768,
+ the extent is uninitialized and the actual extent length is ``ee_len`` -
+ 32768. Therefore, the maximum length of a initialized extent is 32768
+ blocks, and the maximum length of an uninitialized extent is 32767.
+ * - 0x6
+ - \_\_le16
+ - ee\_start\_hi
+ - Upper 16-bits of the block number to which this extent points.
+ * - 0x8
+ - \_\_le32
+ - ee\_start\_lo
+ - Lower 32-bits of the block number to which this extent points.
+
+Prior to the introduction of metadata checksums, the extent header +
+extent entries always left at least 4 bytes of unallocated space at the
+end of each extent tree data block (because (2^x % 12) >= 4). Therefore,
+the 32-bit checksum is inserted into this space. The 4 extents in the
+inode do not need checksumming, since the inode is already checksummed.
+The checksum is calculated against the FS UUID, the inode number, the
+inode generation, and the entire extent block leading up to (but not
+including) the checksum itself.
+
+``struct ext4_extent_tail`` is 4 bytes long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - eb\_checksum
+ - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock)
+
+Inline Data
+~~~~~~~~~~~
+
+If the inline data feature is enabled for the filesystem and the flag is
+set for the inode, it is possible that the first 60 bytes of the file
+data are stored here.
diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst
new file mode 100644
index 000000000000..f7d082c3a435
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/index.rst
@@ -0,0 +1,9 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Data Structures and Algorithms
+==============================
+.. include:: about.rst
+.. include:: overview.rst
+.. include:: globals.rst
+.. include:: dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/ondisk/inlinedata.rst
new file mode 100644
index 000000000000..d1075178ce0b
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/inlinedata.rst
@@ -0,0 +1,37 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Inline Data
+-----------
+
+The inline data feature was designed to handle the case that a file's
+data is so tiny that it readily fits inside the inode, which
+(theoretically) reduces disk block consumption and reduces seeks. If the
+file is smaller than 60 bytes, then the data are stored inline in
+``inode.i_block``. If the rest of the file would fit inside the extended
+attribute space, then it might be found as an extended attribute
+“system.data” within the inode body (“ibody EA”). This of course
+constrains the amount of extended attributes one can attach to an inode.
+If the data size increases beyond i\_block + ibody EA, a regular block
+is allocated and the contents moved to that block.
+
+Pending a change to compact the extended attribute key used to store
+inline data, one ought to be able to store 160 bytes of data in a
+256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to
+that, the limit was 156 bytes due to inefficient use of inode space.
+
+The inline data feature requires the presence of an extended attribute
+for “system.data”, even if the attribute value is zero length.
+
+Inline Directories
+~~~~~~~~~~~~~~~~~~
+
+The first four bytes of i\_block are the inode number of the parent
+directory. Following that is a 56-byte space for an array of directory
+entries; see ``struct ext4_dir_entry``. If there is a “system.data”
+attribute in the inode body, the EA value is an array of
+``struct ext4_dir_entry`` as well. Note that for inline directories, the
+i\_block and EA space are treated as separate dirent blocks; directory
+entries cannot span the two.
+
+Inline directory entries are not checksummed, as the inode checksum
+should protect all inline data contents.
diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst
new file mode 100644
index 000000000000..655ce898f3f5
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/inodes.rst
@@ -0,0 +1,575 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Index Nodes
+-----------
+
+In a regular UNIX filesystem, the inode stores all the metadata
+pertaining to the file (time stamps, block maps, extended attributes,
+etc), not the directory entry. To find the information associated with a
+file, one must traverse the directory files to find the directory entry
+associated with a file, then load the inode to find the metadata for
+that file. ext4 appears to cheat (for performance reasons) a little bit
+by storing a copy of the file type (normally stored in the inode) in the
+directory entry. (Compare all this to FAT, which stores all the file
+information directly in the directory entry, but does not support hard
+links and is in general more seek-happy than ext4 due to its simpler
+block allocator and extensive use of linked lists.)
+
+The inode table is a linear array of ``struct ext4_inode``. The table is
+sized to have enough blocks to store at least
+``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the
+block group containing an inode can be calculated as
+``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the
+group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There
+is no inode 0.
+
+The inode checksum is calculated against the FS UUID, the inode number,
+and the inode structure itself.
+
+The inode table entry is laid out in ``struct ext4_inode``.
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le16
+ - i\_mode
+ - File mode. See the table i_mode_ below.
+ * - 0x2
+ - \_\_le16
+ - i\_uid
+ - Lower 16-bits of Owner UID.
+ * - 0x4
+ - \_\_le32
+ - i\_size\_lo
+ - Lower 32-bits of size in bytes.
+ * - 0x8
+ - \_\_le32
+ - i\_atime
+ - Last access time, in seconds since the epoch. However, if the EA\_INODE
+ inode flag is set, this inode stores an extended attribute value and
+ this field contains the checksum of the value.
+ * - 0xC
+ - \_\_le32
+ - i\_ctime
+ - Last inode change time, in seconds since the epoch. However, if the
+ EA\_INODE inode flag is set, this inode stores an extended attribute
+ value and this field contains the lower 32 bits of the attribute value's
+ reference count.
+ * - 0x10
+ - \_\_le32
+ - i\_mtime
+ - Last data modification time, in seconds since the epoch. However, if the
+ EA\_INODE inode flag is set, this inode stores an extended attribute
+ value and this field contains the number of the inode that owns the
+ extended attribute.
+ * - 0x14
+ - \_\_le32
+ - i\_dtime
+ - Deletion Time, in seconds since the epoch.
+ * - 0x18
+ - \_\_le16
+ - i\_gid
+ - Lower 16-bits of GID.
+ * - 0x1A
+ - \_\_le16
+ - i\_links\_count
+ - Hard link count. Normally, ext4 does not permit an inode to have more
+ than 65,000 hard links. This applies to files as well as directories,
+ which means that there cannot be more than 64,998 subdirectories in a
+ directory (each subdirectory's '..' entry counts as a hard link, as does
+ the '.' entry in the directory itself). With the DIR\_NLINK feature
+ enabled, ext4 supports more than 64,998 subdirectories by setting this
+ field to 1 to indicate that the number of hard links is not known.
+ * - 0x1C
+ - \_\_le32
+ - i\_blocks\_lo
+ - Lower 32-bits of “block” count. If the huge\_file feature flag is not
+ set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks
+ on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in
+ ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi
+ << 32)`` 512-byte blocks on disk. If huge\_file is set and
+ EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file
+ consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on
+ disk.
+ * - 0x20
+ - \_\_le32
+ - i\_flags
+ - Inode flags. See the table i_flags_ below.
+ * - 0x24
+ - 4 bytes
+ - i\_osd1
+ - See the table i_osd1_ for more details.
+ * - 0x28
+ - 60 bytes
+ - i\_block[EXT4\_N\_BLOCKS=15]
+ - Block map or extent tree. See the section “The Contents of inode.i\_block”.
+ * - 0x64
+ - \_\_le32
+ - i\_generation
+ - File version (for NFS).
+ * - 0x68
+ - \_\_le32
+ - i\_file\_acl\_lo
+ - Lower 32-bits of extended attribute block. ACLs are of course one of
+ many possible extended attributes; I think the name of this field is a
+ result of the first use of extended attributes being for ACLs.
+ * - 0x6C
+ - \_\_le32
+ - i\_size\_high / i\_dir\_acl
+ - Upper 32-bits of file/directory size. In ext2/3 this field was named
+ i\_dir\_acl, though it was usually set to zero and never used.
+ * - 0x70
+ - \_\_le32
+ - i\_obso\_faddr
+ - (Obsolete) fragment address.
+ * - 0x74
+ - 12 bytes
+ - i\_osd2
+ - See the table i_osd2_ for more details.
+ * - 0x80
+ - \_\_le16
+ - i\_extra\_isize
+ - Size of this inode - 128. Alternately, the size of the extended inode
+ fields beyond the original ext2 inode, including this field.
+ * - 0x82
+ - \_\_le16
+ - i\_checksum\_hi
+ - Upper 16-bits of the inode checksum.
+ * - 0x84
+ - \_\_le32
+ - i\_ctime\_extra
+ - Extra change time bits. This provides sub-second precision. See Inode
+ Timestamps section.
+ * - 0x88
+ - \_\_le32
+ - i\_mtime\_extra
+ - Extra modification time bits. This provides sub-second precision.
+ * - 0x8C
+ - \_\_le32
+ - i\_atime\_extra
+ - Extra access time bits. This provides sub-second precision.
+ * - 0x90
+ - \_\_le32
+ - i\_crtime
+ - File creation time, in seconds since the epoch.
+ * - 0x94
+ - \_\_le32
+ - i\_crtime\_extra
+ - Extra file creation time bits. This provides sub-second precision.
+ * - 0x98
+ - \_\_le32
+ - i\_version\_hi
+ - Upper 32-bits for version number.
+ * - 0x9C
+ - \_\_le32
+ - i\_projid
+ - Project ID.
+
+.. _i_mode:
+
+The ``i_mode`` value is a combination of the following flags:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - S\_IXOTH (Others may execute)
+ * - 0x2
+ - S\_IWOTH (Others may write)
+ * - 0x4
+ - S\_IROTH (Others may read)
+ * - 0x8
+ - S\_IXGRP (Group members may execute)
+ * - 0x10
+ - S\_IWGRP (Group members may write)
+ * - 0x20
+ - S\_IRGRP (Group members may read)
+ * - 0x40
+ - S\_IXUSR (Owner may execute)
+ * - 0x80
+ - S\_IWUSR (Owner may write)
+ * - 0x100
+ - S\_IRUSR (Owner may read)
+ * - 0x200
+ - S\_ISVTX (Sticky bit)
+ * - 0x400
+ - S\_ISGID (Set GID)
+ * - 0x800
+ - S\_ISUID (Set UID)
+ * -
+ - These are mutually-exclusive file types:
+ * - 0x1000
+ - S\_IFIFO (FIFO)
+ * - 0x2000
+ - S\_IFCHR (Character device)
+ * - 0x4000
+ - S\_IFDIR (Directory)
+ * - 0x6000
+ - S\_IFBLK (Block device)
+ * - 0x8000
+ - S\_IFREG (Regular file)
+ * - 0xA000
+ - S\_IFLNK (Symbolic link)
+ * - 0xC000
+ - S\_IFSOCK (Socket)
+
+.. _i_flags:
+
+The ``i_flags`` field is a combination of these values:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented)
+ * - 0x2
+ - This file should be preserved, should undeletion be desired
+ (EXT4\_UNRM\_FL). (not implemented)
+ * - 0x4
+ - File is compressed (EXT4\_COMPR\_FL). (not really implemented)
+ * - 0x8
+ - All writes to the file must be synchronous (EXT4\_SYNC\_FL).
+ * - 0x10
+ - File is immutable (EXT4\_IMMUTABLE\_FL).
+ * - 0x20
+ - File can only be appended (EXT4\_APPEND\_FL).
+ * - 0x40
+ - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL).
+ * - 0x80
+ - Do not update access time (EXT4\_NOATIME\_FL).
+ * - 0x100
+ - Dirty compressed file (EXT4\_DIRTY\_FL). (not used)
+ * - 0x200
+ - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used)
+ * - 0x400
+ - Do not compress file (EXT4\_NOCOMPR\_FL). (not used)
+ * - 0x800
+ - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was
+ EXT4\_ECOMPR\_FL (compression error), which was never used.
+ * - 0x1000
+ - Directory has hashed indexes (EXT4\_INDEX\_FL).
+ * - 0x2000
+ - AFS magic directory (EXT4\_IMAGIC\_FL).
+ * - 0x4000
+ - File data must always be written through the journal
+ (EXT4\_JOURNAL\_DATA\_FL).
+ * - 0x8000
+ - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4)
+ * - 0x10000
+ - All directory entry data should be written synchronously (see
+ ``dirsync``) (EXT4\_DIRSYNC\_FL).
+ * - 0x20000
+ - Top of directory hierarchy (EXT4\_TOPDIR\_FL).
+ * - 0x40000
+ - This is a huge file (EXT4\_HUGE\_FILE\_FL).
+ * - 0x80000
+ - Inode uses extents (EXT4\_EXTENTS\_FL).
+ * - 0x200000
+ - Inode stores a large extended attribute value in its data blocks
+ (EXT4\_EA\_INODE\_FL).
+ * - 0x400000
+ - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL).
+ (deprecated)
+ * - 0x01000000
+ - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline)
+ * - 0x04000000
+ - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in
+ mainline)
+ * - 0x08000000
+ - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in
+ mainline)
+ * - 0x10000000
+ - Inode has inline data (EXT4\_INLINE\_DATA\_FL).
+ * - 0x20000000
+ - Create children with the same project ID (EXT4\_PROJINHERIT\_FL).
+ * - 0x80000000
+ - Reserved for ext4 library (EXT4\_RESERVED\_FL).
+ * -
+ - Aggregate flags:
+ * - 0x4BDFFF
+ - User-visible flags.
+ * - 0x4B80FF
+ - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and
+ EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's
+ EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of
+ these flags in a special manner and they are masked out of the set of
+ flags that are saved directly to i\_flags.
+
+.. _i_osd1:
+
+The ``osd1`` field has multiple meanings depending on the creator:
+
+Linux:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - l\_i\_version
+ - Inode version. However, if the EA\_INODE inode flag is set, this inode
+ stores an extended attribute value and this field contains the upper 32
+ bits of the attribute value's reference count.
+
+Hurd:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - h\_i\_translator
+ - ??
+
+Masix:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - m\_i\_reserved
+ - ??
+
+.. _i_osd2:
+
+The ``osd2`` field has multiple meanings depending on the filesystem creator:
+
+Linux:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le16
+ - l\_i\_blocks\_high
+ - Upper 16-bits of the block count. Please see the note attached to
+ i\_blocks\_lo.
+ * - 0x2
+ - \_\_le16
+ - l\_i\_file\_acl\_high
+ - Upper 16-bits of the extended attribute block (historically, the file
+ ACL location). See the Extended Attributes section below.
+ * - 0x4
+ - \_\_le16
+ - l\_i\_uid\_high
+ - Upper 16-bits of the Owner UID.
+ * - 0x6
+ - \_\_le16
+ - l\_i\_gid\_high
+ - Upper 16-bits of the GID.
+ * - 0x8
+ - \_\_le16
+ - l\_i\_checksum\_lo
+ - Lower 16-bits of the inode checksum.
+ * - 0xA
+ - \_\_le16
+ - l\_i\_reserved
+ - Unused.
+
+Hurd:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le16
+ - h\_i\_reserved1
+ - ??
+ * - 0x2
+ - \_\_u16
+ - h\_i\_mode\_high
+ - Upper 16-bits of the file mode.
+ * - 0x4
+ - \_\_le16
+ - h\_i\_uid\_high
+ - Upper 16-bits of the Owner UID.
+ * - 0x6
+ - \_\_le16
+ - h\_i\_gid\_high
+ - Upper 16-bits of the GID.
+ * - 0x8
+ - \_\_u32
+ - h\_i\_author
+ - Author code?
+
+Masix:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le16
+ - h\_i\_reserved1
+ - ??
+ * - 0x2
+ - \_\_u16
+ - m\_i\_file\_acl\_high
+ - Upper 16-bits of the extended attribute block (historically, the file
+ ACL location).
+ * - 0x4
+ - \_\_u32
+ - m\_i\_reserved2[2]
+ - ??
+
+Inode Size
+~~~~~~~~~~
+
+In ext2 and ext3, the inode structure size was fixed at 128 bytes
+(``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of
+128 bytes. Starting with ext4, it is possible to allocate a larger
+on-disk inode at format time for all inodes in the filesystem to provide
+space beyond the end of the original ext2 inode. The on-disk inode
+record size is recorded in the superblock as ``s_inode_size``. The
+number of bytes actually used by struct ext4\_inode beyond the original
+128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each
+inode, which allows struct ext4\_inode to grow for a new kernel without
+having to upgrade all of the on-disk inodes. Access to fields beyond
+EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within
+``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as
+of October 2013) the inode structure is 156 bytes
+(``i_extra_isize = 28``). The extra space between the end of the inode
+structure and the end of the inode record can be used to store extended
+attributes. Each inode record can be as large as the filesystem block
+size, though this is not terribly efficient.
+
+Finding an Inode
+~~~~~~~~~~~~~~~~
+
+Each block group contains ``sb->s_inodes_per_group`` inodes. Because
+inode 0 is defined not to exist, this formula can be used to find the
+block group that an inode lives in:
+``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode
+can be found within the block group's inode table at
+``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte
+address within the inode table, use
+``offset = index * sb->s_inode_size``.
+
+Inode Timestamps
+~~~~~~~~~~~~~~~~
+
+Four timestamps are recorded in the lower 128 bytes of the inode
+structure -- inode change time (ctime), access time (atime), data
+modification time (mtime), and deletion time (dtime). The four fields
+are 32-bit signed integers that represent seconds since the Unix epoch
+(1970-01-01 00:00:00 GMT), which means that the fields will overflow in
+January 2038. For inodes that are not linked from any directory but are
+still open (orphan inodes), the dtime field is overloaded for use with
+the orphan list. The superblock field ``s_last_orphan`` points to the
+first inode in the orphan list; dtime is then the number of the next
+orphaned inode, or zero if there are no more orphans.
+
+If the inode structure size ``sb->s_inode_size`` is larger than 128
+bytes and the ``i_inode_extra`` field is large enough to encompass the
+respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime
+inode fields are widened to 64 bits. Within this “extra” 32-bit field,
+the lower two bits are used to extend the 32-bit seconds field to be 34
+bit wide; the upper 30 bits are used to provide nanosecond timestamp
+accuracy. Therefore, timestamps should not overflow until May 2446.
+dtime was not widened. There is also a fifth timestamp to record inode
+creation time (crtime); this field is 64-bits wide and decoded in the
+same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible
+through the regular stat() interface, though debugfs will report them.
+
+We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)).
+In other words:
+
+.. list-table::
+ :widths: 20 20 20 20 20
+ :header-rows: 1
+
+ * - Extra epoch bits
+ - MSB of 32-bit time
+ - Adjustment for signed 32-bit to 64-bit tv\_sec
+ - Decoded 64-bit tv\_sec
+ - valid time range
+ * - 0 0
+ - 1
+ - 0
+ - ``-0x80000000 - -0x00000001``
+ - 1901-12-13 to 1969-12-31
+ * - 0 0
+ - 0
+ - 0
+ - ``0x000000000 - 0x07fffffff``
+ - 1970-01-01 to 2038-01-19
+ * - 0 1
+ - 1
+ - 0x100000000
+ - ``0x080000000 - 0x0ffffffff``
+ - 2038-01-19 to 2106-02-07
+ * - 0 1
+ - 0
+ - 0x100000000
+ - ``0x100000000 - 0x17fffffff``
+ - 2106-02-07 to 2174-02-25
+ * - 1 0
+ - 1
+ - 0x200000000
+ - ``0x180000000 - 0x1ffffffff``
+ - 2174-02-25 to 2242-03-16
+ * - 1 0
+ - 0
+ - 0x200000000
+ - ``0x200000000 - 0x27fffffff``
+ - 2242-03-16 to 2310-04-04
+ * - 1 1
+ - 1
+ - 0x300000000
+ - ``0x280000000 - 0x2ffffffff``
+ - 2310-04-04 to 2378-04-22
+ * - 1 1
+ - 0
+ - 0x300000000
+ - ``0x300000000 - 0x37fffffff``
+ - 2378-04-22 to 2446-05-10
+
+This is a somewhat odd encoding since there are effectively seven times
+as many positive values as negative values. There have also been
+long-standing bugs decoding and encoding dates beyond 2038, which don't
+seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels
+incorrectly use the extra epoch bits 1,1 for dates between 1901 and
+1970. At some point the kernel will be fixed and e2fsck will fix this
+situation, assuming that it is run before 2310.
diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst
new file mode 100644
index 000000000000..e7031af86876
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/journal.rst
@@ -0,0 +1,611 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Journal (jbd2)
+--------------
+
+Introduced in ext3, the ext4 filesystem employs a journal to protect the
+filesystem against corruption in the case of a system crash. A small
+continuous region of disk (default 128MiB) is reserved inside the
+filesystem as a place to land “important” data writes on-disk as quickly
+as possible. Once the important data transaction is fully written to the
+disk and flushed from the disk write cache, a record of the data being
+committed is also written to the journal. At some later point in time,
+the journal code writes the transactions to their final locations on
+disk (this could involve a lot of seeking or a lot of small
+read-write-erases) before erasing the commit record. Should the system
+crash during the second slow write, the journal can be replayed all the
+way to the latest commit record, guaranteeing the atomicity of whatever
+gets written through the journal to the disk. The effect of this is to
+guarantee that the filesystem does not become stuck midway through a
+metadata update.
+
+For performance reasons, ext4 by default only writes filesystem metadata
+through the journal. This means that file data blocks are /not/
+guaranteed to be in any consistent state after a crash. If this default
+guarantee level (``data=ordered``) is not satisfactory, there is a mount
+option to control journal behavior. If ``data=journal``, all data and
+metadata are written to disk through the journal. This is slower but
+safest. If ``data=writeback``, dirty data blocks are not flushed to the
+disk before the metadata are written to disk through the journal.
+
+The journal inode is typically inode 8. The first 68 bytes of the
+journal inode are replicated in the ext4 superblock. The journal itself
+is normal (but hidden) file within the filesystem. The file usually
+consumes an entire block group, though mke2fs tries to put it in the
+middle of the disk.
+
+All fields in jbd2 are written to disk in big-endian order. This is the
+opposite of ext4.
+
+NOTE: Both ext4 and ocfs2 use jbd2.
+
+The maximum size of a journal embedded in an ext4 filesystem is 2^32
+blocks. jbd2 itself does not seem to care.
+
+Layout
+~~~~~~
+
+Generally speaking, the journal has this format:
+
+.. list-table::
+ :widths: 1 1 78
+ :header-rows: 1
+
+ * - Superblock
+ - descriptor\_block (data\_blocks or revocation\_block) [more data or
+ revocations] commmit\_block
+ - [more transactions...]
+ * -
+ - One transaction
+ -
+
+Notice that a transaction begins with either a descriptor and some data,
+or a block revocation list. A finished transaction always ends with a
+commit. If there is no commit record (or the checksums don't match), the
+transaction will be discarded during replay.
+
+External Journal
+~~~~~~~~~~~~~~~~
+
+Optionally, an ext4 filesystem can be created with an external journal
+device (as opposed to an internal journal, which uses a reserved inode).
+In this case, on the filesystem device, ``s_journal_inum`` should be
+zero and ``s_journal_uuid`` should be set. On the journal device there
+will be an ext4 super block in the usual place, with a matching UUID.
+The journal superblock will be in the next full block after the
+superblock.
+
+.. list-table::
+ :widths: 1 1 1 1 76
+ :header-rows: 1
+
+ * - 1024 bytes of padding
+ - ext4 Superblock
+ - Journal Superblock
+ - descriptor\_block (data\_blocks or revocation\_block) [more data or
+ revocations] commmit\_block
+ - [more transactions...]
+ * -
+ -
+ -
+ - One transaction
+ -
+
+Block Header
+~~~~~~~~~~~~
+
+Every block in the journal starts with a common 12-byte header
+``struct journal_header_s``:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_be32
+ - h\_magic
+ - jbd2 magic number, 0xC03B3998.
+ * - 0x4
+ - \_\_be32
+ - h\_blocktype
+ - Description of what this block contains. See the jbd2_blocktype_ table
+ below.
+ * - 0x8
+ - \_\_be32
+ - h\_sequence
+ - The transaction ID that goes with this block.
+
+.. _jbd2_blocktype:
+
+The journal block type can be any one of:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 1
+ - Descriptor. This block precedes a series of data blocks that were
+ written through the journal during a transaction.
+ * - 2
+ - Block commit record. This block signifies the completion of a
+ transaction.
+ * - 3
+ - Journal superblock, v1.
+ * - 4
+ - Journal superblock, v2.
+ * - 5
+ - Block revocation records. This speeds up recovery by enabling the
+ journal to skip writing blocks that were subsequently rewritten.
+
+Super Block
+~~~~~~~~~~~
+
+The super block for the journal is much simpler as compared to ext4's.
+The key data kept within are size of the journal, and where to find the
+start of the log of transactions.
+
+The journal superblock is recorded as ``struct journal_superblock_s``,
+which is 1024 bytes long:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * -
+ -
+ -
+ - Static information describing the journal.
+ * - 0x0
+ - journal\_header\_t (12 bytes)
+ - s\_header
+ - Common header identifying this as a superblock.
+ * - 0xC
+ - \_\_be32
+ - s\_blocksize
+ - Journal device block size.
+ * - 0x10
+ - \_\_be32
+ - s\_maxlen
+ - Total number of blocks in this journal.
+ * - 0x14
+ - \_\_be32
+ - s\_first
+ - First block of log information.
+ * -
+ -
+ -
+ - Dynamic information describing the current state of the log.
+ * - 0x18
+ - \_\_be32
+ - s\_sequence
+ - First commit ID expected in log.
+ * - 0x1C
+ - \_\_be32
+ - s\_start
+ - Block number of the start of log. Contrary to the comments, this field
+ being zero does not imply that the journal is clean!
+ * - 0x20
+ - \_\_be32
+ - s\_errno
+ - Error value, as set by jbd2\_journal\_abort().
+ * -
+ -
+ -
+ - The remaining fields are only valid in a v2 superblock.
+ * - 0x24
+ - \_\_be32
+ - s\_feature\_compat;
+ - Compatible feature set. See the table jbd2_compat_ below.
+ * - 0x28
+ - \_\_be32
+ - s\_feature\_incompat
+ - Incompatible feature set. See the table jbd2_incompat_ below.
+ * - 0x2C
+ - \_\_be32
+ - s\_feature\_ro\_compat
+ - Read-only compatible feature set. There aren't any of these currently.
+ * - 0x30
+ - \_\_u8
+ - s\_uuid[16]
+ - 128-bit uuid for journal. This is compared against the copy in the ext4
+ super block at mount time.
+ * - 0x40
+ - \_\_be32
+ - s\_nr\_users
+ - Number of file systems sharing this journal.
+ * - 0x44
+ - \_\_be32
+ - s\_dynsuper
+ - Location of dynamic super block copy. (Not used?)
+ * - 0x48
+ - \_\_be32
+ - s\_max\_transaction
+ - Limit of journal blocks per transaction. (Not used?)
+ * - 0x4C
+ - \_\_be32
+ - s\_max\_trans\_data
+ - Limit of data blocks per transaction. (Not used?)
+ * - 0x50
+ - \_\_u8
+ - s\_checksum\_type
+ - Checksum algorithm used for the journal. See jbd2_checksum_type_ for
+ more info.
+ * - 0x51
+ - \_\_u8[3]
+ - s\_padding2
+ -
+ * - 0x54
+ - \_\_u32
+ - s\_padding[42]
+ -
+ * - 0xFC
+ - \_\_be32
+ - s\_checksum
+ - Checksum of the entire superblock, with this field set to zero.
+ * - 0x100
+ - \_\_u8
+ - s\_users[16\*48]
+ - ids of all file systems sharing the log. e2fsprogs/Linux don't allow
+ shared external journals, but I imagine Lustre (or ocfs2?), which use
+ the jbd2 code, might.
+
+.. _jbd2_compat:
+
+The journal compat features are any combination of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - Journal maintains checksums on the data blocks.
+ (JBD2\_FEATURE\_COMPAT\_CHECKSUM)
+
+.. _jbd2_incompat:
+
+The journal incompat features are any combination of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE)
+ * - 0x2
+ - Journal can deal with 64-bit block numbers.
+ (JBD2\_FEATURE\_INCOMPAT\_64BIT)
+ * - 0x4
+ - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT)
+ * - 0x8
+ - This journal uses v2 of the checksum on-disk format. Each journal
+ metadata block gets its own checksum, and the block tags in the
+ descriptor table contain checksums for each of the data blocks in the
+ journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2)
+ * - 0x10
+ - This journal uses v3 of the checksum on-disk format. This is the same as
+ v2, but the journal block tag size is fixed regardless of the size of
+ block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3)
+
+.. _jbd2_checksum_type:
+
+Journal checksum type codes are one of the following. crc32 or crc32c are the
+most likely choices.
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 1
+ - CRC32
+ * - 2
+ - MD5
+ * - 3
+ - SHA1
+ * - 4
+ - CRC32C
+
+Descriptor Block
+~~~~~~~~~~~~~~~~
+
+The descriptor block contains an array of journal block tags that
+describe the final locations of the data blocks that follow in the
+journal. Descriptor blocks are open-coded instead of being completely
+described by a data structure, but here is the block structure anyway.
+Descriptor blocks consume at least 36 bytes, but use a full block:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Descriptor
+ * - 0x0
+ - journal\_header\_t
+ - (open coded)
+ - Common block header.
+ * - 0xC
+ - struct journal\_block\_tag\_s
+ - open coded array[]
+ - Enough tags either to fill up the block or to describe all the data
+ blocks that follow this descriptor block.
+
+Journal block tags have any of the following formats, depending on which
+journal feature and block tag flags are set.
+
+If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is
+defined as ``struct journal_block_tag3_s``, which looks like the
+following. The size is 16 or 32 bytes.
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Descriptor
+ * - 0x0
+ - \_\_be32
+ - t\_blocknr
+ - Lower 32-bits of the location of where the corresponding data block
+ should end up on disk.
+ * - 0x4
+ - \_\_be32
+ - t\_flags
+ - Flags that go with the descriptor. See the table jbd2_tag_flags_ for
+ more info.
+ * - 0x8
+ - \_\_be32
+ - t\_blocknr\_high
+ - Upper 32-bits of the location of where the corresponding data block
+ should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is
+ not enabled.
+ * - 0xC
+ - \_\_be32
+ - t\_checksum
+ - Checksum of the journal UUID, the sequence number, and the data block.
+ * -
+ -
+ -
+ - This field appears to be open coded. It always comes at the end of the
+ tag, after t_checksum. This field is not present if the "same UUID" flag
+ is set.
+ * - 0x8 or 0xC
+ - char
+ - uuid[16]
+ - A UUID to go with this tag. This field appears to be copied from the
+ ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that
+ field.
+
+.. _jbd2_tag_flags:
+
+The journal tag flags are any combination of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - On-disk block is escaped. The first four bytes of the data block just
+ happened to match the jbd2 magic number.
+ * - 0x2
+ - This block has the same UUID as previous, therefore the UUID field is
+ omitted.
+ * - 0x4
+ - The data block was deleted by the transaction. (Not used?)
+ * - 0x8
+ - This is the last tag in this descriptor block.
+
+If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag
+is defined as ``struct journal_block_tag_s``, which looks like the
+following. The size is 8, 12, 24, or 28 bytes:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Descriptor
+ * - 0x0
+ - \_\_be32
+ - t\_blocknr
+ - Lower 32-bits of the location of where the corresponding data block
+ should end up on disk.
+ * - 0x4
+ - \_\_be16
+ - t\_checksum
+ - Checksum of the journal UUID, the sequence number, and the data block.
+ Note that only the lower 16 bits are stored.
+ * - 0x6
+ - \_\_be16
+ - t\_flags
+ - Flags that go with the descriptor. See the table jbd2_tag_flags_ for
+ more info.
+ * -
+ -
+ -
+ - This next field is only present if the super block indicates support for
+ 64-bit block numbers.
+ * - 0x8
+ - \_\_be32
+ - t\_blocknr\_high
+ - Upper 32-bits of the location of where the corresponding data block
+ should end up on disk.
+ * -
+ -
+ -
+ - This field appears to be open coded. It always comes at the end of the
+ tag, after t_flags or t_blocknr_high. This field is not present if the
+ "same UUID" flag is set.
+ * - 0x8 or 0xC
+ - char
+ - uuid[16]
+ - A UUID to go with this tag. This field appears to be copied from the
+ ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that
+ field.
+
+If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
+JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
+``struct jbd2_journal_block_tail``, which looks like this:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Descriptor
+ * - 0x0
+ - \_\_be32
+ - t\_checksum
+ - Checksum of the journal UUID + the descriptor block, with this field set
+ to zero.
+
+Data Block
+~~~~~~~~~~
+
+In general, the data blocks being written to disk through the journal
+are written verbatim into the journal file after the descriptor block.
+However, if the first four bytes of the block match the jbd2 magic
+number then those four bytes are replaced with zeroes and the “escaped”
+flag is set in the descriptor block tag.
+
+Revocation Block
+~~~~~~~~~~~~~~~~
+
+A revocation block is used to prevent replay of a block in an earlier
+transaction. This is used to mark blocks that were journalled at one
+time but are no longer journalled. Typically this happens if a metadata
+block is freed and re-allocated as a file data block; in this case, a
+journal replay after the file block was written to disk will cause
+corruption.
+
+**NOTE**: This mechanism is NOT used to express “this journal block is
+superseded by this other journal block”, as the author (djwong)
+mistakenly thought. Any block being added to a transaction will cause
+the removal of all existing revocation records for that block.
+
+Revocation blocks are described in
+``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in
+length, but use a full block:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - journal\_header\_t
+ - r\_header
+ - Common block header.
+ * - 0xC
+ - \_\_be32
+ - r\_count
+ - Number of bytes used in this block.
+ * - 0x10
+ - \_\_be32 or \_\_be64
+ - blocks[0]
+ - Blocks to revoke.
+
+After r\_count is a linear array of block numbers that are effectively
+revoked by this transaction. The size of each block number is 8 bytes if
+the superblock advertises 64-bit block number support, or 4 bytes
+otherwise.
+
+If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
+JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation
+block is a ``struct jbd2_journal_revoke_tail``, which has this format:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_be32
+ - r\_checksum
+ - Checksum of the journal UUID + revocation block
+
+Commit Block
+~~~~~~~~~~~~
+
+The commit block is a sentry that indicates that a transaction has been
+completely written to the journal. Once this commit block reaches the
+journal, the data stored with this transaction can be written to their
+final locations on disk.
+
+The commit block is described by ``struct commit_header``, which is 32
+bytes long (but uses a full block):
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Descriptor
+ * - 0x0
+ - journal\_header\_s
+ - (open coded)
+ - Common block header.
+ * - 0xC
+ - unsigned char
+ - h\_chksum\_type
+ - The type of checksum to use to verify the integrity of the data blocks
+ in the transaction. See jbd2_checksum_type_ for more info.
+ * - 0xD
+ - unsigned char
+ - h\_chksum\_size
+ - The number of bytes used by the checksum. Most likely 4.
+ * - 0xE
+ - unsigned char
+ - h\_padding[2]
+ -
+ * - 0x10
+ - \_\_be32
+ - h\_chksum[JBD2\_CHECKSUM\_BYTES]
+ - 32 bytes of space to store checksums. If
+ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3
+ are set, the first ``__be32`` is the checksum of the journal UUID and
+ the entire commit block, with this field zeroed. If
+ JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the
+ crc32 of all the blocks already written to the transaction.
+ * - 0x30
+ - \_\_be64
+ - h\_commit\_sec
+ - The time that the transaction was committed, in seconds since the epoch.
+ * - 0x38
+ - \_\_be32
+ - h\_commit\_nsec
+ - Nanoseconds component of the above timestamp.
+
diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst
new file mode 100644
index 000000000000..b7d7a3137f80
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/mmp.rst
@@ -0,0 +1,77 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Multiple Mount Protection
+-------------------------
+
+Multiple mount protection (MMP) is a feature that protects the
+filesystem against multiple hosts trying to use the filesystem
+simultaneously. When a filesystem is opened (for mounting, or fsck,
+etc.), the MMP code running on the node (call it node A) checks a
+sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the
+open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then
+fsck is (hopefully) running, and open fails immediately. Otherwise, the
+open code will wait for twice the specified MMP check interval and check
+the sequence number again. If the sequence number has changed, then the
+filesystem is active on another machine and the open fails. If the MMP
+code passes all of those checks, a new MMP sequence number is generated
+and written to the MMP block, and the mount proceeds.
+
+While the filesystem is live, the kernel sets up a timer to re-check the
+MMP block at the specified MMP check interval. To perform the re-check,
+the MMP sequence number is re-read; if it does not match the in-memory
+MMP sequence number, then another node (node B) has mounted the
+filesystem, and node A remounts the filesystem read-only. If the
+sequence numbers match, the sequence number is incremented both in
+memory and on disk, and the re-check is complete.
+
+The hostname and device filename are written into the MMP block whenever
+an open operation succeeds. The MMP code does not use these values; they
+are provided purely for informational purposes.
+
+The checksum is calculated against the FS UUID and the MMP structure.
+The MMP structure (``struct mmp_struct``) is as follows:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Type
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - mmp\_magic
+ - Magic number for MMP, 0x004D4D50 (“MMP”).
+ * - 0x4
+ - \_\_le32
+ - mmp\_seq
+ - Sequence number, updated periodically.
+ * - 0x8
+ - \_\_le64
+ - mmp\_time
+ - Time that the MMP block was last updated.
+ * - 0x10
+ - char[64]
+ - mmp\_nodename
+ - Hostname of the node that opened the filesystem.
+ * - 0x50
+ - char[32]
+ - mmp\_bdevname
+ - Block device name of the filesystem.
+ * - 0x70
+ - \_\_le16
+ - mmp\_check\_interval
+ - The MMP re-check interval, in seconds.
+ * - 0x72
+ - \_\_le16
+ - mmp\_pad1
+ - Zero.
+ * - 0x74
+ - \_\_le32[226]
+ - mmp\_pad2
+ - Zero.
+ * - 0x3FC
+ - \_\_le32
+ - mmp\_checksum
+ - Checksum of the MMP block.
diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/ondisk/overview.rst
new file mode 100644
index 000000000000..cbab18baba12
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/overview.rst
@@ -0,0 +1,26 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+High Level Design
+=================
+
+An ext4 file system is split into a series of block groups. To reduce
+performance difficulties due to fragmentation, the block allocator tries
+very hard to keep each file's blocks within the same group, thereby
+reducing seek times. The size of a block group is specified in
+``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \*
+``block_size_in_bytes``. With the default block size of 4KiB, each group
+will contain 32,768 blocks, for a length of 128MiB. The number of block
+groups is the size of the device divided by the size of a block group.
+
+All fields in ext4 are written to disk in little-endian order. HOWEVER,
+all fields in jbd2 (the journal) are written to disk in big-endian
+order.
+
+.. include:: blocks.rst
+.. include:: blockgroup.rst
+.. include:: special_inodes.rst
+.. include:: allocators.rst
+.. include:: checksums.rst
+.. include:: bigalloc.rst
+.. include:: inlinedata.rst
+.. include:: eainode.rst
diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst
new file mode 100644
index 000000000000..a82f70c9baeb
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/special_inodes.rst
@@ -0,0 +1,38 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Special inodes
+--------------
+
+ext4 reserves some inode for special features, as follows:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - inode Number
+ - Purpose
+ * - 0
+ - Doesn't exist; there is no inode 0.
+ * - 1
+ - List of defective blocks.
+ * - 2
+ - Root directory.
+ * - 3
+ - User quota.
+ * - 4
+ - Group quota.
+ * - 5
+ - Boot loader.
+ * - 6
+ - Undelete directory.
+ * - 7
+ - Reserved group descriptors inode. (“resize inode”)
+ * - 8
+ - Journal inode.
+ * - 9
+ - The “exclude” inode, for snapshots(?)
+ * - 10
+ - Replica inode, used for some non-upstream feature?
+ * - 11
+ - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock.
+
diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst
new file mode 100644
index 000000000000..5f81dd87e0b9
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/super.rst
@@ -0,0 +1,801 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Super Block
+-----------
+
+The superblock records various information about the enclosing
+filesystem, such as block counts, inode counts, supported features,
+maintenance information, and more.
+
+If the sparse\_super feature flag is set, redundant copies of the
+superblock and group descriptors are kept only in the groups whose group
+number is either 0 or a power of 3, 5, or 7. If the flag is not set,
+redundant copies are kept in all groups.
+
+The superblock checksum is calculated against the superblock structure,
+which includes the FS UUID.
+
+The ext4 superblock is laid out as follows in
+``struct ext4_super_block``:
+
+.. list-table::
+ :widths: 1 1 1 77
+ :header-rows: 1
+
+ * - Offset
+ - Size
+ - Name
+ - Description
+ * - 0x0
+ - \_\_le32
+ - s\_inodes\_count
+ - Total inode count.
+ * - 0x4
+ - \_\_le32
+ - s\_blocks\_count\_lo
+ - Total block count.
+ * - 0x8
+ - \_\_le32
+ - s\_r\_blocks\_count\_lo
+ - This number of blocks can only be allocated by the super-user.
+ * - 0xC
+ - \_\_le32
+ - s\_free\_blocks\_count\_lo
+ - Free block count.
+ * - 0x10
+ - \_\_le32
+ - s\_free\_inodes\_count
+ - Free inode count.
+ * - 0x14
+ - \_\_le32
+ - s\_first\_data\_block
+ - First data block. This must be at least 1 for 1k-block filesystems and
+ is typically 0 for all other block sizes.
+ * - 0x18
+ - \_\_le32
+ - s\_log\_block\_size
+ - Block size is 2 ^ (10 + s\_log\_block\_size).
+ * - 0x1C
+ - \_\_le32
+ - s\_log\_cluster\_size
+ - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is
+ enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size.
+ * - 0x20
+ - \_\_le32
+ - s\_blocks\_per\_group
+ - Blocks per group.
+ * - 0x24
+ - \_\_le32
+ - s\_clusters\_per\_group
+ - Clusters per group, if bigalloc is enabled. Otherwise
+ s\_clusters\_per\_group must equal s\_blocks\_per\_group.
+ * - 0x28
+ - \_\_le32
+ - s\_inodes\_per\_group
+ - Inodes per group.
+ * - 0x2C
+ - \_\_le32
+ - s\_mtime
+ - Mount time, in seconds since the epoch.
+ * - 0x30
+ - \_\_le32
+ - s\_wtime
+ - Write time, in seconds since the epoch.
+ * - 0x34
+ - \_\_le16
+ - s\_mnt\_count
+ - Number of mounts since the last fsck.
+ * - 0x36
+ - \_\_le16
+ - s\_max\_mnt\_count
+ - Number of mounts beyond which a fsck is needed.
+ * - 0x38
+ - \_\_le16
+ - s\_magic
+ - Magic signature, 0xEF53
+ * - 0x3A
+ - \_\_le16
+ - s\_state
+ - File system state. See super_state_ for more info.
+ * - 0x3C
+ - \_\_le16
+ - s\_errors
+ - Behaviour when detecting errors. See super_errors_ for more info.
+ * - 0x3E
+ - \_\_le16
+ - s\_minor\_rev\_level
+ - Minor revision level.
+ * - 0x40
+ - \_\_le32
+ - s\_lastcheck
+ - Time of last check, in seconds since the epoch.
+ * - 0x44
+ - \_\_le32
+ - s\_checkinterval
+ - Maximum time between checks, in seconds.
+ * - 0x48
+ - \_\_le32
+ - s\_creator\_os
+ - Creator OS. See the table super_creator_ for more info.
+ * - 0x4C
+ - \_\_le32
+ - s\_rev\_level
+ - Revision level. See the table super_revision_ for more info.
+ * - 0x50
+ - \_\_le16
+ - s\_def\_resuid
+ - Default uid for reserved blocks.
+ * - 0x52
+ - \_\_le16
+ - s\_def\_resgid
+ - Default gid for reserved blocks.
+ * -
+ -
+ -
+ - These fields are for EXT4_DYNAMIC_REV superblocks only.
+
+ Note: the difference between the compatible feature set and the
+ incompatible feature set is that if there is a bit set in the
+ incompatible feature set that the kernel doesn't know about, it should
+ refuse to mount the filesystem.
+
+ e2fsck's requirements are more strict; if it doesn't know
+ about a feature in either the compatible or incompatible feature set, it
+ must abort and not try to meddle with things it doesn't understand...
+ * - 0x54
+ - \_\_le32
+ - s\_first\_ino
+ - First non-reserved inode.
+ * - 0x58
+ - \_\_le16
+ - s\_inode\_size
+ - Size of inode structure, in bytes.
+ * - 0x5A
+ - \_\_le16
+ - s\_block\_group\_nr
+ - Block group # of this superblock.
+ * - 0x5C
+ - \_\_le32
+ - s\_feature\_compat
+ - Compatible feature set flags. Kernel can still read/write this fs even
+ if it doesn't understand a flag; fsck should not do that. See the
+ super_compat_ table for more info.
+ * - 0x60
+ - \_\_le32
+ - s\_feature\_incompat
+ - Incompatible feature set. If the kernel or fsck doesn't understand one
+ of these bits, it should stop. See the super_incompat_ table for more
+ info.
+ * - 0x64
+ - \_\_le32
+ - s\_feature\_ro\_compat
+ - Readonly-compatible feature set. If the kernel doesn't understand one of
+ these bits, it can still mount read-only. See the super_rocompat_ table
+ for more info.
+ * - 0x68
+ - \_\_u8
+ - s\_uuid[16]
+ - 128-bit UUID for volume.
+ * - 0x78
+ - char
+ - s\_volume\_name[16]
+ - Volume label.
+ * - 0x88
+ - char
+ - s\_last\_mounted[64]
+ - Directory where filesystem was last mounted.
+ * - 0xC8
+ - \_\_le32
+ - s\_algorithm\_usage\_bitmap
+ - For compression (Not used in e2fsprogs/Linux)
+ * -
+ -
+ -
+ - Performance hints. Directory preallocation should only happen if the
+ EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ * - 0xCC
+ - \_\_u8
+ - s\_prealloc\_blocks
+ - #. of blocks to try to preallocate for ... files? (Not used in
+ e2fsprogs/Linux)
+ * - 0xCD
+ - \_\_u8
+ - s\_prealloc\_dir\_blocks
+ - #. of blocks to preallocate for directories. (Not used in
+ e2fsprogs/Linux)
+ * - 0xCE
+ - \_\_le16
+ - s\_reserved\_gdt\_blocks
+ - Number of reserved GDT entries for future filesystem expansion.
+ * -
+ -
+ -
+ - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is
+ set.
+ * - 0xD0
+ - \_\_u8
+ - s\_journal\_uuid[16]
+ - UUID of journal superblock
+ * - 0xE0
+ - \_\_le32
+ - s\_journal\_inum
+ - inode number of journal file.
+ * - 0xE4
+ - \_\_le32
+ - s\_journal\_dev
+ - Device number of journal file, if the external journal feature flag is
+ set.
+ * - 0xE8
+ - \_\_le32
+ - s\_last\_orphan
+ - Start of list of orphaned inodes to delete.
+ * - 0xEC
+ - \_\_le32
+ - s\_hash\_seed[4]
+ - HTREE hash seed.
+ * - 0xFC
+ - \_\_u8
+ - s\_def\_hash\_version
+ - Default hash algorithm to use for directory hashes. See super_def_hash_
+ for more info.
+ * - 0xFD
+ - \_\_u8
+ - s\_jnl\_backup\_type
+ - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the
+ ``s_jnl_blocks`` field contains a duplicate copy of the inode's
+ ``i_block[]`` array and ``i_size``.
+ * - 0xFE
+ - \_\_le16
+ - s\_desc\_size
+ - Size of group descriptors, in bytes, if the 64bit incompat feature flag
+ is set.
+ * - 0x100
+ - \_\_le32
+ - s\_default\_mount\_opts
+ - Default mount options. See the super_mountopts_ table for more info.
+ * - 0x104
+ - \_\_le32
+ - s\_first\_meta\_bg
+ - First metablock block group, if the meta\_bg feature is enabled.
+ * - 0x108
+ - \_\_le32
+ - s\_mkfs\_time
+ - When the filesystem was created, in seconds since the epoch.
+ * - 0x10C
+ - \_\_le32
+ - s\_jnl\_blocks[17]
+ - Backup copy of the journal inode's ``i_block[]`` array in the first 15
+ elements and i\_size\_high and i\_size in the 16th and 17th elements,
+ respectively.
+ * -
+ -
+ -
+ - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set.
+ * - 0x150
+ - \_\_le32
+ - s\_blocks\_count\_hi
+ - High 32-bits of the block count.
+ * - 0x154
+ - \_\_le32
+ - s\_r\_blocks\_count\_hi
+ - High 32-bits of the reserved block count.
+ * - 0x158
+ - \_\_le32
+ - s\_free\_blocks\_count\_hi
+ - High 32-bits of the free block count.
+ * - 0x15C
+ - \_\_le16
+ - s\_min\_extra\_isize
+ - All inodes have at least # bytes.
+ * - 0x15E
+ - \_\_le16
+ - s\_want\_extra\_isize
+ - New inodes should reserve # bytes.
+ * - 0x160
+ - \_\_le32
+ - s\_flags
+ - Miscellaneous flags. See the super_flags_ table for more info.
+ * - 0x164
+ - \_\_le16
+ - s\_raid\_stride
+ - RAID stride. This is the number of logical blocks read from or written
+ to the disk before moving to the next disk. This affects the placement
+ of filesystem metadata, which will hopefully make RAID storage faster.
+ * - 0x166
+ - \_\_le16
+ - s\_mmp\_interval
+ - #. seconds to wait in multi-mount prevention (MMP) checking. In theory,
+ MMP is a mechanism to record in the superblock which host and device
+ have mounted the filesystem, in order to prevent multiple mounts. This
+ feature does not seem to be implemented...
+ * - 0x168
+ - \_\_le64
+ - s\_mmp\_block
+ - Block # for multi-mount protection data.
+ * - 0x170
+ - \_\_le32
+ - s\_raid\_stripe\_width
+ - RAID stripe width. This is the number of logical blocks read from or
+ written to the disk before coming back to the current disk. This is used
+ by the block allocator to try to reduce the number of read-modify-write
+ operations in a RAID5/6.
+ * - 0x174
+ - \_\_u8
+ - s\_log\_groups\_per\_flex
+ - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``.
+ * - 0x175
+ - \_\_u8
+ - s\_checksum\_type
+ - Metadata checksum algorithm type. The only valid value is 1 (crc32c).
+ * - 0x176
+ - \_\_le16
+ - s\_reserved\_pad
+ -
+ * - 0x178
+ - \_\_le64
+ - s\_kbytes\_written
+ - Number of KiB written to this filesystem over its lifetime.
+ * - 0x180
+ - \_\_le32
+ - s\_snapshot\_inum
+ - inode number of active snapshot. (Not used in e2fsprogs/Linux.)
+ * - 0x184
+ - \_\_le32
+ - s\_snapshot\_id
+ - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.)
+ * - 0x188
+ - \_\_le64
+ - s\_snapshot\_r\_blocks\_count
+ - Number of blocks reserved for active snapshot's future use. (Not used in
+ e2fsprogs/Linux.)
+ * - 0x190
+ - \_\_le32
+ - s\_snapshot\_list
+ - inode number of the head of the on-disk snapshot list. (Not used in
+ e2fsprogs/Linux.)
+ * - 0x194
+ - \_\_le32
+ - s\_error\_count
+ - Number of errors seen.
+ * - 0x198
+ - \_\_le32
+ - s\_first\_error\_time
+ - First time an error happened, in seconds since the epoch.
+ * - 0x19C
+ - \_\_le32
+ - s\_first\_error\_ino
+ - inode involved in first error.
+ * - 0x1A0
+ - \_\_le64
+ - s\_first\_error\_block
+ - Number of block involved of first error.
+ * - 0x1A8
+ - \_\_u8
+ - s\_first\_error\_func[32]
+ - Name of function where the error happened.
+ * - 0x1C8
+ - \_\_le32
+ - s\_first\_error\_line
+ - Line number where error happened.
+ * - 0x1CC
+ - \_\_le32
+ - s\_last\_error\_time
+ - Time of most recent error, in seconds since the epoch.
+ * - 0x1D0
+ - \_\_le32
+ - s\_last\_error\_ino
+ - inode involved in most recent error.
+ * - 0x1D4
+ - \_\_le32
+ - s\_last\_error\_line
+ - Line number where most recent error happened.
+ * - 0x1D8
+ - \_\_le64
+ - s\_last\_error\_block
+ - Number of block involved in most recent error.
+ * - 0x1E0
+ - \_\_u8
+ - s\_last\_error\_func[32]
+ - Name of function where the most recent error happened.
+ * - 0x200
+ - \_\_u8
+ - s\_mount\_opts[64]
+ - ASCIIZ string of mount options.
+ * - 0x240
+ - \_\_le32
+ - s\_usr\_quota\_inum
+ - Inode number of user `quota <quota>`__ file.
+ * - 0x244
+ - \_\_le32
+ - s\_grp\_quota\_inum
+ - Inode number of group `quota <quota>`__ file.
+ * - 0x248
+ - \_\_le32
+ - s\_overhead\_blocks
+ - Overhead blocks/clusters in fs. (Huh? This field is always zero, which
+ means that the kernel calculates it dynamically.)
+ * - 0x24C
+ - \_\_le32
+ - s\_backup\_bgs[2]
+ - Block groups containing superblock backups (if sparse\_super2)
+ * - 0x254
+ - \_\_u8
+ - s\_encrypt\_algos[4]
+ - Encryption algorithms in use. There can be up to four algorithms in use
+ at any time; valid algorithm codes are given in the super_encrypt_ table
+ below.
+ * - 0x258
+ - \_\_u8
+ - s\_encrypt\_pw\_salt[16]
+ - Salt for the string2key algorithm for encryption.
+ * - 0x268
+ - \_\_le32
+ - s\_lpf\_ino
+ - Inode number of lost+found
+ * - 0x26C
+ - \_\_le32
+ - s\_prj\_quota\_inum
+ - Inode that tracks project quotas.
+ * - 0x270
+ - \_\_le32
+ - s\_checksum\_seed
+ - Checksum seed used for metadata\_csum calculations. This value is
+ crc32c(~0, $orig\_fs\_uuid).
+ * - 0x274
+ - \_\_u8
+ - s\_wtime_hi
+ - Upper 8 bits of the s_wtime field.
+ * - 0x275
+ - \_\_u8
+ - s\_wtime_hi
+ - Upper 8 bits of the s_mtime field.
+ * - 0x276
+ - \_\_u8
+ - s\_mkfs_time_hi
+ - Upper 8 bits of the s_mkfs_time field.
+ * - 0x277
+ - \_\_u8
+ - s\_lastcheck_hi
+ - Upper 8 bits of the s_lastcheck_hi field.
+ * - 0x278
+ - \_\_u8
+ - s\_first_error_time_hi
+ - Upper 8 bits of the s_first_error_time_hi field.
+ * - 0x279
+ - \_\_u8
+ - s\_last_error_time_hi
+ - Upper 8 bits of the s_last_error_time_hi field.
+ * - 0x27A
+ - \_\_u8[2]
+ - s\_pad
+ - Zero padding.
+ * - 0x27C
+ - \_\_le32
+ - s\_reserved[96]
+ - Padding to the end of the block.
+ * - 0x3FC
+ - \_\_le32
+ - s\_checksum
+ - Superblock checksum.
+
+.. _super_state:
+
+The superblock state is some combination of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x0001
+ - Cleanly umounted
+ * - 0x0002
+ - Errors detected
+ * - 0x0004
+ - Orphans being recovered
+
+.. _super_errors:
+
+The superblock error policy is one of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 1
+ - Continue
+ * - 2
+ - Remount read-only
+ * - 3
+ - Panic
+
+.. _super_creator:
+
+The filesystem creator is one of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0
+ - Linux
+ * - 1
+ - Hurd
+ * - 2
+ - Masix
+ * - 3
+ - FreeBSD
+ * - 4
+ - Lites
+
+.. _super_revision:
+
+The superblock revision is one of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0
+ - Original format
+ * - 1
+ - v2 format w/ dynamic inode sizes
+
+Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem.
+
+.. _super_compat:
+
+The superblock compatible features field is a combination of any of the
+following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - Directory preallocation (COMPAT\_DIR\_PREALLOC).
+ * - 0x2
+ - “imagic inodes”. Not clear from the code what this does
+ (COMPAT\_IMAGIC\_INODES).
+ * - 0x4
+ - Has a journal (COMPAT\_HAS\_JOURNAL).
+ * - 0x8
+ - Supports extended attributes (COMPAT\_EXT\_ATTR).
+ * - 0x10
+ - Has reserved GDT blocks for filesystem expansion
+ (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER.
+ * - 0x20
+ - Has directory indices (COMPAT\_DIR\_INDEX).
+ * - 0x40
+ - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized
+ block groups? (COMPAT\_LAZY\_BG)
+ * - 0x80
+ - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE).
+ * - 0x100
+ - “Exclude bitmap”. Seems to be used to indicate the presence of
+ snapshot-related exclude bitmaps? Not defined in kernel or used in
+ e2fsprogs (COMPAT\_EXCLUDE\_BITMAP).
+ * - 0x200
+ - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs
+ points to the two block groups that contain backup superblocks
+ (COMPAT\_SPARSE\_SUPER2).
+
+.. _super_incompat:
+
+The superblock incompatible features field is a combination of any of the
+following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - Compression (INCOMPAT\_COMPRESSION).
+ * - 0x2
+ - Directory entries record the file type. See ext4\_dir\_entry\_2 below
+ (INCOMPAT\_FILETYPE).
+ * - 0x4
+ - Filesystem needs recovery (INCOMPAT\_RECOVER).
+ * - 0x8
+ - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV).
+ * - 0x10
+ - Meta block groups. See the earlier discussion of this feature
+ (INCOMPAT\_META\_BG).
+ * - 0x40
+ - Files in this filesystem use extents (INCOMPAT\_EXTENTS).
+ * - 0x80
+ - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT).
+ * - 0x100
+ - Multiple mount protection. Not implemented (INCOMPAT\_MMP).
+ * - 0x200
+ - Flexible block groups. See the earlier discussion of this feature
+ (INCOMPAT\_FLEX\_BG).
+ * - 0x400
+ - Inodes can be used to store large extended attribute values
+ (INCOMPAT\_EA\_INODE).
+ * - 0x1000
+ - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?)
+ * - 0x2000
+ - Metadata checksum seed is stored in the superblock. This feature enables
+ the administrator to change the UUID of a metadata\_csum filesystem
+ while the filesystem is mounted; without it, the checksum definition
+ requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED).
+ * - 0x4000
+ - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to
+ this feature, directories could not be larger than 4GiB and could not
+ have an htree more than 2 levels deep. If this feature is enabled,
+ directories can be larger than 4GiB and have a maximum htree depth of 3.
+ * - 0x8000
+ - Data in inode (INCOMPAT\_INLINE\_DATA).
+ * - 0x10000
+ - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT).
+
+.. _super_rocompat:
+
+The superblock read-only compatible features field is a combination of any of
+the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x1
+ - Sparse superblocks. See the earlier discussion of this feature
+ (RO\_COMPAT\_SPARSE\_SUPER).
+ * - 0x2
+ - This filesystem has been used to store a file greater than 2GiB
+ (RO\_COMPAT\_LARGE\_FILE).
+ * - 0x4
+ - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR).
+ * - 0x8
+ - This filesystem has files whose sizes are represented in units of
+ logical blocks, not 512-byte sectors. This implies a very large file
+ indeed! (RO\_COMPAT\_HUGE\_FILE)
+ * - 0x10
+ - Group descriptors have checksums. In addition to detecting corruption,
+ this is useful for lazy formatting with uninitialized groups
+ (RO\_COMPAT\_GDT\_CSUM).
+ * - 0x20
+ - Indicates that the old ext3 32,000 subdirectory limit no longer applies
+ (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1
+ if it is incremented past 64,999.
+ * - 0x40
+ - Indicates that large inodes exist on this filesystem
+ (RO\_COMPAT\_EXTRA\_ISIZE).
+ * - 0x80
+ - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT).
+ * - 0x100
+ - `Quota <Quota>`__ (RO\_COMPAT\_QUOTA).
+ * - 0x200
+ - This filesystem supports “bigalloc”, which means that file extents are
+ tracked in units of clusters (of blocks) instead of blocks
+ (RO\_COMPAT\_BIGALLOC).
+ * - 0x400
+ - This filesystem supports metadata checksumming.
+ (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though
+ GDT\_CSUM must not be set)
+ * - 0x800
+ - Filesystem supports replicas. This feature is neither in the kernel nor
+ e2fsprogs. (RO\_COMPAT\_REPLICA)
+ * - 0x1000
+ - Read-only filesystem image; the kernel will not mount this image
+ read-write and most tools will refuse to write to the image.
+ (RO\_COMPAT\_READONLY)
+ * - 0x2000
+ - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT)
+
+.. _super_def_hash:
+
+The ``s_def_hash_version`` field is one of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x0
+ - Legacy.
+ * - 0x1
+ - Half MD4.
+ * - 0x2
+ - Tea.
+ * - 0x3
+ - Legacy, unsigned.
+ * - 0x4
+ - Half MD4, unsigned.
+ * - 0x5
+ - Tea, unsigned.
+
+.. _super_mountopts:
+
+The ``s_default_mount_opts`` field is any combination of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x0001
+ - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG)
+ * - 0x0002
+ - New files take the gid of the containing directory (instead of the fsgid
+ of the current process). (EXT4\_DEFM\_BSDGROUPS)
+ * - 0x0004
+ - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER)
+ * - 0x0008
+ - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL)
+ * - 0x0010
+ - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16)
+ * - 0x0020
+ - All data and metadata are commited to the journal.
+ (EXT4\_DEFM\_JMODE\_DATA)
+ * - 0x0040
+ - All data are flushed to the disk before metadata are committed to the
+ journal. (EXT4\_DEFM\_JMODE\_ORDERED)
+ * - 0x0060
+ - Data ordering is not preserved; data may be written after the metadata
+ has been written. (EXT4\_DEFM\_JMODE\_WBACK)
+ * - 0x0100
+ - Disable write flushes. (EXT4\_DEFM\_NOBARRIER)
+ * - 0x0200
+ - Track which blocks in a filesystem are metadata and therefore should not
+ be used as data blocks. This option will be enabled by default on 3.18,
+ hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY)
+ * - 0x0400
+ - Enable DISCARD support, where the storage device is told about blocks
+ becoming unused. (EXT4\_DEFM\_DISCARD)
+ * - 0x0800
+ - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC)
+
+.. _super_flags:
+
+The ``s_flags`` field is any combination of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0x0001
+ - Signed directory hash in use.
+ * - 0x0002
+ - Unsigned directory hash in use.
+ * - 0x0004
+ - To test development code.
+
+.. _super_encrypt:
+
+The ``s_encrypt_algos`` list can contain any of the following:
+
+.. list-table::
+ :widths: 1 79
+ :header-rows: 1
+
+ * - Value
+ - Description
+ * - 0
+ - Invalid algorithm (ENCRYPTION\_MODE\_INVALID).
+ * - 1
+ - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS).
+ * - 2
+ - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM).
+ * - 3
+ - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC).
+
+Total size of the superblock is 1024 bytes.
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 53b89d0edc15..46d1b1be3a51 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -71,6 +71,39 @@ Other Functions
.. kernel-doc:: fs/block_dev.c
:export:
+.. kernel-doc:: fs/anon_inodes.c
+ :export:
+
+.. kernel-doc:: fs/attr.c
+ :export:
+
+.. kernel-doc:: fs/d_path.c
+ :export:
+
+.. kernel-doc:: fs/dax.c
+ :export:
+
+.. kernel-doc:: fs/direct-io.c
+ :export:
+
+.. kernel-doc:: fs/file_table.c
+ :export:
+
+.. kernel-doc:: fs/libfs.c
+ :export:
+
+.. kernel-doc:: fs/posix_acl.c
+ :export:
+
+.. kernel-doc:: fs/stat.c
+ :export:
+
+.. kernel-doc:: fs/sync.c
+ :export:
+
+.. kernel-doc:: fs/xattr.c
+ :export:
+
The proc filesystem
===================
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 17bb4dc28fae..7b7b845c490a 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -602,3 +602,23 @@ in your dentry operations instead.
dentry separately, and it now has request_mask and query_flags arguments
to specify the fields and sync type requested by statx. Filesystems not
supporting any statx-specific features may ignore the new arguments.
+--
+[mandatory]
+ ->atomic_open() calling conventions have changed. Gone is int *opened,
+ along with FILE_OPENED/FILE_CREATED. In place of those we have
+ FMODE_OPENED/FMODE_CREATED, set in file->f_mode. Additionally, return
+ value for 'called finish_no_open(), open it yourself' case has become
+ 0, not 1. Since finish_no_open() itself is returning 0 now, that part
+ does not need any changes in ->atomic_open() instances.
+--
+[mandatory]
+ alloc_file() has become static now; two wrappers are to be used instead.
+ alloc_file_pseudo(inode, vfsmount, name, flags, ops) is for the cases
+ when dentry needs to be created; that's the majority of old alloc_file()
+ users. Calling conventions: on success a reference to new struct file
+ is returned and callers reference to inode is subsumed by that. On
+ failure, ERR_PTR() is returned and no caller's references are affected,
+ so the caller needs to drop the inode reference it held.
+ alloc_file_clone(file, flags, ops) does not affect any caller's references.
+ On success you get a new struct file sharing the mount/dentry with the
+ original, on failure - ERR_PTR().
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f608180ad59d..85907d5b9c2c 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -386,7 +386,7 @@ struct inode_operations {
ssize_t (*listxattr) (struct dentry *, char *, size_t);
void (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *, struct file *,
- unsigned open_flag, umode_t create_mode, int *opened);
+ unsigned open_flag, umode_t create_mode);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
};
@@ -496,13 +496,15 @@ otherwise noted.
atomic_open: called on the last component of an open. Using this optional
method the filesystem can look up, possibly create and open the file in
- one atomic operation. If it cannot perform this (e.g. the file type
- turned out to be wrong) it may signal this by returning 1 instead of
- usual 0 or -ve . This method is only called if the last component is
- negative or needs lookup. Cached positive dentries are still handled by
- f_op->open(). If the file was created, the FILE_CREATED flag should be
- set in "opened". In case of O_EXCL the method must only succeed if the
- file didn't exist and hence FILE_CREATED shall always be set on success.
+ one atomic operation. If it wants to leave actual opening to the
+ caller (e.g. if the file turned out to be a symlink, device, or just
+ something filesystem won't do atomic open for), it may signal this by
+ returning finish_no_open(file, dentry). This method is only called if
+ the last component is negative or needs lookup. Cached positive dentries
+ are still handled by f_op->open(). If the file was created,
+ FMODE_CREATED flag should be set in file->f_mode. In case of O_EXCL
+ the method must only succeed if the file didn't exist and hence FMODE_CREATED
+ shall always be set on success.
tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to
atomically creating, opening and unlinking a file in given directory.
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 4d9ff0a7f8e1..a9ae82fb9d13 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -223,8 +223,6 @@ Deprecated Mount Options
Name Removal Schedule
---- ----------------
- barrier no earlier than v4.15
- nobarrier no earlier than v4.15
Removed Mount Options
@@ -236,6 +234,8 @@ Removed Mount Options
ihashsize v4.0
irixsgid v4.0
osyncisdsync/osyncisosync v4.0
+ barrier v4.19
+ nobarrier v4.19
sysctls
diff --git a/Documentation/hwmon/max34440 b/Documentation/hwmon/max34440
index 9ba6587b7657..b2de8fa49273 100644
--- a/Documentation/hwmon/max34440
+++ b/Documentation/hwmon/max34440
@@ -16,6 +16,11 @@ Supported chips:
Prefixes: 'max34446'
Addresses scanned: -
Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34446.pdf
+ * Maxim MAX34451
+ PMBus 16-Channel V/I Monitor and 12-Channel Sequencer/Marginer
+ Prefixes: 'max34451'
+ Addresses scanned: -
+ Datasheet: http://datasheets.maximintegrated.com/en/ds/MAX34451.pdf
* Maxim MAX34460
PMBus 12-Channel Voltage Monitor & Sequencer
Prefix: 'max34460'
@@ -36,9 +41,10 @@ Description
This driver supports hardware monitoring for Maxim MAX34440 PMBus 6-Channel
Power-Supply Manager, MAX34441 PMBus 5-Channel Power-Supply Manager
and Intelligent Fan Controller, and MAX34446 PMBus Power-Supply Data Logger.
-It also supports the MAX34460 and MAX34461 PMBus Voltage Monitor & Sequencers.
-The MAX34460 supports 12 voltage channels, and the MAX34461 supports 16 voltage
-channels.
+It also supports the MAX34451, MAX34460, and MAX34461 PMBus Voltage Monitor &
+Sequencers. The MAX34451 supports monitoring voltage or current of 12 channels
+based on GIN pins. The MAX34460 supports 12 voltage channels, and the MAX34461
+supports 16 voltage channels.
The driver is a client driver to the core PMBus driver. Please see
Documentation/hwmon/pmbus for details on PMBus client drivers.
@@ -93,7 +99,7 @@ curr[1-6]_max Maximum current. From IOUT_OC_WARN_LIMIT register.
curr[1-6]_crit Critical maximum current. From IOUT_OC_FAULT_LIMIT register.
curr[1-6]_max_alarm Current high alarm. From IOUT_OC_WARNING status.
curr[1-6]_crit_alarm Current critical high alarm. From IOUT_OC_FAULT status.
-curr[1-4]_average Historical average current (MAX34446 only).
+curr[1-4]_average Historical average current (MAX34446/34451 only).
curr[1-6]_highest Historical maximum current.
curr[1-6]_reset_history Write any value to reset history.
@@ -123,5 +129,7 @@ temp[1-8]_reset_history Write any value to reset history.
temp7 and temp8 attributes only exist for MAX34440.
MAX34446 only supports temp[1-3].
+MAX34451 supports attribute groups in[1-16] (or curr[1-16] based on input pins)
+and temp[1-5].
MAX34460 supports attribute groups in[1-12] and temp[1-5].
MAX34461 supports attribute groups in[1-16] and temp[1-5].
diff --git a/Documentation/hwmon/mlxreg-fan b/Documentation/hwmon/mlxreg-fan
new file mode 100644
index 000000000000..fc531c6978d4
--- /dev/null
+++ b/Documentation/hwmon/mlxreg-fan
@@ -0,0 +1,60 @@
+Kernel driver mlxreg-fan
+========================
+
+Provides FAN control for the next Mellanox systems:
+QMB700, equipped with 40x200GbE InfiniBand ports;
+MSN3700, equipped with 32x200GbE or 16x400GbE Ethernet ports;
+MSN3410, equipped with 6x400GbE plus 48x50GbE Ethernet ports;
+MSN3800, equipped with 64x1000GbE Ethernet ports;
+These are the Top of the Rack systems, equipped with Mellanox switch
+board with Mellanox Quantum or Spectrume-2 devices.
+FAN controller is implemented by the programmable device logic.
+
+The default registers offsets set within the programmable device is as
+following:
+- pwm1 0xe3
+- fan1 (tacho1) 0xe4
+- fan2 (tacho2) 0xe5
+- fan3 (tacho3) 0xe6
+- fan4 (tacho4) 0xe7
+- fan5 (tacho5) 0xe8
+- fan6 (tacho6) 0xe9
+- fan7 (tacho7) 0xea
+- fan8 (tacho8) 0xeb
+- fan9 (tacho9) 0xec
+- fan10 (tacho10) 0xed
+- fan11 (tacho11) 0xee
+- fan12 (tacho12) 0xef
+This setup can be re-programmed with other registers.
+
+Author: Vadim Pasternak <vadimp@mellanox.com>
+
+Description
+-----------
+
+The driver implements a simple interface for driving a fan connected to
+a PWM output and tachometer inputs.
+This driver obtains PWM and tachometers registers location according to
+the system configuration and creates FAN/PWM hwmon objects and a cooling
+device. PWM and tachometers are sensed through the on-board programmable
+device, which exports its register map. This device could be attached to
+any bus type, for which register mapping is supported.
+Single instance is created with one PWM control, up to 12 tachometers and
+one cooling device. It could be as many instances as programmable device
+supports.
+The driver exposes the fan to the user space through the hwmon's and
+thermal's sysfs interfaces.
+
+/sys files in hwmon subsystem
+-----------------------------
+
+fan[1-12]_fault - RO files for tachometers TACH1-TACH12 fault indication
+fan[1-12]_input - RO files for tachometers TACH1-TACH12 input (in RPM)
+pwm1 - RW file for fan[1-12] target duty cycle (0..255)
+
+/sys files in thermal subsystem
+-------------------------------
+
+cur_state - RW file for current cooling state of the cooling device
+ (0..max_state)
+max_state - RO file for maximum cooling state of the cooling device
diff --git a/Documentation/hwmon/npcm750-pwm-fan b/Documentation/hwmon/npcm750-pwm-fan
new file mode 100644
index 000000000000..6156ef7398e6
--- /dev/null
+++ b/Documentation/hwmon/npcm750-pwm-fan
@@ -0,0 +1,22 @@
+Kernel driver npcm750-pwm-fan
+=============================
+
+Supported chips:
+ NUVOTON NPCM750/730/715/705
+
+Authors:
+ <tomer.maimon@nuvoton.com>
+
+Description:
+------------
+This driver implements support for NUVOTON NPCM7XX PWM and Fan Tacho
+controller. The PWM controller supports up to 8 PWM outputs. The Fan tacho
+controller supports up to 16 tachometer inputs.
+
+The driver provides the following sensor accesses in sysfs:
+
+fanX_input ro provide current fan rotation value in RPM as reported
+ by the fan to the device.
+
+pwmX rw get or set PWM fan control value. This is an integer
+ value between 0(off) and 255(full speed).
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index fc337c317c67..2b9e1005d88b 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -171,6 +171,13 @@ in[0-*]_label Suggested voltage channel label.
user-space.
RO
+in[0-*]_enable
+ Enable or disable the sensors.
+ When disabled the sensor read will return -ENODATA.
+ 1: Enable
+ 0: Disable
+ RW
+
cpu[0-*]_vid CPU core reference voltage.
Unit: millivolt
RO
@@ -236,6 +243,13 @@ fan[1-*]_label Suggested fan channel label.
In all other cases, the label is provided by user-space.
RO
+fan[1-*]_enable
+ Enable or disable the sensors.
+ When disabled the sensor read will return -ENODATA.
+ 1: Enable
+ 0: Disable
+ RW
+
Also see the Alarms section for status flags associated with fans.
@@ -409,6 +423,13 @@ temp_reset_history
Reset temp_lowest and temp_highest for all sensors
WO
+temp[1-*]_enable
+ Enable or disable the sensors.
+ When disabled the sensor read will return -ENODATA.
+ 1: Enable
+ 0: Disable
+ RW
+
Some chips measure temperature using external thermistors and an ADC, and
report the temperature measurement as a voltage. Converting this voltage
back to a temperature (or the other way around for limits) requires
@@ -468,6 +489,13 @@ curr_reset_history
Reset currX_lowest and currX_highest for all sensors
WO
+curr[1-*]_enable
+ Enable or disable the sensors.
+ When disabled the sensor read will return -ENODATA.
+ 1: Enable
+ 0: Disable
+ RW
+
Also see the Alarms section for status flags associated with currents.
*********
@@ -566,6 +594,13 @@ power[1-*]_crit Critical maximum power.
Unit: microWatt
RW
+power[1-*]_enable Enable or disable the sensors.
+ When disabled the sensor read will return
+ -ENODATA.
+ 1: Enable
+ 0: Disable
+ RW
+
Also see the Alarms section for status flags associated with power readings.
**********
@@ -576,6 +611,12 @@ energy[1-*]_input Cumulative energy use
Unit: microJoule
RO
+energy[1-*]_enable Enable or disable the sensors.
+ When disabled the sensor read will return
+ -ENODATA.
+ 1: Enable
+ 0: Disable
+ RW
************
* Humidity *
@@ -586,6 +627,13 @@ humidity[1-*]_input Humidity
RO
+humidity[1-*]_enable Enable or disable the sensors
+ When disabled the sensor read will return
+ -ENODATA.
+ 1: Enable
+ 0: Disable
+ RW
+
**********
* Alarms *
**********
diff --git a/Documentation/index.rst b/Documentation/index.rst
index fdc585703498..f95ba981f8cd 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -102,6 +102,17 @@ implementation.
sh/index
+Filesystem Documentation
+------------------------
+
+The documentation in this section are provided by specific filesystem
+subprojects.
+
+.. toctree::
+ :maxdepth: 2
+
+ filesystems/ext4/index
+
Korean translations
-------------------
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
index 04d394a2e06c..49df45f90e8a 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -31,6 +31,9 @@ Here are examples of these different formats::
3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
3 1 hda1 35486 38030 38030 38030
+ 4.18+ diskstats:
+ 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
@@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os
last update of this field. This can provide an easy measure of both
I/O completion time and the backlog that may be accumulating.
+Field 12 -- # of discards completed
+ This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+ See the description of field 2
+
+Field 14 -- # of sectors discarded
+ This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+ This is the total number of milliseconds spent by all discards (as
+ measured from __make_request() to end_that_request_last()).
To avoid introducing performance bottlenecks, no locks are held while
modifying these counters. This implies that minor inaccuracies may be
diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt
index 6c9c69ec3986..114c7ce7b58d 100644
--- a/Documentation/kbuild/kbuild.txt
+++ b/Documentation/kbuild/kbuild.txt
@@ -50,6 +50,11 @@ LDFLAGS_MODULE
--------------------------------------------------
Additional options used for $(LD) when linking modules.
+KBUILD_KCONFIG
+--------------------------------------------------
+Set the top-level Kconfig file to the value of this environment
+variable. The default name is "Kconfig".
+
KBUILD_VERBOSE
--------------------------------------------------
Set the kbuild verbosity. Can be assigned same values as "V=...".
@@ -88,7 +93,8 @@ In most cases the name of the architecture is the same as the
directory name found in the arch/ directory.
But some architectures such as x86 and sparc have aliases.
x86: i386 for 32 bit, x86_64 for 64 bit
-sparc: sparc for 32 bit, sparc64 for 64 bit
+sh: sh for 32 bit, sh64 for 64 bit
+sparc: sparc32 for 32 bit, sparc64 for 64 bit
CROSS_COMPILE
--------------------------------------------------
@@ -148,15 +154,6 @@ stripped after they are installed. If INSTALL_MOD_STRIP is '1', then
the default option --strip-debug will be used. Otherwise,
INSTALL_MOD_STRIP value will be used as the options to the strip command.
-INSTALL_FW_PATH
---------------------------------------------------
-INSTALL_FW_PATH specifies where to install the firmware blobs.
-The default value is:
-
- $(INSTALL_MOD_PATH)/lib/firmware
-
-The value can be overridden in which case the default value is ignored.
-
INSTALL_HDR_PATH
--------------------------------------------------
INSTALL_HDR_PATH specifies where to install user space headers when
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt
index 7233118f3a05..68c82914c0f3 100644
--- a/Documentation/kbuild/kconfig.txt
+++ b/Documentation/kbuild/kconfig.txt
@@ -2,9 +2,9 @@ This file contains some assistance for using "make *config".
Use "make help" to list all of the possible configuration targets.
-The xconfig ('qconf') and menuconfig ('mconf') programs also
-have embedded help text. Be sure to check it for navigation,
-search, and other general help text.
+The xconfig ('qconf'), menuconfig ('mconf'), and nconfig ('nconf')
+programs also have embedded help text. Be sure to check that for
+navigation, search, and other general help text.
======================================================================
General
@@ -17,13 +17,16 @@ this happens, using a previously working .config file and running
for you, so you may find that you need to see what NEW kernel
symbols have been introduced.
-To see a list of new config symbols when using "make oldconfig", use
+To see a list of new config symbols, use
cp user/some/old.config .config
make listnewconfig
and the config program will list any new symbols, one per line.
+Alternatively, you can use the brute force method:
+
+ make oldconfig
scripts/diffconfig .config.old .config | less
______________________________________________________________________
@@ -160,7 +163,7 @@ Searching in menuconfig:
This lists all config symbols that contain "hotplug",
e.g., HOTPLUG_CPU, MEMORY_HOTPLUG.
- For search help, enter / followed TAB-TAB-TAB (to highlight
+ For search help, enter / followed by TAB-TAB (to highlight
<Help>) and Enter. This will tell you that you can also use
regular expressions (regexes) in the search string, so if you
are not interested in MEMORY_HOTPLUG, you could try
@@ -203,6 +206,39 @@ Example:
======================================================================
+nconfig
+--------------------------------------------------
+
+nconfig is an alternate text-based configurator. It lists function
+keys across the bottom of the terminal (window) that execute commands.
+You can also just use the corresponding numeric key to execute the
+commands unless you are in a data entry window. E.g., instead of F6
+for Save, you can just press 6.
+
+Use F1 for Global help or F3 for the Short help menu.
+
+Searching in nconfig:
+
+ You can search either in the menu entry "prompt" strings
+ or in the configuration symbols.
+
+ Use / to begin a search through the menu entries. This does
+ not support regular expressions. Use <Down> or <Up> for
+ Next hit and Previous hit, respectively. Use <Esc> to
+ terminate the search mode.
+
+ F8 (SymSearch) searches the configuration symbols for the
+ given string or regular expression (regex).
+
+NCONFIG_MODE
+--------------------------------------------------
+This mode shows all sub-menus in one large tree.
+
+Example:
+ make NCONFIG_MODE=single_menu nconfig
+
+
+======================================================================
xconfig
--------------------------------------------------
@@ -230,8 +266,7 @@ gconfig
Searching in gconfig:
- None (gconfig isn't maintained as well as xconfig or menuconfig);
- however, gconfig does have a few more viewing choices than
- xconfig does.
+ There is no search command in gconfig. However, gconfig does
+ have several different viewing choices, modes, and options.
###
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index cb3b0de83fc6..10f4499e677c 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -80,6 +80,26 @@ After the instruction is single-stepped, Kprobes executes the
"post_handler," if any, that is associated with the kprobe.
Execution then continues with the instruction following the probepoint.
+Changing Execution Path
+-----------------------
+
+Since kprobes can probe into a running kernel code, it can change the
+register set, including instruction pointer. This operation requires
+maximum care, such as keeping the stack frame, recovering the execution
+path etc. Since it operates on a running kernel and needs deep knowledge
+of computer architecture and concurrent computing, you can easily shoot
+your foot.
+
+If you change the instruction pointer (and set up other related
+registers) in pre_handler, you must return !0 so that kprobes stops
+single stepping and just returns to the given address.
+This also means post_handler should not be called anymore.
+
+Note that this operation may be harder on some architectures which use
+TOC (Table of Contents) for function call, since you have to setup a new
+TOC for your function in your module, and recover the old one after
+returning from it.
+
Return Probes
-------------
@@ -262,7 +282,7 @@ is optimized, that modification is ignored. Thus, if you want to
tweak the kernel's execution path, you need to suppress optimization,
using one of the following techniques:
-- Specify an empty function for the kprobe's post_handler or break_handler.
+- Specify an empty function for the kprobe's post_handler.
or
@@ -474,7 +494,7 @@ error occurs during registration, all probes in the array, up to
the bad probe, are safely unregistered before the register_*probes
function returns.
-- kps/rps/jps: an array of pointers to ``*probe`` data structures
+- kps/rps: an array of pointers to ``*probe`` data structures
- num: the number of the array entries.
.. note::
@@ -566,12 +586,11 @@ the same handler) may run concurrently on different CPUs.
Kprobes does not use mutexes or allocate memory except during
registration and unregistration.
-Probe handlers are run with preemption disabled. Depending on the
-architecture and optimization state, handlers may also run with
-interrupts disabled (e.g., kretprobe handlers and optimized kprobe
-handlers run without interrupt disabled on x86/x86-64). In any case,
-your handler should not yield the CPU (e.g., by attempting to acquire
-a semaphore).
+Probe handlers are run with preemption disabled or interrupt disabled,
+which depends on the architecture and optimization state. (e.g.,
+kretprobe handlers and optimized kprobe handlers run without interrupt
+disabled on x86/x86-64). In any case, your handler should not yield
+the CPU (e.g., by attempting to acquire a semaphore, or waiting I/O).
Since a return probe is implemented by replacing the return
address with the trampoline's address, stack backtraces and calls
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index a02d6bbfc9d0..0d8d7ef131e9 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -2179,32 +2179,41 @@ or:
event_indicated = 1;
wake_up_process(event_daemon);
-A write memory barrier is implied by wake_up() and co. if and only if they
-wake something up. The barrier occurs before the task state is cleared, and so
-sits between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+A general memory barrier is executed by wake_up() if it wakes something up.
+If it doesn't wake anything up then a memory barrier may or may not be
+executed; you must not rely on it. The barrier occurs before the task state
+is accessed, in particular, it sits between the STORE to indicate the event
+and the STORE to set TASK_RUNNING:
- CPU 1 CPU 2
+ CPU 1 (Sleeper) CPU 2 (Waker)
=============================== ===============================
set_current_state(); STORE event_indicated
smp_store_mb(); wake_up();
- STORE current->state <write barrier>
- <general barrier> STORE current->state
- LOAD event_indicated
+ STORE current->state ...
+ <general barrier> <general barrier>
+ LOAD event_indicated if ((LOAD task->state) & TASK_NORMAL)
+ STORE task->state
-To repeat, this write memory barrier is present if and only if something
-is actually awakened. To see this, consider the following sequence of
-events, where X and Y are both initially zero:
+where "task" is the thread being woken up and it equals CPU 1's "current".
+
+To repeat, a general memory barrier is guaranteed to be executed by wake_up()
+if something is actually awakened, but otherwise there is no such guarantee.
+To see this, consider the following sequence of events, where X and Y are both
+initially zero:
CPU 1 CPU 2
=============================== ===============================
- X = 1; STORE event_indicated
+ X = 1; Y = 1;
smp_mb(); wake_up();
- Y = 1; wait_event(wq, Y == 1);
- wake_up(); load from Y sees 1, no memory barrier
- load from X might see 0
+ LOAD Y LOAD X
+
+If a wakeup does occur, one (at least) of the two loads must see 1. If, on
+the other hand, a wakeup does not occur, both loads might see 0.
-In contrast, if a wakeup does occur, CPU 2's load from X would be guaranteed
-to see 1.
+wake_up_process() always executes a general memory barrier. The barrier again
+occurs before the task state is accessed. In particular, if the wake_up() in
+the previous snippet were replaced by a call to wake_up_process() then one of
+the two loads would be guaranteed to see 1.
The available waker functions include:
@@ -2224,6 +2233,8 @@ The available waker functions include:
wake_up_poll();
wake_up_process();
+In terms of memory ordering, these functions all provide the same guarantees of
+a wake_up() (or stronger).
[!] Note that the memory barriers implied by the sleeper and the waker do _not_
order multiple stores before the wake-up with respect to loads of those stored
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index c13214d073a4..d3e5dd26db12 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -1490,7 +1490,7 @@ To remove an ARP target:
To configure the interval between learning packet transmits:
# echo 12 > /sys/class/net/bond0/bonding/lp_interval
- NOTE: the lp_inteval is the number of seconds between instances where
+ NOTE: the lp_interval is the number of seconds between instances where
the bonding driver sends learning packets to each slaves peer switch. The
default interval is 1 second.
diff --git a/Documentation/networking/dpaa2/overview.rst b/Documentation/networking/dpaa2/overview.rst
index 79fede4447d6..d638b5a8aadd 100644
--- a/Documentation/networking/dpaa2/overview.rst
+++ b/Documentation/networking/dpaa2/overview.rst
@@ -1,5 +1,6 @@
.. include:: <isonum.txt>
+=========================================================
DPAA2 (Data Path Acceleration Architecture Gen2) Overview
=========================================================
diff --git a/Documentation/networking/e100.rst b/Documentation/networking/e100.rst
index 9708f5fa76de..f81111eba9c5 100644
--- a/Documentation/networking/e100.rst
+++ b/Documentation/networking/e100.rst
@@ -47,41 +47,45 @@ Driver Configuration Parameters
The default value for each parameter is generally the recommended setting,
unless otherwise noted.
-Rx Descriptors: Number of receive descriptors. A receive descriptor is a data
+Rx Descriptors:
+ Number of receive descriptors. A receive descriptor is a data
structure that describes a receive buffer and its attributes to the network
controller. The data in the descriptor is used by the controller to write
data from the controller to host memory. In the 3.x.x driver the valid range
for this parameter is 64-256. The default value is 256. This parameter can be
changed using the command::
- ethtool -G eth? rx n
+ ethtool -G eth? rx n
Where n is the number of desired Rx descriptors.
-Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data
+Tx Descriptors:
+ Number of transmit descriptors. A transmit descriptor is a data
structure that describes a transmit buffer and its attributes to the network
controller. The data in the descriptor is used by the controller to read
data from the host memory to the controller. In the 3.x.x driver the valid
range for this parameter is 64-256. The default value is 128. This parameter
can be changed using the command::
- ethtool -G eth? tx n
+ ethtool -G eth? tx n
Where n is the number of desired Tx descriptors.
-Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
+Speed/Duplex:
+ The driver auto-negotiates the link speed and duplex settings by
default. The ethtool utility can be used as follows to force speed/duplex.::
- ethtool -s eth? autoneg off speed {10|100} duplex {full|half}
+ ethtool -s eth? autoneg off speed {10|100} duplex {full|half}
NOTE: setting the speed/duplex to incorrect values will cause the link to
fail.
-Event Log Message Level: The driver uses the message level flag to log events
+Event Log Message Level:
+ The driver uses the message level flag to log events
to syslog. The message level can be set at driver load time. It can also be
set using the command::
- ethtool -s eth? msglvl n
+ ethtool -s eth? msglvl n
Additional Configurations
@@ -92,7 +96,7 @@ Configuring the Driver on Different Distributions
Configuring a network driver to load properly when the system is started
is distribution dependent. Typically, the configuration process involves
-adding an alias line to /etc/modprobe.d/*.conf as well as editing other
+adding an alias line to `/etc/modprobe.d/*.conf` as well as editing other
system startup scripts and/or configuration files. Many popular Linux
distributions ship with tools to make these changes for you. To learn
the proper way to configure a network device for your system, refer to
@@ -160,7 +164,10 @@ This results in unbalanced receive traffic.
If you have multiple interfaces in a server, either turn on ARP
filtering by
-(1) entering:: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+(1) entering::
+
+ echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+
(this only works if your kernel's version is higher than 2.4.5), or
(2) installing the interfaces in separate broadcast domains (either
diff --git a/Documentation/networking/e1000.rst b/Documentation/networking/e1000.rst
index 144b87eef153..f10dd4086921 100644
--- a/Documentation/networking/e1000.rst
+++ b/Documentation/networking/e1000.rst
@@ -34,7 +34,8 @@ Command Line Parameters
The default value for each parameter is generally the recommended setting,
unless otherwise noted.
-NOTES: For more information about the AutoNeg, Duplex, and Speed
+NOTES:
+ For more information about the AutoNeg, Duplex, and Speed
parameters, see the "Speed and Duplex Configuration" section in
this document.
@@ -45,22 +46,27 @@ NOTES: For more information about the AutoNeg, Duplex, and Speed
AutoNeg
-------
+
(Supported only on adapters with copper connections)
-Valid Range: 0x01-0x0F, 0x20-0x2F
-Default Value: 0x2F
+
+:Valid Range: 0x01-0x0F, 0x20-0x2F
+:Default Value: 0x2F
This parameter is a bit-mask that specifies the speed and duplex settings
advertised by the adapter. When this parameter is used, the Speed and
Duplex parameters must not be specified.
-NOTE: Refer to the Speed and Duplex section of this readme for more
+NOTE:
+ Refer to the Speed and Duplex section of this readme for more
information on the AutoNeg parameter.
Duplex
------
+
(Supported only on adapters with copper connections)
-Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full)
-Default Value: 0
+
+:Valid Range: 0-2 (0=auto-negotiate, 1=half, 2=full)
+:Default Value: 0
This defines the direction in which data is allowed to flow. Can be
either one or two-directional. If both Duplex and the link partner are
@@ -70,18 +76,22 @@ duplex.
FlowControl
-----------
-Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
-Default Value: Reads flow control settings from the EEPROM
+
+:Valid Range: 0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
+:Default Value: Reads flow control settings from the EEPROM
This parameter controls the automatic generation(Tx) and response(Rx)
to Ethernet PAUSE frames.
InterruptThrottleRate
---------------------
+
(not supported on Intel(R) 82542, 82543 or 82544-based adapters)
-Valid Range: 0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative,
- 4=simplified balancing)
-Default Value: 3
+
+:Valid Range:
+ 0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative,
+ 4=simplified balancing)
+:Default Value: 3
The driver can limit the amount of interrupts per second that the adapter
will generate for incoming packets. It does this by writing a value to the
@@ -135,13 +145,15 @@ Setting InterruptThrottleRate to 0 turns off any interrupt moderation
and may improve small packet latency, but is generally not suitable
for bulk throughput traffic.
-NOTE: InterruptThrottleRate takes precedence over the TxAbsIntDelay and
+NOTE:
+ InterruptThrottleRate takes precedence over the TxAbsIntDelay and
RxAbsIntDelay parameters. In other words, minimizing the receive
and/or transmit absolute delays does not force the controller to
generate more interrupts than what the Interrupt Throttle Rate
allows.
-CAUTION: If you are using the Intel(R) PRO/1000 CT Network Connection
+CAUTION:
+ If you are using the Intel(R) PRO/1000 CT Network Connection
(controller 82547), setting InterruptThrottleRate to a value
greater than 75,000, may hang (stop transmitting) adapters
under certain network conditions. If this occurs a NETDEV
@@ -151,7 +163,8 @@ CAUTION: If you are using the Intel(R) PRO/1000 CT Network Connection
hang, ensure that InterruptThrottleRate is set no greater
than 75,000 and is not set to 0.
-NOTE: When e1000 is loaded with default settings and multiple adapters
+NOTE:
+ When e1000 is loaded with default settings and multiple adapters
are in use simultaneously, the CPU utilization may increase non-
linearly. In order to limit the CPU utilization without impacting
the overall throughput, we recommend that you load the driver as
@@ -168,9 +181,11 @@ NOTE: When e1000 is loaded with default settings and multiple adapters
RxDescriptors
-------------
-Valid Range: 48-256 for 82542 and 82543-based adapters
- 48-4096 for all other supported adapters
-Default Value: 256
+
+:Valid Range:
+ - 48-256 for 82542 and 82543-based adapters
+ - 48-4096 for all other supported adapters
+:Default Value: 256
This value specifies the number of receive buffer descriptors allocated
by the driver. Increasing this value allows the driver to buffer more
@@ -180,15 +195,17 @@ Each descriptor is 16 bytes. A receive buffer is also allocated for each
descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending
on the MTU setting. The maximum MTU size is 16110.
-NOTE: MTU designates the frame size. It only needs to be set for Jumbo
+NOTE:
+ MTU designates the frame size. It only needs to be set for Jumbo
Frames. Depending on the available system resources, the request
for a higher number of receive descriptors may be denied. In this
case, use a lower number.
RxIntDelay
----------
-Valid Range: 0-65535 (0=off)
-Default Value: 0
+
+:Valid Range: 0-65535 (0=off)
+:Default Value: 0
This value delays the generation of receive interrupts in units of 1.024
microseconds. Receive interrupt reduction can improve CPU efficiency if
@@ -198,7 +215,8 @@ of TCP traffic. If the system is reporting dropped receives, this value
may be set too high, causing the driver to run out of available receive
descriptors.
-CAUTION: When setting RxIntDelay to a value other than 0, adapters may
+CAUTION:
+ When setting RxIntDelay to a value other than 0, adapters may
hang (stop transmitting) under certain network conditions. If
this occurs a NETDEV WATCHDOG message is logged in the system
event log. In addition, the controller is automatically reset,
@@ -207,9 +225,11 @@ CAUTION: When setting RxIntDelay to a value other than 0, adapters may
RxAbsIntDelay
-------------
+
(This parameter is supported only on 82540, 82545 and later adapters.)
-Valid Range: 0-65535 (0=off)
-Default Value: 128
+
+:Valid Range: 0-65535 (0=off)
+:Default Value: 128
This value, in units of 1.024 microseconds, limits the delay in which a
receive interrupt is generated. Useful only if RxIntDelay is non-zero,
@@ -220,9 +240,11 @@ conditions.
Speed
-----
+
(This parameter is supported only on adapters with copper connections.)
-Valid Settings: 0, 10, 100, 1000
-Default Value: 0 (auto-negotiate at all supported speeds)
+
+:Valid Settings: 0, 10, 100, 1000
+:Default Value: 0 (auto-negotiate at all supported speeds)
Speed forces the line speed to the specified value in megabits per second
(Mbps). If this parameter is not specified or is set to 0 and the link
@@ -231,22 +253,26 @@ speed. Duplex should also be set when Speed is set to either 10 or 100.
TxDescriptors
-------------
-Valid Range: 48-256 for 82542 and 82543-based adapters
- 48-4096 for all other supported adapters
-Default Value: 256
+
+:Valid Range:
+ - 48-256 for 82542 and 82543-based adapters
+ - 48-4096 for all other supported adapters
+:Default Value: 256
This value is the number of transmit descriptors allocated by the driver.
Increasing this value allows the driver to queue more transmits. Each
descriptor is 16 bytes.
-NOTE: Depending on the available system resources, the request for a
+NOTE:
+ Depending on the available system resources, the request for a
higher number of transmit descriptors may be denied. In this case,
use a lower number.
TxIntDelay
----------
-Valid Range: 0-65535 (0=off)
-Default Value: 8
+
+:Valid Range: 0-65535 (0=off)
+:Default Value: 8
This value delays the generation of transmit interrupts in units of
1.024 microseconds. Transmit interrupt reduction can improve CPU
@@ -256,9 +282,11 @@ causing the driver to run out of available transmit descriptors.
TxAbsIntDelay
-------------
+
(This parameter is supported only on 82540, 82545 and later adapters.)
-Valid Range: 0-65535 (0=off)
-Default Value: 32
+
+:Valid Range: 0-65535 (0=off)
+:Default Value: 32
This value, in units of 1.024 microseconds, limits the delay in which a
transmit interrupt is generated. Useful only if TxIntDelay is non-zero,
@@ -269,18 +297,21 @@ network conditions.
XsumRX
------
+
(This parameter is NOT supported on the 82542-based adapter.)
-Valid Range: 0-1
-Default Value: 1
+
+:Valid Range: 0-1
+:Default Value: 1
A value of '1' indicates that the driver should enable IP checksum
offload for received packets (both UDP and TCP) to the adapter hardware.
Copybreak
---------
-Valid Range: 0-xxxxxxx (0=off)
-Default Value: 256
-Usage: modprobe e1000.ko copybreak=128
+
+:Valid Range: 0-xxxxxxx (0=off)
+:Default Value: 256
+:Usage: modprobe e1000.ko copybreak=128
Driver copies all packets below or equaling this size to a fresh RX
buffer before handing it up the stack.
@@ -292,8 +323,9 @@ it is also available during runtime at
SmartPowerDownEnable
--------------------
-Valid Range: 0-1
-Default Value: 0 (disabled)
+
+:Valid Range: 0-1
+:Default Value: 0 (disabled)
Allows PHY to turn off in lower power states. The user can turn off
this parameter in supported chipsets.
@@ -309,14 +341,14 @@ fiber interface board only links at 1000 Mbps full-duplex.
For copper-based boards, the keywords interact as follows:
- The default operation is auto-negotiate. The board advertises all
+- The default operation is auto-negotiate. The board advertises all
supported speed and duplex combinations, and it links at the highest
common speed and duplex mode IF the link partner is set to auto-negotiate.
- If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps
+- If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps
is advertised (The 1000BaseT spec requires auto-negotiation.)
- If Speed = 10 or 100, then both Speed and Duplex should be set. Auto-
+- If Speed = 10 or 100, then both Speed and Duplex should be set. Auto-
negotiation is disabled, and the AutoNeg parameter is ignored. Partner
SHOULD also be forced.
@@ -328,13 +360,15 @@ process.
The parameter may be specified as either a decimal or hexadecimal value as
determined by the bitmap below.
+============== ====== ====== ======= ======= ====== ====== ======= ======
Bit position 7 6 5 4 3 2 1 0
Decimal Value 128 64 32 16 8 4 2 1
Hex value 80 40 20 10 8 4 2 1
Speed (Mbps) N/A N/A 1000 N/A 100 100 10 10
Duplex Full Full Half Full Half
+============== ====== ====== ======= ======= ====== ====== ======= ======
-Some examples of using AutoNeg:
+Some examples of using AutoNeg::
modprobe e1000 AutoNeg=0x01 (Restricts autonegotiation to 10 Half)
modprobe e1000 AutoNeg=1 (Same as above)
@@ -357,56 +391,59 @@ Additional Configurations
Jumbo Frames
------------
-Jumbo Frames support is enabled by changing the MTU to a value larger
-than the default of 1500. Use the ifconfig command to increase the MTU
-size. For example::
+
+ Jumbo Frames support is enabled by changing the MTU to a value larger than
+ the default of 1500. Use the ifconfig command to increase the MTU size.
+ For example::
ifconfig eth<x> mtu 9000 up
-This setting is not saved across reboots. It can be made permanent if
-you add::
+ This setting is not saved across reboots. It can be made permanent if
+ you add::
MTU=9000
-to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>. This example
-applies to the Red Hat distributions; other distributions may store this
-setting in a different location.
+ to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>. This example
+ applies to the Red Hat distributions; other distributions may store this
+ setting in a different location.
+
+Notes:
+ Degradation in throughput performance may be observed in some Jumbo frames
+ environments. If this is observed, increasing the application's socket buffer
+ size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values may help.
+ See the specific application manual and /usr/src/linux*/Documentation/
+ networking/ip-sysctl.txt for more details.
-Notes: Degradation in throughput performance may be observed in some
-Jumbo frames environments. If this is observed, increasing the
-application's socket buffer size and/or increasing the
-/proc/sys/net/ipv4/tcp_*mem entry values may help. See the specific
-application manual and /usr/src/linux*/Documentation/
-networking/ip-sysctl.txt for more details.
+ - The maximum MTU setting for Jumbo Frames is 16110. This value coincides
+ with the maximum Jumbo Frames size of 16128.
-- The maximum MTU setting for Jumbo Frames is 16110. This value
- coincides with the maximum Jumbo Frames size of 16128.
+ - Using Jumbo frames at 10 or 100 Mbps is not supported and may result in
+ poor performance or loss of link.
-- Using Jumbo frames at 10 or 100 Mbps is not supported and may result
- in poor performance or loss of link.
+ - Adapters based on the Intel(R) 82542 and 82573V/E controller do not
+ support Jumbo Frames. These correspond to the following product names::
-- Adapters based on the Intel(R) 82542 and 82573V/E controller do not
- support Jumbo Frames. These correspond to the following product names:
- Intel(R) PRO/1000 Gigabit Server Adapter Intel(R) PRO/1000 PM Network
- Connection
+ Intel(R) PRO/1000 Gigabit Server Adapter
+ Intel(R) PRO/1000 PM Network Connection
ethtool
-------
-The driver utilizes the ethtool interface for driver configuration and
-diagnostics, as well as displaying statistical information. The ethtool
-version 1.6 or later is required for this functionality.
-The latest release of ethtool can be found from
-https://www.kernel.org/pub/software/network/ethtool/
+ The driver utilizes the ethtool interface for driver configuration and
+ diagnostics, as well as displaying statistical information. The ethtool
+ version 1.6 or later is required for this functionality.
+
+ The latest release of ethtool can be found from
+ https://www.kernel.org/pub/software/network/ethtool/
Enabling Wake on LAN* (WoL)
---------------------------
-WoL is configured through the ethtool* utility.
-WoL will be enabled on the system during the next shut down or reboot.
-For this driver version, in order to enable WoL, the e1000 driver must be
-loaded when shutting down or rebooting the system.
+ WoL is configured through the ethtool* utility.
+ WoL will be enabled on the system during the next shut down or reboot.
+ For this driver version, in order to enable WoL, the e1000 driver must be
+ loaded when shutting down or rebooting the system.
Support
=======
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 921739d00f69..7f01fb1c1084 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -1891,22 +1891,22 @@ Mandatory 배리어들은 SMP 시스템에서도 UP 시스템에서도 SMP 효
/* 소유권을 수정 */
desc->status = DEVICE_OWN;
- /* MMIO 를 통해 디바이스에 공지를 하기 전에 메모리를 동기화 */
- wmb();
-
/* 업데이트된 디스크립터의 디바이스에 공지 */
writel(DESC_NOTIFY, doorbell);
}
dma_rmb() 는 디스크립터로부터 데이터를 읽어오기 전에 디바이스가 소유권을
- 내놓았음을 보장하게 하고, dma_wmb() 는 디바이스가 자신이 소유권을 다시
- 가졌음을 보기 전에 디스크립터에 데이터가 쓰였음을 보장합니다. wmb() 는
- 캐시 일관성이 없는 (cache incoherent) MMIO 영역에 쓰기를 시도하기 전에
- 캐시 일관성이 있는 메모리 (cache coherent memory) 쓰기가 완료되었음을
- 보장해주기 위해 필요합니다.
-
- consistent memory 에 대한 자세한 내용을 위해선 Documentation/DMA-API.txt
- 문서를 참고하세요.
+ 내려놓았을 것을 보장하고, dma_wmb() 는 디바이스가 자신이 소유권을 다시
+ 가졌음을 보기 전에 디스크립터에 데이터가 쓰였을 것을 보장합니다. 참고로,
+ writel() 을 사용하면 캐시 일관성이 있는 메모리 (cache coherent memory)
+ 쓰기가 MMIO 영역에의 쓰기 전에 완료되었을 것을 보장하므로 writel() 앞에
+ wmb() 를 실행할 필요가 없음을 알아두시기 바랍니다. writel() 보다 비용이
+ 저렴한 writel_relaxed() 는 이런 보장을 제공하지 않으므로 여기선 사용되지
+ 않아야 합니다.
+
+ writel_relaxed() 와 같은 완화된 I/O 접근자들에 대한 자세한 내용을 위해서는
+ "커널 I/O 배리어의 효과" 섹션을, consistent memory 에 대한 자세한 내용을
+ 위해선 Documentation/DMA-API.txt 문서를 참고하세요.
MMIO 쓰기 배리어
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d10944e619d3..cb8db4f9d097 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4391,6 +4391,22 @@ all such vmexits.
Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
+7.14 KVM_CAP_S390_HPAGE_1M
+
+Architectures: s390
+Parameters: none
+Returns: 0 on success, -EINVAL if hpage module parameter was not set
+ or cmma is enabled
+
+With this capability the KVM support for memory backing with 1m pages
+through hugetlbfs can be enabled for a VM. After the capability is
+enabled, cmma can't be enabled anymore and pfmfi and the storage key
+interpretation are disabled. If cmma has already been enabled or the
+hpage module parameter is not set to 1, -EINVAL is returned.
+
+While it is generally possible to create a huge page backed VM without
+this capability, the VM will not be able to run.
+
8. Other capabilities.
----------------------
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index a16aa2113840..f662d3c530e5 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -29,7 +29,11 @@ mount options are:
L2 and L3 CDP are controlled seperately.
RDT features are orthogonal. A particular system may support only
-monitoring, only control, or both monitoring and control.
+monitoring, only control, or both monitoring and control. Cache
+pseudo-locking is a unique way of using cache control to "pin" or
+"lock" data in the cache. Details can be found in
+"Cache Pseudo-Locking".
+
The mount succeeds if either of allocation or monitoring is present, but
only those files and directories supported by the system will be created.
@@ -65,6 +69,29 @@ related to allocation:
some platforms support devices that have their
own settings for cache use which can over-ride
these bits.
+"bit_usage": Annotated capacity bitmasks showing how all
+ instances of the resource are used. The legend is:
+ "0" - Corresponding region is unused. When the system's
+ resources have been allocated and a "0" is found
+ in "bit_usage" it is a sign that resources are
+ wasted.
+ "H" - Corresponding region is used by hardware only
+ but available for software use. If a resource
+ has bits set in "shareable_bits" but not all
+ of these bits appear in the resource groups'
+ schematas then the bits appearing in
+ "shareable_bits" but no resource group will
+ be marked as "H".
+ "X" - Corresponding region is available for sharing and
+ used by hardware and software. These are the
+ bits that appear in "shareable_bits" as
+ well as a resource group's allocation.
+ "S" - Corresponding region is used by software
+ and available for sharing.
+ "E" - Corresponding region is used exclusively by
+ one resource group. No sharing allowed.
+ "P" - Corresponding region is pseudo-locked. No
+ sharing allowed.
Memory bandwitdh(MB) subdirectory contains the following files
with respect to allocation:
@@ -151,6 +178,9 @@ All groups contain the following files:
CPUs to/from this group. As with the tasks file a hierarchy is
maintained where MON groups may only include CPUs owned by the
parent CTRL_MON group.
+ When the resouce group is in pseudo-locked mode this file will
+ only be readable, reflecting the CPUs associated with the
+ pseudo-locked region.
"cpus_list":
@@ -163,6 +193,21 @@ When control is enabled all CTRL_MON groups will also contain:
A list of all the resources available to this group.
Each resource has its own line and format - see below for details.
+"size":
+ Mirrors the display of the "schemata" file to display the size in
+ bytes of each allocation instead of the bits representing the
+ allocation.
+
+"mode":
+ The "mode" of the resource group dictates the sharing of its
+ allocations. A "shareable" resource group allows sharing of its
+ allocations while an "exclusive" resource group does not. A
+ cache pseudo-locked region is created by first writing
+ "pseudo-locksetup" to the "mode" file before writing the cache
+ pseudo-locked region's schemata to the resource group's "schemata"
+ file. On successful pseudo-locked region creation the mode will
+ automatically change to "pseudo-locked".
+
When monitoring is enabled all MON groups will also contain:
"mon_data":
@@ -379,6 +424,170 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
+Cache Pseudo-Locking
+--------------------
+CAT enables a user to specify the amount of cache space that an
+application can fill. Cache pseudo-locking builds on the fact that a
+CPU can still read and write data pre-allocated outside its current
+allocated area on a cache hit. With cache pseudo-locking, data can be
+preloaded into a reserved portion of cache that no application can
+fill, and from that point on will only serve cache hits. The cache
+pseudo-locked memory is made accessible to user space where an
+application can map it into its virtual address space and thus have
+a region of memory with reduced average read latency.
+
+The creation of a cache pseudo-locked region is triggered by a request
+from the user to do so that is accompanied by a schemata of the region
+to be pseudo-locked. The cache pseudo-locked region is created as follows:
+- Create a CAT allocation CLOSNEW with a CBM matching the schemata
+ from the user of the cache region that will contain the pseudo-locked
+ memory. This region must not overlap with any current CAT allocation/CLOS
+ on the system and no future overlap with this cache region is allowed
+ while the pseudo-locked region exists.
+- Create a contiguous region of memory of the same size as the cache
+ region.
+- Flush the cache, disable hardware prefetchers, disable preemption.
+- Make CLOSNEW the active CLOS and touch the allocated memory to load
+ it into the cache.
+- Set the previous CLOS as active.
+- At this point the closid CLOSNEW can be released - the cache
+ pseudo-locked region is protected as long as its CBM does not appear in
+ any CAT allocation. Even though the cache pseudo-locked region will from
+ this point on not appear in any CBM of any CLOS an application running with
+ any CLOS will be able to access the memory in the pseudo-locked region since
+ the region continues to serve cache hits.
+- The contiguous region of memory loaded into the cache is exposed to
+ user-space as a character device.
+
+Cache pseudo-locking increases the probability that data will remain
+in the cache via carefully configuring the CAT feature and controlling
+application behavior. There is no guarantee that data is placed in
+cache. Instructions like INVD, WBINVD, CLFLUSH, etc. can still evict
+“locked” data from cache. Power management C-states may shrink or
+power off cache. Deeper C-states will automatically be restricted on
+pseudo-locked region creation.
+
+It is required that an application using a pseudo-locked region runs
+with affinity to the cores (or a subset of the cores) associated
+with the cache on which the pseudo-locked region resides. A sanity check
+within the code will not allow an application to map pseudo-locked memory
+unless it runs with affinity to cores associated with the cache on which the
+pseudo-locked region resides. The sanity check is only done during the
+initial mmap() handling, there is no enforcement afterwards and the
+application self needs to ensure it remains affine to the correct cores.
+
+Pseudo-locking is accomplished in two stages:
+1) During the first stage the system administrator allocates a portion
+ of cache that should be dedicated to pseudo-locking. At this time an
+ equivalent portion of memory is allocated, loaded into allocated
+ cache portion, and exposed as a character device.
+2) During the second stage a user-space application maps (mmap()) the
+ pseudo-locked memory into its address space.
+
+Cache Pseudo-Locking Interface
+------------------------------
+A pseudo-locked region is created using the resctrl interface as follows:
+
+1) Create a new resource group by creating a new directory in /sys/fs/resctrl.
+2) Change the new resource group's mode to "pseudo-locksetup" by writing
+ "pseudo-locksetup" to the "mode" file.
+3) Write the schemata of the pseudo-locked region to the "schemata" file. All
+ bits within the schemata should be "unused" according to the "bit_usage"
+ file.
+
+On successful pseudo-locked region creation the "mode" file will contain
+"pseudo-locked" and a new character device with the same name as the resource
+group will exist in /dev/pseudo_lock. This character device can be mmap()'ed
+by user space in order to obtain access to the pseudo-locked memory region.
+
+An example of cache pseudo-locked region creation and usage can be found below.
+
+Cache Pseudo-Locking Debugging Interface
+---------------------------------------
+The pseudo-locking debugging interface is enabled by default (if
+CONFIG_DEBUG_FS is enabled) and can be found in /sys/kernel/debug/resctrl.
+
+There is no explicit way for the kernel to test if a provided memory
+location is present in the cache. The pseudo-locking debugging interface uses
+the tracing infrastructure to provide two ways to measure cache residency of
+the pseudo-locked region:
+1) Memory access latency using the pseudo_lock_mem_latency tracepoint. Data
+ from these measurements are best visualized using a hist trigger (see
+ example below). In this test the pseudo-locked region is traversed at
+ a stride of 32 bytes while hardware prefetchers and preemption
+ are disabled. This also provides a substitute visualization of cache
+ hits and misses.
+2) Cache hit and miss measurements using model specific precision counters if
+ available. Depending on the levels of cache on the system the pseudo_lock_l2
+ and pseudo_lock_l3 tracepoints are available.
+ WARNING: triggering this measurement uses from two (for just L2
+ measurements) to four (for L2 and L3 measurements) precision counters on
+ the system, if any other measurements are in progress the counters and
+ their corresponding event registers will be clobbered.
+
+When a pseudo-locked region is created a new debugfs directory is created for
+it in debugfs as /sys/kernel/debug/resctrl/<newdir>. A single
+write-only file, pseudo_lock_measure, is present in this directory. The
+measurement on the pseudo-locked region depends on the number, 1 or 2,
+written to this debugfs file. Since the measurements are recorded with the
+tracing infrastructure the relevant tracepoints need to be enabled before the
+measurement is triggered.
+
+Example of latency debugging interface:
+In this example a pseudo-locked region named "newlock" was created. Here is
+how we can measure the latency in cycles of reading from this region and
+visualize this data with a histogram that is available if CONFIG_HIST_TRIGGERS
+is set:
+# :> /sys/kernel/debug/tracing/trace
+# echo 'hist:keys=latency' > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/trigger
+# echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
+# echo 1 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
+# echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
+# cat /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/hist
+
+# event histogram
+#
+# trigger info: hist:keys=latency:vals=hitcount:sort=hitcount:size=2048 [active]
+#
+
+{ latency: 456 } hitcount: 1
+{ latency: 50 } hitcount: 83
+{ latency: 36 } hitcount: 96
+{ latency: 44 } hitcount: 174
+{ latency: 48 } hitcount: 195
+{ latency: 46 } hitcount: 262
+{ latency: 42 } hitcount: 693
+{ latency: 40 } hitcount: 3204
+{ latency: 38 } hitcount: 3484
+
+Totals:
+ Hits: 8192
+ Entries: 9
+ Dropped: 0
+
+Example of cache hits/misses debugging:
+In this example a pseudo-locked region named "newlock" was created on the L2
+cache of a platform. Here is how we can obtain details of the cache hits
+and misses using the platform's precision counters.
+
+# :> /sys/kernel/debug/tracing/trace
+# echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
+# echo 2 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
+# echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
+# cat /sys/kernel/debug/tracing/trace
+
+# tracer: nop
+#
+# _-----=> irqs-off
+# / _----=> need-resched
+# | / _---=> hardirq/softirq
+# || / _--=> preempt-depth
+# ||| / delay
+# TASK-PID CPU# |||| TIMESTAMP FUNCTION
+# | | | |||| | |
+ pseudo_lock_mea-1672 [002] .... 3132.860500: pseudo_lock_l2: hits=4097 miss=0
+
+
Examples for RDT allocation usage:
Example 1
@@ -502,7 +711,172 @@ siblings and only the real time threads are scheduled on the cores 4-7.
# echo F0 > p0/cpus
-4) Locking between applications
+Example 4
+---------
+
+The resource groups in previous examples were all in the default "shareable"
+mode allowing sharing of their cache allocations. If one resource group
+configures a cache allocation then nothing prevents another resource group
+to overlap with that allocation.
+
+In this example a new exclusive resource group will be created on a L2 CAT
+system with two L2 cache instances that can be configured with an 8-bit
+capacity bitmask. The new exclusive resource group will be configured to use
+25% of each cache instance.
+
+# mount -t resctrl resctrl /sys/fs/resctrl/
+# cd /sys/fs/resctrl
+
+First, we observe that the default group is configured to allocate to all L2
+cache:
+
+# cat schemata
+L2:0=ff;1=ff
+
+We could attempt to create the new resource group at this point, but it will
+fail because of the overlap with the schemata of the default group:
+# mkdir p0
+# echo 'L2:0=0x3;1=0x3' > p0/schemata
+# cat p0/mode
+shareable
+# echo exclusive > p0/mode
+-sh: echo: write error: Invalid argument
+# cat info/last_cmd_status
+schemata overlaps
+
+To ensure that there is no overlap with another resource group the default
+resource group's schemata has to change, making it possible for the new
+resource group to become exclusive.
+# echo 'L2:0=0xfc;1=0xfc' > schemata
+# echo exclusive > p0/mode
+# grep . p0/*
+p0/cpus:0
+p0/mode:exclusive
+p0/schemata:L2:0=03;1=03
+p0/size:L2:0=262144;1=262144
+
+A new resource group will on creation not overlap with an exclusive resource
+group:
+# mkdir p1
+# grep . p1/*
+p1/cpus:0
+p1/mode:shareable
+p1/schemata:L2:0=fc;1=fc
+p1/size:L2:0=786432;1=786432
+
+The bit_usage will reflect how the cache is used:
+# cat info/L2/bit_usage
+0=SSSSSSEE;1=SSSSSSEE
+
+A resource group cannot be forced to overlap with an exclusive resource group:
+# echo 'L2:0=0x1;1=0x1' > p1/schemata
+-sh: echo: write error: Invalid argument
+# cat info/last_cmd_status
+overlaps with exclusive group
+
+Example of Cache Pseudo-Locking
+-------------------------------
+Lock portion of L2 cache from cache id 1 using CBM 0x3. Pseudo-locked
+region is exposed at /dev/pseudo_lock/newlock that can be provided to
+application for argument to mmap().
+
+# mount -t resctrl resctrl /sys/fs/resctrl/
+# cd /sys/fs/resctrl
+
+Ensure that there are bits available that can be pseudo-locked, since only
+unused bits can be pseudo-locked the bits to be pseudo-locked needs to be
+removed from the default resource group's schemata:
+# cat info/L2/bit_usage
+0=SSSSSSSS;1=SSSSSSSS
+# echo 'L2:1=0xfc' > schemata
+# cat info/L2/bit_usage
+0=SSSSSSSS;1=SSSSSS00
+
+Create a new resource group that will be associated with the pseudo-locked
+region, indicate that it will be used for a pseudo-locked region, and
+configure the requested pseudo-locked region capacity bitmask:
+
+# mkdir newlock
+# echo pseudo-locksetup > newlock/mode
+# echo 'L2:1=0x3' > newlock/schemata
+
+On success the resource group's mode will change to pseudo-locked, the
+bit_usage will reflect the pseudo-locked region, and the character device
+exposing the pseudo-locked region will exist:
+
+# cat newlock/mode
+pseudo-locked
+# cat info/L2/bit_usage
+0=SSSSSSSS;1=SSSSSSPP
+# ls -l /dev/pseudo_lock/newlock
+crw------- 1 root root 243, 0 Apr 3 05:01 /dev/pseudo_lock/newlock
+
+/*
+ * Example code to access one page of pseudo-locked cache region
+ * from user space.
+ */
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+/*
+ * It is required that the application runs with affinity to only
+ * cores associated with the pseudo-locked region. Here the cpu
+ * is hardcoded for convenience of example.
+ */
+static int cpuid = 2;
+
+int main(int argc, char *argv[])
+{
+ cpu_set_t cpuset;
+ long page_size;
+ void *mapping;
+ int dev_fd;
+ int ret;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpuid, &cpuset);
+ ret = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+ if (ret < 0) {
+ perror("sched_setaffinity");
+ exit(EXIT_FAILURE);
+ }
+
+ dev_fd = open("/dev/pseudo_lock/newlock", O_RDWR);
+ if (dev_fd < 0) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+
+ mapping = mmap(0, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ dev_fd, 0);
+ if (mapping == MAP_FAILED) {
+ perror("mmap");
+ close(dev_fd);
+ exit(EXIT_FAILURE);
+ }
+
+ /* Application interacts with pseudo-locked memory @mapping */
+
+ ret = munmap(mapping, page_size);
+ if (ret < 0) {
+ perror("munmap");
+ close(dev_fd);
+ exit(EXIT_FAILURE);
+ }
+
+ close(dev_fd);
+ exit(EXIT_SUCCESS);
+}
+
+Locking between applications
+----------------------------
Certain operations on the resctrl filesystem, composed of read/writes
to/from multiple files, must be atomic.
@@ -510,7 +884,7 @@ to/from multiple files, must be atomic.
As an example, the allocation of an exclusive reservation of L3 cache
involves:
- 1. Read the cbmmasks from each directory
+ 1. Read the cbmmasks from each directory or the per-resource "bit_usage"
2. Find a contiguous set of bits in the global CBM bitmask that is clear
in any of the directory cbmmasks
3. Create a new directory
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 8d109ef67ab6..ad6d2a80cf05 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -92,9 +92,7 @@ APICs
Timing
notsc
- Don't use the CPU time stamp counter to read the wall time.
- This can be used to work around timing problems on multiprocessor systems
- with not properly synchronized CPUs.
+ Deprecated, use tsc=unstable instead.
nohpet
Don't use the HPET timer.
@@ -156,6 +154,10 @@ NUMA
If given as an integer, fills all system RAM with N fake nodes
interleaved over physical nodes.
+ numa=fake=<N>U
+ If given as an integer followed by 'U', it will divide each
+ physical node into N emulated nodes.
+
ACPI
acpi=off Don't enable ACPI