From ad56b738c5dd223a2f66685830f82194025a6138 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 21 Mar 2018 21:22:47 +0200
Subject: docs/vm: rename documentation files to .rst

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/kernel-parameters.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'Documentation/admin-guide/kernel-parameters.txt')
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1d1d53f85ddd..5d6e5509c049 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3887,7 +3887,7 @@
 			cache (risks via metadata attacks are mostly
 			unchanged). Debug options disable merging on their
 			own.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slab_max_order=	[MM, SLAB]
 			Determines the maximum allowed order for slabs.
@@ -3901,7 +3901,7 @@
 			slub_debug can create guard zones around objects and
 			may poison objects when not in use. Also tracks the
 			last alloc / free. For more information see
-			Documentation/vm/slub.txt.
+			Documentation/vm/slub.rst.
 
 	slub_memcg_sysfs=	[MM, SLUB]
 			Determines whether to enable sysfs directories for
@@ -3915,7 +3915,7 @@
 			Determines the maximum allowed order for slabs.
 			A high setting may cause OOMs due to memory
 			fragmentation. For more information see
-			Documentation/vm/slub.txt.
+			Documentation/vm/slub.rst.
 
 	slub_min_objects=	[MM, SLUB]
 			The minimum number of objects per slab. SLUB will
@@ -3924,12 +3924,12 @@
 			the number of objects indicated. The higher the number
 			of objects the smaller the overhead of tracking slabs
 			and the less frequently locks need to be acquired.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slub_min_order=	[MM, SLUB]
 			Determines the minimum page order for slabs. Must be
 			lower than slub_max_order.
-			For more information see Documentation/vm/slub.txt.
+			For more information see Documentation/vm/slub.rst.
 
 	slub_nomerge	[MM, SLUB]
 			Same with slab_nomerge. This is supported for legacy.
@@ -4285,7 +4285,7 @@
 			Format: [always|madvise|never]
 			Can be used to control the default behavior of the system
 			with respect to transparent hugepages.
-			See Documentation/vm/transhuge.txt for more details.
+			See Documentation/vm/transhuge.rst for more details.
 
 	tsc=		Disable clocksource stability checks for TSC.
 			Format: <string>
-- 
cgit v1.2.3


From 6dddd7a7ec34bd8680ef72de0229cf8a92bd01ab Mon Sep 17 00:00:00 2001
From: Thymo van Beers <thymovanbeers@gmail.com>
Date: Wed, 18 Apr 2018 20:51:39 +0200
Subject: docs: kernel-parameters.txt: Fix whitespace

Some lines used spaces instead of tabs at line start.
This can cause mangled lines in editors due to inconsistency.

Replace spaces for tabs where appropriate.

Signed-off-by: Thymo van Beers <thymovanbeers@gmail.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/kernel-parameters.txt | 136 ++++++++++++------------
 1 file changed, 68 insertions(+), 68 deletions(-)

(limited to 'Documentation/admin-guide/kernel-parameters.txt')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3487be79847c..865a24e4d516 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -106,11 +106,11 @@
 			use by PCI
 			Format: <irq>,<irq>...
 
-	acpi_mask_gpe=  [HW,ACPI]
+	acpi_mask_gpe=	[HW,ACPI]
 			Due to the existence of _Lxx/_Exx, some GPEs triggered
 			by unsupported hardware/firmware features can result in
-                        GPE floodings that cannot be automatically disabled by
-                        the GPE dispatcher.
+			GPE floodings that cannot be automatically disabled by
+			the GPE dispatcher.
 			This facility can be used to prevent such uncontrolled
 			GPE floodings.
 			Format: <int>
@@ -472,10 +472,10 @@
 			for platform specific values (SB1, Loongson3 and
 			others).
 
-	ccw_timeout_log [S390]
+	ccw_timeout_log	[S390]
 			See Documentation/s390/CommonIO for details.
 
-	cgroup_disable= [KNL] Disable a particular controller
+	cgroup_disable=	[KNL] Disable a particular controller
 			Format: {name of the controller(s) to disable}
 			The effects of cgroup_disable=foo are:
 			- foo isn't auto-mounted if you mount all cgroups in
@@ -641,8 +641,8 @@
 		hvc<n>	Use the hypervisor console device <n>. This is for
 			both Xen and PowerPC hypervisors.
 
-                If the device connected to the port is not a TTY but a braille
-                device, prepend "brl," before the device type, for instance
+		If the device connected to the port is not a TTY but a braille
+		device, prepend "brl," before the device type, for instance
 			console=brl,ttyS0
 		For now, only VisioBraille is supported.
 
@@ -662,7 +662,7 @@
 
 	consoleblank=	[KNL] The console blank (screen saver) timeout in
 			seconds. A value of 0 disables the blank timer.
-                       Defaults to 0.
+			Defaults to 0.
 
 	coredump_filter=
 			[KNL] Change the default value for
@@ -730,7 +730,7 @@
 			or memory reserved is below 4G.
 
 	cryptomgr.notests
-                        [KNL] Disable crypto self-tests
+			[KNL] Disable crypto self-tests
 
 	cs89x0_dma=	[HW,NET]
 			Format: <dma>
@@ -746,7 +746,7 @@
 			Format: <port#>,<type>
 			See also Documentation/input/devices/joystick-parport.rst
 
-	ddebug_query=   [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
+	ddebug_query=	[KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
 			time. See
 			Documentation/admin-guide/dynamic-debug-howto.rst for
 			details.  Deprecated, see dyndbg.
@@ -833,7 +833,7 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.
 
-	disable_ddw     [PPC/PSERIES]
+	disable_ddw	[PPC/PSERIES]
 			Disable Dynamic DMA Window support. Use this if
 			to workaround buggy firmware.
 
@@ -1188,7 +1188,7 @@
 			parameter will force ia64_sal_cache_flush to call
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
-	forcepae [X86-32]
+	forcepae	[X86-32]
 			Forcefully enable Physical Address Extension (PAE).
 			Many Pentium M systems disable PAE but may have a
 			functionally usable PAE implementation.
@@ -1247,7 +1247,7 @@
 
 	gamma=		[HW,DRM]
 
-	gart_fix_e820=  [X86_64] disable the fix e820 for K8 GART
+	gart_fix_e820=	[X86_64] disable the fix e820 for K8 GART
 			Format: off | on
 			default: on
 
@@ -1341,11 +1341,11 @@
 			x86-64 are 2M (when the CPU supports "pse") and 1G
 			(when the CPU supports the "pdpe1gb" cpuinfo flag).
 
-	hvc_iucv=	[S390] Number of z/VM IUCV hypervisor console (HVC)
-			       terminal devices. Valid values: 0..8
-	hvc_iucv_allow=	[S390] Comma-separated list of z/VM user IDs.
-			       If specified, z/VM IUCV HVC accepts connections
-			       from listed z/VM user IDs only.
+	hvc_iucv=	[S390]	Number of z/VM IUCV hypervisor console (HVC)
+				terminal devices. Valid values: 0..8
+	hvc_iucv_allow=	[S390]	Comma-separated list of z/VM user IDs.
+				If specified, z/VM IUCV HVC accepts connections
+				from listed z/VM user IDs only.
 
 	keep_bootcon	[KNL]
 			Do not unregister boot console at start. This is only
@@ -1353,11 +1353,11 @@
 			between unregistering the boot console and initializing
 			the real console.
 
-	i2c_bus=	[HW] Override the default board specific I2C bus speed
-			     or register an additional I2C bus that is not
-			     registered from board initialization code.
-			     Format:
-			     <bus_id>,<clkrate>
+	i2c_bus=	[HW]	Override the default board specific I2C bus speed
+				or register an additional I2C bus that is not
+				registered from board initialization code.
+				Format:
+				<bus_id>,<clkrate>
 
 	i8042.debug	[HW] Toggle i8042 debug mode
 	i8042.unmask_kbd_data
@@ -1386,7 +1386,7 @@
 			Default: only on s2r transitions on x86; most other
 			architectures force reset to be always executed
 	i8042.unlock	[HW] Unlock (ignore) the keylock
-	i8042.kbdreset  [HW] Reset device connected to KBD port
+	i8042.kbdreset	[HW] Reset device connected to KBD port
 
 	i810=		[HW,DRM]
 
@@ -1548,13 +1548,13 @@
 			programs exec'd, files mmap'd for exec, and all files
 			opened for read by uid=0.
 
-	ima_template=   [IMA]
+	ima_template=	[IMA]
 			Select one of defined IMA measurements template formats.
 			Formats: { "ima" | "ima-ng" | "ima-sig" }
 			Default: "ima-ng"
 
 	ima_template_fmt=
-	                [IMA] Define a custom template format.
+			[IMA] Define a custom template format.
 			Format: { "field1|...|fieldN" }
 
 	ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
@@ -1597,7 +1597,7 @@
 	inport.irq=	[HW] Inport (ATI XL and Microsoft) busmouse driver
 			Format: <irq>
 
-	int_pln_enable  [x86] Enable power limit notification interrupt
+	int_pln_enable	[x86] Enable power limit notification interrupt
 
 	integrity_audit=[IMA]
 			Format: { "0" | "1" }
@@ -1650,39 +1650,39 @@
 			0	disables intel_idle and fall back on acpi_idle.
 			1 to 9	specify maximum depth of C-state.
 
-	intel_pstate=  [X86]
-		       disable
-		         Do not enable intel_pstate as the default
-		         scaling driver for the supported processors
-		       passive
-			 Use intel_pstate as a scaling driver, but configure it
-			 to work with generic cpufreq governors (instead of
-			 enabling its internal governor).  This mode cannot be
-			 used along with the hardware-managed P-states (HWP)
-			 feature.
-		       force
-			 Enable intel_pstate on systems that prohibit it by default
-			 in favor of acpi-cpufreq. Forcing the intel_pstate driver
-			 instead of acpi-cpufreq may disable platform features, such
-			 as thermal controls and power capping, that rely on ACPI
-			 P-States information being indicated to OSPM and therefore
-			 should be used with caution. This option does not work with
-			 processors that aren't supported by the intel_pstate driver
-			 or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
-		       no_hwp
-		         Do not enable hardware P state control (HWP)
-			 if available.
-		hwp_only
-			Only load intel_pstate on systems which support
-			hardware P state control (HWP) if available.
-		support_acpi_ppc
-			Enforce ACPI _PPC performance limits. If the Fixed ACPI
-			Description Table, specifies preferred power management
-			profile as "Enterprise Server" or "Performance Server",
-			then this feature is turned on by default.
-		per_cpu_perf_limits
-			Allow per-logical-CPU P-State performance control limits using
-			cpufreq sysfs interface
+	intel_pstate=	[X86]
+			disable
+			  Do not enable intel_pstate as the default
+			  scaling driver for the supported processors
+			passive
+			  Use intel_pstate as a scaling driver, but configure it
+			  to work with generic cpufreq governors (instead of
+			  enabling its internal governor).  This mode cannot be
+			  used along with the hardware-managed P-states (HWP)
+			  feature.
+			force
+			  Enable intel_pstate on systems that prohibit it by default
+			  in favor of acpi-cpufreq. Forcing the intel_pstate driver
+			  instead of acpi-cpufreq may disable platform features, such
+			  as thermal controls and power capping, that rely on ACPI
+			  P-States information being indicated to OSPM and therefore
+			  should be used with caution. This option does not work with
+			  processors that aren't supported by the intel_pstate driver
+			  or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
+			no_hwp
+			  Do not enable hardware P state control (HWP)
+			  if available.
+			hwp_only
+			  Only load intel_pstate on systems which support
+			  hardware P state control (HWP) if available.
+			support_acpi_ppc
+			  Enforce ACPI _PPC performance limits. If the Fixed ACPI
+			  Description Table, specifies preferred power management
+			  profile as "Enterprise Server" or "Performance Server",
+			  then this feature is turned on by default.
+			per_cpu_perf_limits
+			  Allow per-logical-CPU P-State performance control limits using
+			  cpufreq sysfs interface
 
 	intremap=	[X86-64, Intel-IOMMU]
 			on	enable Interrupt Remapping (default)
@@ -2027,7 +2027,7 @@
 			* [no]ncqtrim: Turn off queued DSM TRIM.
 
 			* nohrst, nosrst, norst: suppress hard, soft
-                          and both resets.
+			  and both resets.
 
 			* rstonce: only attempt one reset during
 			  hot-unplug link recovery
@@ -2215,7 +2215,7 @@
 			[KNL,SH] Allow user to override the default size for
 			per-device physically contiguous DMA buffers.
 
-        memhp_default_state=online/offline
+	memhp_default_state=online/offline
 			[KNL] Set the initial state for the memory hotplug
 			onlining policy. If not specified, the default value is
 			set according to the
@@ -2762,7 +2762,7 @@
 			[X86,PV_OPS] Disable paravirtualized VMware scheduler
 			clock and use the default one.
 
-	no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
+	no-steal-acc	[X86,KVM] Disable paravirtualized steal time accounting.
 			steal time is computed, but won't influence scheduler
 			behaviour
 
@@ -2823,7 +2823,7 @@
 	notsc		[BUGS=X86-32] Disable Time Stamp Counter
 
 	nowatchdog	[KNL] Disable both lockup detectors, i.e.
-                        soft-lockup and NMI watchdog (hard-lockup).
+			soft-lockup and NMI watchdog (hard-lockup).
 
 	nowb		[ARM]
 
@@ -2843,7 +2843,7 @@
 			If the dependencies are under your control, you can
 			turn on cpu0_hotplug.
 
-	nps_mtm_hs_ctr= [KNL,ARC]
+	nps_mtm_hs_ctr=	[KNL,ARC]
 			This parameter sets the maximum duration, in
 			cycles, each HW thread of the CTOP can run
 			without interruptions, before HW switches it.
@@ -2984,7 +2984,7 @@
 
 	pci=option[,option...]	[PCI] various PCI subsystem options:
 		earlydump	[X86] dump PCI config space before the kernel
-			        changes anything
+				changes anything
 		off		[X86] don't probe for the PCI bus
 		bios		[X86-32] force use of PCI BIOS, don't access
 				the hardware directly. Use this if your machine
@@ -3072,7 +3072,7 @@
 				is enabled by default.  If you need to use this,
 				please report a bug.
 		nocrs		[X86] Ignore PCI host bridge windows from ACPI.
-			        If you need to use this, please report a bug.
+				If you need to use this, please report a bug.
 		routeirq	Do IRQ routing for all PCI devices.
 				This is normally done in pci_enable_device(),
 				so this option is a temporary workaround
@@ -4391,7 +4391,7 @@
 
 	usbcore.initial_descriptor_timeout=
 			[USB] Specifies timeout for the initial 64-byte
-                        USB_REQ_GET_DESCRIPTOR request in milliseconds
+			USB_REQ_GET_DESCRIPTOR request in milliseconds
 			(default 5000 = 5.0 seconds).
 
 	usbcore.nousb	[USB] Disable the USB subsystem
-- 
cgit v1.2.3


From 18bcaa4e617c04043e46e70c54753d42cf6728f4 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Mon, 7 May 2018 06:35:44 -0300
Subject: docs: driver-api: add clk documentation

The clk.rst is already in ReST format. So, move it to the
driver-api guide, where it belongs.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/00-INDEX                          |   2 -
 Documentation/admin-guide/kernel-parameters.txt |   2 +-
 Documentation/clk.txt                           | 307 ------------------------
 Documentation/driver-api/clk.rst                | 307 ++++++++++++++++++++++++
 Documentation/driver-api/index.rst              |   1 +
 5 files changed, 309 insertions(+), 310 deletions(-)
 delete mode 100644 Documentation/clk.txt
 create mode 100644 Documentation/driver-api/clk.rst

(limited to 'Documentation/admin-guide/kernel-parameters.txt')

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 6e141c05f3d2..a50d2380b6fb 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -82,8 +82,6 @@ cgroup-v1/
 	- cgroups v1 features, including cpusets and memory controller.
 cgroup-v2.txt
 	- cgroups v2 features, including cpusets and memory controller.
-clk.txt
-	- info on the common clock framework
 cma/
 	- Continuous Memory Area (CMA) debugfs interface.
 conf.py
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 865a24e4d516..42f3e2884e7c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -518,7 +518,7 @@
 			those clocks in any way. This parameter is useful for
 			debug and development, but should not be needed on a
 			platform with proper driver support.  For more
-			information, see Documentation/clk.txt.
+			information, see Documentation/driver-api/clk.rst.
 
 	clock=		[BUGS=X86-32, HW] gettimeofday clocksource override.
 			[Deprecated]
diff --git a/Documentation/clk.txt b/Documentation/clk.txt
deleted file mode 100644
index 511628bb3d3a..000000000000
--- a/Documentation/clk.txt
+++ /dev/null
@@ -1,307 +0,0 @@
-========================
-The Common Clk Framework
-========================
-
-:Author: Mike Turquette <mturquette@ti.com>
-
-This document endeavours to explain the common clk framework details,
-and how to port a platform over to this framework.  It is not yet a
-detailed explanation of the clock api in include/linux/clk.h, but
-perhaps someday it will include that information.
-
-Introduction and interface split
-================================
-
-The common clk framework is an interface to control the clock nodes
-available on various devices today.  This may come in the form of clock
-gating, rate adjustment, muxing or other operations.  This framework is
-enabled with the CONFIG_COMMON_CLK option.
-
-The interface itself is divided into two halves, each shielded from the
-details of its counterpart.  First is the common definition of struct
-clk which unifies the framework-level accounting and infrastructure that
-has traditionally been duplicated across a variety of platforms.  Second
-is a common implementation of the clk.h api, defined in
-drivers/clk/clk.c.  Finally there is struct clk_ops, whose operations
-are invoked by the clk api implementation.
-
-The second half of the interface is comprised of the hardware-specific
-callbacks registered with struct clk_ops and the corresponding
-hardware-specific structures needed to model a particular clock.  For
-the remainder of this document any reference to a callback in struct
-clk_ops, such as .enable or .set_rate, implies the hardware-specific
-implementation of that code.  Likewise, references to struct clk_foo
-serve as a convenient shorthand for the implementation of the
-hardware-specific bits for the hypothetical "foo" hardware.
-
-Tying the two halves of this interface together is struct clk_hw, which
-is defined in struct clk_foo and pointed to within struct clk_core.  This
-allows for easy navigation between the two discrete halves of the common
-clock interface.
-
-Common data structures and api
-==============================
-
-Below is the common struct clk_core definition from
-drivers/clk/clk.c, modified for brevity::
-
-	struct clk_core {
-		const char		*name;
-		const struct clk_ops	*ops;
-		struct clk_hw		*hw;
-		struct module		*owner;
-		struct clk_core		*parent;
-		const char		**parent_names;
-		struct clk_core		**parents;
-		u8			num_parents;
-		u8			new_parent_index;
-		...
-	};
-
-The members above make up the core of the clk tree topology.  The clk
-api itself defines several driver-facing functions which operate on
-struct clk.  That api is documented in include/linux/clk.h.
-
-Platforms and devices utilizing the common struct clk_core use the struct
-clk_ops pointer in struct clk_core to perform the hardware-specific parts of
-the operations defined in clk-provider.h::
-
-	struct clk_ops {
-		int		(*prepare)(struct clk_hw *hw);
-		void		(*unprepare)(struct clk_hw *hw);
-		int		(*is_prepared)(struct clk_hw *hw);
-		void		(*unprepare_unused)(struct clk_hw *hw);
-		int		(*enable)(struct clk_hw *hw);
-		void		(*disable)(struct clk_hw *hw);
-		int		(*is_enabled)(struct clk_hw *hw);
-		void		(*disable_unused)(struct clk_hw *hw);
-		unsigned long	(*recalc_rate)(struct clk_hw *hw,
-						unsigned long parent_rate);
-		long		(*round_rate)(struct clk_hw *hw,
-						unsigned long rate,
-						unsigned long *parent_rate);
-		int		(*determine_rate)(struct clk_hw *hw,
-						  struct clk_rate_request *req);
-		int		(*set_parent)(struct clk_hw *hw, u8 index);
-		u8		(*get_parent)(struct clk_hw *hw);
-		int		(*set_rate)(struct clk_hw *hw,
-					    unsigned long rate,
-					    unsigned long parent_rate);
-		int		(*set_rate_and_parent)(struct clk_hw *hw,
-					    unsigned long rate,
-					    unsigned long parent_rate,
-					    u8 index);
-		unsigned long	(*recalc_accuracy)(struct clk_hw *hw,
-						unsigned long parent_accuracy);
-		int		(*get_phase)(struct clk_hw *hw);
-		int		(*set_phase)(struct clk_hw *hw, int degrees);
-		void		(*init)(struct clk_hw *hw);
-		int		(*debug_init)(struct clk_hw *hw,
-					      struct dentry *dentry);
-	};
-
-Hardware clk implementations
-============================
-
-The strength of the common struct clk_core comes from its .ops and .hw pointers
-which abstract the details of struct clk from the hardware-specific bits, and
-vice versa.  To illustrate consider the simple gateable clk implementation in
-drivers/clk/clk-gate.c::
-
-	struct clk_gate {
-		struct clk_hw	hw;
-		void __iomem    *reg;
-		u8              bit_idx;
-		...
-	};
-
-struct clk_gate contains struct clk_hw hw as well as hardware-specific
-knowledge about which register and bit controls this clk's gating.
-Nothing about clock topology or accounting, such as enable_count or
-notifier_count, is needed here.  That is all handled by the common
-framework code and struct clk_core.
-
-Let's walk through enabling this clk from driver code::
-
-	struct clk *clk;
-	clk = clk_get(NULL, "my_gateable_clk");
-
-	clk_prepare(clk);
-	clk_enable(clk);
-
-The call graph for clk_enable is very simple::
-
-	clk_enable(clk);
-		clk->ops->enable(clk->hw);
-		[resolves to...]
-			clk_gate_enable(hw);
-			[resolves struct clk gate with to_clk_gate(hw)]
-				clk_gate_set_bit(gate);
-
-And the definition of clk_gate_set_bit::
-
-	static void clk_gate_set_bit(struct clk_gate *gate)
-	{
-		u32 reg;
-
-		reg = __raw_readl(gate->reg);
-		reg |= BIT(gate->bit_idx);
-		writel(reg, gate->reg);
-	}
-
-Note that to_clk_gate is defined as::
-
-	#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
-
-This pattern of abstraction is used for every clock hardware
-representation.
-
-Supporting your own clk hardware
-================================
-
-When implementing support for a new type of clock it is only necessary to
-include the following header::
-
-	#include <linux/clk-provider.h>
-
-To construct a clk hardware structure for your platform you must define
-the following::
-
-	struct clk_foo {
-		struct clk_hw hw;
-		... hardware specific data goes here ...
-	};
-
-To take advantage of your data you'll need to support valid operations
-for your clk::
-
-	struct clk_ops clk_foo_ops {
-		.enable		= &clk_foo_enable;
-		.disable	= &clk_foo_disable;
-	};
-
-Implement the above functions using container_of::
-
-	#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
-
-	int clk_foo_enable(struct clk_hw *hw)
-	{
-		struct clk_foo *foo;
-
-		foo = to_clk_foo(hw);
-
-		... perform magic on foo ...
-
-		return 0;
-	};
-
-Below is a matrix detailing which clk_ops are mandatory based upon the
-hardware capabilities of that clock.  A cell marked as "y" means
-mandatory, a cell marked as "n" implies that either including that
-callback is invalid or otherwise unnecessary.  Empty cells are either
-optional or must be evaluated on a case-by-case basis.
-
-.. table:: clock hardware characteristics
-
-   +----------------+------+-------------+---------------+-------------+------+
-   |                | gate | change rate | single parent | multiplexer | root |
-   +================+======+=============+===============+=============+======+
-   |.prepare        |      |             |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   |.unprepare      |      |             |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   +----------------+------+-------------+---------------+-------------+------+
-   |.enable         | y    |             |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   |.disable        | y    |             |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   |.is_enabled     | y    |             |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   +----------------+------+-------------+---------------+-------------+------+
-   |.recalc_rate    |      | y           |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   |.round_rate     |      | y [1]_      |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   |.determine_rate |      | y [1]_      |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   |.set_rate       |      | y           |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   +----------------+------+-------------+---------------+-------------+------+
-   |.set_parent     |      |             | n             | y           | n    |
-   +----------------+------+-------------+---------------+-------------+------+
-   |.get_parent     |      |             | n             | y           | n    |
-   +----------------+------+-------------+---------------+-------------+------+
-   +----------------+------+-------------+---------------+-------------+------+
-   |.recalc_accuracy|      |             |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-   +----------------+------+-------------+---------------+-------------+------+
-   |.init           |      |             |               |             |      |
-   +----------------+------+-------------+---------------+-------------+------+
-
-.. [1] either one of round_rate or determine_rate is required.
-
-Finally, register your clock at run-time with a hardware-specific
-registration function.  This function simply populates struct clk_foo's
-data and then passes the common struct clk parameters to the framework
-with a call to::
-
-	clk_register(...)
-
-See the basic clock types in ``drivers/clk/clk-*.c`` for examples.
-
-Disabling clock gating of unused clocks
-=======================================
-
-Sometimes during development it can be useful to be able to bypass the
-default disabling of unused clocks. For example, if drivers aren't enabling
-clocks properly but rely on them being on from the bootloader, bypassing
-the disabling means that the driver will remain functional while the issues
-are sorted out.
-
-To bypass this disabling, include "clk_ignore_unused" in the bootargs to the
-kernel.
-
-Locking
-=======
-
-The common clock framework uses two global locks, the prepare lock and the
-enable lock.
-
-The enable lock is a spinlock and is held across calls to the .enable,
-.disable operations. Those operations are thus not allowed to sleep,
-and calls to the clk_enable(), clk_disable() API functions are allowed in
-atomic context.
-
-For clk_is_enabled() API, it is also designed to be allowed to be used in
-atomic context. However, it doesn't really make any sense to hold the enable
-lock in core, unless you want to do something else with the information of
-the enable state with that lock held. Otherwise, seeing if a clk is enabled is
-a one-shot read of the enabled state, which could just as easily change after
-the function returns because the lock is released. Thus the user of this API
-needs to handle synchronizing the read of the state with whatever they're
-using it for to make sure that the enable state doesn't change during that
-time.
-
-The prepare lock is a mutex and is held across calls to all other operations.
-All those operations are allowed to sleep, and calls to the corresponding API
-functions are not allowed in atomic context.
-
-This effectively divides operations in two groups from a locking perspective.
-
-Drivers don't need to manually protect resources shared between the operations
-of one group, regardless of whether those resources are shared by multiple
-clocks or not. However, access to resources that are shared between operations
-of the two groups needs to be protected by the drivers. An example of such a
-resource would be a register that controls both the clock rate and the clock
-enable/disable state.
-
-The clock framework is reentrant, in that a driver is allowed to call clock
-framework functions from within its implementation of clock operations. This
-can for instance cause a .set_rate operation of one clock being called from
-within the .set_rate operation of another clock. This case must be considered
-in the driver implementations, but the code flow is usually controlled by the
-driver in that case.
-
-Note that locking must also be considered when code outside of the common
-clock framework needs to access resources used by the clock operations. This
-is considered out of scope of this document.
diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst
new file mode 100644
index 000000000000..511628bb3d3a
--- /dev/null
+++ b/Documentation/driver-api/clk.rst
@@ -0,0 +1,307 @@
+========================
+The Common Clk Framework
+========================
+
+:Author: Mike Turquette <mturquette@ti.com>
+
+This document endeavours to explain the common clk framework details,
+and how to port a platform over to this framework.  It is not yet a
+detailed explanation of the clock api in include/linux/clk.h, but
+perhaps someday it will include that information.
+
+Introduction and interface split
+================================
+
+The common clk framework is an interface to control the clock nodes
+available on various devices today.  This may come in the form of clock
+gating, rate adjustment, muxing or other operations.  This framework is
+enabled with the CONFIG_COMMON_CLK option.
+
+The interface itself is divided into two halves, each shielded from the
+details of its counterpart.  First is the common definition of struct
+clk which unifies the framework-level accounting and infrastructure that
+has traditionally been duplicated across a variety of platforms.  Second
+is a common implementation of the clk.h api, defined in
+drivers/clk/clk.c.  Finally there is struct clk_ops, whose operations
+are invoked by the clk api implementation.
+
+The second half of the interface is comprised of the hardware-specific
+callbacks registered with struct clk_ops and the corresponding
+hardware-specific structures needed to model a particular clock.  For
+the remainder of this document any reference to a callback in struct
+clk_ops, such as .enable or .set_rate, implies the hardware-specific
+implementation of that code.  Likewise, references to struct clk_foo
+serve as a convenient shorthand for the implementation of the
+hardware-specific bits for the hypothetical "foo" hardware.
+
+Tying the two halves of this interface together is struct clk_hw, which
+is defined in struct clk_foo and pointed to within struct clk_core.  This
+allows for easy navigation between the two discrete halves of the common
+clock interface.
+
+Common data structures and api
+==============================
+
+Below is the common struct clk_core definition from
+drivers/clk/clk.c, modified for brevity::
+
+	struct clk_core {
+		const char		*name;
+		const struct clk_ops	*ops;
+		struct clk_hw		*hw;
+		struct module		*owner;
+		struct clk_core		*parent;
+		const char		**parent_names;
+		struct clk_core		**parents;
+		u8			num_parents;
+		u8			new_parent_index;
+		...
+	};
+
+The members above make up the core of the clk tree topology.  The clk
+api itself defines several driver-facing functions which operate on
+struct clk.  That api is documented in include/linux/clk.h.
+
+Platforms and devices utilizing the common struct clk_core use the struct
+clk_ops pointer in struct clk_core to perform the hardware-specific parts of
+the operations defined in clk-provider.h::
+
+	struct clk_ops {
+		int		(*prepare)(struct clk_hw *hw);
+		void		(*unprepare)(struct clk_hw *hw);
+		int		(*is_prepared)(struct clk_hw *hw);
+		void		(*unprepare_unused)(struct clk_hw *hw);
+		int		(*enable)(struct clk_hw *hw);
+		void		(*disable)(struct clk_hw *hw);
+		int		(*is_enabled)(struct clk_hw *hw);
+		void		(*disable_unused)(struct clk_hw *hw);
+		unsigned long	(*recalc_rate)(struct clk_hw *hw,
+						unsigned long parent_rate);
+		long		(*round_rate)(struct clk_hw *hw,
+						unsigned long rate,
+						unsigned long *parent_rate);
+		int		(*determine_rate)(struct clk_hw *hw,
+						  struct clk_rate_request *req);
+		int		(*set_parent)(struct clk_hw *hw, u8 index);
+		u8		(*get_parent)(struct clk_hw *hw);
+		int		(*set_rate)(struct clk_hw *hw,
+					    unsigned long rate,
+					    unsigned long parent_rate);
+		int		(*set_rate_and_parent)(struct clk_hw *hw,
+					    unsigned long rate,
+					    unsigned long parent_rate,
+					    u8 index);
+		unsigned long	(*recalc_accuracy)(struct clk_hw *hw,
+						unsigned long parent_accuracy);
+		int		(*get_phase)(struct clk_hw *hw);
+		int		(*set_phase)(struct clk_hw *hw, int degrees);
+		void		(*init)(struct clk_hw *hw);
+		int		(*debug_init)(struct clk_hw *hw,
+					      struct dentry *dentry);
+	};
+
+Hardware clk implementations
+============================
+
+The strength of the common struct clk_core comes from its .ops and .hw pointers
+which abstract the details of struct clk from the hardware-specific bits, and
+vice versa.  To illustrate consider the simple gateable clk implementation in
+drivers/clk/clk-gate.c::
+
+	struct clk_gate {
+		struct clk_hw	hw;
+		void __iomem    *reg;
+		u8              bit_idx;
+		...
+	};
+
+struct clk_gate contains struct clk_hw hw as well as hardware-specific
+knowledge about which register and bit controls this clk's gating.
+Nothing about clock topology or accounting, such as enable_count or
+notifier_count, is needed here.  That is all handled by the common
+framework code and struct clk_core.
+
+Let's walk through enabling this clk from driver code::
+
+	struct clk *clk;
+	clk = clk_get(NULL, "my_gateable_clk");
+
+	clk_prepare(clk);
+	clk_enable(clk);
+
+The call graph for clk_enable is very simple::
+
+	clk_enable(clk);
+		clk->ops->enable(clk->hw);
+		[resolves to...]
+			clk_gate_enable(hw);
+			[resolves struct clk gate with to_clk_gate(hw)]
+				clk_gate_set_bit(gate);
+
+And the definition of clk_gate_set_bit::
+
+	static void clk_gate_set_bit(struct clk_gate *gate)
+	{
+		u32 reg;
+
+		reg = __raw_readl(gate->reg);
+		reg |= BIT(gate->bit_idx);
+		writel(reg, gate->reg);
+	}
+
+Note that to_clk_gate is defined as::
+
+	#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
+
+This pattern of abstraction is used for every clock hardware
+representation.
+
+Supporting your own clk hardware
+================================
+
+When implementing support for a new type of clock it is only necessary to
+include the following header::
+
+	#include <linux/clk-provider.h>
+
+To construct a clk hardware structure for your platform you must define
+the following::
+
+	struct clk_foo {
+		struct clk_hw hw;
+		... hardware specific data goes here ...
+	};
+
+To take advantage of your data you'll need to support valid operations
+for your clk::
+
+	struct clk_ops clk_foo_ops {
+		.enable		= &clk_foo_enable;
+		.disable	= &clk_foo_disable;
+	};
+
+Implement the above functions using container_of::
+
+	#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
+
+	int clk_foo_enable(struct clk_hw *hw)
+	{
+		struct clk_foo *foo;
+
+		foo = to_clk_foo(hw);
+
+		... perform magic on foo ...
+
+		return 0;
+	};
+
+Below is a matrix detailing which clk_ops are mandatory based upon the
+hardware capabilities of that clock.  A cell marked as "y" means
+mandatory, a cell marked as "n" implies that either including that
+callback is invalid or otherwise unnecessary.  Empty cells are either
+optional or must be evaluated on a case-by-case basis.
+
+.. table:: clock hardware characteristics
+
+   +----------------+------+-------------+---------------+-------------+------+
+   |                | gate | change rate | single parent | multiplexer | root |
+   +================+======+=============+===============+=============+======+
+   |.prepare        |      |             |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   |.unprepare      |      |             |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   +----------------+------+-------------+---------------+-------------+------+
+   |.enable         | y    |             |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   |.disable        | y    |             |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   |.is_enabled     | y    |             |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   +----------------+------+-------------+---------------+-------------+------+
+   |.recalc_rate    |      | y           |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   |.round_rate     |      | y [1]_      |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   |.determine_rate |      | y [1]_      |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   |.set_rate       |      | y           |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   +----------------+------+-------------+---------------+-------------+------+
+   |.set_parent     |      |             | n             | y           | n    |
+   +----------------+------+-------------+---------------+-------------+------+
+   |.get_parent     |      |             | n             | y           | n    |
+   +----------------+------+-------------+---------------+-------------+------+
+   +----------------+------+-------------+---------------+-------------+------+
+   |.recalc_accuracy|      |             |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+   +----------------+------+-------------+---------------+-------------+------+
+   |.init           |      |             |               |             |      |
+   +----------------+------+-------------+---------------+-------------+------+
+
+.. [1] either one of round_rate or determine_rate is required.
+
+Finally, register your clock at run-time with a hardware-specific
+registration function.  This function simply populates struct clk_foo's
+data and then passes the common struct clk parameters to the framework
+with a call to::
+
+	clk_register(...)
+
+See the basic clock types in ``drivers/clk/clk-*.c`` for examples.
+
+Disabling clock gating of unused clocks
+=======================================
+
+Sometimes during development it can be useful to be able to bypass the
+default disabling of unused clocks. For example, if drivers aren't enabling
+clocks properly but rely on them being on from the bootloader, bypassing
+the disabling means that the driver will remain functional while the issues
+are sorted out.
+
+To bypass this disabling, include "clk_ignore_unused" in the bootargs to the
+kernel.
+
+Locking
+=======
+
+The common clock framework uses two global locks, the prepare lock and the
+enable lock.
+
+The enable lock is a spinlock and is held across calls to the .enable,
+.disable operations. Those operations are thus not allowed to sleep,
+and calls to the clk_enable(), clk_disable() API functions are allowed in
+atomic context.
+
+For clk_is_enabled() API, it is also designed to be allowed to be used in
+atomic context. However, it doesn't really make any sense to hold the enable
+lock in core, unless you want to do something else with the information of
+the enable state with that lock held. Otherwise, seeing if a clk is enabled is
+a one-shot read of the enabled state, which could just as easily change after
+the function returns because the lock is released. Thus the user of this API
+needs to handle synchronizing the read of the state with whatever they're
+using it for to make sure that the enable state doesn't change during that
+time.
+
+The prepare lock is a mutex and is held across calls to all other operations.
+All those operations are allowed to sleep, and calls to the corresponding API
+functions are not allowed in atomic context.
+
+This effectively divides operations in two groups from a locking perspective.
+
+Drivers don't need to manually protect resources shared between the operations
+of one group, regardless of whether those resources are shared by multiple
+clocks or not. However, access to resources that are shared between operations
+of the two groups needs to be protected by the drivers. An example of such a
+resource would be a register that controls both the clock rate and the clock
+enable/disable state.
+
+The clock framework is reentrant, in that a driver is allowed to call clock
+framework functions from within its implementation of clock operations. This
+can for instance cause a .set_rate operation of one clock being called from
+within the .set_rate operation of another clock. This case must be considered
+in the driver implementations, but the code flow is usually controlled by the
+driver in that case.
+
+Note that locking must also be considered when code outside of the common
+clock framework needs to access resources used by the clock operations. This
+is considered out of scope of this document.
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 3ac51c94f97b..5d04296f5ce0 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -17,6 +17,7 @@ available subsections can be seen below.
    basics
    infrastructure
    pm/index
+   clk
    device-io
    device_connection
    dma-buf
-- 
cgit v1.2.3


From 45c9a74f648a76e1118cf8024d11cba54bd64e37 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Mon, 14 May 2018 11:13:40 +0300
Subject: docs/vm: transhuge: split userspace bits to admin-guide/mm/transhuge

Now that the administrative information for transparent huge pages is
nicely separated, move it to its own page under the admin guide.

Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/kernel-parameters.txt |   3 +-
 Documentation/admin-guide/mm/index.rst          |   1 +
 Documentation/admin-guide/mm/transhuge.rst      | 418 ++++++++++++++++++++++++
 Documentation/vm/transhuge.rst                  | 414 +----------------------
 4 files changed, 423 insertions(+), 413 deletions(-)
 create mode 100644 Documentation/admin-guide/mm/transhuge.rst

(limited to 'Documentation/admin-guide/kernel-parameters.txt')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 42f3e2884e7c..8d24270644a1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4313,7 +4313,8 @@
 			Format: [always|madvise|never]
 			Can be used to control the default behavior of the system
 			with respect to transparent hugepages.
-			See Documentation/vm/transhuge.rst for more details.
+			See Documentation/admin-guide/mm/transhuge.rst
+			for more details.
 
 	tsc=		Disable clocksource stability checks for TSC.
 			Format: <string>
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index a69aa69af255..8454be638108 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -27,4 +27,5 @@ the Linux memory management.
    numa_memory_policy
    pagemap
    soft-dirty
+   transhuge
    userfaultfd
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
new file mode 100644
index 000000000000..7ab93a8404b9
--- /dev/null
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -0,0 +1,418 @@
+.. _admin_guide_transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+Objective
+=========
+
+Performance critical computing applications dealing with large memory
+working sets are already running on top of libhugetlbfs and in turn
+hugetlbfs. Transparent HugePage Support (THP) is an alternative mean of
+using huge pages for the backing of virtual memory with huge pages
+that supports the automatic promotion and demotion of page sizes and
+without the shortcomings of hugetlbfs.
+
+Currently THP only works for anonymous memory mappings and tmpfs/shmem.
+But in the future it can expand to other filesystems.
+
+.. note::
+   in the examples below we presume that the basic page size is 4K and
+   the huge page size is 2M, although the actual numbers may vary
+   depending on the CPU architecture.
+
+The reason applications are running faster is because of two
+factors. The first factor is almost completely irrelevant and it's not
+of significant interest because it'll also have the downside of
+requiring larger clear-page copy-page in page faults which is a
+potentially negative effect. The first factor consists in taking a
+single page fault for each 2M virtual region touched by userland (so
+reducing the enter/exit kernel frequency by a 512 times factor). This
+only matters the first time the memory is accessed for the lifetime of
+a memory mapping. The second long lasting and much more important
+factor will affect all subsequent accesses to the memory for the whole
+runtime of the application. The second factor consist of two
+components:
+
+1) the TLB miss will run faster (especially with virtualization using
+   nested pagetables but almost always also on bare metal without
+   virtualization)
+
+2) a single TLB entry will be mapping a much larger amount of virtual
+   memory in turn reducing the number of TLB misses. With
+   virtualization and nested pagetables the TLB can be mapped of
+   larger size only if both KVM and the Linux guest are using
+   hugepages but a significant speedup already happens if only one of
+   the two is using hugepages just because of the fact the TLB miss is
+   going to run faster.
+
+THP can be enabled system wide or restricted to certain tasks or even
+memory ranges inside task's address space. Unless THP is completely
+disabled, there is ``khugepaged`` daemon that scans memory and
+collapses sequences of basic pages into huge pages.
+
+The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
+interface and using madivse(2) and prctl(2) system calls.
+
+Transparent Hugepage Support maximizes the usefulness of free memory
+if compared to the reservation approach of hugetlbfs by allowing all
+unused memory to be used as cache or other movable (or even unmovable
+entities). It doesn't require reservation to prevent hugepage
+allocation failures to be noticeable from userland. It allows paging
+and all other advanced VM features to be available on the
+hugepages. It requires no modifications for applications to take
+advantage of it.
+
+Applications however can be further optimized to take advantage of
+this feature, like for example they've been optimized before to avoid
+a flood of mmap system calls for every malloc(4k). Optimizing userland
+is by far not mandatory and khugepaged already can take care of long
+lived page allocations even for hugepage unaware applications that
+deals with large amounts of memory.
+
+In certain cases when hugepages are enabled system wide, application
+may end up allocating more memory resources. An application may mmap a
+large region but only touch 1 byte of it, in that case a 2M page might
+be allocated instead of a 4k page for no good. This is why it's
+possible to disable hugepages system-wide and to only have them inside
+MADV_HUGEPAGE madvise regions.
+
+Embedded systems should enable hugepages only inside madvise regions
+to eliminate any risk of wasting any precious byte of memory and to
+only run faster.
+
+Applications that gets a lot of benefit from hugepages and that don't
+risk to lose memory by using hugepages, should use
+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
+
+.. _thp_sysfs:
+
+sysfs
+=====
+
+Global THP controls
+-------------------
+
+Transparent Hugepage Support for anonymous memory can be entirely disabled
+(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
+regions (to avoid the risk of consuming more memory resources) or enabled
+system wide. This can be achieved with one of::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/enabled
+	echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
+	echo never >/sys/kernel/mm/transparent_hugepage/enabled
+
+It's also possible to limit defrag efforts in the VM to generate
+anonymous hugepages in case they're not immediately free to madvise
+regions or to never try to defrag memory and simply fallback to regular
+pages unless hugepages are immediately available. Clearly if we spend CPU
+time to defrag memory, we would expect to gain even more by the fact we
+use hugepages later instead of regular pages. This isn't always
+guaranteed, but it may be more likely in case the allocation is for a
+MADV_HUGEPAGE region.
+
+::
+
+	echo always >/sys/kernel/mm/transparent_hugepage/defrag
+	echo defer >/sys/kernel/mm/transparent_hugepage/defrag
+	echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
+	echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
+	echo never >/sys/kernel/mm/transparent_hugepage/defrag
+
+always
+	means that an application requesting THP will stall on
+	allocation failure and directly reclaim pages and compact
+	memory in an effort to allocate a THP immediately. This may be
+	desirable for virtual machines that benefit heavily from THP
+	use and are willing to delay the VM start to utilise them.
+
+defer
+	means that an application will wake kswapd in the background
+	to reclaim pages and wake kcompactd to compact memory so that
+	THP is available in the near future. It's the responsibility
+	of khugepaged to then install the THP pages later.
+
+defer+madvise
+	will enter direct reclaim and compaction like ``always``, but
+	only for regions that have used madvise(MADV_HUGEPAGE); all
+	other regions will wake kswapd in the background to reclaim
+	pages and wake kcompactd to compact memory so that THP is
+	available in the near future.
+
+madvise
+	will enter direct reclaim like ``always`` but only for regions
+	that are have used madvise(MADV_HUGEPAGE). This is the default
+	behaviour.
+
+never
+	should be self-explanatory.
+
+By default kernel tries to use huge zero page on read page fault to
+anonymous mapping. It's possible to disable huge zero page by writing 0
+or enable it back by writing 1::
+
+	echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+	echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
+
+Some userspace (such as a test program, or an optimized memory allocation
+library) may want to know the size (in bytes) of a transparent hugepage::
+
+	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
+
+khugepaged will be automatically started when
+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
+be automatically shutdown if it's set to "never".
+
+Khugepaged controls
+-------------------
+
+khugepaged runs usually at low frequency so while one may not want to
+invoke defrag algorithms synchronously during the page faults, it
+should be worth invoking defrag at least in khugepaged. However it's
+also possible to disable defrag in khugepaged by writing 0 or enable
+defrag in khugepaged by writing 1::
+
+	echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+	echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
+
+You can also control how many pages khugepaged should scan at each
+pass::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
+
+and how many milliseconds to wait in khugepaged between each pass (you
+can set this to 0 to run khugepaged at 100% utilization of one core)::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
+
+and how many milliseconds to wait in khugepaged if there's an hugepage
+allocation failure to throttle the next allocation attempt::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
+
+The khugepaged progress can be seen in the number of pages collapsed::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
+
+for each pass::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
+
+``max_ptes_none`` specifies how many extra small pages (that are
+not already mapped) can be allocated when collapsing a group
+of small pages into one large page::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
+
+A higher value leads to use additional memory for programs.
+A lower value leads to gain less thp performance. Value of
+max_ptes_none can waste cpu time very little, you can
+ignore it.
+
+``max_ptes_swap`` specifies how many pages can be brought in from
+swap when collapsing a group of pages into a transparent huge page::
+
+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
+
+A higher value can cause excessive swap IO and waste
+memory. A lower value can prevent THPs from being
+collapsed, resulting fewer pages being collapsed into
+THPs, and lower memory access performance.
+
+Boot parameter
+==============
+
+You can change the sysfs boot time defaults of Transparent Hugepage
+Support by passing the parameter ``transparent_hugepage=always`` or
+``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
+to the kernel command line.
+
+Hugepages in tmpfs/shmem
+========================
+
+You can control hugepage allocation policy in tmpfs with mount option
+``huge=``. It can have following values:
+
+always
+    Attempt to allocate huge pages every time we need a new page;
+
+never
+    Do not allocate huge pages;
+
+within_size
+    Only allocate huge page if it will be fully within i_size.
+    Also respect fadvise()/madvise() hints;
+
+advise
+    Only allocate huge pages if requested with fadvise()/madvise();
+
+The default policy is ``never``.
+
+``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
+``huge=never`` will not attempt to break up huge pages at all, just stop more
+from being allocated.
+
+There's also sysfs knob to control hugepage allocation policy for internal
+shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
+is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
+MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
+
+In addition to policies listed above, shmem_enabled allows two further
+values:
+
+deny
+    For use in emergencies, to force the huge option off from
+    all mounts;
+force
+    Force the huge option on for all - very useful for testing;
+
+Need of application restart
+===========================
+
+The transparent_hugepage/enabled values and tmpfs mount option only affect
+future behavior. So to make them effective you need to restart any
+application that could have been using hugepages. This also applies to the
+regions registered in khugepaged.
+
+Monitoring usage
+================
+
+The number of anonymous transparent huge pages currently used by the
+system is available by reading the AnonHugePages field in ``/proc/meminfo``.
+To identify what applications are using anonymous transparent huge pages,
+it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
+for each mapping.
+
+The number of file transparent huge pages mapped to userspace is available
+by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
+To identify what applications are mapping file transparent huge pages, it
+is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
+for each mapping.
+
+Note that reading the smaps file is expensive and reading it
+frequently will incur overhead.
+
+There are a number of counters in ``/proc/vmstat`` that may be used to
+monitor how successfully the system is providing huge pages for use.
+
+thp_fault_alloc
+	is incremented every time a huge page is successfully
+	allocated to handle a page fault. This applies to both the
+	first time a page is faulted and for COW faults.
+
+thp_collapse_alloc
+	is incremented by khugepaged when it has found
+	a range of pages to collapse into one huge page and has
+	successfully allocated a new huge page to store the data.
+
+thp_fault_fallback
+	is incremented if a page fault fails to allocate
+	a huge page and instead falls back to using small pages.
+
+thp_collapse_alloc_failed
+	is incremented if khugepaged found a range
+	of pages that should be collapsed into one huge page but failed
+	the allocation.
+
+thp_file_alloc
+	is incremented every time a file huge page is successfully
+	allocated.
+
+thp_file_mapped
+	is incremented every time a file huge page is mapped into
+	user address space.
+
+thp_split_page
+	is incremented every time a huge page is split into base
+	pages. This can happen for a variety of reasons but a common
+	reason is that a huge page is old and is being reclaimed.
+	This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed
+	is incremented if kernel fails to split huge
+	page. This can happen if the page was pinned by somebody.
+
+thp_deferred_split_page
+	is incremented when a huge page is put onto split
+	queue. This happens when a huge page is partially unmapped and
+	splitting it would free up some memory. Pages on split queue are
+	going to be split under memory pressure.
+
+thp_split_pmd
+	is incremented every time a PMD split into table of PTEs.
+	This can happen, for instance, when application calls mprotect() or
+	munmap() on part of huge page. It doesn't split huge page, only
+	page table entry.
+
+thp_zero_page_alloc
+	is incremented every time a huge zero page is
+	successfully allocated. It includes allocations which where
+	dropped due race with other allocation. Note, it doesn't count
+	every map of the huge zero page, only its allocation.
+
+thp_zero_page_alloc_failed
+	is incremented if kernel fails to allocate
+	huge zero page and falls back to using small pages.
+
+thp_swpout
+	is incremented every time a huge page is swapout in one
+	piece without splitting.
+
+thp_swpout_fallback
+	is incremented if a huge page has to be split before swapout.
+	Usually because failed to allocate some continuous swap space
+	for the huge page.
+
+As the system ages, allocating huge pages may be expensive as the
+system uses memory compaction to copy data around memory to free a
+huge page for use. There are some counters in ``/proc/vmstat`` to help
+monitor this overhead.
+
+compact_stall
+	is incremented every time a process stalls to run
+	memory compaction so that a huge page is free for use.
+
+compact_success
+	is incremented if the system compacted memory and
+	freed a huge page for use.
+
+compact_fail
+	is incremented if the system tries to compact memory
+	but failed.
+
+compact_pages_moved
+	is incremented each time a page is moved. If
+	this value is increasing rapidly, it implies that the system
+	is copying a lot of data to satisfy the huge page allocation.
+	It is possible that the cost of copying exceeds any savings
+	from reduced TLB misses.
+
+compact_pagemigrate_failed
+	is incremented when the underlying mechanism
+	for moving a page failed.
+
+compact_blocks_moved
+	is incremented each time memory compaction examines
+	a huge page aligned range of pages.
+
+It is possible to establish how long the stalls were using the function
+tracer to record how long was spent in __alloc_pages_nodemask and
+using the mm_page_alloc tracepoint to identify which allocations were
+for huge pages.
+
+Optimizing the applications
+===========================
+
+To be guaranteed that the kernel will map a 2M page immediately in any
+memory region, the mmap region has to be hugepage naturally
+aligned. posix_memalign() can provide that guarantee.
+
+Hugetlbfs
+=========
+
+You can use hugetlbfs on a kernel that has transparent hugepage
+support enabled just fine as always. No difference can be noted in
+hugetlbfs other than there will be less overall fragmentation. All
+usual features belonging to hugetlbfs are preserved and
+unaffected. libhugetlbfs will also work fine as usual.
diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst
index 47c7e4742bc2..a8cf6809e36e 100644
--- a/Documentation/vm/transhuge.rst
+++ b/Documentation/vm/transhuge.rst
@@ -4,418 +4,8 @@
 Transparent Hugepage Support
 ============================
 
-Objective
-=========
-
-Performance critical computing applications dealing with large memory
-working sets are already running on top of libhugetlbfs and in turn
-hugetlbfs. Transparent HugePage Support (THP) is an alternative mean of
-using huge pages for the backing of virtual memory with huge pages
-that supports the automatic promotion and demotion of page sizes and
-without the shortcomings of hugetlbfs.
-
-Currently THP only works for anonymous memory mappings and tmpfs/shmem.
-But in the future it can expand to other filesystems.
-
-.. note::
-   in the examples below we presume that the basic page size is 4K and
-   the huge page size is 2M, although the actual numbers may vary
-   depending on the CPU architecture.
-
-The reason applications are running faster is because of two
-factors. The first factor is almost completely irrelevant and it's not
-of significant interest because it'll also have the downside of
-requiring larger clear-page copy-page in page faults which is a
-potentially negative effect. The first factor consists in taking a
-single page fault for each 2M virtual region touched by userland (so
-reducing the enter/exit kernel frequency by a 512 times factor). This
-only matters the first time the memory is accessed for the lifetime of
-a memory mapping. The second long lasting and much more important
-factor will affect all subsequent accesses to the memory for the whole
-runtime of the application. The second factor consist of two
-components:
-
-1) the TLB miss will run faster (especially with virtualization using
-   nested pagetables but almost always also on bare metal without
-   virtualization)
-
-2) a single TLB entry will be mapping a much larger amount of virtual
-   memory in turn reducing the number of TLB misses. With
-   virtualization and nested pagetables the TLB can be mapped of
-   larger size only if both KVM and the Linux guest are using
-   hugepages but a significant speedup already happens if only one of
-   the two is using hugepages just because of the fact the TLB miss is
-   going to run faster.
-
-THP can be enabled system wide or restricted to certain tasks or even
-memory ranges inside task's address space. Unless THP is completely
-disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into huge pages.
-
-The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
-interface and using madivse(2) and prctl(2) system calls.
-
-Transparent Hugepage Support maximizes the usefulness of free memory
-if compared to the reservation approach of hugetlbfs by allowing all
-unused memory to be used as cache or other movable (or even unmovable
-entities). It doesn't require reservation to prevent hugepage
-allocation failures to be noticeable from userland. It allows paging
-and all other advanced VM features to be available on the
-hugepages. It requires no modifications for applications to take
-advantage of it.
-
-Applications however can be further optimized to take advantage of
-this feature, like for example they've been optimized before to avoid
-a flood of mmap system calls for every malloc(4k). Optimizing userland
-is by far not mandatory and khugepaged already can take care of long
-lived page allocations even for hugepage unaware applications that
-deals with large amounts of memory.
-
-In certain cases when hugepages are enabled system wide, application
-may end up allocating more memory resources. An application may mmap a
-large region but only touch 1 byte of it, in that case a 2M page might
-be allocated instead of a 4k page for no good. This is why it's
-possible to disable hugepages system-wide and to only have them inside
-MADV_HUGEPAGE madvise regions.
-
-Embedded systems should enable hugepages only inside madvise regions
-to eliminate any risk of wasting any precious byte of memory and to
-only run faster.
-
-Applications that gets a lot of benefit from hugepages and that don't
-risk to lose memory by using hugepages, should use
-madvise(MADV_HUGEPAGE) on their critical mmapped regions.
-
-.. _thp_sysfs:
-
-sysfs
-=====
-
-Global THP controls
--------------------
-
-Transparent Hugepage Support for anonymous memory can be entirely disabled
-(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
-regions (to avoid the risk of consuming more memory resources) or enabled
-system wide. This can be achieved with one of::
-
-	echo always >/sys/kernel/mm/transparent_hugepage/enabled
-	echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
-	echo never >/sys/kernel/mm/transparent_hugepage/enabled
-
-It's also possible to limit defrag efforts in the VM to generate
-anonymous hugepages in case they're not immediately free to madvise
-regions or to never try to defrag memory and simply fallback to regular
-pages unless hugepages are immediately available. Clearly if we spend CPU
-time to defrag memory, we would expect to gain even more by the fact we
-use hugepages later instead of regular pages. This isn't always
-guaranteed, but it may be more likely in case the allocation is for a
-MADV_HUGEPAGE region.
-
-::
-
-	echo always >/sys/kernel/mm/transparent_hugepage/defrag
-	echo defer >/sys/kernel/mm/transparent_hugepage/defrag
-	echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
-	echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
-	echo never >/sys/kernel/mm/transparent_hugepage/defrag
-
-always
-	means that an application requesting THP will stall on
-	allocation failure and directly reclaim pages and compact
-	memory in an effort to allocate a THP immediately. This may be
-	desirable for virtual machines that benefit heavily from THP
-	use and are willing to delay the VM start to utilise them.
-
-defer
-	means that an application will wake kswapd in the background
-	to reclaim pages and wake kcompactd to compact memory so that
-	THP is available in the near future. It's the responsibility
-	of khugepaged to then install the THP pages later.
-
-defer+madvise
-	will enter direct reclaim and compaction like ``always``, but
-	only for regions that have used madvise(MADV_HUGEPAGE); all
-	other regions will wake kswapd in the background to reclaim
-	pages and wake kcompactd to compact memory so that THP is
-	available in the near future.
-
-madvise
-	will enter direct reclaim like ``always`` but only for regions
-	that are have used madvise(MADV_HUGEPAGE). This is the default
-	behaviour.
-
-never
-	should be self-explanatory.
-
-By default kernel tries to use huge zero page on read page fault to
-anonymous mapping. It's possible to disable huge zero page by writing 0
-or enable it back by writing 1::
-
-	echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-	echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
-
-Some userspace (such as a test program, or an optimized memory allocation
-library) may want to know the size (in bytes) of a transparent hugepage::
-
-	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
-
-khugepaged will be automatically started when
-transparent_hugepage/enabled is set to "always" or "madvise, and it'll
-be automatically shutdown if it's set to "never".
-
-Khugepaged controls
--------------------
-
-khugepaged runs usually at low frequency so while one may not want to
-invoke defrag algorithms synchronously during the page faults, it
-should be worth invoking defrag at least in khugepaged. However it's
-also possible to disable defrag in khugepaged by writing 0 or enable
-defrag in khugepaged by writing 1::
-
-	echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-	echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
-
-You can also control how many pages khugepaged should scan at each
-pass::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
-
-and how many milliseconds to wait in khugepaged between each pass (you
-can set this to 0 to run khugepaged at 100% utilization of one core)::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
-
-and how many milliseconds to wait in khugepaged if there's an hugepage
-allocation failure to throttle the next allocation attempt::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
-
-The khugepaged progress can be seen in the number of pages collapsed::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
-
-for each pass::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-
-``max_ptes_none`` specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
-
-``max_ptes_swap`` specifies how many pages can be brought in from
-swap when collapsing a group of pages into a transparent huge page::
-
-	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
-
-A higher value can cause excessive swap IO and waste
-memory. A lower value can prevent THPs from being
-collapsed, resulting fewer pages being collapsed into
-THPs, and lower memory access performance.
-
-Boot parameter
-==============
-
-You can change the sysfs boot time defaults of Transparent Hugepage
-Support by passing the parameter ``transparent_hugepage=always`` or
-``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
-to the kernel command line.
-
-Hugepages in tmpfs/shmem
-========================
-
-You can control hugepage allocation policy in tmpfs with mount option
-``huge=``. It can have following values:
-
-always
-    Attempt to allocate huge pages every time we need a new page;
-
-never
-    Do not allocate huge pages;
-
-within_size
-    Only allocate huge page if it will be fully within i_size.
-    Also respect fadvise()/madvise() hints;
-
-advise
-    Only allocate huge pages if requested with fadvise()/madvise();
-
-The default policy is ``never``.
-
-``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
-``huge=never`` will not attempt to break up huge pages at all, just stop more
-from being allocated.
-
-There's also sysfs knob to control hugepage allocation policy for internal
-shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
-is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
-MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
-
-In addition to policies listed above, shmem_enabled allows two further
-values:
-
-deny
-    For use in emergencies, to force the huge option off from
-    all mounts;
-force
-    Force the huge option on for all - very useful for testing;
-
-Need of application restart
-===========================
-
-The transparent_hugepage/enabled values and tmpfs mount option only affect
-future behavior. So to make them effective you need to restart any
-application that could have been using hugepages. This also applies to the
-regions registered in khugepaged.
-
-Monitoring usage
-================
-
-The number of anonymous transparent huge pages currently used by the
-system is available by reading the AnonHugePages field in ``/proc/meminfo``.
-To identify what applications are using anonymous transparent huge pages,
-it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
-for each mapping.
-
-The number of file transparent huge pages mapped to userspace is available
-by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
-To identify what applications are mapping file transparent huge pages, it
-is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
-for each mapping.
-
-Note that reading the smaps file is expensive and reading it
-frequently will incur overhead.
-
-There are a number of counters in ``/proc/vmstat`` that may be used to
-monitor how successfully the system is providing huge pages for use.
-
-thp_fault_alloc
-	is incremented every time a huge page is successfully
-	allocated to handle a page fault. This applies to both the
-	first time a page is faulted and for COW faults.
-
-thp_collapse_alloc
-	is incremented by khugepaged when it has found
-	a range of pages to collapse into one huge page and has
-	successfully allocated a new huge page to store the data.
-
-thp_fault_fallback
-	is incremented if a page fault fails to allocate
-	a huge page and instead falls back to using small pages.
-
-thp_collapse_alloc_failed
-	is incremented if khugepaged found a range
-	of pages that should be collapsed into one huge page but failed
-	the allocation.
-
-thp_file_alloc
-	is incremented every time a file huge page is successfully
-	allocated.
-
-thp_file_mapped
-	is incremented every time a file huge page is mapped into
-	user address space.
-
-thp_split_page
-	is incremented every time a huge page is split into base
-	pages. This can happen for a variety of reasons but a common
-	reason is that a huge page is old and is being reclaimed.
-	This action implies splitting all PMD the page mapped with.
-
-thp_split_page_failed
-	is incremented if kernel fails to split huge
-	page. This can happen if the page was pinned by somebody.
-
-thp_deferred_split_page
-	is incremented when a huge page is put onto split
-	queue. This happens when a huge page is partially unmapped and
-	splitting it would free up some memory. Pages on split queue are
-	going to be split under memory pressure.
-
-thp_split_pmd
-	is incremented every time a PMD split into table of PTEs.
-	This can happen, for instance, when application calls mprotect() or
-	munmap() on part of huge page. It doesn't split huge page, only
-	page table entry.
-
-thp_zero_page_alloc
-	is incremented every time a huge zero page is
-	successfully allocated. It includes allocations which where
-	dropped due race with other allocation. Note, it doesn't count
-	every map of the huge zero page, only its allocation.
-
-thp_zero_page_alloc_failed
-	is incremented if kernel fails to allocate
-	huge zero page and falls back to using small pages.
-
-thp_swpout
-	is incremented every time a huge page is swapout in one
-	piece without splitting.
-
-thp_swpout_fallback
-	is incremented if a huge page has to be split before swapout.
-	Usually because failed to allocate some continuous swap space
-	for the huge page.
-
-As the system ages, allocating huge pages may be expensive as the
-system uses memory compaction to copy data around memory to free a
-huge page for use. There are some counters in ``/proc/vmstat`` to help
-monitor this overhead.
-
-compact_stall
-	is incremented every time a process stalls to run
-	memory compaction so that a huge page is free for use.
-
-compact_success
-	is incremented if the system compacted memory and
-	freed a huge page for use.
-
-compact_fail
-	is incremented if the system tries to compact memory
-	but failed.
-
-compact_pages_moved
-	is incremented each time a page is moved. If
-	this value is increasing rapidly, it implies that the system
-	is copying a lot of data to satisfy the huge page allocation.
-	It is possible that the cost of copying exceeds any savings
-	from reduced TLB misses.
-
-compact_pagemigrate_failed
-	is incremented when the underlying mechanism
-	for moving a page failed.
-
-compact_blocks_moved
-	is incremented each time memory compaction examines
-	a huge page aligned range of pages.
-
-It is possible to establish how long the stalls were using the function
-tracer to record how long was spent in __alloc_pages_nodemask and
-using the mm_page_alloc tracepoint to identify which allocations were
-for huge pages.
-
-Optimizing the applications
-===========================
-
-To be guaranteed that the kernel will map a 2M page immediately in any
-memory region, the mmap region has to be hugepage naturally
-aligned. posix_memalign() can provide that guarantee.
-
-Hugetlbfs
-=========
-
-You can use hugetlbfs on a kernel that has transparent hugepage
-support enabled just fine as always. No difference can be noted in
-hugetlbfs other than there will be less overall fragmentation. All
-usual features belonging to hugetlbfs are preserved and
-unaffected. libhugetlbfs will also work fine as usual.
+This document describes design principles Transparent Hugepage (THP)
+Support and its interaction with other parts of the memory management.
 
 Design principles
 =================
-- 
cgit v1.2.3


From a49d9c0ae46e149a22aefa8251d07dddd5611851 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 21 May 2018 11:18:17 -0700
Subject: Documentation: document hung_task_panic kernel parameter

This parameter has been around since commit e162b39a368f ("softlockup:
decouple hung tasks check from softlockup detection") in 2009 but was
never documented.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/kernel-parameters.txt | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'Documentation/admin-guide/kernel-parameters.txt')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8d24270644a1..5385af53a8ca 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1341,12 +1341,21 @@
 			x86-64 are 2M (when the CPU supports "pse") and 1G
 			(when the CPU supports the "pdpe1gb" cpuinfo flag).
 
+	hung_task_panic=
+			[KNL] Should the hung task detector generate panics.
+			Format: <integer>
+
+			A nonzero value instructs the kernel to panic when a
+			hung task is detected. The default value is controlled
+			by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time
+			option. The value selected by this boot parameter can
+			be changed later by the kernel.hung_task_panic sysctl.
+
 	hvc_iucv=	[S390]	Number of z/VM IUCV hypervisor console (HVC)
 				terminal devices. Valid values: 0..8
 	hvc_iucv_allow=	[S390]	Comma-separated list of z/VM user IDs.
 				If specified, z/VM IUCV HVC accepts connections
 				from listed z/VM user IDs only.
-
 	keep_bootcon	[KNL]
 			Do not unregister boot console at start. This is only
 			useful for debugging when something happens in the window
-- 
cgit v1.2.3