From ad56b738c5dd223a2f66685830f82194025a6138 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 21 Mar 2018 21:22:47 +0200 Subject: docs/vm: rename documentation files to .rst Signed-off-by: Mike Rapoport Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1d1d53f85ddd..5d6e5509c049 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3887,7 +3887,7 @@ cache (risks via metadata attacks are mostly unchanged). Debug options disable merging on their own. - For more information see Documentation/vm/slub.txt. + For more information see Documentation/vm/slub.rst. slab_max_order= [MM, SLAB] Determines the maximum allowed order for slabs. @@ -3901,7 +3901,7 @@ slub_debug can create guard zones around objects and may poison objects when not in use. Also tracks the last alloc / free. For more information see - Documentation/vm/slub.txt. + Documentation/vm/slub.rst. slub_memcg_sysfs= [MM, SLUB] Determines whether to enable sysfs directories for @@ -3915,7 +3915,7 @@ Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory fragmentation. For more information see - Documentation/vm/slub.txt. + Documentation/vm/slub.rst. slub_min_objects= [MM, SLUB] The minimum number of objects per slab. SLUB will @@ -3924,12 +3924,12 @@ the number of objects indicated. The higher the number of objects the smaller the overhead of tracking slabs and the less frequently locks need to be acquired. - For more information see Documentation/vm/slub.txt. + For more information see Documentation/vm/slub.rst. slub_min_order= [MM, SLUB] Determines the minimum page order for slabs. Must be lower than slub_max_order. - For more information see Documentation/vm/slub.txt. + For more information see Documentation/vm/slub.rst. slub_nomerge [MM, SLUB] Same with slab_nomerge. This is supported for legacy. @@ -4285,7 +4285,7 @@ Format: [always|madvise|never] Can be used to control the default behavior of the system with respect to transparent hugepages. - See Documentation/vm/transhuge.txt for more details. + See Documentation/vm/transhuge.rst for more details. tsc= Disable clocksource stability checks for TSC. Format: -- cgit v1.2.3 From 6dddd7a7ec34bd8680ef72de0229cf8a92bd01ab Mon Sep 17 00:00:00 2001 From: Thymo van Beers Date: Wed, 18 Apr 2018 20:51:39 +0200 Subject: docs: kernel-parameters.txt: Fix whitespace Some lines used spaces instead of tabs at line start. This can cause mangled lines in editors due to inconsistency. Replace spaces for tabs where appropriate. Signed-off-by: Thymo van Beers Reviewed-by: Randy Dunlap Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 136 ++++++++++++------------ 1 file changed, 68 insertions(+), 68 deletions(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3487be79847c..865a24e4d516 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -106,11 +106,11 @@ use by PCI Format: ,... - acpi_mask_gpe= [HW,ACPI] + acpi_mask_gpe= [HW,ACPI] Due to the existence of _Lxx/_Exx, some GPEs triggered by unsupported hardware/firmware features can result in - GPE floodings that cannot be automatically disabled by - the GPE dispatcher. + GPE floodings that cannot be automatically disabled by + the GPE dispatcher. This facility can be used to prevent such uncontrolled GPE floodings. Format: @@ -472,10 +472,10 @@ for platform specific values (SB1, Loongson3 and others). - ccw_timeout_log [S390] + ccw_timeout_log [S390] See Documentation/s390/CommonIO for details. - cgroup_disable= [KNL] Disable a particular controller + cgroup_disable= [KNL] Disable a particular controller Format: {name of the controller(s) to disable} The effects of cgroup_disable=foo are: - foo isn't auto-mounted if you mount all cgroups in @@ -641,8 +641,8 @@ hvc Use the hypervisor console device . This is for both Xen and PowerPC hypervisors. - If the device connected to the port is not a TTY but a braille - device, prepend "brl," before the device type, for instance + If the device connected to the port is not a TTY but a braille + device, prepend "brl," before the device type, for instance console=brl,ttyS0 For now, only VisioBraille is supported. @@ -662,7 +662,7 @@ consoleblank= [KNL] The console blank (screen saver) timeout in seconds. A value of 0 disables the blank timer. - Defaults to 0. + Defaults to 0. coredump_filter= [KNL] Change the default value for @@ -730,7 +730,7 @@ or memory reserved is below 4G. cryptomgr.notests - [KNL] Disable crypto self-tests + [KNL] Disable crypto self-tests cs89x0_dma= [HW,NET] Format: @@ -746,7 +746,7 @@ Format: , See also Documentation/input/devices/joystick-parport.rst - ddebug_query= [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot + ddebug_query= [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot time. See Documentation/admin-guide/dynamic-debug-howto.rst for details. Deprecated, see dyndbg. @@ -833,7 +833,7 @@ causing system reset or hang due to sending INIT from AP to BSP. - disable_ddw [PPC/PSERIES] + disable_ddw [PPC/PSERIES] Disable Dynamic DMA Window support. Use this if to workaround buggy firmware. @@ -1188,7 +1188,7 @@ parameter will force ia64_sal_cache_flush to call ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. - forcepae [X86-32] + forcepae [X86-32] Forcefully enable Physical Address Extension (PAE). Many Pentium M systems disable PAE but may have a functionally usable PAE implementation. @@ -1247,7 +1247,7 @@ gamma= [HW,DRM] - gart_fix_e820= [X86_64] disable the fix e820 for K8 GART + gart_fix_e820= [X86_64] disable the fix e820 for K8 GART Format: off | on default: on @@ -1341,11 +1341,11 @@ x86-64 are 2M (when the CPU supports "pse") and 1G (when the CPU supports the "pdpe1gb" cpuinfo flag). - hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) - terminal devices. Valid values: 0..8 - hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs. - If specified, z/VM IUCV HVC accepts connections - from listed z/VM user IDs only. + hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) + terminal devices. Valid values: 0..8 + hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs. + If specified, z/VM IUCV HVC accepts connections + from listed z/VM user IDs only. keep_bootcon [KNL] Do not unregister boot console at start. This is only @@ -1353,11 +1353,11 @@ between unregistering the boot console and initializing the real console. - i2c_bus= [HW] Override the default board specific I2C bus speed - or register an additional I2C bus that is not - registered from board initialization code. - Format: - , + i2c_bus= [HW] Override the default board specific I2C bus speed + or register an additional I2C bus that is not + registered from board initialization code. + Format: + , i8042.debug [HW] Toggle i8042 debug mode i8042.unmask_kbd_data @@ -1386,7 +1386,7 @@ Default: only on s2r transitions on x86; most other architectures force reset to be always executed i8042.unlock [HW] Unlock (ignore) the keylock - i8042.kbdreset [HW] Reset device connected to KBD port + i8042.kbdreset [HW] Reset device connected to KBD port i810= [HW,DRM] @@ -1548,13 +1548,13 @@ programs exec'd, files mmap'd for exec, and all files opened for read by uid=0. - ima_template= [IMA] + ima_template= [IMA] Select one of defined IMA measurements template formats. Formats: { "ima" | "ima-ng" | "ima-sig" } Default: "ima-ng" ima_template_fmt= - [IMA] Define a custom template format. + [IMA] Define a custom template format. Format: { "field1|...|fieldN" } ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage @@ -1597,7 +1597,7 @@ inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver Format: - int_pln_enable [x86] Enable power limit notification interrupt + int_pln_enable [x86] Enable power limit notification interrupt integrity_audit=[IMA] Format: { "0" | "1" } @@ -1650,39 +1650,39 @@ 0 disables intel_idle and fall back on acpi_idle. 1 to 9 specify maximum depth of C-state. - intel_pstate= [X86] - disable - Do not enable intel_pstate as the default - scaling driver for the supported processors - passive - Use intel_pstate as a scaling driver, but configure it - to work with generic cpufreq governors (instead of - enabling its internal governor). This mode cannot be - used along with the hardware-managed P-states (HWP) - feature. - force - Enable intel_pstate on systems that prohibit it by default - in favor of acpi-cpufreq. Forcing the intel_pstate driver - instead of acpi-cpufreq may disable platform features, such - as thermal controls and power capping, that rely on ACPI - P-States information being indicated to OSPM and therefore - should be used with caution. This option does not work with - processors that aren't supported by the intel_pstate driver - or on platforms that use pcc-cpufreq instead of acpi-cpufreq. - no_hwp - Do not enable hardware P state control (HWP) - if available. - hwp_only - Only load intel_pstate on systems which support - hardware P state control (HWP) if available. - support_acpi_ppc - Enforce ACPI _PPC performance limits. If the Fixed ACPI - Description Table, specifies preferred power management - profile as "Enterprise Server" or "Performance Server", - then this feature is turned on by default. - per_cpu_perf_limits - Allow per-logical-CPU P-State performance control limits using - cpufreq sysfs interface + intel_pstate= [X86] + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors + passive + Use intel_pstate as a scaling driver, but configure it + to work with generic cpufreq governors (instead of + enabling its internal governor). This mode cannot be + used along with the hardware-managed P-states (HWP) + feature. + force + Enable intel_pstate on systems that prohibit it by default + in favor of acpi-cpufreq. Forcing the intel_pstate driver + instead of acpi-cpufreq may disable platform features, such + as thermal controls and power capping, that rely on ACPI + P-States information being indicated to OSPM and therefore + should be used with caution. This option does not work with + processors that aren't supported by the intel_pstate driver + or on platforms that use pcc-cpufreq instead of acpi-cpufreq. + no_hwp + Do not enable hardware P state control (HWP) + if available. + hwp_only + Only load intel_pstate on systems which support + hardware P state control (HWP) if available. + support_acpi_ppc + Enforce ACPI _PPC performance limits. If the Fixed ACPI + Description Table, specifies preferred power management + profile as "Enterprise Server" or "Performance Server", + then this feature is turned on by default. + per_cpu_perf_limits + Allow per-logical-CPU P-State performance control limits using + cpufreq sysfs interface intremap= [X86-64, Intel-IOMMU] on enable Interrupt Remapping (default) @@ -2027,7 +2027,7 @@ * [no]ncqtrim: Turn off queued DSM TRIM. * nohrst, nosrst, norst: suppress hard, soft - and both resets. + and both resets. * rstonce: only attempt one reset during hot-unplug link recovery @@ -2215,7 +2215,7 @@ [KNL,SH] Allow user to override the default size for per-device physically contiguous DMA buffers. - memhp_default_state=online/offline + memhp_default_state=online/offline [KNL] Set the initial state for the memory hotplug onlining policy. If not specified, the default value is set according to the @@ -2762,7 +2762,7 @@ [X86,PV_OPS] Disable paravirtualized VMware scheduler clock and use the default one. - no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting. + no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour @@ -2823,7 +2823,7 @@ notsc [BUGS=X86-32] Disable Time Stamp Counter nowatchdog [KNL] Disable both lockup detectors, i.e. - soft-lockup and NMI watchdog (hard-lockup). + soft-lockup and NMI watchdog (hard-lockup). nowb [ARM] @@ -2843,7 +2843,7 @@ If the dependencies are under your control, you can turn on cpu0_hotplug. - nps_mtm_hs_ctr= [KNL,ARC] + nps_mtm_hs_ctr= [KNL,ARC] This parameter sets the maximum duration, in cycles, each HW thread of the CTOP can run without interruptions, before HW switches it. @@ -2984,7 +2984,7 @@ pci=option[,option...] [PCI] various PCI subsystem options: earlydump [X86] dump PCI config space before the kernel - changes anything + changes anything off [X86] don't probe for the PCI bus bios [X86-32] force use of PCI BIOS, don't access the hardware directly. Use this if your machine @@ -3072,7 +3072,7 @@ is enabled by default. If you need to use this, please report a bug. nocrs [X86] Ignore PCI host bridge windows from ACPI. - If you need to use this, please report a bug. + If you need to use this, please report a bug. routeirq Do IRQ routing for all PCI devices. This is normally done in pci_enable_device(), so this option is a temporary workaround @@ -4391,7 +4391,7 @@ usbcore.initial_descriptor_timeout= [USB] Specifies timeout for the initial 64-byte - USB_REQ_GET_DESCRIPTOR request in milliseconds + USB_REQ_GET_DESCRIPTOR request in milliseconds (default 5000 = 5.0 seconds). usbcore.nousb [USB] Disable the USB subsystem -- cgit v1.2.3 From 18bcaa4e617c04043e46e70c54753d42cf6728f4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 7 May 2018 06:35:44 -0300 Subject: docs: driver-api: add clk documentation The clk.rst is already in ReST format. So, move it to the driver-api guide, where it belongs. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/00-INDEX | 2 - Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/clk.txt | 307 ------------------------ Documentation/driver-api/clk.rst | 307 ++++++++++++++++++++++++ Documentation/driver-api/index.rst | 1 + 5 files changed, 309 insertions(+), 310 deletions(-) delete mode 100644 Documentation/clk.txt create mode 100644 Documentation/driver-api/clk.rst (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 6e141c05f3d2..a50d2380b6fb 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -82,8 +82,6 @@ cgroup-v1/ - cgroups v1 features, including cpusets and memory controller. cgroup-v2.txt - cgroups v2 features, including cpusets and memory controller. -clk.txt - - info on the common clock framework cma/ - Continuous Memory Area (CMA) debugfs interface. conf.py diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 865a24e4d516..42f3e2884e7c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -518,7 +518,7 @@ those clocks in any way. This parameter is useful for debug and development, but should not be needed on a platform with proper driver support. For more - information, see Documentation/clk.txt. + information, see Documentation/driver-api/clk.rst. clock= [BUGS=X86-32, HW] gettimeofday clocksource override. [Deprecated] diff --git a/Documentation/clk.txt b/Documentation/clk.txt deleted file mode 100644 index 511628bb3d3a..000000000000 --- a/Documentation/clk.txt +++ /dev/null @@ -1,307 +0,0 @@ -======================== -The Common Clk Framework -======================== - -:Author: Mike Turquette - -This document endeavours to explain the common clk framework details, -and how to port a platform over to this framework. It is not yet a -detailed explanation of the clock api in include/linux/clk.h, but -perhaps someday it will include that information. - -Introduction and interface split -================================ - -The common clk framework is an interface to control the clock nodes -available on various devices today. This may come in the form of clock -gating, rate adjustment, muxing or other operations. This framework is -enabled with the CONFIG_COMMON_CLK option. - -The interface itself is divided into two halves, each shielded from the -details of its counterpart. First is the common definition of struct -clk which unifies the framework-level accounting and infrastructure that -has traditionally been duplicated across a variety of platforms. Second -is a common implementation of the clk.h api, defined in -drivers/clk/clk.c. Finally there is struct clk_ops, whose operations -are invoked by the clk api implementation. - -The second half of the interface is comprised of the hardware-specific -callbacks registered with struct clk_ops and the corresponding -hardware-specific structures needed to model a particular clock. For -the remainder of this document any reference to a callback in struct -clk_ops, such as .enable or .set_rate, implies the hardware-specific -implementation of that code. Likewise, references to struct clk_foo -serve as a convenient shorthand for the implementation of the -hardware-specific bits for the hypothetical "foo" hardware. - -Tying the two halves of this interface together is struct clk_hw, which -is defined in struct clk_foo and pointed to within struct clk_core. This -allows for easy navigation between the two discrete halves of the common -clock interface. - -Common data structures and api -============================== - -Below is the common struct clk_core definition from -drivers/clk/clk.c, modified for brevity:: - - struct clk_core { - const char *name; - const struct clk_ops *ops; - struct clk_hw *hw; - struct module *owner; - struct clk_core *parent; - const char **parent_names; - struct clk_core **parents; - u8 num_parents; - u8 new_parent_index; - ... - }; - -The members above make up the core of the clk tree topology. The clk -api itself defines several driver-facing functions which operate on -struct clk. That api is documented in include/linux/clk.h. - -Platforms and devices utilizing the common struct clk_core use the struct -clk_ops pointer in struct clk_core to perform the hardware-specific parts of -the operations defined in clk-provider.h:: - - struct clk_ops { - int (*prepare)(struct clk_hw *hw); - void (*unprepare)(struct clk_hw *hw); - int (*is_prepared)(struct clk_hw *hw); - void (*unprepare_unused)(struct clk_hw *hw); - int (*enable)(struct clk_hw *hw); - void (*disable)(struct clk_hw *hw); - int (*is_enabled)(struct clk_hw *hw); - void (*disable_unused)(struct clk_hw *hw); - unsigned long (*recalc_rate)(struct clk_hw *hw, - unsigned long parent_rate); - long (*round_rate)(struct clk_hw *hw, - unsigned long rate, - unsigned long *parent_rate); - int (*determine_rate)(struct clk_hw *hw, - struct clk_rate_request *req); - int (*set_parent)(struct clk_hw *hw, u8 index); - u8 (*get_parent)(struct clk_hw *hw); - int (*set_rate)(struct clk_hw *hw, - unsigned long rate, - unsigned long parent_rate); - int (*set_rate_and_parent)(struct clk_hw *hw, - unsigned long rate, - unsigned long parent_rate, - u8 index); - unsigned long (*recalc_accuracy)(struct clk_hw *hw, - unsigned long parent_accuracy); - int (*get_phase)(struct clk_hw *hw); - int (*set_phase)(struct clk_hw *hw, int degrees); - void (*init)(struct clk_hw *hw); - int (*debug_init)(struct clk_hw *hw, - struct dentry *dentry); - }; - -Hardware clk implementations -============================ - -The strength of the common struct clk_core comes from its .ops and .hw pointers -which abstract the details of struct clk from the hardware-specific bits, and -vice versa. To illustrate consider the simple gateable clk implementation in -drivers/clk/clk-gate.c:: - - struct clk_gate { - struct clk_hw hw; - void __iomem *reg; - u8 bit_idx; - ... - }; - -struct clk_gate contains struct clk_hw hw as well as hardware-specific -knowledge about which register and bit controls this clk's gating. -Nothing about clock topology or accounting, such as enable_count or -notifier_count, is needed here. That is all handled by the common -framework code and struct clk_core. - -Let's walk through enabling this clk from driver code:: - - struct clk *clk; - clk = clk_get(NULL, "my_gateable_clk"); - - clk_prepare(clk); - clk_enable(clk); - -The call graph for clk_enable is very simple:: - - clk_enable(clk); - clk->ops->enable(clk->hw); - [resolves to...] - clk_gate_enable(hw); - [resolves struct clk gate with to_clk_gate(hw)] - clk_gate_set_bit(gate); - -And the definition of clk_gate_set_bit:: - - static void clk_gate_set_bit(struct clk_gate *gate) - { - u32 reg; - - reg = __raw_readl(gate->reg); - reg |= BIT(gate->bit_idx); - writel(reg, gate->reg); - } - -Note that to_clk_gate is defined as:: - - #define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw) - -This pattern of abstraction is used for every clock hardware -representation. - -Supporting your own clk hardware -================================ - -When implementing support for a new type of clock it is only necessary to -include the following header:: - - #include - -To construct a clk hardware structure for your platform you must define -the following:: - - struct clk_foo { - struct clk_hw hw; - ... hardware specific data goes here ... - }; - -To take advantage of your data you'll need to support valid operations -for your clk:: - - struct clk_ops clk_foo_ops { - .enable = &clk_foo_enable; - .disable = &clk_foo_disable; - }; - -Implement the above functions using container_of:: - - #define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw) - - int clk_foo_enable(struct clk_hw *hw) - { - struct clk_foo *foo; - - foo = to_clk_foo(hw); - - ... perform magic on foo ... - - return 0; - }; - -Below is a matrix detailing which clk_ops are mandatory based upon the -hardware capabilities of that clock. A cell marked as "y" means -mandatory, a cell marked as "n" implies that either including that -callback is invalid or otherwise unnecessary. Empty cells are either -optional or must be evaluated on a case-by-case basis. - -.. table:: clock hardware characteristics - - +----------------+------+-------------+---------------+-------------+------+ - | | gate | change rate | single parent | multiplexer | root | - +================+======+=============+===============+=============+======+ - |.prepare | | | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.unprepare | | | | | | - +----------------+------+-------------+---------------+-------------+------+ - +----------------+------+-------------+---------------+-------------+------+ - |.enable | y | | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.disable | y | | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.is_enabled | y | | | | | - +----------------+------+-------------+---------------+-------------+------+ - +----------------+------+-------------+---------------+-------------+------+ - |.recalc_rate | | y | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.round_rate | | y [1]_ | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.determine_rate | | y [1]_ | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.set_rate | | y | | | | - +----------------+------+-------------+---------------+-------------+------+ - +----------------+------+-------------+---------------+-------------+------+ - |.set_parent | | | n | y | n | - +----------------+------+-------------+---------------+-------------+------+ - |.get_parent | | | n | y | n | - +----------------+------+-------------+---------------+-------------+------+ - +----------------+------+-------------+---------------+-------------+------+ - |.recalc_accuracy| | | | | | - +----------------+------+-------------+---------------+-------------+------+ - +----------------+------+-------------+---------------+-------------+------+ - |.init | | | | | | - +----------------+------+-------------+---------------+-------------+------+ - -.. [1] either one of round_rate or determine_rate is required. - -Finally, register your clock at run-time with a hardware-specific -registration function. This function simply populates struct clk_foo's -data and then passes the common struct clk parameters to the framework -with a call to:: - - clk_register(...) - -See the basic clock types in ``drivers/clk/clk-*.c`` for examples. - -Disabling clock gating of unused clocks -======================================= - -Sometimes during development it can be useful to be able to bypass the -default disabling of unused clocks. For example, if drivers aren't enabling -clocks properly but rely on them being on from the bootloader, bypassing -the disabling means that the driver will remain functional while the issues -are sorted out. - -To bypass this disabling, include "clk_ignore_unused" in the bootargs to the -kernel. - -Locking -======= - -The common clock framework uses two global locks, the prepare lock and the -enable lock. - -The enable lock is a spinlock and is held across calls to the .enable, -.disable operations. Those operations are thus not allowed to sleep, -and calls to the clk_enable(), clk_disable() API functions are allowed in -atomic context. - -For clk_is_enabled() API, it is also designed to be allowed to be used in -atomic context. However, it doesn't really make any sense to hold the enable -lock in core, unless you want to do something else with the information of -the enable state with that lock held. Otherwise, seeing if a clk is enabled is -a one-shot read of the enabled state, which could just as easily change after -the function returns because the lock is released. Thus the user of this API -needs to handle synchronizing the read of the state with whatever they're -using it for to make sure that the enable state doesn't change during that -time. - -The prepare lock is a mutex and is held across calls to all other operations. -All those operations are allowed to sleep, and calls to the corresponding API -functions are not allowed in atomic context. - -This effectively divides operations in two groups from a locking perspective. - -Drivers don't need to manually protect resources shared between the operations -of one group, regardless of whether those resources are shared by multiple -clocks or not. However, access to resources that are shared between operations -of the two groups needs to be protected by the drivers. An example of such a -resource would be a register that controls both the clock rate and the clock -enable/disable state. - -The clock framework is reentrant, in that a driver is allowed to call clock -framework functions from within its implementation of clock operations. This -can for instance cause a .set_rate operation of one clock being called from -within the .set_rate operation of another clock. This case must be considered -in the driver implementations, but the code flow is usually controlled by the -driver in that case. - -Note that locking must also be considered when code outside of the common -clock framework needs to access resources used by the clock operations. This -is considered out of scope of this document. diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst new file mode 100644 index 000000000000..511628bb3d3a --- /dev/null +++ b/Documentation/driver-api/clk.rst @@ -0,0 +1,307 @@ +======================== +The Common Clk Framework +======================== + +:Author: Mike Turquette + +This document endeavours to explain the common clk framework details, +and how to port a platform over to this framework. It is not yet a +detailed explanation of the clock api in include/linux/clk.h, but +perhaps someday it will include that information. + +Introduction and interface split +================================ + +The common clk framework is an interface to control the clock nodes +available on various devices today. This may come in the form of clock +gating, rate adjustment, muxing or other operations. This framework is +enabled with the CONFIG_COMMON_CLK option. + +The interface itself is divided into two halves, each shielded from the +details of its counterpart. First is the common definition of struct +clk which unifies the framework-level accounting and infrastructure that +has traditionally been duplicated across a variety of platforms. Second +is a common implementation of the clk.h api, defined in +drivers/clk/clk.c. Finally there is struct clk_ops, whose operations +are invoked by the clk api implementation. + +The second half of the interface is comprised of the hardware-specific +callbacks registered with struct clk_ops and the corresponding +hardware-specific structures needed to model a particular clock. For +the remainder of this document any reference to a callback in struct +clk_ops, such as .enable or .set_rate, implies the hardware-specific +implementation of that code. Likewise, references to struct clk_foo +serve as a convenient shorthand for the implementation of the +hardware-specific bits for the hypothetical "foo" hardware. + +Tying the two halves of this interface together is struct clk_hw, which +is defined in struct clk_foo and pointed to within struct clk_core. This +allows for easy navigation between the two discrete halves of the common +clock interface. + +Common data structures and api +============================== + +Below is the common struct clk_core definition from +drivers/clk/clk.c, modified for brevity:: + + struct clk_core { + const char *name; + const struct clk_ops *ops; + struct clk_hw *hw; + struct module *owner; + struct clk_core *parent; + const char **parent_names; + struct clk_core **parents; + u8 num_parents; + u8 new_parent_index; + ... + }; + +The members above make up the core of the clk tree topology. The clk +api itself defines several driver-facing functions which operate on +struct clk. That api is documented in include/linux/clk.h. + +Platforms and devices utilizing the common struct clk_core use the struct +clk_ops pointer in struct clk_core to perform the hardware-specific parts of +the operations defined in clk-provider.h:: + + struct clk_ops { + int (*prepare)(struct clk_hw *hw); + void (*unprepare)(struct clk_hw *hw); + int (*is_prepared)(struct clk_hw *hw); + void (*unprepare_unused)(struct clk_hw *hw); + int (*enable)(struct clk_hw *hw); + void (*disable)(struct clk_hw *hw); + int (*is_enabled)(struct clk_hw *hw); + void (*disable_unused)(struct clk_hw *hw); + unsigned long (*recalc_rate)(struct clk_hw *hw, + unsigned long parent_rate); + long (*round_rate)(struct clk_hw *hw, + unsigned long rate, + unsigned long *parent_rate); + int (*determine_rate)(struct clk_hw *hw, + struct clk_rate_request *req); + int (*set_parent)(struct clk_hw *hw, u8 index); + u8 (*get_parent)(struct clk_hw *hw); + int (*set_rate)(struct clk_hw *hw, + unsigned long rate, + unsigned long parent_rate); + int (*set_rate_and_parent)(struct clk_hw *hw, + unsigned long rate, + unsigned long parent_rate, + u8 index); + unsigned long (*recalc_accuracy)(struct clk_hw *hw, + unsigned long parent_accuracy); + int (*get_phase)(struct clk_hw *hw); + int (*set_phase)(struct clk_hw *hw, int degrees); + void (*init)(struct clk_hw *hw); + int (*debug_init)(struct clk_hw *hw, + struct dentry *dentry); + }; + +Hardware clk implementations +============================ + +The strength of the common struct clk_core comes from its .ops and .hw pointers +which abstract the details of struct clk from the hardware-specific bits, and +vice versa. To illustrate consider the simple gateable clk implementation in +drivers/clk/clk-gate.c:: + + struct clk_gate { + struct clk_hw hw; + void __iomem *reg; + u8 bit_idx; + ... + }; + +struct clk_gate contains struct clk_hw hw as well as hardware-specific +knowledge about which register and bit controls this clk's gating. +Nothing about clock topology or accounting, such as enable_count or +notifier_count, is needed here. That is all handled by the common +framework code and struct clk_core. + +Let's walk through enabling this clk from driver code:: + + struct clk *clk; + clk = clk_get(NULL, "my_gateable_clk"); + + clk_prepare(clk); + clk_enable(clk); + +The call graph for clk_enable is very simple:: + + clk_enable(clk); + clk->ops->enable(clk->hw); + [resolves to...] + clk_gate_enable(hw); + [resolves struct clk gate with to_clk_gate(hw)] + clk_gate_set_bit(gate); + +And the definition of clk_gate_set_bit:: + + static void clk_gate_set_bit(struct clk_gate *gate) + { + u32 reg; + + reg = __raw_readl(gate->reg); + reg |= BIT(gate->bit_idx); + writel(reg, gate->reg); + } + +Note that to_clk_gate is defined as:: + + #define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw) + +This pattern of abstraction is used for every clock hardware +representation. + +Supporting your own clk hardware +================================ + +When implementing support for a new type of clock it is only necessary to +include the following header:: + + #include + +To construct a clk hardware structure for your platform you must define +the following:: + + struct clk_foo { + struct clk_hw hw; + ... hardware specific data goes here ... + }; + +To take advantage of your data you'll need to support valid operations +for your clk:: + + struct clk_ops clk_foo_ops { + .enable = &clk_foo_enable; + .disable = &clk_foo_disable; + }; + +Implement the above functions using container_of:: + + #define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw) + + int clk_foo_enable(struct clk_hw *hw) + { + struct clk_foo *foo; + + foo = to_clk_foo(hw); + + ... perform magic on foo ... + + return 0; + }; + +Below is a matrix detailing which clk_ops are mandatory based upon the +hardware capabilities of that clock. A cell marked as "y" means +mandatory, a cell marked as "n" implies that either including that +callback is invalid or otherwise unnecessary. Empty cells are either +optional or must be evaluated on a case-by-case basis. + +.. table:: clock hardware characteristics + + +----------------+------+-------------+---------------+-------------+------+ + | | gate | change rate | single parent | multiplexer | root | + +================+======+=============+===============+=============+======+ + |.prepare | | | | | | + +----------------+------+-------------+---------------+-------------+------+ + |.unprepare | | | | | | + +----------------+------+-------------+---------------+-------------+------+ + +----------------+------+-------------+---------------+-------------+------+ + |.enable | y | | | | | + +----------------+------+-------------+---------------+-------------+------+ + |.disable | y | | | | | + +----------------+------+-------------+---------------+-------------+------+ + |.is_enabled | y | | | | | + +----------------+------+-------------+---------------+-------------+------+ + +----------------+------+-------------+---------------+-------------+------+ + |.recalc_rate | | y | | | | + +----------------+------+-------------+---------------+-------------+------+ + |.round_rate | | y [1]_ | | | | + +----------------+------+-------------+---------------+-------------+------+ + |.determine_rate | | y [1]_ | | | | + +----------------+------+-------------+---------------+-------------+------+ + |.set_rate | | y | | | | + +----------------+------+-------------+---------------+-------------+------+ + +----------------+------+-------------+---------------+-------------+------+ + |.set_parent | | | n | y | n | + +----------------+------+-------------+---------------+-------------+------+ + |.get_parent | | | n | y | n | + +----------------+------+-------------+---------------+-------------+------+ + +----------------+------+-------------+---------------+-------------+------+ + |.recalc_accuracy| | | | | | + +----------------+------+-------------+---------------+-------------+------+ + +----------------+------+-------------+---------------+-------------+------+ + |.init | | | | | | + +----------------+------+-------------+---------------+-------------+------+ + +.. [1] either one of round_rate or determine_rate is required. + +Finally, register your clock at run-time with a hardware-specific +registration function. This function simply populates struct clk_foo's +data and then passes the common struct clk parameters to the framework +with a call to:: + + clk_register(...) + +See the basic clock types in ``drivers/clk/clk-*.c`` for examples. + +Disabling clock gating of unused clocks +======================================= + +Sometimes during development it can be useful to be able to bypass the +default disabling of unused clocks. For example, if drivers aren't enabling +clocks properly but rely on them being on from the bootloader, bypassing +the disabling means that the driver will remain functional while the issues +are sorted out. + +To bypass this disabling, include "clk_ignore_unused" in the bootargs to the +kernel. + +Locking +======= + +The common clock framework uses two global locks, the prepare lock and the +enable lock. + +The enable lock is a spinlock and is held across calls to the .enable, +.disable operations. Those operations are thus not allowed to sleep, +and calls to the clk_enable(), clk_disable() API functions are allowed in +atomic context. + +For clk_is_enabled() API, it is also designed to be allowed to be used in +atomic context. However, it doesn't really make any sense to hold the enable +lock in core, unless you want to do something else with the information of +the enable state with that lock held. Otherwise, seeing if a clk is enabled is +a one-shot read of the enabled state, which could just as easily change after +the function returns because the lock is released. Thus the user of this API +needs to handle synchronizing the read of the state with whatever they're +using it for to make sure that the enable state doesn't change during that +time. + +The prepare lock is a mutex and is held across calls to all other operations. +All those operations are allowed to sleep, and calls to the corresponding API +functions are not allowed in atomic context. + +This effectively divides operations in two groups from a locking perspective. + +Drivers don't need to manually protect resources shared between the operations +of one group, regardless of whether those resources are shared by multiple +clocks or not. However, access to resources that are shared between operations +of the two groups needs to be protected by the drivers. An example of such a +resource would be a register that controls both the clock rate and the clock +enable/disable state. + +The clock framework is reentrant, in that a driver is allowed to call clock +framework functions from within its implementation of clock operations. This +can for instance cause a .set_rate operation of one clock being called from +within the .set_rate operation of another clock. This case must be considered +in the driver implementations, but the code flow is usually controlled by the +driver in that case. + +Note that locking must also be considered when code outside of the common +clock framework needs to access resources used by the clock operations. This +is considered out of scope of this document. diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 3ac51c94f97b..5d04296f5ce0 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -17,6 +17,7 @@ available subsections can be seen below. basics infrastructure pm/index + clk device-io device_connection dma-buf -- cgit v1.2.3 From 45c9a74f648a76e1118cf8024d11cba54bd64e37 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 May 2018 11:13:40 +0300 Subject: docs/vm: transhuge: split userspace bits to admin-guide/mm/transhuge Now that the administrative information for transparent huge pages is nicely separated, move it to its own page under the admin guide. Signed-off-by: Mike Rapoport Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 3 +- Documentation/admin-guide/mm/index.rst | 1 + Documentation/admin-guide/mm/transhuge.rst | 418 ++++++++++++++++++++++++ Documentation/vm/transhuge.rst | 414 +---------------------- 4 files changed, 423 insertions(+), 413 deletions(-) create mode 100644 Documentation/admin-guide/mm/transhuge.rst (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 42f3e2884e7c..8d24270644a1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4313,7 +4313,8 @@ Format: [always|madvise|never] Can be used to control the default behavior of the system with respect to transparent hugepages. - See Documentation/vm/transhuge.rst for more details. + See Documentation/admin-guide/mm/transhuge.rst + for more details. tsc= Disable clocksource stability checks for TSC. Format: diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index a69aa69af255..8454be638108 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -27,4 +27,5 @@ the Linux memory management. numa_memory_policy pagemap soft-dirty + transhuge userfaultfd diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst new file mode 100644 index 000000000000..7ab93a8404b9 --- /dev/null +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -0,0 +1,418 @@ +.. _admin_guide_transhuge: + +============================ +Transparent Hugepage Support +============================ + +Objective +========= + +Performance critical computing applications dealing with large memory +working sets are already running on top of libhugetlbfs and in turn +hugetlbfs. Transparent HugePage Support (THP) is an alternative mean of +using huge pages for the backing of virtual memory with huge pages +that supports the automatic promotion and demotion of page sizes and +without the shortcomings of hugetlbfs. + +Currently THP only works for anonymous memory mappings and tmpfs/shmem. +But in the future it can expand to other filesystems. + +.. note:: + in the examples below we presume that the basic page size is 4K and + the huge page size is 2M, although the actual numbers may vary + depending on the CPU architecture. + +The reason applications are running faster is because of two +factors. The first factor is almost completely irrelevant and it's not +of significant interest because it'll also have the downside of +requiring larger clear-page copy-page in page faults which is a +potentially negative effect. The first factor consists in taking a +single page fault for each 2M virtual region touched by userland (so +reducing the enter/exit kernel frequency by a 512 times factor). This +only matters the first time the memory is accessed for the lifetime of +a memory mapping. The second long lasting and much more important +factor will affect all subsequent accesses to the memory for the whole +runtime of the application. The second factor consist of two +components: + +1) the TLB miss will run faster (especially with virtualization using + nested pagetables but almost always also on bare metal without + virtualization) + +2) a single TLB entry will be mapping a much larger amount of virtual + memory in turn reducing the number of TLB misses. With + virtualization and nested pagetables the TLB can be mapped of + larger size only if both KVM and the Linux guest are using + hugepages but a significant speedup already happens if only one of + the two is using hugepages just because of the fact the TLB miss is + going to run faster. + +THP can be enabled system wide or restricted to certain tasks or even +memory ranges inside task's address space. Unless THP is completely +disabled, there is ``khugepaged`` daemon that scans memory and +collapses sequences of basic pages into huge pages. + +The THP behaviour is controlled via :ref:`sysfs ` +interface and using madivse(2) and prctl(2) system calls. + +Transparent Hugepage Support maximizes the usefulness of free memory +if compared to the reservation approach of hugetlbfs by allowing all +unused memory to be used as cache or other movable (or even unmovable +entities). It doesn't require reservation to prevent hugepage +allocation failures to be noticeable from userland. It allows paging +and all other advanced VM features to be available on the +hugepages. It requires no modifications for applications to take +advantage of it. + +Applications however can be further optimized to take advantage of +this feature, like for example they've been optimized before to avoid +a flood of mmap system calls for every malloc(4k). Optimizing userland +is by far not mandatory and khugepaged already can take care of long +lived page allocations even for hugepage unaware applications that +deals with large amounts of memory. + +In certain cases when hugepages are enabled system wide, application +may end up allocating more memory resources. An application may mmap a +large region but only touch 1 byte of it, in that case a 2M page might +be allocated instead of a 4k page for no good. This is why it's +possible to disable hugepages system-wide and to only have them inside +MADV_HUGEPAGE madvise regions. + +Embedded systems should enable hugepages only inside madvise regions +to eliminate any risk of wasting any precious byte of memory and to +only run faster. + +Applications that gets a lot of benefit from hugepages and that don't +risk to lose memory by using hugepages, should use +madvise(MADV_HUGEPAGE) on their critical mmapped regions. + +.. _thp_sysfs: + +sysfs +===== + +Global THP controls +------------------- + +Transparent Hugepage Support for anonymous memory can be entirely disabled +(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE +regions (to avoid the risk of consuming more memory resources) or enabled +system wide. This can be achieved with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/enabled + echo madvise >/sys/kernel/mm/transparent_hugepage/enabled + echo never >/sys/kernel/mm/transparent_hugepage/enabled + +It's also possible to limit defrag efforts in the VM to generate +anonymous hugepages in case they're not immediately free to madvise +regions or to never try to defrag memory and simply fallback to regular +pages unless hugepages are immediately available. Clearly if we spend CPU +time to defrag memory, we would expect to gain even more by the fact we +use hugepages later instead of regular pages. This isn't always +guaranteed, but it may be more likely in case the allocation is for a +MADV_HUGEPAGE region. + +:: + + echo always >/sys/kernel/mm/transparent_hugepage/defrag + echo defer >/sys/kernel/mm/transparent_hugepage/defrag + echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag + echo madvise >/sys/kernel/mm/transparent_hugepage/defrag + echo never >/sys/kernel/mm/transparent_hugepage/defrag + +always + means that an application requesting THP will stall on + allocation failure and directly reclaim pages and compact + memory in an effort to allocate a THP immediately. This may be + desirable for virtual machines that benefit heavily from THP + use and are willing to delay the VM start to utilise them. + +defer + means that an application will wake kswapd in the background + to reclaim pages and wake kcompactd to compact memory so that + THP is available in the near future. It's the responsibility + of khugepaged to then install the THP pages later. + +defer+madvise + will enter direct reclaim and compaction like ``always``, but + only for regions that have used madvise(MADV_HUGEPAGE); all + other regions will wake kswapd in the background to reclaim + pages and wake kcompactd to compact memory so that THP is + available in the near future. + +madvise + will enter direct reclaim like ``always`` but only for regions + that are have used madvise(MADV_HUGEPAGE). This is the default + behaviour. + +never + should be self-explanatory. + +By default kernel tries to use huge zero page on read page fault to +anonymous mapping. It's possible to disable huge zero page by writing 0 +or enable it back by writing 1:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page + echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page + +Some userspace (such as a test program, or an optimized memory allocation +library) may want to know the size (in bytes) of a transparent hugepage:: + + cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size + +khugepaged will be automatically started when +transparent_hugepage/enabled is set to "always" or "madvise, and it'll +be automatically shutdown if it's set to "never". + +Khugepaged controls +------------------- + +khugepaged runs usually at low frequency so while one may not want to +invoke defrag algorithms synchronously during the page faults, it +should be worth invoking defrag at least in khugepaged. However it's +also possible to disable defrag in khugepaged by writing 0 or enable +defrag in khugepaged by writing 1:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag + echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag + +You can also control how many pages khugepaged should scan at each +pass:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan + +and how many milliseconds to wait in khugepaged between each pass (you +can set this to 0 to run khugepaged at 100% utilization of one core):: + + /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs + +and how many milliseconds to wait in khugepaged if there's an hugepage +allocation failure to throttle the next allocation attempt:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs + +The khugepaged progress can be seen in the number of pages collapsed:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed + +for each pass:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/full_scans + +``max_ptes_none`` specifies how many extra small pages (that are +not already mapped) can be allocated when collapsing a group +of small pages into one large page:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none + +A higher value leads to use additional memory for programs. +A lower value leads to gain less thp performance. Value of +max_ptes_none can waste cpu time very little, you can +ignore it. + +``max_ptes_swap`` specifies how many pages can be brought in from +swap when collapsing a group of pages into a transparent huge page:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap + +A higher value can cause excessive swap IO and waste +memory. A lower value can prevent THPs from being +collapsed, resulting fewer pages being collapsed into +THPs, and lower memory access performance. + +Boot parameter +============== + +You can change the sysfs boot time defaults of Transparent Hugepage +Support by passing the parameter ``transparent_hugepage=always`` or +``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` +to the kernel command line. + +Hugepages in tmpfs/shmem +======================== + +You can control hugepage allocation policy in tmpfs with mount option +``huge=``. It can have following values: + +always + Attempt to allocate huge pages every time we need a new page; + +never + Do not allocate huge pages; + +within_size + Only allocate huge page if it will be fully within i_size. + Also respect fadvise()/madvise() hints; + +advise + Only allocate huge pages if requested with fadvise()/madvise(); + +The default policy is ``never``. + +``mount -o remount,huge= /mountpoint`` works fine after mount: remounting +``huge=never`` will not attempt to break up huge pages at all, just stop more +from being allocated. + +There's also sysfs knob to control hugepage allocation policy for internal +shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount +is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or +MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. + +In addition to policies listed above, shmem_enabled allows two further +values: + +deny + For use in emergencies, to force the huge option off from + all mounts; +force + Force the huge option on for all - very useful for testing; + +Need of application restart +=========================== + +The transparent_hugepage/enabled values and tmpfs mount option only affect +future behavior. So to make them effective you need to restart any +application that could have been using hugepages. This also applies to the +regions registered in khugepaged. + +Monitoring usage +================ + +The number of anonymous transparent huge pages currently used by the +system is available by reading the AnonHugePages field in ``/proc/meminfo``. +To identify what applications are using anonymous transparent huge pages, +it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields +for each mapping. + +The number of file transparent huge pages mapped to userspace is available +by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. +To identify what applications are mapping file transparent huge pages, it +is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields +for each mapping. + +Note that reading the smaps file is expensive and reading it +frequently will incur overhead. + +There are a number of counters in ``/proc/vmstat`` that may be used to +monitor how successfully the system is providing huge pages for use. + +thp_fault_alloc + is incremented every time a huge page is successfully + allocated to handle a page fault. This applies to both the + first time a page is faulted and for COW faults. + +thp_collapse_alloc + is incremented by khugepaged when it has found + a range of pages to collapse into one huge page and has + successfully allocated a new huge page to store the data. + +thp_fault_fallback + is incremented if a page fault fails to allocate + a huge page and instead falls back to using small pages. + +thp_collapse_alloc_failed + is incremented if khugepaged found a range + of pages that should be collapsed into one huge page but failed + the allocation. + +thp_file_alloc + is incremented every time a file huge page is successfully + allocated. + +thp_file_mapped + is incremented every time a file huge page is mapped into + user address space. + +thp_split_page + is incremented every time a huge page is split into base + pages. This can happen for a variety of reasons but a common + reason is that a huge page is old and is being reclaimed. + This action implies splitting all PMD the page mapped with. + +thp_split_page_failed + is incremented if kernel fails to split huge + page. This can happen if the page was pinned by somebody. + +thp_deferred_split_page + is incremented when a huge page is put onto split + queue. This happens when a huge page is partially unmapped and + splitting it would free up some memory. Pages on split queue are + going to be split under memory pressure. + +thp_split_pmd + is incremented every time a PMD split into table of PTEs. + This can happen, for instance, when application calls mprotect() or + munmap() on part of huge page. It doesn't split huge page, only + page table entry. + +thp_zero_page_alloc + is incremented every time a huge zero page is + successfully allocated. It includes allocations which where + dropped due race with other allocation. Note, it doesn't count + every map of the huge zero page, only its allocation. + +thp_zero_page_alloc_failed + is incremented if kernel fails to allocate + huge zero page and falls back to using small pages. + +thp_swpout + is incremented every time a huge page is swapout in one + piece without splitting. + +thp_swpout_fallback + is incremented if a huge page has to be split before swapout. + Usually because failed to allocate some continuous swap space + for the huge page. + +As the system ages, allocating huge pages may be expensive as the +system uses memory compaction to copy data around memory to free a +huge page for use. There are some counters in ``/proc/vmstat`` to help +monitor this overhead. + +compact_stall + is incremented every time a process stalls to run + memory compaction so that a huge page is free for use. + +compact_success + is incremented if the system compacted memory and + freed a huge page for use. + +compact_fail + is incremented if the system tries to compact memory + but failed. + +compact_pages_moved + is incremented each time a page is moved. If + this value is increasing rapidly, it implies that the system + is copying a lot of data to satisfy the huge page allocation. + It is possible that the cost of copying exceeds any savings + from reduced TLB misses. + +compact_pagemigrate_failed + is incremented when the underlying mechanism + for moving a page failed. + +compact_blocks_moved + is incremented each time memory compaction examines + a huge page aligned range of pages. + +It is possible to establish how long the stalls were using the function +tracer to record how long was spent in __alloc_pages_nodemask and +using the mm_page_alloc tracepoint to identify which allocations were +for huge pages. + +Optimizing the applications +=========================== + +To be guaranteed that the kernel will map a 2M page immediately in any +memory region, the mmap region has to be hugepage naturally +aligned. posix_memalign() can provide that guarantee. + +Hugetlbfs +========= + +You can use hugetlbfs on a kernel that has transparent hugepage +support enabled just fine as always. No difference can be noted in +hugetlbfs other than there will be less overall fragmentation. All +usual features belonging to hugetlbfs are preserved and +unaffected. libhugetlbfs will also work fine as usual. diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst index 47c7e4742bc2..a8cf6809e36e 100644 --- a/Documentation/vm/transhuge.rst +++ b/Documentation/vm/transhuge.rst @@ -4,418 +4,8 @@ Transparent Hugepage Support ============================ -Objective -========= - -Performance critical computing applications dealing with large memory -working sets are already running on top of libhugetlbfs and in turn -hugetlbfs. Transparent HugePage Support (THP) is an alternative mean of -using huge pages for the backing of virtual memory with huge pages -that supports the automatic promotion and demotion of page sizes and -without the shortcomings of hugetlbfs. - -Currently THP only works for anonymous memory mappings and tmpfs/shmem. -But in the future it can expand to other filesystems. - -.. note:: - in the examples below we presume that the basic page size is 4K and - the huge page size is 2M, although the actual numbers may vary - depending on the CPU architecture. - -The reason applications are running faster is because of two -factors. The first factor is almost completely irrelevant and it's not -of significant interest because it'll also have the downside of -requiring larger clear-page copy-page in page faults which is a -potentially negative effect. The first factor consists in taking a -single page fault for each 2M virtual region touched by userland (so -reducing the enter/exit kernel frequency by a 512 times factor). This -only matters the first time the memory is accessed for the lifetime of -a memory mapping. The second long lasting and much more important -factor will affect all subsequent accesses to the memory for the whole -runtime of the application. The second factor consist of two -components: - -1) the TLB miss will run faster (especially with virtualization using - nested pagetables but almost always also on bare metal without - virtualization) - -2) a single TLB entry will be mapping a much larger amount of virtual - memory in turn reducing the number of TLB misses. With - virtualization and nested pagetables the TLB can be mapped of - larger size only if both KVM and the Linux guest are using - hugepages but a significant speedup already happens if only one of - the two is using hugepages just because of the fact the TLB miss is - going to run faster. - -THP can be enabled system wide or restricted to certain tasks or even -memory ranges inside task's address space. Unless THP is completely -disabled, there is ``khugepaged`` daemon that scans memory and -collapses sequences of basic pages into huge pages. - -The THP behaviour is controlled via :ref:`sysfs ` -interface and using madivse(2) and prctl(2) system calls. - -Transparent Hugepage Support maximizes the usefulness of free memory -if compared to the reservation approach of hugetlbfs by allowing all -unused memory to be used as cache or other movable (or even unmovable -entities). It doesn't require reservation to prevent hugepage -allocation failures to be noticeable from userland. It allows paging -and all other advanced VM features to be available on the -hugepages. It requires no modifications for applications to take -advantage of it. - -Applications however can be further optimized to take advantage of -this feature, like for example they've been optimized before to avoid -a flood of mmap system calls for every malloc(4k). Optimizing userland -is by far not mandatory and khugepaged already can take care of long -lived page allocations even for hugepage unaware applications that -deals with large amounts of memory. - -In certain cases when hugepages are enabled system wide, application -may end up allocating more memory resources. An application may mmap a -large region but only touch 1 byte of it, in that case a 2M page might -be allocated instead of a 4k page for no good. This is why it's -possible to disable hugepages system-wide and to only have them inside -MADV_HUGEPAGE madvise regions. - -Embedded systems should enable hugepages only inside madvise regions -to eliminate any risk of wasting any precious byte of memory and to -only run faster. - -Applications that gets a lot of benefit from hugepages and that don't -risk to lose memory by using hugepages, should use -madvise(MADV_HUGEPAGE) on their critical mmapped regions. - -.. _thp_sysfs: - -sysfs -===== - -Global THP controls -------------------- - -Transparent Hugepage Support for anonymous memory can be entirely disabled -(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE -regions (to avoid the risk of consuming more memory resources) or enabled -system wide. This can be achieved with one of:: - - echo always >/sys/kernel/mm/transparent_hugepage/enabled - echo madvise >/sys/kernel/mm/transparent_hugepage/enabled - echo never >/sys/kernel/mm/transparent_hugepage/enabled - -It's also possible to limit defrag efforts in the VM to generate -anonymous hugepages in case they're not immediately free to madvise -regions or to never try to defrag memory and simply fallback to regular -pages unless hugepages are immediately available. Clearly if we spend CPU -time to defrag memory, we would expect to gain even more by the fact we -use hugepages later instead of regular pages. This isn't always -guaranteed, but it may be more likely in case the allocation is for a -MADV_HUGEPAGE region. - -:: - - echo always >/sys/kernel/mm/transparent_hugepage/defrag - echo defer >/sys/kernel/mm/transparent_hugepage/defrag - echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag - echo madvise >/sys/kernel/mm/transparent_hugepage/defrag - echo never >/sys/kernel/mm/transparent_hugepage/defrag - -always - means that an application requesting THP will stall on - allocation failure and directly reclaim pages and compact - memory in an effort to allocate a THP immediately. This may be - desirable for virtual machines that benefit heavily from THP - use and are willing to delay the VM start to utilise them. - -defer - means that an application will wake kswapd in the background - to reclaim pages and wake kcompactd to compact memory so that - THP is available in the near future. It's the responsibility - of khugepaged to then install the THP pages later. - -defer+madvise - will enter direct reclaim and compaction like ``always``, but - only for regions that have used madvise(MADV_HUGEPAGE); all - other regions will wake kswapd in the background to reclaim - pages and wake kcompactd to compact memory so that THP is - available in the near future. - -madvise - will enter direct reclaim like ``always`` but only for regions - that are have used madvise(MADV_HUGEPAGE). This is the default - behaviour. - -never - should be self-explanatory. - -By default kernel tries to use huge zero page on read page fault to -anonymous mapping. It's possible to disable huge zero page by writing 0 -or enable it back by writing 1:: - - echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page - echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page - -Some userspace (such as a test program, or an optimized memory allocation -library) may want to know the size (in bytes) of a transparent hugepage:: - - cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size - -khugepaged will be automatically started when -transparent_hugepage/enabled is set to "always" or "madvise, and it'll -be automatically shutdown if it's set to "never". - -Khugepaged controls -------------------- - -khugepaged runs usually at low frequency so while one may not want to -invoke defrag algorithms synchronously during the page faults, it -should be worth invoking defrag at least in khugepaged. However it's -also possible to disable defrag in khugepaged by writing 0 or enable -defrag in khugepaged by writing 1:: - - echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag - echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag - -You can also control how many pages khugepaged should scan at each -pass:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan - -and how many milliseconds to wait in khugepaged between each pass (you -can set this to 0 to run khugepaged at 100% utilization of one core):: - - /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs - -and how many milliseconds to wait in khugepaged if there's an hugepage -allocation failure to throttle the next allocation attempt:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs - -The khugepaged progress can be seen in the number of pages collapsed:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed - -for each pass:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/full_scans - -``max_ptes_none`` specifies how many extra small pages (that are -not already mapped) can be allocated when collapsing a group -of small pages into one large page:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none - -A higher value leads to use additional memory for programs. -A lower value leads to gain less thp performance. Value of -max_ptes_none can waste cpu time very little, you can -ignore it. - -``max_ptes_swap`` specifies how many pages can be brought in from -swap when collapsing a group of pages into a transparent huge page:: - - /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap - -A higher value can cause excessive swap IO and waste -memory. A lower value can prevent THPs from being -collapsed, resulting fewer pages being collapsed into -THPs, and lower memory access performance. - -Boot parameter -============== - -You can change the sysfs boot time defaults of Transparent Hugepage -Support by passing the parameter ``transparent_hugepage=always`` or -``transparent_hugepage=madvise`` or ``transparent_hugepage=never`` -to the kernel command line. - -Hugepages in tmpfs/shmem -======================== - -You can control hugepage allocation policy in tmpfs with mount option -``huge=``. It can have following values: - -always - Attempt to allocate huge pages every time we need a new page; - -never - Do not allocate huge pages; - -within_size - Only allocate huge page if it will be fully within i_size. - Also respect fadvise()/madvise() hints; - -advise - Only allocate huge pages if requested with fadvise()/madvise(); - -The default policy is ``never``. - -``mount -o remount,huge= /mountpoint`` works fine after mount: remounting -``huge=never`` will not attempt to break up huge pages at all, just stop more -from being allocated. - -There's also sysfs knob to control hugepage allocation policy for internal -shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount -is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or -MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. - -In addition to policies listed above, shmem_enabled allows two further -values: - -deny - For use in emergencies, to force the huge option off from - all mounts; -force - Force the huge option on for all - very useful for testing; - -Need of application restart -=========================== - -The transparent_hugepage/enabled values and tmpfs mount option only affect -future behavior. So to make them effective you need to restart any -application that could have been using hugepages. This also applies to the -regions registered in khugepaged. - -Monitoring usage -================ - -The number of anonymous transparent huge pages currently used by the -system is available by reading the AnonHugePages field in ``/proc/meminfo``. -To identify what applications are using anonymous transparent huge pages, -it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields -for each mapping. - -The number of file transparent huge pages mapped to userspace is available -by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. -To identify what applications are mapping file transparent huge pages, it -is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields -for each mapping. - -Note that reading the smaps file is expensive and reading it -frequently will incur overhead. - -There are a number of counters in ``/proc/vmstat`` that may be used to -monitor how successfully the system is providing huge pages for use. - -thp_fault_alloc - is incremented every time a huge page is successfully - allocated to handle a page fault. This applies to both the - first time a page is faulted and for COW faults. - -thp_collapse_alloc - is incremented by khugepaged when it has found - a range of pages to collapse into one huge page and has - successfully allocated a new huge page to store the data. - -thp_fault_fallback - is incremented if a page fault fails to allocate - a huge page and instead falls back to using small pages. - -thp_collapse_alloc_failed - is incremented if khugepaged found a range - of pages that should be collapsed into one huge page but failed - the allocation. - -thp_file_alloc - is incremented every time a file huge page is successfully - allocated. - -thp_file_mapped - is incremented every time a file huge page is mapped into - user address space. - -thp_split_page - is incremented every time a huge page is split into base - pages. This can happen for a variety of reasons but a common - reason is that a huge page is old and is being reclaimed. - This action implies splitting all PMD the page mapped with. - -thp_split_page_failed - is incremented if kernel fails to split huge - page. This can happen if the page was pinned by somebody. - -thp_deferred_split_page - is incremented when a huge page is put onto split - queue. This happens when a huge page is partially unmapped and - splitting it would free up some memory. Pages on split queue are - going to be split under memory pressure. - -thp_split_pmd - is incremented every time a PMD split into table of PTEs. - This can happen, for instance, when application calls mprotect() or - munmap() on part of huge page. It doesn't split huge page, only - page table entry. - -thp_zero_page_alloc - is incremented every time a huge zero page is - successfully allocated. It includes allocations which where - dropped due race with other allocation. Note, it doesn't count - every map of the huge zero page, only its allocation. - -thp_zero_page_alloc_failed - is incremented if kernel fails to allocate - huge zero page and falls back to using small pages. - -thp_swpout - is incremented every time a huge page is swapout in one - piece without splitting. - -thp_swpout_fallback - is incremented if a huge page has to be split before swapout. - Usually because failed to allocate some continuous swap space - for the huge page. - -As the system ages, allocating huge pages may be expensive as the -system uses memory compaction to copy data around memory to free a -huge page for use. There are some counters in ``/proc/vmstat`` to help -monitor this overhead. - -compact_stall - is incremented every time a process stalls to run - memory compaction so that a huge page is free for use. - -compact_success - is incremented if the system compacted memory and - freed a huge page for use. - -compact_fail - is incremented if the system tries to compact memory - but failed. - -compact_pages_moved - is incremented each time a page is moved. If - this value is increasing rapidly, it implies that the system - is copying a lot of data to satisfy the huge page allocation. - It is possible that the cost of copying exceeds any savings - from reduced TLB misses. - -compact_pagemigrate_failed - is incremented when the underlying mechanism - for moving a page failed. - -compact_blocks_moved - is incremented each time memory compaction examines - a huge page aligned range of pages. - -It is possible to establish how long the stalls were using the function -tracer to record how long was spent in __alloc_pages_nodemask and -using the mm_page_alloc tracepoint to identify which allocations were -for huge pages. - -Optimizing the applications -=========================== - -To be guaranteed that the kernel will map a 2M page immediately in any -memory region, the mmap region has to be hugepage naturally -aligned. posix_memalign() can provide that guarantee. - -Hugetlbfs -========= - -You can use hugetlbfs on a kernel that has transparent hugepage -support enabled just fine as always. No difference can be noted in -hugetlbfs other than there will be less overall fragmentation. All -usual features belonging to hugetlbfs are preserved and -unaffected. libhugetlbfs will also work fine as usual. +This document describes design principles Transparent Hugepage (THP) +Support and its interaction with other parts of the memory management. Design principles ================= -- cgit v1.2.3 From a49d9c0ae46e149a22aefa8251d07dddd5611851 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 21 May 2018 11:18:17 -0700 Subject: Documentation: document hung_task_panic kernel parameter This parameter has been around since commit e162b39a368f ("softlockup: decouple hung tasks check from softlockup detection") in 2009 but was never documented. Signed-off-by: Omar Sandoval Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'Documentation/admin-guide/kernel-parameters.txt') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8d24270644a1..5385af53a8ca 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1341,12 +1341,21 @@ x86-64 are 2M (when the CPU supports "pse") and 1G (when the CPU supports the "pdpe1gb" cpuinfo flag). + hung_task_panic= + [KNL] Should the hung task detector generate panics. + Format: + + A nonzero value instructs the kernel to panic when a + hung task is detected. The default value is controlled + by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time + option. The value selected by this boot parameter can + be changed later by the kernel.hung_task_panic sysctl. + hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) terminal devices. Valid values: 0..8 hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs. If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. - keep_bootcon [KNL] Do not unregister boot console at start. This is only useful for debugging when something happens in the window -- cgit v1.2.3