diff options
Diffstat (limited to 'Documentation')
313 files changed, 11870 insertions, 8341 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 923fe2001472..d404603c6b52 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -137,7 +137,8 @@ Description: Discover cpuidle policy and mechanism current_governor: (RW) displays current idle policy. Users can switch the governor at runtime by writing to this file. - See files in Documentation/cpuidle/ for more information. + See Documentation/admin-guide/pm/cpuidle.rst and + Documentation/driver-api/pm/cpuidle.rst for more information. What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/name diff --git a/Documentation/ABI/testing/sysfs-kernel-uids b/Documentation/ABI/testing/sysfs-kernel-uids index 28f14695a852..4182b7061816 100644 --- a/Documentation/ABI/testing/sysfs-kernel-uids +++ b/Documentation/ABI/testing/sysfs-kernel-uids @@ -11,4 +11,4 @@ Description: example would be, if User A has shares = 1024 and user B has shares = 2048, User B will get twice the CPU bandwidth user A will. For more details refer - Documentation/scheduler/sched-design-CFS.txt + Documentation/scheduler/sched-design-CFS.rst diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 0076150fdccb..e47c63bd4887 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -198,7 +198,7 @@ call to set the mask to the value returned. :: size_t - dma_direct_max_mapping_size(struct device *dev); + dma_max_mapping_size(struct device *dev); Returns the maximum size of a mapping for the device. The size parameter of the mapping functions like dma_map_single(), dma_map_page() and diff --git a/Documentation/EDID/HOWTO.txt b/Documentation/EDID/howto.rst index 539871c3b785..725fd49a88ca 100644 --- a/Documentation/EDID/HOWTO.txt +++ b/Documentation/EDID/howto.rst @@ -1,3 +1,9 @@ +:orphan: + +==== +EDID +==== + In the good old days when graphics parameters were configured explicitly in a file called xorg.conf, even broken hardware could be managed. @@ -34,16 +40,19 @@ Makefile. Please note that the EDID data structure expects the timing values in a different way as compared to the standard X11 format. X11: -HTimings: hdisp hsyncstart hsyncend htotal -VTimings: vdisp vsyncstart vsyncend vtotal - -EDID: -#define XPIX hdisp -#define XBLANK htotal-hdisp -#define XOFFSET hsyncstart-hdisp -#define XPULSE hsyncend-hsyncstart - -#define YPIX vdisp -#define YBLANK vtotal-vdisp -#define YOFFSET vsyncstart-vdisp -#define YPULSE vsyncend-vsyncstart + HTimings: + hdisp hsyncstart hsyncend htotal + VTimings: + vdisp vsyncstart vsyncend vtotal + +EDID:: + + #define XPIX hdisp + #define XBLANK htotal-hdisp + #define XOFFSET hsyncstart-hdisp + #define XPULSE hsyncend-hsyncstart + + #define YPIX vdisp + #define YBLANK vtotal-vdisp + #define YOFFSET vsyncstart-vdisp + #define YPULSE vsyncend-vsyncstart diff --git a/Documentation/Kconfig b/Documentation/Kconfig new file mode 100644 index 000000000000..66046fa1c341 --- /dev/null +++ b/Documentation/Kconfig @@ -0,0 +1,13 @@ +config WARN_MISSING_DOCUMENTS + + bool "Warn if there's a missing documentation file" + depends on COMPILE_TEST + help + It is not uncommon that a document gets renamed. + This option makes the Kernel to check for missing dependencies, + warning when something is missing. Works only if the Kernel + is built from a git tree. + + If unsure, select 'N'. + + diff --git a/Documentation/Makefile b/Documentation/Makefile index e889e7cb8511..e145e4db508b 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -4,6 +4,11 @@ subdir-y := devicetree/bindings/ +# Check for broken documentation file references +ifeq ($(CONFIG_WARN_MISSING_DOCUMENTS),y) +$(shell $(srctree)/scripts/documentation-file-ref-check --warn) +endif + # You can set these variables from the command line. SPHINXBUILD = sphinx-build SPHINXOPTS = @@ -23,11 +28,13 @@ ifeq ($(HAVE_SPHINX),0) .DEFAULT: $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.) @echo - @./scripts/sphinx-pre-install + @$(srctree)/scripts/sphinx-pre-install @echo " SKIP Sphinx $@ target." else # HAVE_SPHINX +export SPHINXOPTS = $(shell perl -e 'open IN,"sphinx-build --version 2>&1 |"; while (<IN>) { if (m/([\d\.]+)/) { print "-jauto" if ($$1 >= "1.7") } ;} close IN') + # User-friendly check for pdflatex and latexmk HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi) HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi) @@ -70,12 +77,14 @@ quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4) $(abspath $(BUILDDIR)/$3/$4) htmldocs: + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var))) linkcheckdocs: @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var))) latexdocs: + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var))) ifeq ($(HAVE_PDFLATEX),0) @@ -87,14 +96,17 @@ pdfdocs: else # HAVE_PDFLATEX pdfdocs: latexdocs + @$(srctree)/scripts/sphinx-pre-install --version-check $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) endif # HAVE_PDFLATEX epubdocs: + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var))) xmldocs: + @$(srctree)/scripts/sphinx-pre-install --version-check @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var))) endif # HAVE_SPHINX diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.rst index 53bde717017b..e26dda27430c 100644 --- a/Documentation/RCU/UP.txt +++ b/Documentation/RCU/UP.rst @@ -1,17 +1,19 @@ -RCU on Uniprocessor Systems +.. _up_doc: +RCU on Uniprocessor Systems +=========================== A common misconception is that, on UP systems, the call_rcu() primitive may immediately invoke its function. The basis of this misconception is that since there is only one CPU, it should not be necessary to wait for anything else to get done, since there are no other CPUs for -anything else to be happening on. Although this approach will -sort- -of- +anything else to be happening on. Although this approach will *sort of* work a surprising amount of the time, it is a very bad idea in general. This document presents three examples that demonstrate exactly how bad an idea this is. - Example 1: softirq Suicide +-------------------------- Suppose that an RCU-based algorithm scans a linked list containing elements A, B, and C in process context, and can delete elements from @@ -28,8 +30,8 @@ your kernel. This same problem can occur if call_rcu() is invoked from a hardware interrupt handler. - Example 2: Function-Call Fatality +--------------------------------- Of course, one could avert the suicide described in the preceding example by having call_rcu() directly invoke its arguments only if it was called @@ -46,11 +48,13 @@ its arguments would cause it to fail to make the fundamental guarantee underlying RCU, namely that call_rcu() defers invoking its arguments until all RCU read-side critical sections currently executing have completed. -Quick Quiz #1: why is it -not- legal to invoke synchronize_rcu() in - this case? +Quick Quiz #1: + Why is it *not* legal to invoke synchronize_rcu() in this case? +:ref:`Answers to Quick Quiz <answer_quick_quiz_up>` Example 3: Death by Deadlock +---------------------------- Suppose that call_rcu() is invoked while holding a lock, and that the callback function must acquire this same lock. In this case, if @@ -76,25 +80,30 @@ there are cases where this can be quite ugly: If call_rcu() directly invokes the callback, painful locking restrictions or API changes would be required. -Quick Quiz #2: What locking restriction must RCU callbacks respect? +Quick Quiz #2: + What locking restriction must RCU callbacks respect? +:ref:`Answers to Quick Quiz <answer_quick_quiz_up>` Summary +------- Permitting call_rcu() to immediately invoke its arguments breaks RCU, even on a UP system. So do not do it! Even on a UP system, the RCU -infrastructure -must- respect grace periods, and -must- invoke callbacks +infrastructure *must* respect grace periods, and *must* invoke callbacks from a known environment in which no locks are held. -Note that it -is- safe for synchronize_rcu() to return immediately on -UP systems, including !PREEMPT SMP builds running on UP systems. +Note that it *is* safe for synchronize_rcu() to return immediately on +UP systems, including PREEMPT SMP builds running on UP systems. -Quick Quiz #3: Why can't synchronize_rcu() return immediately on - UP systems running preemptable RCU? +Quick Quiz #3: + Why can't synchronize_rcu() return immediately on UP systems running + preemptable RCU? +.. _answer_quick_quiz_up: Answer to Quick Quiz #1: - Why is it -not- legal to invoke synchronize_rcu() in this case? + Why is it *not* legal to invoke synchronize_rcu() in this case? Because the calling function is scanning an RCU-protected linked list, and is therefore within an RCU read-side critical section. @@ -104,12 +113,13 @@ Answer to Quick Quiz #1: Answer to Quick Quiz #2: What locking restriction must RCU callbacks respect? - Any lock that is acquired within an RCU callback must be - acquired elsewhere using an _irq variant of the spinlock - primitive. For example, if "mylock" is acquired by an - RCU callback, then a process-context acquisition of this - lock must use something like spin_lock_irqsave() to - acquire the lock. + Any lock that is acquired within an RCU callback must be acquired + elsewhere using an _bh variant of the spinlock primitive. + For example, if "mylock" is acquired by an RCU callback, then + a process-context acquisition of this lock must use something + like spin_lock_bh() to acquire the lock. Please note that + it is also OK to use _irq variants of spinlocks, for example, + spin_lock_irqsave(). If the process-context code were to simply use spin_lock(), then, since RCU callbacks can be invoked from softirq context, @@ -119,7 +129,7 @@ Answer to Quick Quiz #2: This restriction might seem gratuitous, since very few RCU callbacks acquire locks directly. However, a great many RCU - callbacks do acquire locks -indirectly-, for example, via + callbacks do acquire locks *indirectly*, for example, via the kfree() primitive. Answer to Quick Quiz #3: diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst new file mode 100644 index 000000000000..340a9725676c --- /dev/null +++ b/Documentation/RCU/index.rst @@ -0,0 +1,19 @@ +.. _rcu_concepts: + +============ +RCU concepts +============ + +.. toctree:: + :maxdepth: 1 + + rcu + listRCU + UP + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.rst index adb5a3782846..7956ff33042b 100644 --- a/Documentation/RCU/listRCU.txt +++ b/Documentation/RCU/listRCU.rst @@ -1,5 +1,7 @@ -Using RCU to Protect Read-Mostly Linked Lists +.. _list_rcu_doc: +Using RCU to Protect Read-Mostly Linked Lists +============================================= One of the best applications of RCU is to protect read-mostly linked lists ("struct list_head" in list.h). One big advantage of this approach @@ -7,8 +9,8 @@ is that all of the required memory barriers are included for you in the list macros. This document describes several applications of RCU, with the best fits first. - Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates +---------------------------------------------------------------------- The best applications are cases where, if reader-writer locking were used, the read-side lock would be dropped before taking any action @@ -24,7 +26,7 @@ added or deleted, rather than being modified in place. A straightforward example of this use of RCU may be found in the system-call auditing support. For example, a reader-writer locked -implementation of audit_filter_task() might be as follows: +implementation of audit_filter_task() might be as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -48,7 +50,7 @@ the corresponding value is returned. By the time that this value is acted on, the list may well have been modified. This makes sense, since if you are turning auditing off, it is OK to audit a few extra system calls. -This means that RCU can be easily applied to the read side, as follows: +This means that RCU can be easily applied to the read side, as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -73,7 +75,7 @@ become list_for_each_entry_rcu(). The _rcu() list-traversal primitives insert the read-side memory barriers that are required on DEC Alpha CPUs. The changes to the update side are also straightforward. A reader-writer -lock might be used as follows for deletion and insertion: +lock might be used as follows for deletion and insertion:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) @@ -106,7 +108,7 @@ lock might be used as follows for deletion and insertion: return 0; } -Following are the RCU equivalents for these two functions: +Following are the RCU equivalents for these two functions:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) @@ -154,13 +156,13 @@ otherwise cause concurrent readers to fail spectacularly. So, when readers can tolerate stale data and when entries are either added or deleted, without in-place modification, it is very easy to use RCU! - Example 2: Handling In-Place Updates +------------------------------------ The system-call auditing code does not update auditing rules in place. However, if it did, reader-writer-locked code to do so might look as follows (presumably, the field_count is only permitted to decrease, -otherwise, the added fields would need to be filled in): +otherwise, the added fields would need to be filled in):: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, @@ -187,7 +189,7 @@ otherwise, the added fields would need to be filled in): The RCU version creates a copy, updates the copy, then replaces the old entry with the newly updated entry. This sequence of actions, allowing concurrent reads while doing a copy to perform an update, is what gives -RCU ("read-copy update") its name. The RCU code is as follows: +RCU ("read-copy update") its name. The RCU code is as follows:: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, @@ -216,8 +218,8 @@ RCU ("read-copy update") its name. The RCU code is as follows: Again, this assumes that the caller holds audit_netlink_sem. Normally, the reader-writer lock would become a spinlock in this sort of code. - Example 3: Eliminating Stale Data +--------------------------------- The auditing examples above tolerate stale data, as do most algorithms that are tracking external state. Because there is a delay from the @@ -231,13 +233,16 @@ per-entry spinlock, and, if the "deleted" flag is set, pretends that the entry does not exist. For this to be helpful, the search function must return holding the per-entry spinlock, as ipc_lock() does in fact do. -Quick Quiz: Why does the search function need to return holding the - per-entry lock for this deleted-flag technique to be helpful? +Quick Quiz: + Why does the search function need to return holding the per-entry lock for + this deleted-flag technique to be helpful? + +:ref:`Answer to Quick Quiz <answer_quick_quiz_list>` If the system-call audit module were to ever need to reject stale data, one way to accomplish this would be to add a "deleted" flag and a "lock" spinlock to the audit_entry structure, and modify audit_filter_task() -as follows: +as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -268,7 +273,7 @@ audit_upd_rule() would need additional memory barriers to ensure that the list_add_rcu() was really executed before the list_del_rcu(). The audit_del_rule() function would need to set the "deleted" -flag under the spinlock as follows: +flag under the spinlock as follows:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) @@ -290,8 +295,8 @@ flag under the spinlock as follows: return -EFAULT; /* No matching rule */ } - Summary +------- Read-mostly list-based data structures that can tolerate stale data are the most amenable to use of RCU. The simplest case is where entries are @@ -302,8 +307,9 @@ If stale data cannot be tolerated, then a "deleted" flag may be used in conjunction with a per-entry spinlock in order to allow the search function to reject newly deleted data. +.. _answer_quick_quiz_list: -Answer to Quick Quiz +Answer to Quick Quiz: Why does the search function need to return holding the per-entry lock for this deleted-flag technique to be helpful? diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst new file mode 100644 index 000000000000..8dfb437dacc3 --- /dev/null +++ b/Documentation/RCU/rcu.rst @@ -0,0 +1,92 @@ +.. _rcu_doc: + +RCU Concepts +============ + +The basic idea behind RCU (read-copy update) is to split destructive +operations into two parts, one that prevents anyone from seeing the data +item being destroyed, and one that actually carries out the destruction. +A "grace period" must elapse between the two parts, and this grace period +must be long enough that any readers accessing the item being deleted have +since dropped their references. For example, an RCU-protected deletion +from a linked list would first remove the item from the list, wait for +a grace period to elapse, then free the element. See the +Documentation/RCU/listRCU.rst file for more information on using RCU with +linked lists. + +Frequently Asked Questions +-------------------------- + +- Why would anyone want to use RCU? + + The advantage of RCU's two-part approach is that RCU readers need + not acquire any locks, perform any atomic instructions, write to + shared memory, or (on CPUs other than Alpha) execute any memory + barriers. The fact that these operations are quite expensive + on modern CPUs is what gives RCU its performance advantages + in read-mostly situations. The fact that RCU readers need not + acquire locks can also greatly simplify deadlock-avoidance code. + +- How can the updater tell when a grace period has completed + if the RCU readers give no indication when they are done? + + Just as with spinlocks, RCU readers are not permitted to + block, switch to user-mode execution, or enter the idle loop. + Therefore, as soon as a CPU is seen passing through any of these + three states, we know that that CPU has exited any previous RCU + read-side critical sections. So, if we remove an item from a + linked list, and then wait until all CPUs have switched context, + executed in user mode, or executed in the idle loop, we can + safely free up that item. + + Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the + same effect, but require that the readers manipulate CPU-local + counters. These counters allow limited types of blocking within + RCU read-side critical sections. SRCU also uses CPU-local + counters, and permits general blocking within RCU read-side + critical sections. These variants of RCU detect grace periods + by sampling these counters. + +- If I am running on a uniprocessor kernel, which can only do one + thing at a time, why should I wait for a grace period? + + See the Documentation/RCU/UP.rst file for more information. + +- How can I see where RCU is currently used in the Linux kernel? + + Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu", + "rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock", + "srcu_read_unlock", "synchronize_rcu", "synchronize_net", + "synchronize_srcu", and the other RCU primitives. Or grab one + of the cscope databases from: + + (http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html). + +- What guidelines should I follow when writing code that uses RCU? + + See the checklist.txt file in this directory. + +- Why the name "RCU"? + + "RCU" stands for "read-copy update". The file Documentation/RCU/listRCU.rst + has more information on where this name came from, search for + "read-copy update" to find it. + +- I hear that RCU is patented? What is with that? + + Yes, it is. There are several known patents related to RCU, + search for the string "Patent" in RTFP.txt to find them. + Of these, one was allowed to lapse by the assignee, and the + others have been contributed to the Linux kernel under GPL. + There are now also LGPL implementations of user-level RCU + available (http://liburcu.org/). + +- I hear that RCU needs work in order to support realtime kernels? + + Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU + kernel configuration parameter. + +- Where can I find more information on RCU? + + See the RTFP.txt file in this directory. + Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/). diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt deleted file mode 100644 index c818cf65c5a9..000000000000 --- a/Documentation/RCU/rcu.txt +++ /dev/null @@ -1,89 +0,0 @@ -RCU Concepts - - -The basic idea behind RCU (read-copy update) is to split destructive -operations into two parts, one that prevents anyone from seeing the data -item being destroyed, and one that actually carries out the destruction. -A "grace period" must elapse between the two parts, and this grace period -must be long enough that any readers accessing the item being deleted have -since dropped their references. For example, an RCU-protected deletion -from a linked list would first remove the item from the list, wait for -a grace period to elapse, then free the element. See the listRCU.txt -file for more information on using RCU with linked lists. - - -Frequently Asked Questions - -o Why would anyone want to use RCU? - - The advantage of RCU's two-part approach is that RCU readers need - not acquire any locks, perform any atomic instructions, write to - shared memory, or (on CPUs other than Alpha) execute any memory - barriers. The fact that these operations are quite expensive - on modern CPUs is what gives RCU its performance advantages - in read-mostly situations. The fact that RCU readers need not - acquire locks can also greatly simplify deadlock-avoidance code. - -o How can the updater tell when a grace period has completed - if the RCU readers give no indication when they are done? - - Just as with spinlocks, RCU readers are not permitted to - block, switch to user-mode execution, or enter the idle loop. - Therefore, as soon as a CPU is seen passing through any of these - three states, we know that that CPU has exited any previous RCU - read-side critical sections. So, if we remove an item from a - linked list, and then wait until all CPUs have switched context, - executed in user mode, or executed in the idle loop, we can - safely free up that item. - - Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the - same effect, but require that the readers manipulate CPU-local - counters. These counters allow limited types of blocking within - RCU read-side critical sections. SRCU also uses CPU-local - counters, and permits general blocking within RCU read-side - critical sections. These variants of RCU detect grace periods - by sampling these counters. - -o If I am running on a uniprocessor kernel, which can only do one - thing at a time, why should I wait for a grace period? - - See the UP.txt file in this directory. - -o How can I see where RCU is currently used in the Linux kernel? - - Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu", - "rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock", - "srcu_read_unlock", "synchronize_rcu", "synchronize_net", - "synchronize_srcu", and the other RCU primitives. Or grab one - of the cscope databases from: - - http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html - -o What guidelines should I follow when writing code that uses RCU? - - See the checklist.txt file in this directory. - -o Why the name "RCU"? - - "RCU" stands for "read-copy update". The file listRCU.txt has - more information on where this name came from, search for - "read-copy update" to find it. - -o I hear that RCU is patented? What is with that? - - Yes, it is. There are several known patents related to RCU, - search for the string "Patent" in RTFP.txt to find them. - Of these, one was allowed to lapse by the assignee, and the - others have been contributed to the Linux kernel under GPL. - There are now also LGPL implementations of user-level RCU - available (http://liburcu.org/). - -o I hear that RCU needs work in order to support realtime kernels? - - Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU - kernel configuration parameter. - -o Where can I find more information on RCU? - - See the RTFP.txt file in this directory. - Or point your browser at http://www.rdrop.com/users/paulmck/RCU/. diff --git a/Documentation/accelerators/ocxl.rst b/Documentation/accelerators/ocxl.rst index 14cefc020e2d..b1cea19a90f5 100644 --- a/Documentation/accelerators/ocxl.rst +++ b/Documentation/accelerators/ocxl.rst @@ -1,3 +1,5 @@ +:orphan: + ======================================================== OpenCAPI (Open Coherent Accelerator Processor Interface) ======================================================== diff --git a/Documentation/acpi/dsd/leds.txt b/Documentation/acpi/dsd/leds.txt index 81a63af42ed2..cc58b1a574c5 100644 --- a/Documentation/acpi/dsd/leds.txt +++ b/Documentation/acpi/dsd/leds.txt @@ -96,4 +96,4 @@ where <URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>, referenced 2019-02-21. -[7] Documentation/acpi/dsd/data-node-reference.txt +[7] Documentation/firmware-guide/acpi/dsd/data-node-references.rst diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index a582c780c3bd..cc6151fc0845 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -227,7 +227,7 @@ Configuring the kernel "make tinyconfig" Configure the tiniest possible kernel. You can find more information on using the Linux kernel config tools - in Documentation/kbuild/kconfig.txt. + in Documentation/kbuild/kconfig.rst. - NOTES on ``make config``: diff --git a/Documentation/filesystems/binderfs.rst b/Documentation/admin-guide/binderfs.rst index c009671f8434..c009671f8434 100644 --- a/Documentation/filesystems/binderfs.rst +++ b/Documentation/admin-guide/binderfs.rst diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index f278b289e260..b761aa2a51d2 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -90,7 +90,7 @@ the disk is not available then you have three options: run a null modem to a second machine and capture the output there using your favourite communication program. Minicom works well. -(3) Use Kdump (see Documentation/kdump/kdump.txt), +(3) Use Kdump (see Documentation/kdump/kdump.rst), extract the kernel ring buffer from old memory with using dmesg gdbmacro in Documentation/kdump/gdbmacros.txt. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index ffc064c1ec68..49311f3da6f2 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -9,5 +9,6 @@ are configurable at compile, boot or run time. .. toctree:: :maxdepth: 1 + spectre l1tf mds diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst new file mode 100644 index 000000000000..25f3b2532198 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -0,0 +1,697 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Spectre Side Channels +===================== + +Spectre is a class of side channel attacks that exploit branch prediction +and speculative execution on modern CPUs to read memory, possibly +bypassing access controls. Speculative execution side channel exploits +do not modify memory but attempt to infer privileged data in the memory. + +This document covers Spectre variant 1 and Spectre variant 2. + +Affected processors +------------------- + +Speculative execution side channel methods affect a wide range of modern +high performance processors, since most modern high speed processors +use branch prediction and speculative execution. + +The following CPUs are vulnerable: + + - Intel Core, Atom, Pentium, and Xeon processors + + - AMD Phenom, EPYC, and Zen processors + + - IBM POWER and zSeries processors + + - Higher end ARM processors + + - Apple CPUs + + - Higher end MIPS CPUs + + - Likely most other high performance CPUs. Contact your CPU vendor for details. + +Whether a processor is affected or not can be read out from the Spectre +vulnerability files in sysfs. See :ref:`spectre_sys_info`. + +Related CVEs +------------ + +The following CVE entries describe Spectre variants: + + ============= ======================= ================= + CVE-2017-5753 Bounds check bypass Spectre variant 1 + CVE-2017-5715 Branch target injection Spectre variant 2 + ============= ======================= ================= + +Problem +------- + +CPUs use speculative operations to improve performance. That may leave +traces of memory accesses or computations in the processor's caches, +buffers, and branch predictors. Malicious software may be able to +influence the speculative execution paths, and then use the side effects +of the speculative execution in the CPUs' caches and buffers to infer +privileged data touched during the speculative execution. + +Spectre variant 1 attacks take advantage of speculative execution of +conditional branches, while Spectre variant 2 attacks use speculative +execution of indirect branches to leak privileged memory. +See :ref:`[1] <spec_ref1>` :ref:`[5] <spec_ref5>` :ref:`[7] <spec_ref7>` +:ref:`[10] <spec_ref10>` :ref:`[11] <spec_ref11>`. + +Spectre variant 1 (Bounds Check Bypass) +--------------------------------------- + +The bounds check bypass attack :ref:`[2] <spec_ref2>` takes advantage +of speculative execution that bypasses conditional branch instructions +used for memory access bounds check (e.g. checking if the index of an +array results in memory access within a valid range). This results in +memory accesses to invalid memory (with out-of-bound index) that are +done speculatively before validation checks resolve. Such speculative +memory accesses can leave side effects, creating side channels which +leak information to the attacker. + +There are some extensions of Spectre variant 1 attacks for reading data +over the network, see :ref:`[12] <spec_ref12>`. However such attacks +are difficult, low bandwidth, fragile, and are considered low risk. + +Spectre variant 2 (Branch Target Injection) +------------------------------------------- + +The branch target injection attack takes advantage of speculative +execution of indirect branches :ref:`[3] <spec_ref3>`. The indirect +branch predictors inside the processor used to guess the target of +indirect branches can be influenced by an attacker, causing gadget code +to be speculatively executed, thus exposing sensitive data touched by +the victim. The side effects left in the CPU's caches during speculative +execution can be measured to infer data values. + +.. _poison_btb: + +In Spectre variant 2 attacks, the attacker can steer speculative indirect +branches in the victim to gadget code by poisoning the branch target +buffer of a CPU used for predicting indirect branch addresses. Such +poisoning could be done by indirect branching into existing code, +with the address offset of the indirect branch under the attacker's +control. Since the branch prediction on impacted hardware does not +fully disambiguate branch address and uses the offset for prediction, +this could cause privileged code's indirect branch to jump to a gadget +code with the same offset. + +The most useful gadgets take an attacker-controlled input parameter (such +as a register value) so that the memory read can be controlled. Gadgets +without input parameters might be possible, but the attacker would have +very little control over what memory can be read, reducing the risk of +the attack revealing useful data. + +One other variant 2 attack vector is for the attacker to poison the +return stack buffer (RSB) :ref:`[13] <spec_ref13>` to cause speculative +subroutine return instruction execution to go to a gadget. An attacker's +imbalanced subroutine call instructions might "poison" entries in the +return stack buffer which are later consumed by a victim's subroutine +return instructions. This attack can be mitigated by flushing the return +stack buffer on context switch, or virtual machine (VM) exit. + +On systems with simultaneous multi-threading (SMT), attacks are possible +from the sibling thread, as level 1 cache and branch target buffer +(BTB) may be shared between hardware threads in a CPU core. A malicious +program running on the sibling thread may influence its peer's BTB to +steer its indirect branch speculations to gadget code, and measure the +speculative execution's side effects left in level 1 cache to infer the +victim's data. + +Attack scenarios +---------------- + +The following list of attack scenarios have been anticipated, but may +not cover all possible attack vectors. + +1. A user process attacking the kernel +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + The attacker passes a parameter to the kernel via a register or + via a known address in memory during a syscall. Such parameter may + be used later by the kernel as an index to an array or to derive + a pointer for a Spectre variant 1 attack. The index or pointer + is invalid, but bound checks are bypassed in the code branch taken + for speculative execution. This could cause privileged memory to be + accessed and leaked. + + For kernel code that has been identified where data pointers could + potentially be influenced for Spectre attacks, new "nospec" accessor + macros are used to prevent speculative loading of data. + + Spectre variant 2 attacker can :ref:`poison <poison_btb>` the branch + target buffer (BTB) before issuing syscall to launch an attack. + After entering the kernel, the kernel could use the poisoned branch + target buffer on indirect jump and jump to gadget code in speculative + execution. + + If an attacker tries to control the memory addresses leaked during + speculative execution, he would also need to pass a parameter to the + gadget, either through a register or a known address in memory. After + the gadget has executed, he can measure the side effect. + + The kernel can protect itself against consuming poisoned branch + target buffer entries by using return trampolines (also known as + "retpoline") :ref:`[3] <spec_ref3>` :ref:`[9] <spec_ref9>` for all + indirect branches. Return trampolines trap speculative execution paths + to prevent jumping to gadget code during speculative execution. + x86 CPUs with Enhanced Indirect Branch Restricted Speculation + (Enhanced IBRS) available in hardware should use the feature to + mitigate Spectre variant 2 instead of retpoline. Enhanced IBRS is + more efficient than retpoline. + + There may be gadget code in firmware which could be exploited with + Spectre variant 2 attack by a rogue user process. To mitigate such + attacks on x86, Indirect Branch Restricted Speculation (IBRS) feature + is turned on before the kernel invokes any firmware code. + +2. A user process attacking another user process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + A malicious user process can try to attack another user process, + either via a context switch on the same hardware thread, or from the + sibling hyperthread sharing a physical processor core on simultaneous + multi-threading (SMT) system. + + Spectre variant 1 attacks generally require passing parameters + between the processes, which needs a data passing relationship, such + as remote procedure calls (RPC). Those parameters are used in gadget + code to derive invalid data pointers accessing privileged memory in + the attacked process. + + Spectre variant 2 attacks can be launched from a rogue process by + :ref:`poisoning <poison_btb>` the branch target buffer. This can + influence the indirect branch targets for a victim process that either + runs later on the same hardware thread, or running concurrently on + a sibling hardware thread sharing the same physical core. + + A user process can protect itself against Spectre variant 2 attacks + by using the prctl() syscall to disable indirect branch speculation + for itself. An administrator can also cordon off an unsafe process + from polluting the branch target buffer by disabling the process's + indirect branch speculation. This comes with a performance cost + from not using indirect branch speculation and clearing the branch + target buffer. When SMT is enabled on x86, for a process that has + indirect branch speculation disabled, Single Threaded Indirect Branch + Predictors (STIBP) :ref:`[4] <spec_ref4>` are turned on to prevent the + sibling thread from controlling branch target buffer. In addition, + the Indirect Branch Prediction Barrier (IBPB) is issued to clear the + branch target buffer when context switching to and from such process. + + On x86, the return stack buffer is stuffed on context switch. + This prevents the branch target buffer from being used for branch + prediction when the return stack buffer underflows while switching to + a deeper call stack. Any poisoned entries in the return stack buffer + left by the previous process will also be cleared. + + User programs should use address space randomization to make attacks + more difficult (Set /proc/sys/kernel/randomize_va_space = 1 or 2). + +3. A virtualized guest attacking the host +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + The attack mechanism is similar to how user processes attack the + kernel. The kernel is entered via hyper-calls or other virtualization + exit paths. + + For Spectre variant 1 attacks, rogue guests can pass parameters + (e.g. in registers) via hyper-calls to derive invalid pointers to + speculate into privileged memory after entering the kernel. For places + where such kernel code has been identified, nospec accessor macros + are used to stop speculative memory access. + + For Spectre variant 2 attacks, rogue guests can :ref:`poison + <poison_btb>` the branch target buffer or return stack buffer, causing + the kernel to jump to gadget code in the speculative execution paths. + + To mitigate variant 2, the host kernel can use return trampolines + for indirect branches to bypass the poisoned branch target buffer, + and flushing the return stack buffer on VM exit. This prevents rogue + guests from affecting indirect branching in the host kernel. + + To protect host processes from rogue guests, host processes can have + indirect branch speculation disabled via prctl(). The branch target + buffer is cleared before context switching to such processes. + +4. A virtualized guest attacking other guest +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + A rogue guest may attack another guest to get data accessible by the + other guest. + + Spectre variant 1 attacks are possible if parameters can be passed + between guests. This may be done via mechanisms such as shared memory + or message passing. Such parameters could be used to derive data + pointers to privileged data in guest. The privileged data could be + accessed by gadget code in the victim's speculation paths. + + Spectre variant 2 attacks can be launched from a rogue guest by + :ref:`poisoning <poison_btb>` the branch target buffer or the return + stack buffer. Such poisoned entries could be used to influence + speculation execution paths in the victim guest. + + Linux kernel mitigates attacks to other guests running in the same + CPU hardware thread by flushing the return stack buffer on VM exit, + and clearing the branch target buffer before switching to a new guest. + + If SMT is used, Spectre variant 2 attacks from an untrusted guest + in the sibling hyperthread can be mitigated by the administrator, + by turning off the unsafe guest's indirect branch speculation via + prctl(). A guest can also protect itself by turning on microcode + based mitigations (such as IBPB or STIBP on x86) within the guest. + +.. _spectre_sys_info: + +Spectre system information +-------------------------- + +The Linux kernel provides a sysfs interface to enumerate the current +mitigation status of the system for Spectre: whether the system is +vulnerable, and which mitigations are active. + +The sysfs file showing Spectre variant 1 mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/spectre_v1 + +The possible values in this file are: + + ======================================= ================================= + 'Mitigation: __user pointer sanitation' Protection in kernel on a case by + case base with explicit pointer + sanitation. + ======================================= ================================= + +However, the protections are put in place on a case by case basis, +and there is no guarantee that all possible attack vectors for Spectre +variant 1 are covered. + +The spectre_v2 kernel file reports if the kernel has been compiled with +retpoline mitigation or if the CPU has hardware mitigation, and if the +CPU has support for additional process-specific mitigation. + +This file also reports CPU features enabled by microcode to mitigate +attack between user processes: + +1. Indirect Branch Prediction Barrier (IBPB) to add additional + isolation between processes of different users. +2. Single Thread Indirect Branch Predictors (STIBP) to add additional + isolation between CPU threads running on the same core. + +These CPU features may impact performance when used and can be enabled +per process on a case-by-case base. + +The sysfs file showing Spectre variant 2 mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/spectre_v2 + +The possible values in this file are: + + - Kernel status: + + ==================================== ================================= + 'Not affected' The processor is not vulnerable + 'Vulnerable' Vulnerable, no mitigation + 'Mitigation: Full generic retpoline' Software-focused mitigation + 'Mitigation: Full AMD retpoline' AMD-specific software mitigation + 'Mitigation: Enhanced IBRS' Hardware-focused mitigation + ==================================== ================================= + + - Firmware status: Show if Indirect Branch Restricted Speculation (IBRS) is + used to protect against Spectre variant 2 attacks when calling firmware (x86 only). + + ========== ============================================================= + 'IBRS_FW' Protection against user program attacks when calling firmware + ========== ============================================================= + + - Indirect branch prediction barrier (IBPB) status for protection between + processes of different users. This feature can be controlled through + prctl() per process, or through kernel command line options. This is + an x86 only feature. For more details see below. + + =================== ======================================================== + 'IBPB: disabled' IBPB unused + 'IBPB: always-on' Use IBPB on all tasks + 'IBPB: conditional' Use IBPB on SECCOMP or indirect branch restricted tasks + =================== ======================================================== + + - Single threaded indirect branch prediction (STIBP) status for protection + between different hyper threads. This feature can be controlled through + prctl per process, or through kernel command line options. This is x86 + only feature. For more details see below. + + ==================== ======================================================== + 'STIBP: disabled' STIBP unused + 'STIBP: forced' Use STIBP on all tasks + 'STIBP: conditional' Use STIBP on SECCOMP or indirect branch restricted tasks + ==================== ======================================================== + + - Return stack buffer (RSB) protection status: + + ============= =========================================== + 'RSB filling' Protection of RSB on context switch enabled + ============= =========================================== + +Full mitigation might require a microcode update from the CPU +vendor. When the necessary microcode is not available, the kernel will +report vulnerability. + +Turning on mitigation for Spectre variant 1 and Spectre variant 2 +----------------------------------------------------------------- + +1. Kernel mitigation +^^^^^^^^^^^^^^^^^^^^ + + For the Spectre variant 1, vulnerable kernel code (as determined + by code audit or scanning tools) is annotated on a case by case + basis to use nospec accessor macros for bounds clipping :ref:`[2] + <spec_ref2>` to avoid any usable disclosure gadgets. However, it may + not cover all attack vectors for Spectre variant 1. + + For Spectre variant 2 mitigation, the compiler turns indirect calls or + jumps in the kernel into equivalent return trampolines (retpolines) + :ref:`[3] <spec_ref3>` :ref:`[9] <spec_ref9>` to go to the target + addresses. Speculative execution paths under retpolines are trapped + in an infinite loop to prevent any speculative execution jumping to + a gadget. + + To turn on retpoline mitigation on a vulnerable CPU, the kernel + needs to be compiled with a gcc compiler that supports the + -mindirect-branch=thunk-extern -mindirect-branch-register options. + If the kernel is compiled with a Clang compiler, the compiler needs + to support -mretpoline-external-thunk option. The kernel config + CONFIG_RETPOLINE needs to be turned on, and the CPU needs to run with + the latest updated microcode. + + On Intel Skylake-era systems the mitigation covers most, but not all, + cases. See :ref:`[3] <spec_ref3>` for more details. + + On CPUs with hardware mitigation for Spectre variant 2 (e.g. Enhanced + IBRS on x86), retpoline is automatically disabled at run time. + + The retpoline mitigation is turned on by default on vulnerable + CPUs. It can be forced on or off by the administrator + via the kernel command line and sysfs control files. See + :ref:`spectre_mitigation_control_command_line`. + + On x86, indirect branch restricted speculation is turned on by default + before invoking any firmware code to prevent Spectre variant 2 exploits + using the firmware. + + Using kernel address space randomization (CONFIG_RANDOMIZE_SLAB=y + and CONFIG_SLAB_FREELIST_RANDOM=y in the kernel configuration) makes + attacks on the kernel generally more difficult. + +2. User program mitigation +^^^^^^^^^^^^^^^^^^^^^^^^^^ + + User programs can mitigate Spectre variant 1 using LFENCE or "bounds + clipping". For more details see :ref:`[2] <spec_ref2>`. + + For Spectre variant 2 mitigation, individual user programs + can be compiled with return trampolines for indirect branches. + This protects them from consuming poisoned entries in the branch + target buffer left by malicious software. Alternatively, the + programs can disable their indirect branch speculation via prctl() + (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`). + On x86, this will turn on STIBP to guard against attacks from the + sibling thread when the user program is running, and use IBPB to + flush the branch target buffer when switching to/from the program. + + Restricting indirect branch speculation on a user program will + also prevent the program from launching a variant 2 attack + on x86. All sand-boxed SECCOMP programs have indirect branch + speculation restricted by default. Administrators can change + that behavior via the kernel command line and sysfs control files. + See :ref:`spectre_mitigation_control_command_line`. + + Programs that disable their indirect branch speculation will have + more overhead and run slower. + + User programs should use address space randomization + (/proc/sys/kernel/randomize_va_space = 1 or 2) to make attacks more + difficult. + +3. VM mitigation +^^^^^^^^^^^^^^^^ + + Within the kernel, Spectre variant 1 attacks from rogue guests are + mitigated on a case by case basis in VM exit paths. Vulnerable code + uses nospec accessor macros for "bounds clipping", to avoid any + usable disclosure gadgets. However, this may not cover all variant + 1 attack vectors. + + For Spectre variant 2 attacks from rogue guests to the kernel, the + Linux kernel uses retpoline or Enhanced IBRS to prevent consumption of + poisoned entries in branch target buffer left by rogue guests. It also + flushes the return stack buffer on every VM exit to prevent a return + stack buffer underflow so poisoned branch target buffer could be used, + or attacker guests leaving poisoned entries in the return stack buffer. + + To mitigate guest-to-guest attacks in the same CPU hardware thread, + the branch target buffer is sanitized by flushing before switching + to a new guest on a CPU. + + The above mitigations are turned on by default on vulnerable CPUs. + + To mitigate guest-to-guest attacks from sibling thread when SMT is + in use, an untrusted guest running in the sibling thread can have + its indirect branch speculation disabled by administrator via prctl(). + + The kernel also allows guests to use any microcode based mitigation + they choose to use (such as IBPB or STIBP on x86) to protect themselves. + +.. _spectre_mitigation_control_command_line: + +Mitigation control on the kernel command line +--------------------------------------------- + +Spectre variant 2 mitigation can be disabled or force enabled at the +kernel command line. + + nospectre_v2 + + [X86] Disable all mitigations for the Spectre variant 2 + (indirect branch prediction) vulnerability. System may + allow data leaks with this option, which is equivalent + to spectre_v2=off. + + + spectre_v2= + + [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability. + The default operation protects the kernel from + user space attacks. + + on + unconditionally enable, implies + spectre_v2_user=on + off + unconditionally disable, implies + spectre_v2_user=off + auto + kernel detects whether your CPU model is + vulnerable + + Selecting 'on' will, and 'auto' may, choose a + mitigation method at run time according to the + CPU, the available microcode, the setting of the + CONFIG_RETPOLINE configuration option, and the + compiler with which the kernel was built. + + Selecting 'on' will also enable the mitigation + against user space to user space task attacks. + + Selecting 'off' will disable both the kernel and + the user space protections. + + Specific mitigations can also be selected manually: + + retpoline + replace indirect branches + retpoline,generic + google's original retpoline + retpoline,amd + AMD-specific minimal thunk + + Not specifying this option is equivalent to + spectre_v2=auto. + +For user space mitigation: + + spectre_v2_user= + + [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability between + user space tasks + + on + Unconditionally enable mitigations. Is + enforced by spectre_v2=on + + off + Unconditionally disable mitigations. Is + enforced by spectre_v2=off + + prctl + Indirect branch speculation is enabled, + but mitigation can be enabled via prctl + per thread. The mitigation control state + is inherited on fork. + + prctl,ibpb + Like "prctl" above, but only STIBP is + controlled per thread. IBPB is issued + always when switching between different user + space processes. + + seccomp + Same as "prctl" above, but all seccomp + threads will enable the mitigation unless + they explicitly opt out. + + seccomp,ibpb + Like "seccomp" above, but only STIBP is + controlled per thread. IBPB is issued + always when switching between different + user space processes. + + auto + Kernel selects the mitigation depending on + the available CPU features and vulnerability. + + Default mitigation: + If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl" + + Not specifying this option is equivalent to + spectre_v2_user=auto. + + In general the kernel by default selects + reasonable mitigations for the current CPU. To + disable Spectre variant 2 mitigations, boot with + spectre_v2=off. Spectre variant 1 mitigations + cannot be disabled. + +Mitigation selection guide +-------------------------- + +1. Trusted userspace +^^^^^^^^^^^^^^^^^^^^ + + If all userspace applications are from trusted sources and do not + execute externally supplied untrusted code, then the mitigations can + be disabled. + +2. Protect sensitive programs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + For security-sensitive programs that have secrets (e.g. crypto + keys), protection against Spectre variant 2 can be put in place by + disabling indirect branch speculation when the program is running + (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`). + +3. Sandbox untrusted programs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Untrusted programs that could be a source of attacks can be cordoned + off by disabling their indirect branch speculation when they are run + (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`). + This prevents untrusted programs from polluting the branch target + buffer. All programs running in SECCOMP sandboxes have indirect + branch speculation restricted by default. This behavior can be + changed via the kernel command line and sysfs control files. See + :ref:`spectre_mitigation_control_command_line`. + +3. High security mode +^^^^^^^^^^^^^^^^^^^^^ + + All Spectre variant 2 mitigations can be forced on + at boot time for all programs (See the "on" option in + :ref:`spectre_mitigation_control_command_line`). This will add + overhead as indirect branch speculations for all programs will be + restricted. + + On x86, branch target buffer will be flushed with IBPB when switching + to a new program. STIBP is left on all the time to protect programs + against variant 2 attacks originating from programs running on + sibling threads. + + Alternatively, STIBP can be used only when running programs + whose indirect branch speculation is explicitly disabled, + while IBPB is still used all the time when switching to a new + program to clear the branch target buffer (See "ibpb" option in + :ref:`spectre_mitigation_control_command_line`). This "ibpb" option + has less performance cost than the "on" option, which leaves STIBP + on all the time. + +References on Spectre +--------------------- + +Intel white papers: + +.. _spec_ref1: + +[1] `Intel analysis of speculative execution side channels <https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/Intel-Analysis-of-Speculative-Execution-Side-Channels.pdf>`_. + +.. _spec_ref2: + +[2] `Bounds check bypass <https://software.intel.com/security-software-guidance/software-guidance/bounds-check-bypass>`_. + +.. _spec_ref3: + +[3] `Deep dive: Retpoline: A branch target injection mitigation <https://software.intel.com/security-software-guidance/insights/deep-dive-retpoline-branch-target-injection-mitigation>`_. + +.. _spec_ref4: + +[4] `Deep Dive: Single Thread Indirect Branch Predictors <https://software.intel.com/security-software-guidance/insights/deep-dive-single-thread-indirect-branch-predictors>`_. + +AMD white papers: + +.. _spec_ref5: + +[5] `AMD64 technology indirect branch control extension <https://developer.amd.com/wp-content/resources/Architecture_Guidelines_Update_Indirect_Branch_Control.pdf>`_. + +.. _spec_ref6: + +[6] `Software techniques for managing speculation on AMD processors <https://developer.amd.com/wp-content/resources/90343-B_SoftwareTechniquesforManagingSpeculation_WP_7-18Update_FNL.pdf>`_. + +ARM white papers: + +.. _spec_ref7: + +[7] `Cache speculation side-channels <https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability/download-the-whitepaper>`_. + +.. _spec_ref8: + +[8] `Cache speculation issues update <https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability/latest-updates/cache-speculation-issues-update>`_. + +Google white paper: + +.. _spec_ref9: + +[9] `Retpoline: a software construct for preventing branch-target-injection <https://support.google.com/faqs/answer/7625886>`_. + +MIPS white paper: + +.. _spec_ref10: + +[10] `MIPS: response on speculative execution and side channel vulnerabilities <https://www.mips.com/blog/mips-response-on-speculative-execution-and-side-channel-vulnerabilities/>`_. + +Academic papers: + +.. _spec_ref11: + +[11] `Spectre Attacks: Exploiting Speculative Execution <https://spectreattack.com/spectre.pdf>`_. + +.. _spec_ref12: + +[12] `NetSpectre: Read Arbitrary Memory over Network <https://arxiv.org/abs/1807.10535>`_. + +.. _spec_ref13: + +[13] `Spectre Returns! Speculation Attacks using the Return Stack Buffer <https://www.usenix.org/system/files/conference/woot18/woot18-paper-koruyeh.pdf>`_. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 8001917ee012..24fbe0568eff 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -70,6 +70,7 @@ configure specific aspects of kernel behavior to your liking. ras bcache ext4 + binderfs pm/index thunderbolt LSM/index diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 0124980dca2d..5d29ba5ad88c 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -9,11 +9,11 @@ and sorted into English Dictionary order (defined as ignoring all punctuation and sorting digits before letters in a case insensitive manner), and with descriptions where known. -The kernel parses parameters from the kernel command line up to "--"; +The kernel parses parameters from the kernel command line up to "``--``"; if it doesn't recognize a parameter and it doesn't contain a '.', the parameter gets passed to init: parameters with '=' go into init's environment, others are passed as command line arguments to init. -Everything after "--" is passed as an argument to init. +Everything after "``--``" is passed as an argument to init. Module parameters can be specified in two ways: via the kernel command line with a module name prefix, or via modprobe, e.g.:: @@ -167,7 +167,7 @@ parameter is applicable:: X86-32 X86-32, aka i386 architecture is enabled. X86-64 X86-64 architecture is enabled. More X86-64 boot options can be found in - Documentation/x86/x86_64/boot-options.txt . + Documentation/x86/x86_64/boot-options.rst. X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) X86_UV SGI UV support is enabled. XEN Xen support is enabled @@ -181,10 +181,10 @@ In addition, the following text indicates that the option:: Parameters denoted with BOOT are actually interpreted by the boot loader, and have no meaning to the kernel directly. Do not modify the syntax of boot loader parameters without extreme -need or coordination with <Documentation/x86/boot.txt>. +need or coordination with <Documentation/x86/boot.rst>. There are also arch-specific kernel-parameters not documented here. -See for example <Documentation/x86/x86_64/boot-options.txt>. +See for example <Documentation/x86/x86_64/boot-options.rst>. Note that ALL kernel parameters listed below are CASE SENSITIVE, and that a trailing = on the name of any parameter states that that parameter will diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 74d28efa1c40..f1c433daef6b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -53,7 +53,7 @@ ACPI_DEBUG_PRINT statements, e.g., ACPI_DEBUG_PRINT((ACPI_DB_INFO, ... The debug_level mask defaults to "info". See - Documentation/acpi/debug.txt for more information about + Documentation/firmware-guide/acpi/debug.rst for more information about debug layers and levels. Enable processor driver info messages: @@ -708,14 +708,14 @@ [KNL, x86_64] select a region under 4G first, and fall back to reserve region above 4G when '@offset' hasn't been specified. - See Documentation/kdump/kdump.txt for further details. + See Documentation/kdump/kdump.rst for further details. crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is start-[end] where start and end are both a memory unit (amount[KMG]). See also - Documentation/kdump/kdump.txt for an example. + Documentation/kdump/kdump.rst for an example. crashkernel=size[KMG],high [KNL, x86_64] range could be above 4G. Allow kernel @@ -932,7 +932,7 @@ edid/1680x1050.bin, or edid/1920x1080.bin is given and no file with the same name exists. Details and instructions how to build your own EDID data are - available in Documentation/EDID/HOWTO.txt. An EDID + available in Documentation/EDID/howto.rst. An EDID data set will only be used for a particular connector, if its name and a colon are prepended to the EDID name. Each connector may use a unique EDID data @@ -963,7 +963,7 @@ for details. nompx [X86] Disables Intel Memory Protection Extensions. - See Documentation/x86/intel_mpx.txt for more + See Documentation/x86/intel_mpx.rst for more information about the feature. nopku [X86] Disable Memory Protection Keys CPU feature found @@ -1189,7 +1189,7 @@ that is to be dynamically loaded by Linux. If there are multiple variables with the same name but with different vendor GUIDs, all of them will be loaded. See - Documentation/acpi/ssdt-overlays.txt for details. + Documentation/admin-guide/acpi/ssdt-overlays.rst for details. eisa_irq_edge= [PARISC,HW] @@ -1209,7 +1209,7 @@ Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. - See Documentation/kdump/kdump.txt for details. + See Documentation/kdump/kdump.rst for details. enable_mtrr_cleanup [X86] The kernel tries to adjust MTRR layout from continuous @@ -1388,9 +1388,6 @@ Valid parameters: "on", "off" Default: "on" - hisax= [HW,ISDN] - See Documentation/isdn/README.HiSax. - hlt [BUGS=ARM,SH] hpet= [X86-32,HPET] option to control HPET usage @@ -1507,7 +1504,7 @@ Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr .cdrom .chs .ignore_cable are additional options - See Documentation/ide/ide.txt. + See Documentation/ide/ide.rst. ide-generic.probe-mask= [HW] (E)IDE subsystem Format: <int> @@ -2383,7 +2380,7 @@ mce [X86-32] Machine Check Exception - mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt + mce=option [X86-64] See Documentation/x86/x86_64/boot-options.rst md= [HW] RAID subsystems devices and level See Documentation/admin-guide/md.rst. @@ -2439,7 +2436,7 @@ set according to the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config option. - See Documentation/memory-hotplug.txt. + See Documentation/admin-guide/mm/memory-hotplug.rst. memmap=exactmap [KNL,X86] Enable setting of an exact E820 memory map, as specified by the user. @@ -2528,7 +2525,7 @@ mem_encrypt=on: Activate SME mem_encrypt=off: Do not activate SME - Refer to Documentation/x86/amd-memory-encryption.txt + Refer to Documentation/virtual/kvm/amd-memory-encryption.rst for details on when memory encryption can be activated. mem_sleep_default= [SUSPEND] Default system suspend mode: @@ -2836,8 +2833,9 @@ 0 - turn hardlockup detector in nmi_watchdog off 1 - turn hardlockup detector in nmi_watchdog on When panic is specified, panic when an NMI watchdog - timeout occurs (or 'nopanic' to override the opposite - default). To disable both hard and soft lockup detectors, + timeout occurs (or 'nopanic' to not panic on an NMI + watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set) + To disable both hard and soft lockup detectors, please see 'nowatchdog'. This is useful when you use a panic=... timeout and need the box quickly up again. @@ -3528,7 +3526,7 @@ See Documentation/blockdev/paride.txt. pirq= [SMP,APIC] Manual mp-table setup - See Documentation/x86/i386/IO-APIC.txt. + See Documentation/x86/i386/IO-APIC.rst. plip= [PPT,NET] Parallel port network link Format: { parport<nr> | timid | 0 } @@ -5032,7 +5030,7 @@ vector=percpu: enable percpu vector domain video= [FB] Frame buffer configuration - See Documentation/fb/modedb.txt. + See Documentation/fb/modedb.rst. video.brightness_switch_enabled= [0,1] If set to 1, on receiving an ACPI notify event @@ -5060,7 +5058,7 @@ Can be used multiple times for multiple devices. vga= [BOOT,X86-32] Select a particular video mode - See Documentation/x86/boot.txt and + See Documentation/x86/boot.rst and Documentation/svga.txt. Use vga=ask for menu. This is actually a boot loader parameter; the value is @@ -5167,7 +5165,7 @@ Default: 3 = cyan. watchdog timers [HW,WDT] For information on watchdog timers, - see Documentation/watchdog/watchdog-parameters.txt + see Documentation/watchdog/watchdog-parameters.rst or other driver-specific files in the Documentation/watchdog/ directory. diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst index c067ed145158..a80c3c37226e 100644 --- a/Documentation/admin-guide/mm/numaperf.rst +++ b/Documentation/admin-guide/mm/numaperf.rst @@ -165,5 +165,6 @@ write-through caching. ======== See Also ======== -.. [1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf - Section 5.2.27 + +[1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf +- Section 5.2.27 diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index c7495e42e6f4..2b20f5f7380d 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -199,7 +199,7 @@ Architecture (MCA)\ [#f3]_. mode). .. [#f3] For more details about the Machine Check Architecture (MCA), - please read Documentation/x86/x86_64/machinecheck at the Kernel tree. + please read Documentation/x86/x86_64/machinecheck.rst at the Kernel tree. EDAC - Error Detection And Correction ************************************* diff --git a/Documentation/aoe/aoe.txt b/Documentation/aoe/aoe.rst index c71487d399d1..58747ecec71d 100644 --- a/Documentation/aoe/aoe.txt +++ b/Documentation/aoe/aoe.rst @@ -1,3 +1,6 @@ +Introduction +============ + ATA over Ethernet is a network protocol that provides simple access to block storage on the LAN. @@ -22,7 +25,8 @@ document the use of the driver and are not necessary if you install the aoetools. -CREATING DEVICE NODES +Creating Device Nodes +===================== Users of udev should find the block device nodes created automatically, but to create all the necessary device nodes, use the @@ -38,7 +42,8 @@ CREATING DEVICE NODES confusing when an AoE device is not present the first time the a command is run but appears a second later. -USING DEVICE NODES +Using Device Nodes +================== "cat /dev/etherd/err" blocks, waiting for error diagnostic output, like any retransmitted packets. @@ -55,7 +60,7 @@ USING DEVICE NODES by sysfs counterparts. Using the commands in aoetools insulates users from these implementation details. - The block devices are named like this: + The block devices are named like this:: e{shelf}.{slot} e{shelf}.{slot}p{part} @@ -64,7 +69,8 @@ USING DEVICE NODES first shelf (shelf address zero). That's the whole disk. The first partition on that disk would be "e0.2p1". -USING SYSFS +Using sysfs +=========== Each aoe block device in /sys/block has the extra attributes of state, mac, and netif. The state attribute is "up" when the device @@ -78,29 +84,29 @@ USING SYSFS There is a script in this directory that formats this information in a convenient way. Users with aoetools should use the aoe-stat - command. - - root@makki root# sh Documentation/aoe/status.sh - e10.0 eth3 up - e10.1 eth3 up - e10.2 eth3 up - e10.3 eth3 up - e10.4 eth3 up - e10.5 eth3 up - e10.6 eth3 up - e10.7 eth3 up - e10.8 eth3 up - e10.9 eth3 up - e4.0 eth1 up - e4.1 eth1 up - e4.2 eth1 up - e4.3 eth1 up - e4.4 eth1 up - e4.5 eth1 up - e4.6 eth1 up - e4.7 eth1 up - e4.8 eth1 up - e4.9 eth1 up + command:: + + root@makki root# sh Documentation/aoe/status.sh + e10.0 eth3 up + e10.1 eth3 up + e10.2 eth3 up + e10.3 eth3 up + e10.4 eth3 up + e10.5 eth3 up + e10.6 eth3 up + e10.7 eth3 up + e10.8 eth3 up + e10.9 eth3 up + e4.0 eth1 up + e4.1 eth1 up + e4.2 eth1 up + e4.3 eth1 up + e4.4 eth1 up + e4.5 eth1 up + e4.6 eth1 up + e4.7 eth1 up + e4.8 eth1 up + e4.9 eth1 up Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver option discussed below) instead of /dev/etherd/interfaces to limit @@ -113,12 +119,13 @@ USING SYSFS for this purpose. You can also directly use the /dev/etherd/discover special file described above. -DRIVER OPTIONS +Driver Options +============== There is a boot option for the built-in aoe driver and a corresponding module parameter, aoe_iflist. Without this option, all network interfaces may be used for ATA over Ethernet. Here is a - usage example for the module parameter. + usage example for the module parameter:: modprobe aoe_iflist="eth1 eth3" diff --git a/Documentation/aoe/examples.rst b/Documentation/aoe/examples.rst new file mode 100644 index 000000000000..91f3198e52c1 --- /dev/null +++ b/Documentation/aoe/examples.rst @@ -0,0 +1,23 @@ +Example of udev rules +--------------------- + + .. include:: udev.txt + :literal: + +Example of udev install rules script +------------------------------------ + + .. literalinclude:: udev-install.sh + :language: shell + +Example script to get status +---------------------------- + + .. literalinclude:: status.sh + :language: shell + +Example of AoE autoload script +------------------------------ + + .. literalinclude:: autoload.sh + :language: shell diff --git a/Documentation/aoe/index.rst b/Documentation/aoe/index.rst new file mode 100644 index 000000000000..4394b9b7913c --- /dev/null +++ b/Documentation/aoe/index.rst @@ -0,0 +1,19 @@ +:orphan: + +======================= +ATA over Ethernet (AoE) +======================= + +.. toctree:: + :maxdepth: 1 + + aoe + todo + examples + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/aoe/todo.txt b/Documentation/aoe/todo.rst index c09dfad4aed8..dea8db5a33e1 100644 --- a/Documentation/aoe/todo.txt +++ b/Documentation/aoe/todo.rst @@ -1,3 +1,6 @@ +TODO +==== + There is a potential for deadlock when allocating a struct sk_buff for data that needs to be written out to aoe storage. If the data is being written from a dirty page in order to free that page, and if diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt index 1f06daf03f5b..54feda5a0772 100644 --- a/Documentation/aoe/udev.txt +++ b/Documentation/aoe/udev.txt @@ -11,7 +11,7 @@ # udev_rules="/etc/udev/rules.d/" # bash# ls /etc/udev/rules.d/ # 10-wacom.rules 50-udev.rules -# bash# cp /path/to/linux-2.6.xx/Documentation/aoe/udev.txt \ +# bash# cp /path/to/linux/Documentation/aoe/udev.txt \ # /etc/udev/rules.d/60-aoe.rules # diff --git a/Documentation/arm/mem_alignment b/Documentation/arm/mem_alignment index 6335fcacbba9..e110e2781039 100644 --- a/Documentation/arm/mem_alignment +++ b/Documentation/arm/mem_alignment @@ -1,4 +1,4 @@ -Too many problems poped up because of unnoticed misaligned memory access in +Too many problems popped up because of unnoticed misaligned memory access in kernel code lately. Therefore the alignment fixup is now unconditionally configured in for SA11x0 based targets. According to Alan Cox, this is a bad idea to configure it out, but Russell King has some good reasons for diff --git a/Documentation/arm/stm32/overview.rst b/Documentation/arm/stm32/overview.rst index 85cfc8410798..f7e734153860 100644 --- a/Documentation/arm/stm32/overview.rst +++ b/Documentation/arm/stm32/overview.rst @@ -1,3 +1,5 @@ +:orphan: + ======================== STM32 ARM Linux Overview ======================== diff --git a/Documentation/arm/stm32/stm32f429-overview.rst b/Documentation/arm/stm32/stm32f429-overview.rst index 18feda97f483..65bbb1c3b423 100644 --- a/Documentation/arm/stm32/stm32f429-overview.rst +++ b/Documentation/arm/stm32/stm32f429-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32F429 Overview ================== diff --git a/Documentation/arm/stm32/stm32f746-overview.rst b/Documentation/arm/stm32/stm32f746-overview.rst index b5f4b6ce7656..42d593085015 100644 --- a/Documentation/arm/stm32/stm32f746-overview.rst +++ b/Documentation/arm/stm32/stm32f746-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32F746 Overview ================== diff --git a/Documentation/arm/stm32/stm32f769-overview.rst b/Documentation/arm/stm32/stm32f769-overview.rst index 228656ced2fe..f6adac862b17 100644 --- a/Documentation/arm/stm32/stm32f769-overview.rst +++ b/Documentation/arm/stm32/stm32f769-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32F769 Overview ================== diff --git a/Documentation/arm/stm32/stm32h743-overview.rst b/Documentation/arm/stm32/stm32h743-overview.rst index 3458dc00095d..c525835e7473 100644 --- a/Documentation/arm/stm32/stm32h743-overview.rst +++ b/Documentation/arm/stm32/stm32h743-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32H743 Overview ================== diff --git a/Documentation/arm/stm32/stm32mp157-overview.rst b/Documentation/arm/stm32/stm32mp157-overview.rst index 62e176d47ca7..2c52cd020601 100644 --- a/Documentation/arm/stm32/stm32mp157-overview.rst +++ b/Documentation/arm/stm32/stm32mp157-overview.rst @@ -1,3 +1,5 @@ +:orphan: + STM32MP157 Overview =================== diff --git a/Documentation/arm64/acpi_object_usage.txt b/Documentation/arm64/acpi_object_usage.rst index c77010c5c1f0..d51b69dc624d 100644 --- a/Documentation/arm64/acpi_object_usage.txt +++ b/Documentation/arm64/acpi_object_usage.rst @@ -1,5 +1,7 @@ +=========== ACPI Tables ------------ +=========== + The expectations of individual ACPI tables are discussed in the list that follows. @@ -11,54 +13,71 @@ outside of the UEFI Forum (see Section 5.2.6 of the specification). For ACPI on arm64, tables also fall into the following categories: - -- Required: DSDT, FADT, GTDT, MADT, MCFG, RSDP, SPCR, XSDT + - Required: DSDT, FADT, GTDT, MADT, MCFG, RSDP, SPCR, XSDT - -- Recommended: BERT, EINJ, ERST, HEST, PCCT, SSDT + - Recommended: BERT, EINJ, ERST, HEST, PCCT, SSDT - -- Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IORT, + - Optional: BGRT, CPEP, CSRT, DBG2, DRTM, ECDT, FACS, FPDT, IORT, MCHI, MPST, MSCT, NFIT, PMTT, RASF, SBST, SLIT, SPMI, SRAT, STAO, TCPA, TPM2, UEFI, XENV - -- Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, LPIT, + - Not supported: BOOT, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, LPIT, MSDM, OEMx, PSDT, RSDT, SLIC, WAET, WDAT, WDRT, WPBT +====== ======================================================================== Table Usage for ARMv8 Linux ------ ---------------------------------------------------------------- +====== ======================================================================== BERT Section 18.3 (signature == "BERT") - == Boot Error Record Table == + + **Boot Error Record Table** + Must be supplied if RAS support is provided by the platform. It is recommended this table be supplied. BOOT Signature Reserved (signature == "BOOT") - == simple BOOT flag table == + + **simple BOOT flag table** + Microsoft only table, will not be supported. BGRT Section 5.2.22 (signature == "BGRT") - == Boot Graphics Resource Table == + + **Boot Graphics Resource Table** + Optional, not currently supported, with no real use-case for an ARM server. CPEP Section 5.2.18 (signature == "CPEP") - == Corrected Platform Error Polling table == + + **Corrected Platform Error Polling table** + Optional, not currently supported, and not recommended until such time as ARM-compatible hardware is available, and the specification suitably modified. CSRT Signature Reserved (signature == "CSRT") - == Core System Resources Table == + + **Core System Resources Table** + Optional, not currently supported. DBG2 Signature Reserved (signature == "DBG2") - == DeBuG port table 2 == + + **DeBuG port table 2** + License has changed and should be usable. Optional if used instead of earlycon=<device> on the command line. DBGP Signature Reserved (signature == "DBGP") - == DeBuG Port table == + + **DeBuG Port table** + Microsoft only table, will not be supported. DSDT Section 5.2.11.1 (signature == "DSDT") - == Differentiated System Description Table == + + **Differentiated System Description Table** + A DSDT is required; see also SSDT. ACPI tables contain only one DSDT but can contain one or more SSDTs, @@ -66,22 +85,30 @@ DSDT Section 5.2.11.1 (signature == "DSDT") but cannot modify or replace anything in the DSDT. DMAR Signature Reserved (signature == "DMAR") - == DMA Remapping table == + + **DMA Remapping table** + x86 only table, will not be supported. DRTM Signature Reserved (signature == "DRTM") - == Dynamic Root of Trust for Measurement table == + + **Dynamic Root of Trust for Measurement table** + Optional, not currently supported. ECDT Section 5.2.16 (signature == "ECDT") - == Embedded Controller Description Table == + + **Embedded Controller Description Table** + Optional, not currently supported, but could be used on ARM if and only if one uses the GPE_BIT field to represent an IRQ number, since there are no GPE blocks defined in hardware reduced mode. This would need to be modified in the ACPI specification. EINJ Section 18.6 (signature == "EINJ") - == Error Injection table == + + **Error Injection table** + This table is very useful for testing platform response to error conditions; it allows one to inject an error into the system as if it had actually occurred. However, this table should not be @@ -89,27 +116,35 @@ EINJ Section 18.6 (signature == "EINJ") and executed with the ACPICA tools only during testing. ERST Section 18.5 (signature == "ERST") - == Error Record Serialization Table == + + **Error Record Serialization Table** + On a platform supports RAS, this table must be supplied if it is not UEFI-based; if it is UEFI-based, this table may be supplied. When this table is not present, UEFI run time service will be utilized to save and retrieve hardware error information to and from a persistent store. ETDT Signature Reserved (signature == "ETDT") - == Event Timer Description Table == + + **Event Timer Description Table** + Obsolete table, will not be supported. FACS Section 5.2.10 (signature == "FACS") - == Firmware ACPI Control Structure == + + **Firmware ACPI Control Structure** + It is unlikely that this table will be terribly useful. If it is provided, the Global Lock will NOT be used since it is not part of the hardware reduced profile, and only 64-bit address fields will be considered valid. FADT Section 5.2.9 (signature == "FACP") - == Fixed ACPI Description Table == + + **Fixed ACPI Description Table** Required for arm64. + The HW_REDUCED_ACPI flag must be set. All of the fields that are to be ignored when HW_REDUCED_ACPI is set are expected to be set to zero. @@ -118,22 +153,28 @@ FADT Section 5.2.9 (signature == "FACP") used, not FIRMWARE_CTRL. If PSCI is used (as is recommended), make sure that ARM_BOOT_ARCH is - filled in properly -- that the PSCI_COMPLIANT flag is set and that + filled in properly - that the PSCI_COMPLIANT flag is set and that PSCI_USE_HVC is set or unset as needed (see table 5-37). For the DSDT that is also required, the X_DSDT field is to be used, not the DSDT field. FPDT Section 5.2.23 (signature == "FPDT") - == Firmware Performance Data Table == + + **Firmware Performance Data Table** + Optional, not currently supported. GTDT Section 5.2.24 (signature == "GTDT") - == Generic Timer Description Table == + + **Generic Timer Description Table** + Required for arm64. HEST Section 18.3.2 (signature == "HEST") - == Hardware Error Source Table == + + **Hardware Error Source Table** + ARM-specific error sources have been defined; please use those or the PCI types such as type 6 (AER Root Port), 7 (AER Endpoint), or 8 (AER Bridge), or use type 9 (Generic Hardware Error Source). Firmware first @@ -144,122 +185,174 @@ HEST Section 18.3.2 (signature == "HEST") is recommended this table be supplied. HPET Signature Reserved (signature == "HPET") - == High Precision Event timer Table == + + **High Precision Event timer Table** + x86 only table, will not be supported. IBFT Signature Reserved (signature == "IBFT") - == iSCSI Boot Firmware Table == + + **iSCSI Boot Firmware Table** + Microsoft defined table, support TBD. IORT Signature Reserved (signature == "IORT") - == Input Output Remapping Table == + + **Input Output Remapping Table** + arm64 only table, required in order to describe IO topology, SMMUs, and GIC ITSs, and how those various components are connected together, such as identifying which components are behind which SMMUs/ITSs. This table will only be required on certain SBSA platforms (e.g., - when using GICv3-ITS and an SMMU); on SBSA Level 0 platforms, it + when using GICv3-ITS and an SMMU); on SBSA Level 0 platforms, it remains optional. IVRS Signature Reserved (signature == "IVRS") - == I/O Virtualization Reporting Structure == + + **I/O Virtualization Reporting Structure** + x86_64 (AMD) only table, will not be supported. LPIT Signature Reserved (signature == "LPIT") - == Low Power Idle Table == + + **Low Power Idle Table** + x86 only table as of ACPI 5.1; starting with ACPI 6.0, processor descriptions and power states on ARM platforms should use the DSDT and define processor container devices (_HID ACPI0010, Section 8.4, and more specifically 8.4.3 and and 8.4.4). MADT Section 5.2.12 (signature == "APIC") - == Multiple APIC Description Table == + + **Multiple APIC Description Table** + Required for arm64. Only the GIC interrupt controller structures should be used (types 0xA - 0xF). MCFG Signature Reserved (signature == "MCFG") - == Memory-mapped ConFiGuration space == + + **Memory-mapped ConFiGuration space** + If the platform supports PCI/PCIe, an MCFG table is required. MCHI Signature Reserved (signature == "MCHI") - == Management Controller Host Interface table == + + **Management Controller Host Interface table** + Optional, not currently supported. MPST Section 5.2.21 (signature == "MPST") - == Memory Power State Table == + + **Memory Power State Table** + Optional, not currently supported. MSCT Section 5.2.19 (signature == "MSCT") - == Maximum System Characteristic Table == + + **Maximum System Characteristic Table** + Optional, not currently supported. MSDM Signature Reserved (signature == "MSDM") - == Microsoft Data Management table == + + **Microsoft Data Management table** + Microsoft only table, will not be supported. NFIT Section 5.2.25 (signature == "NFIT") - == NVDIMM Firmware Interface Table == + + **NVDIMM Firmware Interface Table** + Optional, not currently supported. OEMx Signature of "OEMx" only - == OEM Specific Tables == + + **OEM Specific Tables** + All tables starting with a signature of "OEM" are reserved for OEM use. Since these are not meant to be of general use but are limited to very specific end users, they are not recommended for use and are not supported by the kernel for arm64. PCCT Section 14.1 (signature == "PCCT) - == Platform Communications Channel Table == + + **Platform Communications Channel Table** + Recommend for use on arm64; use of PCC is recommended when using CPPC to control performance and power for platform processors. PMTT Section 5.2.21.12 (signature == "PMTT") - == Platform Memory Topology Table == + + **Platform Memory Topology Table** + Optional, not currently supported. PSDT Section 5.2.11.3 (signature == "PSDT") - == Persistent System Description Table == + + **Persistent System Description Table** + Obsolete table, will not be supported. RASF Section 5.2.20 (signature == "RASF") - == RAS Feature table == + + **RAS Feature table** + Optional, not currently supported. RSDP Section 5.2.5 (signature == "RSD PTR") - == Root System Description PoinTeR == + + **Root System Description PoinTeR** + Required for arm64. RSDT Section 5.2.7 (signature == "RSDT") - == Root System Description Table == + + **Root System Description Table** + Since this table can only provide 32-bit addresses, it is deprecated on arm64, and will not be used. If provided, it will be ignored. SBST Section 5.2.14 (signature == "SBST") - == Smart Battery Subsystem Table == + + **Smart Battery Subsystem Table** + Optional, not currently supported. SLIC Signature Reserved (signature == "SLIC") - == Software LIcensing table == + + **Software LIcensing table** + Microsoft only table, will not be supported. SLIT Section 5.2.17 (signature == "SLIT") - == System Locality distance Information Table == + + **System Locality distance Information Table** + Optional in general, but required for NUMA systems. SPCR Signature Reserved (signature == "SPCR") - == Serial Port Console Redirection table == + + **Serial Port Console Redirection table** + Required for arm64. SPMI Signature Reserved (signature == "SPMI") - == Server Platform Management Interface table == + + **Server Platform Management Interface table** + Optional, not currently supported. SRAT Section 5.2.16 (signature == "SRAT") - == System Resource Affinity Table == + + **System Resource Affinity Table** + Optional, but if used, only the GICC Affinity structures are read. To support arm64 NUMA, this table is required. SSDT Section 5.2.11.2 (signature == "SSDT") - == Secondary System Description Table == + + **Secondary System Description Table** + These tables are a continuation of the DSDT; these are recommended for use with devices that can be added to a running system, but can also serve the purpose of dividing up device descriptions into more @@ -272,49 +365,69 @@ SSDT Section 5.2.11.2 (signature == "SSDT") one DSDT but can contain many SSDTs. STAO Signature Reserved (signature == "STAO") - == _STA Override table == + + **_STA Override table** + Optional, but only necessary in virtualized environments in order to hide devices from guest OSs. TCPA Signature Reserved (signature == "TCPA") - == Trusted Computing Platform Alliance table == + + **Trusted Computing Platform Alliance table** + Optional, not currently supported, and may need changes to fully interoperate with arm64. TPM2 Signature Reserved (signature == "TPM2") - == Trusted Platform Module 2 table == + + **Trusted Platform Module 2 table** + Optional, not currently supported, and may need changes to fully interoperate with arm64. UEFI Signature Reserved (signature == "UEFI") - == UEFI ACPI data table == + + **UEFI ACPI data table** + Optional, not currently supported. No known use case for arm64, at present. WAET Signature Reserved (signature == "WAET") - == Windows ACPI Emulated devices Table == + + **Windows ACPI Emulated devices Table** + Microsoft only table, will not be supported. WDAT Signature Reserved (signature == "WDAT") - == Watch Dog Action Table == + + **Watch Dog Action Table** + Microsoft only table, will not be supported. WDRT Signature Reserved (signature == "WDRT") - == Watch Dog Resource Table == + + **Watch Dog Resource Table** + Microsoft only table, will not be supported. WPBT Signature Reserved (signature == "WPBT") - == Windows Platform Binary Table == + + **Windows Platform Binary Table** + Microsoft only table, will not be supported. XENV Signature Reserved (signature == "XENV") - == Xen project table == + + **Xen project table** + Optional, used only by Xen at present. XSDT Section 5.2.8 (signature == "XSDT") - == eXtended System Description Table == - Required for arm64. + **eXtended System Description Table** + + Required for arm64. +====== ======================================================================== ACPI Objects ------------ @@ -323,10 +436,11 @@ shown in the list that follows; any object not explicitly mentioned below should be used as needed for a particular platform or particular subsystem, such as power management or PCI. +===== ================ ======================================================== Name Section Usage for ARMv8 Linux ----- ------------ ------------------------------------------------- +===== ================ ======================================================== _CCA 6.2.17 This method must be defined for all bus masters - on arm64 -- there are no assumptions made about + on arm64 - there are no assumptions made about whether such devices are cache coherent or not. The _CCA value is inherited by all descendants of these devices so it does not need to be repeated. @@ -422,8 +536,8 @@ _OSC 6.2.11 This method can be a global method in ACPI (i.e., by the kernel community, then register it with the UEFI Forum. -\_OSI 5.7.2 Deprecated on ARM64. As far as ACPI firmware is - concerned, _OSI is not to be used to determine what +\_OSI 5.7.2 Deprecated on ARM64. As far as ACPI firmware is + concerned, _OSI is not to be used to determine what sort of system is being used or what functionality is provided. The _OSC method is to be used instead. @@ -447,7 +561,7 @@ _PSx 7.3.2-5 Use as needed; power management specific. If _PS0 is usage, change them in these methods. _RDI 8.4.4.4 Recommended for use with processor definitions (_HID - ACPI0010) on arm64. This should only be used in + ACPI0010) on arm64. This should only be used in conjunction with _LPI. \_REV 5.7.4 Always returns the latest version of ACPI supported. @@ -476,6 +590,7 @@ _SWS 7.4.3 Use as needed; power management specific; this may _UID 6.1.12 Recommended for distinguishing devices of the same class; define it if at all possible. +===== ================ ======================================================== @@ -488,7 +603,7 @@ platforms, ACPI events must be signaled differently. There are two options: GPIO-signaled interrupts (Section 5.6.5), and interrupt-signaled events (Section 5.6.9). Interrupt-signaled events are a -new feature in the ACPI 6.1 specification. Either -- or both -- can be used +new feature in the ACPI 6.1 specification. Either - or both - can be used on a given platform, and which to use may be dependent of limitations in any given SoC. If possible, interrupt-signaled events are recommended. @@ -564,39 +679,40 @@ supported. The following classes of objects are not supported: - -- Section 9.2: ambient light sensor devices + - Section 9.2: ambient light sensor devices - -- Section 9.3: battery devices + - Section 9.3: battery devices - -- Section 9.4: lids (e.g., laptop lids) + - Section 9.4: lids (e.g., laptop lids) - -- Section 9.8.2: IDE controllers + - Section 9.8.2: IDE controllers - -- Section 9.9: floppy controllers + - Section 9.9: floppy controllers - -- Section 9.10: GPE block devices + - Section 9.10: GPE block devices - -- Section 9.15: PC/AT RTC/CMOS devices + - Section 9.15: PC/AT RTC/CMOS devices - -- Section 9.16: user presence detection devices + - Section 9.16: user presence detection devices - -- Section 9.17: I/O APIC devices; all GICs must be enumerable via MADT + - Section 9.17: I/O APIC devices; all GICs must be enumerable via MADT - -- Section 9.18: time and alarm devices (see 9.15) + - Section 9.18: time and alarm devices (see 9.15) - -- Section 10: power source and power meter devices + - Section 10: power source and power meter devices - -- Section 11: thermal management + - Section 11: thermal management - -- Section 12: embedded controllers interface + - Section 12: embedded controllers interface - -- Section 13: SMBus interfaces + - Section 13: SMBus interfaces This also means that there is no support for the following objects: +==== =========================== ==== ========== Name Section Name Section ----- ------------ ---- ------------ +==== =========================== ==== ========== _ALC 9.3.4 _FDM 9.10.3 _ALI 9.3.2 _FIX 6.2.7 _ALP 9.3.6 _GAI 10.4.5 @@ -619,4 +735,4 @@ _DCK 6.5.2 _UPD 9.16.1 _EC 12.12 _UPP 9.16.2 _FDE 9.10.1 _WPC 10.5.2 _FDI 9.10.2 _WPP 10.5.3 - +==== =========================== ==== ========== diff --git a/Documentation/arm64/arm-acpi.txt b/Documentation/arm64/arm-acpi.rst index 1a74a041a443..872dbbc73d4a 100644 --- a/Documentation/arm64/arm-acpi.txt +++ b/Documentation/arm64/arm-acpi.rst @@ -1,5 +1,7 @@ +===================== ACPI on ARMv8 Servers ---------------------- +===================== + ACPI can be used for ARMv8 general purpose servers designed to follow the ARM SBSA (Server Base System Architecture) [0] and SBBR (Server Base Boot Requirements) [1] specifications. Please note that the SBBR @@ -34,28 +36,28 @@ of the summary text almost directly, to be honest. The short form of the rationale for ACPI on ARM is: --- ACPI’s byte code (AML) allows the platform to encode hardware behavior, +- ACPI’s byte code (AML) allows the platform to encode hardware behavior, while DT explicitly does not support this. For hardware vendors, being able to encode behavior is a key tool used in supporting operating system releases on new hardware. --- ACPI’s OSPM defines a power management model that constrains what the +- ACPI’s OSPM defines a power management model that constrains what the platform is allowed to do into a specific model, while still providing flexibility in hardware design. --- In the enterprise server environment, ACPI has established bindings (such +- In the enterprise server environment, ACPI has established bindings (such as for RAS) which are currently used in production systems. DT does not. Such bindings could be defined in DT at some point, but doing so means ARM and x86 would end up using completely different code paths in both firmware and the kernel. --- Choosing a single interface to describe the abstraction between a platform +- Choosing a single interface to describe the abstraction between a platform and an OS is important. Hardware vendors would not be required to implement both DT and ACPI if they want to support multiple operating systems. And, agreeing on a single interface instead of being fragmented into per OS interfaces makes for better interoperability overall. --- The new ACPI governance process works well and Linux is now at the same +- The new ACPI governance process works well and Linux is now at the same table as hardware vendors and other OS vendors. In fact, there is no longer any reason to feel that ACPI only belongs to Windows or that Linux is in any way secondary to Microsoft in this arena. The move of @@ -169,31 +171,31 @@ For the ACPI core to operate properly, and in turn provide the information the kernel needs to configure devices, it expects to find the following tables (all section numbers refer to the ACPI 6.1 specification): - -- RSDP (Root System Description Pointer), section 5.2.5 + - RSDP (Root System Description Pointer), section 5.2.5 - -- XSDT (eXtended System Description Table), section 5.2.8 + - XSDT (eXtended System Description Table), section 5.2.8 - -- FADT (Fixed ACPI Description Table), section 5.2.9 + - FADT (Fixed ACPI Description Table), section 5.2.9 - -- DSDT (Differentiated System Description Table), section + - DSDT (Differentiated System Description Table), section 5.2.11.1 - -- MADT (Multiple APIC Description Table), section 5.2.12 + - MADT (Multiple APIC Description Table), section 5.2.12 - -- GTDT (Generic Timer Description Table), section 5.2.24 + - GTDT (Generic Timer Description Table), section 5.2.24 - -- If PCI is supported, the MCFG (Memory mapped ConFiGuration + - If PCI is supported, the MCFG (Memory mapped ConFiGuration Table), section 5.2.6, specifically Table 5-31. - -- If booting without a console=<device> kernel parameter is + - If booting without a console=<device> kernel parameter is supported, the SPCR (Serial Port Console Redirection table), section 5.2.6, specifically Table 5-31. - -- If necessary to describe the I/O topology, SMMUs and GIC ITSs, + - If necessary to describe the I/O topology, SMMUs and GIC ITSs, the IORT (Input Output Remapping Table, section 5.2.6, specifically Table 5-31). - -- If NUMA is supported, the SRAT (System Resource Affinity Table) + - If NUMA is supported, the SRAT (System Resource Affinity Table) and SLIT (System Locality distance Information Table), sections 5.2.16 and 5.2.17, respectively. @@ -269,9 +271,9 @@ describes how to define the structure of an object returned via _DSD, and how specific data structures are defined by specific UUIDs. Linux should only use the _DSD Device Properties UUID [5]: - -- UUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 + - UUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 - -- http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf + - http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf The UEFI Forum provides a mechanism for registering device properties [4] so that they may be used across all operating systems supporting ACPI. @@ -327,10 +329,10 @@ turning a device full off. There are two options for using those Power Resources. They can: - -- be managed in a _PSx method which gets called on entry to power + - be managed in a _PSx method which gets called on entry to power state Dx. - -- be declared separately as power resources with their own _ON and _OFF + - be declared separately as power resources with their own _ON and _OFF methods. They are then tied back to D-states for a particular device via _PRx which specifies which power resources a device needs to be on while in Dx. Kernel then tracks number of devices using a power resource @@ -339,16 +341,16 @@ There are two options for using those Power Resources. They can: The kernel ACPI code will also assume that the _PSx methods follow the normal ACPI rules for such methods: - -- If either _PS0 or _PS3 is implemented, then the other method must also + - If either _PS0 or _PS3 is implemented, then the other method must also be implemented. - -- If a device requires usage or setup of a power resource when on, the ASL + - If a device requires usage or setup of a power resource when on, the ASL should organize that it is allocated/enabled using the _PS0 method. - -- Resources allocated or enabled in the _PS0 method should be disabled + - Resources allocated or enabled in the _PS0 method should be disabled or de-allocated in the _PS3 method. - -- Firmware will leave the resources in a reasonable state before handing + - Firmware will leave the resources in a reasonable state before handing over control to the kernel. Such code in _PSx methods will of course be very platform specific. But, @@ -394,52 +396,52 @@ else must be discovered by the driver probe function. Then, have the rest of the driver operate off of the contents of that struct. Doing so should allow most divergence between ACPI and DT functionality to be kept local to the probe function instead of being scattered throughout the driver. For -example: - -static int device_probe_dt(struct platform_device *pdev) -{ - /* DT specific functionality */ - ... -} - -static int device_probe_acpi(struct platform_device *pdev) -{ - /* ACPI specific functionality */ - ... -} - -static int device_probe(struct platform_device *pdev) -{ - ... - struct device_node node = pdev->dev.of_node; - ... - - if (node) - ret = device_probe_dt(pdev); - else if (ACPI_HANDLE(&pdev->dev)) - ret = device_probe_acpi(pdev); - else - /* other initialization */ - ... - /* Continue with any generic probe operations */ - ... -} +example:: + + static int device_probe_dt(struct platform_device *pdev) + { + /* DT specific functionality */ + ... + } + + static int device_probe_acpi(struct platform_device *pdev) + { + /* ACPI specific functionality */ + ... + } + + static int device_probe(struct platform_device *pdev) + { + ... + struct device_node node = pdev->dev.of_node; + ... + + if (node) + ret = device_probe_dt(pdev); + else if (ACPI_HANDLE(&pdev->dev)) + ret = device_probe_acpi(pdev); + else + /* other initialization */ + ... + /* Continue with any generic probe operations */ + ... + } DO keep the MODULE_DEVICE_TABLE entries together in the driver to make it clear the different names the driver is probed for, both from DT and from -ACPI: +ACPI:: -static struct of_device_id virtio_mmio_match[] = { - { .compatible = "virtio,mmio", }, - { } -}; -MODULE_DEVICE_TABLE(of, virtio_mmio_match); + static struct of_device_id virtio_mmio_match[] = { + { .compatible = "virtio,mmio", }, + { } + }; + MODULE_DEVICE_TABLE(of, virtio_mmio_match); -static const struct acpi_device_id virtio_mmio_acpi_match[] = { - { "LNRO0005", }, - { } -}; -MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); + static const struct acpi_device_id virtio_mmio_acpi_match[] = { + { "LNRO0005", }, + { } + }; + MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); ASWG @@ -471,7 +473,8 @@ Linux Code Individual items specific to Linux on ARM, contained in the the Linux source code, are in the list that follows: -ACPI_OS_NAME This macro defines the string to be returned when +ACPI_OS_NAME + This macro defines the string to be returned when an ACPI method invokes the _OS method. On ARM64 systems, this macro will be "Linux" by default. The command line parameter acpi_os=<string> @@ -482,38 +485,44 @@ ACPI_OS_NAME This macro defines the string to be returned when ACPI Objects ------------ Detailed expectations for ACPI tables and object are listed in the file -Documentation/arm64/acpi_object_usage.txt. +Documentation/arm64/acpi_object_usage.rst. References ---------- -[0] http://silver.arm.com -- document ARM-DEN-0029, or newer +[0] http://silver.arm.com + document ARM-DEN-0029, or newer: "Server Base System Architecture", version 2.3, dated 27 Mar 2014 [1] http://infocenter.arm.com/help/topic/com.arm.doc.den0044a/Server_Base_Boot_Requirements.pdf Document ARM-DEN-0044A, or newer: "Server Base Boot Requirements, System Software on ARM Platforms", dated 16 Aug 2014 -[2] http://www.secretlab.ca/archives/151, 10 Jan 2015, Copyright (c) 2015, +[2] http://www.secretlab.ca/archives/151, + 10 Jan 2015, Copyright (c) 2015, Linaro Ltd., written by Grant Likely. -[3] AMD ACPI for Seattle platform documentation: +[3] AMD ACPI for Seattle platform documentation http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/Seattle_ACPI_Guide.pdf -[4] http://www.uefi.org/acpi -- please see the link for the "ACPI _DSD Device + +[4] http://www.uefi.org/acpi + please see the link for the "ACPI _DSD Device Property Registry Instructions" -[5] http://www.uefi.org/acpi -- please see the link for the "_DSD (Device +[5] http://www.uefi.org/acpi + please see the link for the "_DSD (Device Specific Data) Implementation Guide" -[6] Kernel code for the unified device property interface can be found in +[6] Kernel code for the unified device + property interface can be found in include/linux/property.h and drivers/base/property.c. Authors ------- -Al Stone <al.stone@linaro.org> -Graeme Gregory <graeme.gregory@linaro.org> -Hanjun Guo <hanjun.guo@linaro.org> +- Al Stone <al.stone@linaro.org> +- Graeme Gregory <graeme.gregory@linaro.org> +- Hanjun Guo <hanjun.guo@linaro.org> -Grant Likely <grant.likely@linaro.org>, for the "Why ACPI on ARM?" section +- Grant Likely <grant.likely@linaro.org>, for the "Why ACPI on ARM?" section diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.rst index fbab7e21d116..3d041d0d16e8 100644 --- a/Documentation/arm64/booting.txt +++ b/Documentation/arm64/booting.rst @@ -1,7 +1,9 @@ - Booting AArch64 Linux - ===================== +===================== +Booting AArch64 Linux +===================== Author: Will Deacon <will.deacon@arm.com> + Date : 07 September 2012 This document is based on the ARM booting document by Russell King and @@ -12,7 +14,7 @@ The AArch64 exception model is made up of a number of exception levels counterpart. EL2 is the hypervisor level and exists only in non-secure mode. EL3 is the highest priority level and exists only in secure mode. -For the purposes of this document, we will use the term `boot loader' +For the purposes of this document, we will use the term `boot loader` simply to define all software that executes on the CPU(s) before control is passed to the Linux kernel. This may include secure monitor and hypervisor code, or it may just be a handful of instructions for @@ -70,7 +72,7 @@ Image target is available instead. Requirement: MANDATORY -The decompressed kernel image contains a 64-byte header as follows: +The decompressed kernel image contains a 64-byte header as follows:: u32 code0; /* Executable code */ u32 code1; /* Executable code */ @@ -103,19 +105,26 @@ Header notes: - The flags field (introduced in v3.17) is a little-endian 64-bit field composed as follows: - Bit 0: Kernel endianness. 1 if BE, 0 if LE. - Bit 1-2: Kernel Page size. - 0 - Unspecified. - 1 - 4K - 2 - 16K - 3 - 64K - Bit 3: Kernel physical placement - 0 - 2MB aligned base should be as close as possible - to the base of DRAM, since memory below it is not - accessible via the linear mapping - 1 - 2MB aligned base may be anywhere in physical - memory - Bits 4-63: Reserved. + + ============= =============================================================== + Bit 0 Kernel endianness. 1 if BE, 0 if LE. + Bit 1-2 Kernel Page size. + + * 0 - Unspecified. + * 1 - 4K + * 2 - 16K + * 3 - 64K + Bit 3 Kernel physical placement + + 0 + 2MB aligned base should be as close as possible + to the base of DRAM, since memory below it is not + accessible via the linear mapping + 1 + 2MB aligned base may be anywhere in physical + memory + Bits 4-63 Reserved. + ============= =============================================================== - When image_size is zero, a bootloader should attempt to keep as much memory as possible free for use by the kernel immediately after the @@ -147,19 +156,22 @@ Before jumping into the kernel, the following conditions must be met: corrupted by bogus network packets or disk data. This will save you many hours of debug. -- Primary CPU general-purpose register settings - x0 = physical address of device tree blob (dtb) in system RAM. - x1 = 0 (reserved for future use) - x2 = 0 (reserved for future use) - x3 = 0 (reserved for future use) +- Primary CPU general-purpose register settings: + + - x0 = physical address of device tree blob (dtb) in system RAM. + - x1 = 0 (reserved for future use) + - x2 = 0 (reserved for future use) + - x3 = 0 (reserved for future use) - CPU mode + All forms of interrupts must be masked in PSTATE.DAIF (Debug, SError, IRQ and FIQ). The CPU must be in either EL2 (RECOMMENDED in order to have access to the virtualisation extensions) or non-secure EL1. - Caches, MMUs + The MMU must be off. Instruction cache may be on or off. The address range corresponding to the loaded kernel image must be @@ -172,18 +184,21 @@ Before jumping into the kernel, the following conditions must be met: operations (not recommended) must be configured and disabled. - Architected timers + CNTFRQ must be programmed with the timer frequency and CNTVOFF must be programmed with a consistent value on all CPUs. If entering the kernel at EL1, CNTHCTL_EL2 must have EL1PCTEN (bit 0) set where available. - Coherency + All CPUs to be booted by the kernel must be part of the same coherency domain on entry to the kernel. This may require IMPLEMENTATION DEFINED initialisation to enable the receiving of maintenance operations on each CPU. - System registers + All writable architected system registers at the exception level where the kernel image will be entered must be initialised by software at a higher exception level to prevent execution in an UNKNOWN state. @@ -195,28 +210,40 @@ Before jumping into the kernel, the following conditions must be met: For systems with a GICv3 interrupt controller to be used in v3 mode: - If EL3 is present: - ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1. - ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1. + + - ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1. + - ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1. + - If the kernel is entered at EL1: - ICC.SRE_EL2.Enable (bit 3) must be initialised to 0b1 - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b1. + + - ICC.SRE_EL2.Enable (bit 3) must be initialised to 0b1 + - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b1. + - The DT or ACPI tables must describe a GICv3 interrupt controller. For systems with a GICv3 interrupt controller to be used in compatibility (v2) mode: + - If EL3 is present: - ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b0. + + ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b0. + - If the kernel is entered at EL1: - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0. + + ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b0. + - The DT or ACPI tables must describe a GICv2 interrupt controller. For CPUs with pointer authentication functionality: - If EL3 is present: - SCR_EL3.APK (bit 16) must be initialised to 0b1 - SCR_EL3.API (bit 17) must be initialised to 0b1 + + - SCR_EL3.APK (bit 16) must be initialised to 0b1 + - SCR_EL3.API (bit 17) must be initialised to 0b1 + - If the kernel is entered at EL1: - HCR_EL2.APK (bit 40) must be initialised to 0b1 - HCR_EL2.API (bit 41) must be initialised to 0b1 + + - HCR_EL2.APK (bit 40) must be initialised to 0b1 + - HCR_EL2.API (bit 41) must be initialised to 0b1 The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must diff --git a/Documentation/arm64/cpu-feature-registers.txt b/Documentation/arm64/cpu-feature-registers.rst index 684a0da39378..2955287e9acc 100644 --- a/Documentation/arm64/cpu-feature-registers.txt +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -1,5 +1,6 @@ - ARM64 CPU Feature Registers - =========================== +=========================== +ARM64 CPU Feature Registers +=========================== Author: Suzuki K Poulose <suzuki.poulose@arm.com> @@ -9,7 +10,7 @@ registers to userspace. The availability of this ABI is advertised via the HWCAP_CPUID in HWCAPs. 1. Motivation ---------------- +------------- The ARM architecture defines a set of feature registers, which describe the capabilities of the CPU/system. Access to these system registers is @@ -33,9 +34,10 @@ there are some issues with their usage. 2. Requirements ------------------ +--------------- + + a) Safety: - a) Safety : Applications should be able to use the information provided by the infrastructure to run safely across the system. This has greater implications on a system with heterogeneous CPUs. @@ -47,7 +49,8 @@ there are some issues with their usage. Otherwise an application could crash when scheduled on the CPU which doesn't support CRC32. - b) Security : + b) Security: + Applications should only be able to receive information that is relevant to the normal operation in userspace. Hence, some of the fields are masked out(i.e, made invisible) and their values are set to @@ -58,10 +61,12 @@ there are some issues with their usage. (even when the CPU provides it). c) Implementation Defined Features + The infrastructure doesn't expose any register which is IMPLEMENTATION DEFINED as per ARMv8-A Architecture. - d) CPU Identification : + d) CPU Identification: + MIDR_EL1 is exposed to help identify the processor. On a heterogeneous system, this could be racy (just like getcpu()). The process could be migrated to another CPU by the time it uses the @@ -70,7 +75,7 @@ there are some issues with their usage. currently executing on. The REVIDR is not exposed due to this constraint, as REVIDR makes sense only in conjunction with the MIDR. Alternately, MIDR_EL1 and REVIDR_EL1 are exposed via sysfs - at: + at:: /sys/devices/system/cpu/cpu$ID/regs/identification/ \- midr @@ -85,7 +90,8 @@ exception and ends up in SIGILL being delivered to the process. The infrastructure hooks into the exception handler and emulates the operation if the source belongs to the supported system register space. -The infrastructure emulates only the following system register space: +The infrastructure emulates only the following system register space:: + Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7 (See Table C5-6 'System instruction encodings for non-Debug System @@ -107,73 +113,76 @@ infrastructure: ------------------------------------------- 1) ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0 - x--------------------------------------------------x + + +------------------------------+---------+---------+ | Name | bits | visible | - |--------------------------------------------------| + +------------------------------+---------+---------+ | TS | [55-52] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | FHM | [51-48] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | DP | [47-44] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SM4 | [43-40] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SM3 | [39-36] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SHA3 | [35-32] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | RDM | [31-28] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | ATOMICS | [23-20] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | CRC32 | [19-16] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SHA2 | [15-12] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SHA1 | [11-8] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | AES | [7-4] | y | - x--------------------------------------------------x + +------------------------------+---------+---------+ 2) ID_AA64PFR0_EL1 - Processor Feature Register 0 - x--------------------------------------------------x + + +------------------------------+---------+---------+ | Name | bits | visible | - |--------------------------------------------------| + +------------------------------+---------+---------+ | DIT | [51-48] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SVE | [35-32] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | GIC | [27-24] | n | - |--------------------------------------------------| + +------------------------------+---------+---------+ | AdvSIMD | [23-20] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | FP | [19-16] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | EL3 | [15-12] | n | - |--------------------------------------------------| + +------------------------------+---------+---------+ | EL2 | [11-8] | n | - |--------------------------------------------------| + +------------------------------+---------+---------+ | EL1 | [7-4] | n | - |--------------------------------------------------| + +------------------------------+---------+---------+ | EL0 | [3-0] | n | - x--------------------------------------------------x + +------------------------------+---------+---------+ 3) MIDR_EL1 - Main ID Register - x--------------------------------------------------x + + +------------------------------+---------+---------+ | Name | bits | visible | - |--------------------------------------------------| + +------------------------------+---------+---------+ | Implementer | [31-24] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | Variant | [23-20] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | Architecture | [19-16] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | PartNum | [15-4] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | Revision | [3-0] | y | - x--------------------------------------------------x + +------------------------------+---------+---------+ NOTE: The 'visible' fields of MIDR_EL1 will contain the value as available on the CPU where it is fetched and is not a system @@ -181,90 +190,92 @@ infrastructure: 4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1 - x--------------------------------------------------x + +------------------------------+---------+---------+ | Name | bits | visible | - |--------------------------------------------------| + +------------------------------+---------+---------+ | GPI | [31-28] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | GPA | [27-24] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | LRCPC | [23-20] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | FCMA | [19-16] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | JSCVT | [15-12] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | API | [11-8] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | APA | [7-4] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | DPB | [3-0] | y | - x--------------------------------------------------x + +------------------------------+---------+---------+ 5) ID_AA64MMFR2_EL1 - Memory model feature register 2 - x--------------------------------------------------x + +------------------------------+---------+---------+ | Name | bits | visible | - |--------------------------------------------------| + +------------------------------+---------+---------+ | AT | [35-32] | y | - x--------------------------------------------------x + +------------------------------+---------+---------+ 6) ID_AA64ZFR0_EL1 - SVE feature ID register 0 - x--------------------------------------------------x + +------------------------------+---------+---------+ | Name | bits | visible | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SM4 | [43-40] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SHA3 | [35-32] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | BitPerm | [19-16] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | AES | [7-4] | y | - |--------------------------------------------------| + +------------------------------+---------+---------+ | SVEVer | [3-0] | y | - x--------------------------------------------------x + +------------------------------+---------+---------+ Appendix I: Example ---------------------------- - -/* - * Sample program to demonstrate the MRS emulation ABI. - * - * Copyright (C) 2015-2016, ARM Ltd - * - * Author: Suzuki K Poulose <suzuki.poulose@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <asm/hwcap.h> -#include <stdio.h> -#include <sys/auxv.h> - -#define get_cpu_ftr(id) ({ \ +------------------- + +:: + + /* + * Sample program to demonstrate the MRS emulation ABI. + * + * Copyright (C) 2015-2016, ARM Ltd + * + * Author: Suzuki K Poulose <suzuki.poulose@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + + #include <asm/hwcap.h> + #include <stdio.h> + #include <sys/auxv.h> + + #define get_cpu_ftr(id) ({ \ unsigned long __val; \ asm("mrs %0, "#id : "=r" (__val)); \ printf("%-20s: 0x%016lx\n", #id, __val); \ }) -int main(void) -{ + int main(void) + { if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { fputs("CPUID registers unavailable\n", stderr); @@ -284,13 +295,10 @@ int main(void) get_cpu_ftr(MPIDR_EL1); get_cpu_ftr(REVIDR_EL1); -#if 0 + #if 0 /* Unexposed register access causes SIGILL */ get_cpu_ftr(ID_MMFR0_EL1); -#endif + #endif return 0; -} - - - + } diff --git a/Documentation/arm64/elf_hwcaps.txt b/Documentation/arm64/elf_hwcaps.rst index 5ae2ef2c12f3..91f79529c58c 100644 --- a/Documentation/arm64/elf_hwcaps.txt +++ b/Documentation/arm64/elf_hwcaps.rst @@ -1,3 +1,4 @@ +================ ARM64 ELF hwcaps ================ @@ -15,16 +16,16 @@ of flags called hwcaps, exposed in the auxilliary vector. Userspace software can test for features by acquiring the AT_HWCAP or AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant -flags are set, e.g. +flags are set, e.g.:: -bool floating_point_is_present(void) -{ - unsigned long hwcaps = getauxval(AT_HWCAP); - if (hwcaps & HWCAP_FP) - return true; + bool floating_point_is_present(void) + { + unsigned long hwcaps = getauxval(AT_HWCAP); + if (hwcaps & HWCAP_FP) + return true; - return false; -} + return false; + } Where software relies on a feature described by a hwcap, it should check the relevant hwcap flag to verify that the feature is present before @@ -45,7 +46,7 @@ userspace code at EL0. These hwcaps are defined in terms of ID register fields, and should be interpreted with reference to the definition of these fields in the ARM Architecture Reference Manual (ARM ARM). -Such hwcaps are described below in the form: +Such hwcaps are described below in the form:: Functionality implied by idreg.field == val. @@ -64,75 +65,58 @@ reference to ID registers, and may refer to other documentation. --------------------------------- HWCAP_FP - Functionality implied by ID_AA64PFR0_EL1.FP == 0b0000. HWCAP_ASIMD - Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0000. HWCAP_EVTSTRM - The generic timer is configured to generate events at a frequency of approximately 100KHz. HWCAP_AES - Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001. HWCAP_PMULL - Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0010. HWCAP_SHA1 - Functionality implied by ID_AA64ISAR0_EL1.SHA1 == 0b0001. HWCAP_SHA2 - Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0001. HWCAP_CRC32 - Functionality implied by ID_AA64ISAR0_EL1.CRC32 == 0b0001. HWCAP_ATOMICS - Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0010. HWCAP_FPHP - Functionality implied by ID_AA64PFR0_EL1.FP == 0b0001. HWCAP_ASIMDHP - Functionality implied by ID_AA64PFR0_EL1.AdvSIMD == 0b0001. HWCAP_CPUID - EL0 access to certain ID registers is available, to the extent - described by Documentation/arm64/cpu-feature-registers.txt. + described by Documentation/arm64/cpu-feature-registers.rst. These ID registers may imply the availability of features. HWCAP_ASIMDRDM - Functionality implied by ID_AA64ISAR0_EL1.RDM == 0b0001. HWCAP_JSCVT - Functionality implied by ID_AA64ISAR1_EL1.JSCVT == 0b0001. HWCAP_FCMA - Functionality implied by ID_AA64ISAR1_EL1.FCMA == 0b0001. HWCAP_LRCPC - Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0001. HWCAP_DCPOP - Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001. HWCAP2_DCPODP @@ -140,27 +124,21 @@ HWCAP2_DCPODP Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010. HWCAP_SHA3 - Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001. HWCAP_SM3 - Functionality implied by ID_AA64ISAR0_EL1.SM3 == 0b0001. HWCAP_SM4 - Functionality implied by ID_AA64ISAR0_EL1.SM4 == 0b0001. HWCAP_ASIMDDP - Functionality implied by ID_AA64ISAR0_EL1.DP == 0b0001. HWCAP_SHA512 - Functionality implied by ID_AA64ISAR0_EL1.SHA2 == 0b0010. HWCAP_SVE - Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001. HWCAP2_SVE2 @@ -188,23 +166,18 @@ HWCAP2_SVESM4 Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001. HWCAP_ASIMDFHM - Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001. HWCAP_DIT - Functionality implied by ID_AA64PFR0_EL1.DIT == 0b0001. HWCAP_USCAT - Functionality implied by ID_AA64MMFR2_EL1.AT == 0b0001. HWCAP_ILRCPC - Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0010. HWCAP_FLAGM - Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001. HWCAP2_FLAGM2 @@ -212,20 +185,17 @@ HWCAP2_FLAGM2 Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0010. HWCAP_SSBS - Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010. HWCAP_PACA - Functionality implied by ID_AA64ISAR1_EL1.APA == 0b0001 or ID_AA64ISAR1_EL1.API == 0b0001, as described by - Documentation/arm64/pointer-authentication.txt. + Documentation/arm64/pointer-authentication.rst. HWCAP_PACG - Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or ID_AA64ISAR1_EL1.GPI == 0b0001, as described by - Documentation/arm64/pointer-authentication.txt. + Documentation/arm64/pointer-authentication.rst. HWCAP2_FRINT diff --git a/Documentation/arm64/hugetlbpage.txt b/Documentation/arm64/hugetlbpage.rst index cfae87dc653b..b44f939e5210 100644 --- a/Documentation/arm64/hugetlbpage.txt +++ b/Documentation/arm64/hugetlbpage.rst @@ -1,3 +1,4 @@ +==================== HugeTLBpage on ARM64 ==================== @@ -31,8 +32,10 @@ and level of the page table. The following hugepage sizes are supported - - CONT PTE PMD CONT PMD PUD - -------- --- -------- --- + ====== ======== ==== ======== === + - CONT PTE PMD CONT PMD PUD + ====== ======== ==== ======== === 4K: 64K 2M 32M 1G 16K: 2M 32M 1G 64K: 2M 512M 16G + ====== ======== ==== ======== === diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst new file mode 100644 index 000000000000..018b7836ecb7 --- /dev/null +++ b/Documentation/arm64/index.rst @@ -0,0 +1,28 @@ +:orphan: + +================== +ARM64 Architecture +================== + +.. toctree:: + :maxdepth: 1 + + acpi_object_usage + arm-acpi + booting + cpu-feature-registers + elf_hwcaps + hugetlbpage + legacy_instructions + memory + pointer-authentication + silicon-errata + sve + tagged-pointers + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/arm64/legacy_instructions.txt b/Documentation/arm64/legacy_instructions.rst index 01bf3d9fac85..54401b22cb8f 100644 --- a/Documentation/arm64/legacy_instructions.txt +++ b/Documentation/arm64/legacy_instructions.rst @@ -1,3 +1,7 @@ +=================== +Legacy instructions +=================== + The arm64 port of the Linux kernel provides infrastructure to support emulation of instructions which have been deprecated, or obsoleted in the architecture. The infrastructure code uses undefined instruction @@ -9,19 +13,22 @@ The emulation mode can be controlled by writing to sysctl nodes behaviours and the corresponding values of the sysctl nodes - * Undef - Value: 0 + Value: 0 + Generates undefined instruction abort. Default for instructions that have been obsoleted in the architecture, e.g., SWP * Emulate - Value: 1 + Value: 1 + Uses software emulation. To aid migration of software, in this mode usage of emulated instruction is traced as well as rate limited warnings are issued. This is the default for deprecated instructions, .e.g., CP15 barriers * Hardware Execution - Value: 2 + Value: 2 + Although marked as deprecated, some implementations may support the enabling/disabling of hardware support for the execution of these instructions. Using hardware execution generally provides better @@ -38,20 +45,24 @@ individual instruction notes for further information. Supported legacy instructions ----------------------------- * SWP{B} -Node: /proc/sys/abi/swp -Status: Obsolete -Default: Undef (0) + +:Node: /proc/sys/abi/swp +:Status: Obsolete +:Default: Undef (0) * CP15 Barriers -Node: /proc/sys/abi/cp15_barrier -Status: Deprecated -Default: Emulate (1) + +:Node: /proc/sys/abi/cp15_barrier +:Status: Deprecated +:Default: Emulate (1) * SETEND -Node: /proc/sys/abi/setend -Status: Deprecated -Default: Emulate (1)* -Note: All the cpus on the system must have mixed endian support at EL0 -for this feature to be enabled. If a new CPU - which doesn't support mixed -endian - is hotplugged in after this feature has been enabled, there could -be unexpected results in the application. + +:Node: /proc/sys/abi/setend +:Status: Deprecated +:Default: Emulate (1)* + + Note: All the cpus on the system must have mixed endian support at EL0 + for this feature to be enabled. If a new CPU - which doesn't support mixed + endian - is hotplugged in after this feature has been enabled, there could + be unexpected results in the application. diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst new file mode 100644 index 000000000000..464b880fc4b7 --- /dev/null +++ b/Documentation/arm64/memory.rst @@ -0,0 +1,98 @@ +============================== +Memory Layout on AArch64 Linux +============================== + +Author: Catalin Marinas <catalin.marinas@arm.com> + +This document describes the virtual memory layout used by the AArch64 +Linux kernel. The architecture allows up to 4 levels of translation +tables with a 4KB page size and up to 3 levels with a 64KB page size. + +AArch64 Linux uses either 3 levels or 4 levels of translation tables +with the 4KB page configuration, allowing 39-bit (512GB) or 48-bit +(256TB) virtual addresses, respectively, for both user and kernel. With +64KB pages, only 2 levels of translation tables, allowing 42-bit (4TB) +virtual address, are used but the memory layout is the same. + +User addresses have bits 63:48 set to 0 while the kernel addresses have +the same bits set to 1. TTBRx selection is given by bit 63 of the +virtual address. The swapper_pg_dir contains only kernel (global) +mappings while the user pgd contains only user (non-global) mappings. +The swapper_pg_dir address is written to TTBR1 and never written to +TTBR0. + + +AArch64 Linux memory layout with 4KB pages + 3 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 0000007fffffffff 512GB user + ffffff8000000000 ffffffffffffffff 512GB kernel + + +AArch64 Linux memory layout with 4KB pages + 4 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 0000ffffffffffff 256TB user + ffff000000000000 ffffffffffffffff 256TB kernel + + +AArch64 Linux memory layout with 64KB pages + 2 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 000003ffffffffff 4TB user + fffffc0000000000 ffffffffffffffff 4TB kernel + + +AArch64 Linux memory layout with 64KB pages + 3 levels:: + + Start End Size Use + ----------------------------------------------------------------------- + 0000000000000000 0000ffffffffffff 256TB user + ffff000000000000 ffffffffffffffff 256TB kernel + + +For details of the virtual kernel memory layout please see the kernel +booting log. + + +Translation table lookup with 4KB pages:: + + +--------+--------+--------+--------+--------+--------+--------+--------+ + |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| + +--------+--------+--------+--------+--------+--------+--------+--------+ + | | | | | | + | | | | | v + | | | | | [11:0] in-page offset + | | | | +-> [20:12] L3 index + | | | +-----------> [29:21] L2 index + | | +---------------------> [38:30] L1 index + | +-------------------------------> [47:39] L0 index + +-------------------------------------------------> [63] TTBR0/1 + + +Translation table lookup with 64KB pages:: + + +--------+--------+--------+--------+--------+--------+--------+--------+ + |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| + +--------+--------+--------+--------+--------+--------+--------+--------+ + | | | | | + | | | | v + | | | | [15:0] in-page offset + | | | +----------> [28:16] L3 index + | | +--------------------------> [41:29] L2 index + | +-------------------------------> [47:42] L1 index + +-------------------------------------------------> [63] TTBR0/1 + + +When using KVM without the Virtualization Host Extensions, the +hypervisor maps kernel pages in EL2 at a fixed (and potentially +random) offset from the linear mapping. See the kern_hyp_va macro and +kvm_update_va_mask function for more details. MMIO devices such as +GICv2 gets mapped next to the HYP idmap page, as do vectors when +ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs. + +When using KVM with the Virtualization Host Extensions, no additional +mappings are created, since the host kernel runs directly in EL2. diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt deleted file mode 100644 index c5dab30d3389..000000000000 --- a/Documentation/arm64/memory.txt +++ /dev/null @@ -1,97 +0,0 @@ - Memory Layout on AArch64 Linux - ============================== - -Author: Catalin Marinas <catalin.marinas@arm.com> - -This document describes the virtual memory layout used by the AArch64 -Linux kernel. The architecture allows up to 4 levels of translation -tables with a 4KB page size and up to 3 levels with a 64KB page size. - -AArch64 Linux uses either 3 levels or 4 levels of translation tables -with the 4KB page configuration, allowing 39-bit (512GB) or 48-bit -(256TB) virtual addresses, respectively, for both user and kernel. With -64KB pages, only 2 levels of translation tables, allowing 42-bit (4TB) -virtual address, are used but the memory layout is the same. - -User addresses have bits 63:48 set to 0 while the kernel addresses have -the same bits set to 1. TTBRx selection is given by bit 63 of the -virtual address. The swapper_pg_dir contains only kernel (global) -mappings while the user pgd contains only user (non-global) mappings. -The swapper_pg_dir address is written to TTBR1 and never written to -TTBR0. - - -AArch64 Linux memory layout with 4KB pages + 3 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 0000007fffffffff 512GB user -ffffff8000000000 ffffffffffffffff 512GB kernel - - -AArch64 Linux memory layout with 4KB pages + 4 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 0000ffffffffffff 256TB user -ffff000000000000 ffffffffffffffff 256TB kernel - - -AArch64 Linux memory layout with 64KB pages + 2 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 000003ffffffffff 4TB user -fffffc0000000000 ffffffffffffffff 4TB kernel - - -AArch64 Linux memory layout with 64KB pages + 3 levels: - -Start End Size Use ------------------------------------------------------------------------ -0000000000000000 0000ffffffffffff 256TB user -ffff000000000000 ffffffffffffffff 256TB kernel - - -For details of the virtual kernel memory layout please see the kernel -booting log. - - -Translation table lookup with 4KB pages: - -+--------+--------+--------+--------+--------+--------+--------+--------+ -|63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| -+--------+--------+--------+--------+--------+--------+--------+--------+ - | | | | | | - | | | | | v - | | | | | [11:0] in-page offset - | | | | +-> [20:12] L3 index - | | | +-----------> [29:21] L2 index - | | +---------------------> [38:30] L1 index - | +-------------------------------> [47:39] L0 index - +-------------------------------------------------> [63] TTBR0/1 - - -Translation table lookup with 64KB pages: - -+--------+--------+--------+--------+--------+--------+--------+--------+ -|63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0| -+--------+--------+--------+--------+--------+--------+--------+--------+ - | | | | | - | | | | v - | | | | [15:0] in-page offset - | | | +----------> [28:16] L3 index - | | +--------------------------> [41:29] L2 index - | +-------------------------------> [47:42] L1 index - +-------------------------------------------------> [63] TTBR0/1 - - -When using KVM without the Virtualization Host Extensions, the -hypervisor maps kernel pages in EL2 at a fixed (and potentially -random) offset from the linear mapping. See the kern_hyp_va macro and -kvm_update_va_mask function for more details. MMIO devices such as -GICv2 gets mapped next to the HYP idmap page, as do vectors when -ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs. - -When using KVM with the Virtualization Host Extensions, no additional -mappings are created, since the host kernel runs directly in EL2. diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.rst index fc71b33de87e..30b2ab06526b 100644 --- a/Documentation/arm64/pointer-authentication.txt +++ b/Documentation/arm64/pointer-authentication.rst @@ -1,7 +1,9 @@ +======================================= Pointer authentication in AArch64 Linux ======================================= Author: Mark Rutland <mark.rutland@arm.com> + Date: 2017-07-19 This document briefly describes the provision of pointer authentication diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.rst index 2735462d5958..c792774be59e 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.rst @@ -1,7 +1,9 @@ - Silicon Errata and Software Workarounds - ======================================= +======================================= +Silicon Errata and Software Workarounds +======================================= Author: Will Deacon <will.deacon@arm.com> + Date : 27 November 2015 It is an unfortunate fact of life that hardware is often produced with @@ -9,11 +11,13 @@ so-called "errata", which can cause it to deviate from the architecture under specific circumstances. For hardware produced by ARM, these errata are broadly classified into the following categories: - Category A: A critical error without a viable workaround. - Category B: A significant or critical error with an acceptable + ========== ======================================================== + Category A A critical error without a viable workaround. + Category B A significant or critical error with an acceptable workaround. - Category C: A minor error that is not expected to occur under normal + Category C A minor error that is not expected to occur under normal operation. + ========== ======================================================== For more information, consult one of the "Software Developers Errata Notice" documents available on infocenter.arm.com (registration @@ -42,47 +46,86 @@ file acts as a registry of software workarounds in the Linux Kernel and will be updated when new workarounds are committed and backported to stable kernels. -| Implementor | Component | Erratum ID | Kconfig | +----------------+-----------------+-----------------+-----------------------------+ +| Implementor | Component | Erratum ID | Kconfig | ++================+=================+=================+=============================+ | Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 | -| | | | | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #824069 | ARM64_ERRATUM_824069 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #819472 | ARM64_ERRATUM_819472 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #845719 | ARM64_ERRATUM_845719 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A53 | #843419 | ARM64_ERRATUM_843419 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #852523 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A72 | #853709 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A55 | #1024718 | ARM64_ERRATUM_1024718 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | MMU-500 | #841119,826419 | N/A | -| | | | | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX ITS | #22375,24313 | CAVIUM_ERRATUM_22375 | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX ITS | #23144 | CAVIUM_ERRATUM_23144 | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX SMMUv2 | #27704 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX2 SMMUv3| #74 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX2 SMMUv3| #126 | N/A | -| | | | | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | -| | | | | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 | ++----------------+-----------------+-----------------+-----------------------------+ | Hisilicon | Hip0{6,7} | #161010701 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ | Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 | ++----------------+-----------------+-----------------+-----------------------------+ | Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A | -| | | | | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | ++----------------+-----------------+-----------------+-----------------------------+ | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | ++----------------+-----------------+-----------------+-----------------------------+ | Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | ++----------------+-----------------+-----------------+-----------------------------+ | Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 | ++----------------+-----------------+-----------------+-----------------------------+ diff --git a/Documentation/arm64/sve.txt b/Documentation/arm64/sve.rst index 5689fc9a976a..5689c74c8082 100644 --- a/Documentation/arm64/sve.txt +++ b/Documentation/arm64/sve.rst @@ -1,7 +1,9 @@ - Scalable Vector Extension support for AArch64 Linux - =================================================== +=================================================== +Scalable Vector Extension support for AArch64 Linux +=================================================== Author: Dave Martin <Dave.Martin@arm.com> + Date: 4 August 2017 This document outlines briefly the interface provided to userspace by Linux in @@ -442,7 +444,7 @@ In A64 state, SVE adds the following: * FPSR and FPCR are retained from ARMv8-A, and interact with SVE floating-point operations in a similar way to the way in which they interact with ARMv8 - floating-point operations. + floating-point operations:: 8VL-1 128 0 bit index +---- //// -----------------+ @@ -499,6 +501,8 @@ ARMv8-A defines the following floating-point / SIMD register state: * 32 128-bit vector registers V0..V31 * 2 32-bit status/control registers FPSR, FPCR +:: + 127 0 bit index +---------------+ V0 | | @@ -533,7 +537,7 @@ References [2] arch/arm64/include/uapi/asm/ptrace.h AArch64 Linux ptrace ABI definitions -[3] Documentation/arm64/cpu-feature-registers.txt +[3] Documentation/arm64/cpu-feature-registers.rst [4] ARM IHI0055C http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf diff --git a/Documentation/arm64/tagged-pointers.txt b/Documentation/arm64/tagged-pointers.rst index a25a99e82bb1..2acdec3ebbeb 100644 --- a/Documentation/arm64/tagged-pointers.txt +++ b/Documentation/arm64/tagged-pointers.rst @@ -1,7 +1,9 @@ - Tagged virtual addresses in AArch64 Linux - ========================================= +========================================= +Tagged virtual addresses in AArch64 Linux +========================================= Author: Will Deacon <will.deacon@arm.com> + Date : 12 June 2013 This document briefly describes the provision of tagged virtual diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 35d83e24dbdb..4d565d202ce3 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -151,6 +151,7 @@ for the type. The maximum value of ``BTF_INT_BITS()`` is 128. The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values for this int. For example, a bitfield struct member has: + * btf member bit offset 100 from the start of the structure, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` @@ -160,6 +161,7 @@ from bits ``100 + 2 = 102``. Alternatively, the bitfield struct member can be the following to access the same bits as the above: + * btf member bit offset 102, * btf member pointing to an int type, * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4`` diff --git a/Documentation/cdrom/Makefile b/Documentation/cdrom/Makefile deleted file mode 100644 index a19e321928e1..000000000000 --- a/Documentation/cdrom/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -LATEXFILE = cdrom-standard - -all: - make clean - latex $(LATEXFILE) - latex $(LATEXFILE) - @if [ -x `which gv` ]; then \ - `dvips -q -t letter -o $(LATEXFILE).ps $(LATEXFILE).dvi` ;\ - `gv -antialias -media letter -nocenter $(LATEXFILE).ps` ;\ - else \ - `xdvi $(LATEXFILE).dvi &` ;\ - fi - make sortofclean - -clean: - rm -f $(LATEXFILE).ps $(LATEXFILE).dvi $(LATEXFILE).aux $(LATEXFILE).log - -sortofclean: - rm -f $(LATEXFILE).aux $(LATEXFILE).log - - diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst new file mode 100644 index 000000000000..dde4f7f7fdbf --- /dev/null +++ b/Documentation/cdrom/cdrom-standard.rst @@ -0,0 +1,1063 @@ +======================= +A Linux CD-ROM standard +======================= + +:Author: David van Leeuwen <david@ElseWare.cistron.nl> +:Date: 12 March 1999 +:Updated by: Erik Andersen (andersee@debian.org) +:Updated by: Jens Axboe (axboe@image.dk) + + +Introduction +============ + +Linux is probably the Unix-like operating system that supports +the widest variety of hardware devices. The reasons for this are +presumably + +- The large list of hardware devices available for the many platforms + that Linux now supports (i.e., i386-PCs, Sparc Suns, etc.) +- The open design of the operating system, such that anybody can write a + driver for Linux. +- There is plenty of source code around as examples of how to write a driver. + +The openness of Linux, and the many different types of available +hardware has allowed Linux to support many different hardware devices. +Unfortunately, the very openness that has allowed Linux to support +all these different devices has also allowed the behavior of each +device driver to differ significantly from one device to another. +This divergence of behavior has been very significant for CD-ROM +devices; the way a particular drive reacts to a `standard` *ioctl()* +call varies greatly from one device driver to another. To avoid making +their drivers totally inconsistent, the writers of Linux CD-ROM +drivers generally created new device drivers by understanding, copying, +and then changing an existing one. Unfortunately, this practice did not +maintain uniform behavior across all the Linux CD-ROM drivers. + +This document describes an effort to establish Uniform behavior across +all the different CD-ROM device drivers for Linux. This document also +defines the various *ioctl()'s*, and how the low-level CD-ROM device +drivers should implement them. Currently (as of the Linux 2.1.\ *x* +development kernels) several low-level CD-ROM device drivers, including +both IDE/ATAPI and SCSI, now use this Uniform interface. + +When the CD-ROM was developed, the interface between the CD-ROM drive +and the computer was not specified in the standards. As a result, many +different CD-ROM interfaces were developed. Some of them had their +own proprietary design (Sony, Mitsumi, Panasonic, Philips), other +manufacturers adopted an existing electrical interface and changed +the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply +adapted their drives to one or more of the already existing electrical +interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and +most of the `NoName` manufacturers). In cases where a new drive really +brought its own interface or used its own command set and flow control +scheme, either a separate driver had to be written, or an existing +driver had to be enhanced. History has delivered us CD-ROM support for +many of these different interfaces. Nowadays, almost all new CD-ROM +drives are either IDE/ATAPI or SCSI, and it is very unlikely that any +manufacturer will create a new interface. Even finding drives for the +old proprietary interfaces is getting difficult. + +When (in the 1.3.70's) I looked at the existing software interface, +which was expressed through `cdrom.h`, it appeared to be a rather wild +set of commands and data formats [#f1]_. It seemed that many +features of the software interface had been added to accommodate the +capabilities of a particular drive, in an *ad hoc* manner. More +importantly, it appeared that the behavior of the `standard` commands +was different for most of the different drivers: e. g., some drivers +close the tray if an *open()* call occurs when the tray is open, while +others do not. Some drivers lock the door upon opening the device, to +prevent an incoherent file system, but others don't, to allow software +ejection. Undoubtedly, the capabilities of the different drives vary, +but even when two drives have the same capability their drivers' +behavior was usually different. + +.. [#f1] + I cannot recollect what kernel version I looked at, then, + presumably 1.2.13 and 1.3.34 --- the latest kernel that I was + indirectly involved in. + +I decided to start a discussion on how to make all the Linux CD-ROM +drivers behave more uniformly. I began by contacting the developers of +the many CD-ROM drivers found in the Linux kernel. Their reactions +encouraged me to write the Uniform CD-ROM Driver which this document is +intended to describe. The implementation of the Uniform CD-ROM Driver is +in the file `cdrom.c`. This driver is intended to be an additional software +layer that sits on top of the low-level device drivers for each CD-ROM drive. +By adding this additional layer, it is possible to have all the different +CD-ROM devices behave **exactly** the same (insofar as the underlying +hardware will allow). + +The goal of the Uniform CD-ROM Driver is **not** to alienate driver developers +whohave not yet taken steps to support this effort. The goal of Uniform CD-ROM +Driver is simply to give people writing application programs for CD-ROM drives +**one** Linux CD-ROM interface with consistent behavior for all +CD-ROM devices. In addition, this also provides a consistent interface +between the low-level device driver code and the Linux kernel. Care +is taken that 100% compatibility exists with the data structures and +programmer's interface defined in `cdrom.h`. This guide was written to +help CD-ROM driver developers adapt their code to use the Uniform CD-ROM +Driver code defined in `cdrom.c`. + +Personally, I think that the most important hardware interfaces are +the IDE/ATAPI drives and, of course, the SCSI drives, but as prices +of hardware drop continuously, it is also likely that people may have +more than one CD-ROM drive, possibly of mixed types. It is important +that these drives behave in the same way. In December 1994, one of the +cheapest CD-ROM drives was a Philips cm206, a double-speed proprietary +drive. In the months that I was busy writing a Linux driver for it, +proprietary drives became obsolete and IDE/ATAPI drives became the +standard. At the time of the last update to this document (November +1997) it is becoming difficult to even **find** anything less than a +16 speed CD-ROM drive, and 24 speed drives are common. + +.. _cdrom_api: + +Standardizing through another software level +============================================ + +At the time this document was conceived, all drivers directly +implemented the CD-ROM *ioctl()* calls through their own routines. This +led to the danger of different drivers forgetting to do important things +like checking that the user was giving the driver valid data. More +importantly, this led to the divergence of behavior, which has already +been discussed. + +For this reason, the Uniform CD-ROM Driver was created to enforce consistent +CD-ROM drive behavior, and to provide a common set of services to the various +low-level CD-ROM device drivers. The Uniform CD-ROM Driver now provides another +software-level, that separates the *ioctl()* and *open()* implementation +from the actual hardware implementation. Note that this effort has +made few changes which will affect a user's application programs. The +greatest change involved moving the contents of the various low-level +CD-ROM drivers\' header files to the kernel's cdrom directory. This was +done to help ensure that the user is only presented with only one cdrom +interface, the interface defined in `cdrom.h`. + +CD-ROM drives are specific enough (i. e., different from other +block-devices such as floppy or hard disc drives), to define a set +of common **CD-ROM device operations**, *<cdrom-device>_dops*. +These operations are different from the classical block-device file +operations, *<block-device>_fops*. + +The routines for the Uniform CD-ROM Driver interface level are implemented +in the file `cdrom.c`. In this file, the Uniform CD-ROM Driver interfaces +with the kernel as a block device by registering the following general +*struct file_operations*:: + + struct file_operations cdrom_fops = { + NULL, /∗ lseek ∗/ + block _read , /∗ read—general block-dev read ∗/ + block _write, /∗ write—general block-dev write ∗/ + NULL, /∗ readdir ∗/ + NULL, /∗ select ∗/ + cdrom_ioctl, /∗ ioctl ∗/ + NULL, /∗ mmap ∗/ + cdrom_open, /∗ open ∗/ + cdrom_release, /∗ release ∗/ + NULL, /∗ fsync ∗/ + NULL, /∗ fasync ∗/ + cdrom_media_changed, /∗ media change ∗/ + NULL /∗ revalidate ∗/ + }; + +Every active CD-ROM device shares this *struct*. The routines +declared above are all implemented in `cdrom.c`, since this file is the +place where the behavior of all CD-ROM-devices is defined and +standardized. The actual interface to the various types of CD-ROM +hardware is still performed by various low-level CD-ROM-device +drivers. These routines simply implement certain **capabilities** +that are common to all CD-ROM (and really, all removable-media +devices). + +Registration of a low-level CD-ROM device driver is now done through +the general routines in `cdrom.c`, not through the Virtual File System +(VFS) any more. The interface implemented in `cdrom.c` is carried out +through two general structures that contain information about the +capabilities of the driver, and the specific drives on which the +driver operates. The structures are: + +cdrom_device_ops + This structure contains information about the low-level driver for a + CD-ROM device. This structure is conceptually connected to the major + number of the device (although some drivers may have different + major numbers, as is the case for the IDE driver). + +cdrom_device_info + This structure contains information about a particular CD-ROM drive, + such as its device name, speed, etc. This structure is conceptually + connected to the minor number of the device. + +Registering a particular CD-ROM drive with the Uniform CD-ROM Driver +is done by the low-level device driver though a call to:: + + register_cdrom(struct cdrom_device_info * <device>_info) + +The device information structure, *<device>_info*, contains all the +information needed for the kernel to interface with the low-level +CD-ROM device driver. One of the most important entries in this +structure is a pointer to the *cdrom_device_ops* structure of the +low-level driver. + +The device operations structure, *cdrom_device_ops*, contains a list +of pointers to the functions which are implemented in the low-level +device driver. When `cdrom.c` accesses a CD-ROM device, it does it +through the functions in this structure. It is impossible to know all +the capabilities of future CD-ROM drives, so it is expected that this +list may need to be expanded from time to time as new technologies are +developed. For example, CD-R and CD-R/W drives are beginning to become +popular, and support will soon need to be added for them. For now, the +current *struct* is:: + + struct cdrom_device_ops { + int (*open)(struct cdrom_device_info *, int) + void (*release)(struct cdrom_device_info *); + int (*drive_status)(struct cdrom_device_info *, int); + unsigned int (*check_events)(struct cdrom_device_info *, + unsigned int, int); + int (*media_changed)(struct cdrom_device_info *, int); + int (*tray_move)(struct cdrom_device_info *, int); + int (*lock_door)(struct cdrom_device_info *, int); + int (*select_speed)(struct cdrom_device_info *, int); + int (*select_disc)(struct cdrom_device_info *, int); + int (*get_last_session) (struct cdrom_device_info *, + struct cdrom_multisession *); + int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *); + int (*reset)(struct cdrom_device_info *); + int (*audio_ioctl)(struct cdrom_device_info *, + unsigned int, void *); + const int capability; /* capability flags */ + int (*generic_packet)(struct cdrom_device_info *, + struct packet_command *); + }; + +When a low-level device driver implements one of these capabilities, +it should add a function pointer to this *struct*. When a particular +function is not implemented, however, this *struct* should contain a +NULL instead. The *capability* flags specify the capabilities of the +CD-ROM hardware and/or low-level CD-ROM driver when a CD-ROM drive +is registered with the Uniform CD-ROM Driver. + +Note that most functions have fewer parameters than their +*blkdev_fops* counterparts. This is because very little of the +information in the structures *inode* and *file* is used. For most +drivers, the main parameter is the *struct* *cdrom_device_info*, from +which the major and minor number can be extracted. (Most low-level +CD-ROM drivers don't even look at the major and minor number though, +since many of them only support one device.) This will be available +through *dev* in *cdrom_device_info* described below. + +The drive-specific, minor-like information that is registered with +`cdrom.c`, currently contains the following fields:: + + struct cdrom_device_info { + const struct cdrom_device_ops * ops; /* device operations for this major */ + struct list_head list; /* linked list of all device_info */ + struct gendisk * disk; /* matching block layer disk */ + void * handle; /* driver-dependent data */ + + int mask; /* mask of capability: disables them */ + int speed; /* maximum speed for reading data */ + int capacity; /* number of discs in a jukebox */ + + unsigned int options:30; /* options flags */ + unsigned mc_flags:2; /* media-change buffer flags */ + unsigned int vfs_events; /* cached events for vfs path */ + unsigned int ioctl_events; /* cached events for ioctl path */ + int use_count; /* number of times device is opened */ + char name[20]; /* name of the device type */ + + __u8 sanyo_slot : 2; /* Sanyo 3-CD changer support */ + __u8 keeplocked : 1; /* CDROM_LOCKDOOR status */ + __u8 reserved : 5; /* not used yet */ + int cdda_method; /* see CDDA_* flags */ + __u8 last_sense; /* saves last sense key */ + __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ + unsigned short mmc3_profile; /* current MMC3 profile */ + int for_data; /* unknown:TBD */ + int (*exit)(struct cdrom_device_info *);/* unknown:TBD */ + int mrw_mode_page; /* which MRW mode page is in use */ + }; + +Using this *struct*, a linked list of the registered minor devices is +built, using the *next* field. The device number, the device operations +struct and specifications of properties of the drive are stored in this +structure. + +The *mask* flags can be used to mask out some of the capabilities listed +in *ops->capability*, if a specific drive doesn't support a feature +of the driver. The value *speed* specifies the maximum head-rate of the +drive, measured in units of normal audio speed (176kB/sec raw data or +150kB/sec file system data). The parameters are declared *const* +because they describe properties of the drive, which don't change after +registration. + +A few registers contain variables local to the CD-ROM drive. The +flags *options* are used to specify how the general CD-ROM routines +should behave. These various flags registers should provide enough +flexibility to adapt to the different users' wishes (and **not** the +`arbitrary` wishes of the author of the low-level device driver, as is +the case in the old scheme). The register *mc_flags* is used to buffer +the information from *media_changed()* to two separate queues. Other +data that is specific to a minor drive, can be accessed through *handle*, +which can point to a data structure specific to the low-level driver. +The fields *use_count*, *next*, *options* and *mc_flags* need not be +initialized. + +The intermediate software layer that `cdrom.c` forms will perform some +additional bookkeeping. The use count of the device (the number of +processes that have the device opened) is registered in *use_count*. The +function *cdrom_ioctl()* will verify the appropriate user-memory regions +for read and write, and in case a location on the CD is transferred, +it will `sanitize` the format by making requests to the low-level +drivers in a standard format, and translating all formats between the +user-software and low level drivers. This relieves much of the drivers' +memory checking and format checking and translation. Also, the necessary +structures will be declared on the program stack. + +The implementation of the functions should be as defined in the +following sections. Two functions **must** be implemented, namely +*open()* and *release()*. Other functions may be omitted, their +corresponding capability flags will be cleared upon registration. +Generally, a function returns zero on success and negative on error. A +function call should return only after the command has completed, but of +course waiting for the device should not use processor time. + +:: + + int open(struct cdrom_device_info *cdi, int purpose) + +*Open()* should try to open the device for a specific *purpose*, which +can be either: + +- Open for reading data, as done by `mount()` (2), or the + user commands `dd` or `cat`. +- Open for *ioctl* commands, as done by audio-CD playing programs. + +Notice that any strategic code (closing tray upon *open()*, etc.) is +done by the calling routine in `cdrom.c`, so the low-level routine +should only be concerned with proper initialization, such as spinning +up the disc, etc. + +:: + + void release(struct cdrom_device_info *cdi) + +Device-specific actions should be taken such as spinning down the device. +However, strategic actions such as ejection of the tray, or unlocking +the door, should be left over to the general routine *cdrom_release()*. +This is the only function returning type *void*. + +.. _cdrom_drive_status: + +:: + + int drive_status(struct cdrom_device_info *cdi, int slot_nr) + +The function *drive_status*, if implemented, should provide +information on the status of the drive (not the status of the disc, +which may or may not be in the drive). If the drive is not a changer, +*slot_nr* should be ignored. In `cdrom.h` the possibilities are listed:: + + + CDS_NO_INFO /* no information available */ + CDS_NO_DISC /* no disc is inserted, tray is closed */ + CDS_TRAY_OPEN /* tray is opened */ + CDS_DRIVE_NOT_READY /* something is wrong, tray is moving? */ + CDS_DISC_OK /* a disc is loaded and everything is fine */ + +:: + + int media_changed(struct cdrom_device_info *cdi, int disc_nr) + +This function is very similar to the original function in $struct +file_operations*. It returns 1 if the medium of the device *cdi->dev* +has changed since the last call, and 0 otherwise. The parameter +*disc_nr* identifies a specific slot in a juke-box, it should be +ignored for single-disc drives. Note that by `re-routing` this +function through *cdrom_media_changed()*, we can implement separate +queues for the VFS and a new *ioctl()* function that can report device +changes to software (e. g., an auto-mounting daemon). + +:: + + int tray_move(struct cdrom_device_info *cdi, int position) + +This function, if implemented, should control the tray movement. (No +other function should control this.) The parameter *position* controls +the desired direction of movement: + +- 0 Close tray +- 1 Open tray + +This function returns 0 upon success, and a non-zero value upon +error. Note that if the tray is already in the desired position, no +action need be taken, and the return value should be 0. + +:: + + int lock_door(struct cdrom_device_info *cdi, int lock) + +This function (and no other code) controls locking of the door, if the +drive allows this. The value of *lock* controls the desired locking +state: + +- 0 Unlock door, manual opening is allowed +- 1 Lock door, tray cannot be ejected manually + +This function returns 0 upon success, and a non-zero value upon +error. Note that if the door is already in the requested state, no +action need be taken, and the return value should be 0. + +:: + + int select_speed(struct cdrom_device_info *cdi, int speed) + +Some CD-ROM drives are capable of changing their head-speed. There +are several reasons for changing the speed of a CD-ROM drive. Badly +pressed CD-ROM s may benefit from less-than-maximum head rate. Modern +CD-ROM drives can obtain very high head rates (up to *24x* is +common). It has been reported that these drives can make reading +errors at these high speeds, reducing the speed can prevent data loss +in these circumstances. Finally, some of these drives can +make an annoyingly loud noise, which a lower speed may reduce. + +This function specifies the speed at which data is read or audio is +played back. The value of *speed* specifies the head-speed of the +drive, measured in units of standard cdrom speed (176kB/sec raw data +or 150kB/sec file system data). So to request that a CD-ROM drive +operate at 300kB/sec you would call the CDROM_SELECT_SPEED *ioctl* +with *speed=2*. The special value `0` means `auto-selection`, i. e., +maximum data-rate or real-time audio rate. If the drive doesn't have +this `auto-selection` capability, the decision should be made on the +current disc loaded and the return value should be positive. A negative +return value indicates an error. + +:: + + int select_disc(struct cdrom_device_info *cdi, int number) + +If the drive can store multiple discs (a juke-box) this function +will perform disc selection. It should return the number of the +selected disc on success, a negative value on error. Currently, only +the ide-cd driver supports this functionality. + +:: + + int get_last_session(struct cdrom_device_info *cdi, + struct cdrom_multisession *ms_info) + +This function should implement the old corresponding *ioctl()*. For +device *cdi->dev*, the start of the last session of the current disc +should be returned in the pointer argument *ms_info*. Note that +routines in `cdrom.c` have sanitized this argument: its requested +format will **always** be of the type *CDROM_LBA* (linear block +addressing mode), whatever the calling software requested. But +sanitization goes even further: the low-level implementation may +return the requested information in *CDROM_MSF* format if it wishes so +(setting the *ms_info->addr_format* field appropriately, of +course) and the routines in `cdrom.c` will make the transformation if +necessary. The return value is 0 upon success. + +:: + + int get_mcn(struct cdrom_device_info *cdi, + struct cdrom_mcn *mcn) + +Some discs carry a `Media Catalog Number` (MCN), also called +`Universal Product Code` (UPC). This number should reflect the number +that is generally found in the bar-code on the product. Unfortunately, +the few discs that carry such a number on the disc don't even use the +same format. The return argument to this function is a pointer to a +pre-declared memory region of type *struct cdrom_mcn*. The MCN is +expected as a 13-character string, terminated by a null-character. + +:: + + int reset(struct cdrom_device_info *cdi) + +This call should perform a hard-reset on the drive (although in +circumstances that a hard-reset is necessary, a drive may very well not +listen to commands anymore). Preferably, control is returned to the +caller only after the drive has finished resetting. If the drive is no +longer listening, it may be wise for the underlying low-level cdrom +driver to time out. + +:: + + int audio_ioctl(struct cdrom_device_info *cdi, + unsigned int cmd, void *arg) + +Some of the CD-ROM-\ *ioctl()*\ 's defined in `cdrom.h` can be +implemented by the routines described above, and hence the function +*cdrom_ioctl* will use those. However, most *ioctl()*\ 's deal with +audio-control. We have decided to leave these to be accessed through a +single function, repeating the arguments *cmd* and *arg*. Note that +the latter is of type *void*, rather than *unsigned long int*. +The routine *cdrom_ioctl()* does do some useful things, +though. It sanitizes the address format type to *CDROM_MSF* (Minutes, +Seconds, Frames) for all audio calls. It also verifies the memory +location of *arg*, and reserves stack-memory for the argument. This +makes implementation of the *audio_ioctl()* much simpler than in the +old driver scheme. For example, you may look up the function +*cm206_audio_ioctl()* `cm206.c` that should be updated with +this documentation. + +An unimplemented ioctl should return *-ENOSYS*, but a harmless request +(e. g., *CDROMSTART*) may be ignored by returning 0 (success). Other +errors should be according to the standards, whatever they are. When +an error is returned by the low-level driver, the Uniform CD-ROM Driver +tries whenever possible to return the error code to the calling program. +(We may decide to sanitize the return value in *cdrom_ioctl()* though, in +order to guarantee a uniform interface to the audio-player software.) + +:: + + int dev_ioctl(struct cdrom_device_info *cdi, + unsigned int cmd, unsigned long arg) + +Some *ioctl()'s* seem to be specific to certain CD-ROM drives. That is, +they are introduced to service some capabilities of certain drives. In +fact, there are 6 different *ioctl()'s* for reading data, either in some +particular kind of format, or audio data. Not many drives support +reading audio tracks as data, I believe this is because of protection +of copyrights of artists. Moreover, I think that if audio-tracks are +supported, it should be done through the VFS and not via *ioctl()'s*. A +problem here could be the fact that audio-frames are 2352 bytes long, +so either the audio-file-system should ask for 75264 bytes at once +(the least common multiple of 512 and 2352), or the drivers should +bend their backs to cope with this incoherence (to which I would be +opposed). Furthermore, it is very difficult for the hardware to find +the exact frame boundaries, since there are no synchronization headers +in audio frames. Once these issues are resolved, this code should be +standardized in `cdrom.c`. + +Because there are so many *ioctl()'s* that seem to be introduced to +satisfy certain drivers [#f2]_, any non-standard *ioctl()*\ s +are routed through the call *dev_ioctl()*. In principle, `private` +*ioctl()*\ 's should be numbered after the device's major number, and not +the general CD-ROM *ioctl* number, `0x53`. Currently the +non-supported *ioctl()'s* are: + + CDROMREADMODE1, CDROMREADMODE2, CDROMREADAUDIO, CDROMREADRAW, + CDROMREADCOOKED, CDROMSEEK, CDROMPLAY-BLK and CDROM-READALL + +.. [#f2] + + Is there software around that actually uses these? I'd be interested! + +.. _cdrom_capabilities: + +CD-ROM capabilities +------------------- + +Instead of just implementing some *ioctl* calls, the interface in +`cdrom.c` supplies the possibility to indicate the **capabilities** +of a CD-ROM drive. This can be done by ORing any number of +capability-constants that are defined in `cdrom.h` at the registration +phase. Currently, the capabilities are any of:: + + CDC_CLOSE_TRAY /* can close tray by software control */ + CDC_OPEN_TRAY /* can open tray */ + CDC_LOCK /* can lock and unlock the door */ + CDC_SELECT_SPEED /* can select speed, in units of * sim*150 ,kB/s */ + CDC_SELECT_DISC /* drive is juke-box */ + CDC_MULTI_SESSION /* can read sessions *> rm1* */ + CDC_MCN /* can read Media Catalog Number */ + CDC_MEDIA_CHANGED /* can report if disc has changed */ + CDC_PLAY_AUDIO /* can perform audio-functions (play, pause, etc) */ + CDC_RESET /* hard reset device */ + CDC_IOCTLS /* driver has non-standard ioctls */ + CDC_DRIVE_STATUS /* driver implements drive status */ + +The capability flag is declared *const*, to prevent drivers from +accidentally tampering with the contents. The capability fags actually +inform `cdrom.c` of what the driver can do. If the drive found +by the driver does not have the capability, is can be masked out by +the *cdrom_device_info* variable *mask*. For instance, the SCSI CD-ROM +driver has implemented the code for loading and ejecting CD-ROM's, and +hence its corresponding flags in *capability* will be set. But a SCSI +CD-ROM drive might be a caddy system, which can't load the tray, and +hence for this drive the *cdrom_device_info* struct will have set +the *CDC_CLOSE_TRAY* bit in *mask*. + +In the file `cdrom.c` you will encounter many constructions of the type:: + + if (cdo->capability & ∼cdi->mask & CDC _⟨capability⟩) ... + +There is no *ioctl* to set the mask... The reason is that +I think it is better to control the **behavior** rather than the +**capabilities**. + +Options +------- + +A final flag register controls the **behavior** of the CD-ROM +drives, in order to satisfy different users' wishes, hopefully +independently of the ideas of the respective author who happened to +have made the drive's support available to the Linux community. The +current behavior options are:: + + CDO_AUTO_CLOSE /* try to close tray upon device open() */ + CDO_AUTO_EJECT /* try to open tray on last device close() */ + CDO_USE_FFLAGS /* use file_pointer->f_flags to indicate purpose for open() */ + CDO_LOCK /* try to lock door if device is opened */ + CDO_CHECK_TYPE /* ensure disc type is data if opened for data */ + +The initial value of this register is +`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK`, reflecting my own view on user +interface and software standards. Before you protest, there are two +new *ioctl()'s* implemented in `cdrom.c`, that allow you to control the +behavior by software. These are:: + + CDROM_SET_OPTIONS /* set options specified in (int)arg */ + CDROM_CLEAR_OPTIONS /* clear options specified in (int)arg */ + +One option needs some more explanation: *CDO_USE_FFLAGS*. In the next +newsection we explain what the need for this option is. + +A software package `setcd`, available from the Debian distribution +and `sunsite.unc.edu`, allows user level control of these flags. + + +The need to know the purpose of opening the CD-ROM device +========================================================= + +Traditionally, Unix devices can be used in two different `modes`, +either by reading/writing to the device file, or by issuing +controlling commands to the device, by the device's *ioctl()* +call. The problem with CD-ROM drives, is that they can be used for +two entirely different purposes. One is to mount removable +file systems, CD-ROM's, the other is to play audio CD's. Audio commands +are implemented entirely through *ioctl()\'s*, presumably because the +first implementation (SUN?) has been such. In principle there is +nothing wrong with this, but a good control of the `CD player` demands +that the device can **always** be opened in order to give the +*ioctl* commands, regardless of the state the drive is in. + +On the other hand, when used as a removable-media disc drive (what the +original purpose of CD-ROM s is) we would like to make sure that the +disc drive is ready for operation upon opening the device. In the old +scheme, some CD-ROM drivers don't do any integrity checking, resulting +in a number of i/o errors reported by the VFS to the kernel when an +attempt for mounting a CD-ROM on an empty drive occurs. This is not a +particularly elegant way to find out that there is no CD-ROM inserted; +it more-or-less looks like the old IBM-PC trying to read an empty floppy +drive for a couple of seconds, after which the system complains it +can't read from it. Nowadays we can **sense** the existence of a +removable medium in a drive, and we believe we should exploit that +fact. An integrity check on opening of the device, that verifies the +availability of a CD-ROM and its correct type (data), would be +desirable. + +These two ways of using a CD-ROM drive, principally for data and +secondarily for playing audio discs, have different demands for the +behavior of the *open()* call. Audio use simply wants to open the +device in order to get a file handle which is needed for issuing +*ioctl* commands, while data use wants to open for correct and +reliable data transfer. The only way user programs can indicate what +their *purpose* of opening the device is, is through the *flags* +parameter (see `open(2)`). For CD-ROM devices, these flags aren't +implemented (some drivers implement checking for write-related flags, +but this is not strictly necessary if the device file has correct +permission flags). Most option flags simply don't make sense to +CD-ROM devices: *O_CREAT*, *O_NOCTTY*, *O_TRUNC*, *O_APPEND*, and +*O_SYNC* have no meaning to a CD-ROM. + +We therefore propose to use the flag *O_NONBLOCK* to indicate +that the device is opened just for issuing *ioctl* +commands. Strictly, the meaning of *O_NONBLOCK* is that opening and +subsequent calls to the device don't cause the calling process to +wait. We could interpret this as don't wait until someone has +inserted some valid data-CD-ROM. Thus, our proposal of the +implementation for the *open()* call for CD-ROM s is: + +- If no other flags are set than *O_RDONLY*, the device is opened + for data transfer, and the return value will be 0 only upon successful + initialization of the transfer. The call may even induce some actions + on the CD-ROM, such as closing the tray. +- If the option flag *O_NONBLOCK* is set, opening will always be + successful, unless the whole device doesn't exist. The drive will take + no actions whatsoever. + +And what about standards? +------------------------- + +You might hesitate to accept this proposal as it comes from the +Linux community, and not from some standardizing institute. What +about SUN, SGI, HP and all those other Unix and hardware vendors? +Well, these companies are in the lucky position that they generally +control both the hardware and software of their supported products, +and are large enough to set their own standard. They do not have to +deal with a dozen or more different, competing hardware +configurations\ [#f3]_. + +.. [#f3] + + Incidentally, I think that SUN's approach to mounting CD-ROM s is very + good in origin: under Solaris a volume-daemon automatically mounts a + newly inserted CD-ROM under `/cdrom/*<volume-name>*`. + + In my opinion they should have pushed this + further and have **every** CD-ROM on the local area network be + mounted at the similar location, i. e., no matter in which particular + machine you insert a CD-ROM, it will always appear at the same + position in the directory tree, on every system. When I wanted to + implement such a user-program for Linux, I came across the + differences in behavior of the various drivers, and the need for an + *ioctl* informing about media changes. + +We believe that using *O_NONBLOCK* to indicate that a device is being opened +for *ioctl* commands only can be easily introduced in the Linux +community. All the CD-player authors will have to be informed, we can +even send in our own patches to the programs. The use of *O_NONBLOCK* +has most likely no influence on the behavior of the CD-players on +other operating systems than Linux. Finally, a user can always revert +to old behavior by a call to +*ioctl(file_descriptor, CDROM_CLEAR_OPTIONS, CDO_USE_FFLAGS)*. + +The preferred strategy of *open()* +---------------------------------- + +The routines in `cdrom.c` are designed in such a way that run-time +configuration of the behavior of CD-ROM devices (of **any** type) +can be carried out, by the *CDROM_SET/CLEAR_OPTIONS* *ioctls*. Thus, various +modes of operation can be set: + +`CDO_AUTO_CLOSE | CDO_USE_FFLAGS | CDO_LOCK` + This is the default setting. (With *CDO_CHECK_TYPE* it will be better, in + the future.) If the device is not yet opened by any other process, and if + the device is being opened for data (*O_NONBLOCK* is not set) and the + tray is found to be open, an attempt to close the tray is made. Then, + it is verified that a disc is in the drive and, if *CDO_CHECK_TYPE* is + set, that it contains tracks of type `data mode 1`. Only if all tests + are passed is the return value zero. The door is locked to prevent file + system corruption. If the drive is opened for audio (*O_NONBLOCK* is + set), no actions are taken and a value of 0 will be returned. + +`CDO_AUTO_CLOSE | CDO_AUTO_EJECT | CDO_LOCK` + This mimics the behavior of the current sbpcd-driver. The option flags are + ignored, the tray is closed on the first open, if necessary. Similarly, + the tray is opened on the last release, i. e., if a CD-ROM is unmounted, + it is automatically ejected, such that the user can replace it. + +We hope that these option can convince everybody (both driver +maintainers and user program developers) to adopt the new CD-ROM +driver scheme and option flag interpretation. + +Description of routines in `cdrom.c` +==================================== + +Only a few routines in `cdrom.c` are exported to the drivers. In this +new section we will discuss these, as well as the functions that `take +over' the CD-ROM interface to the kernel. The header file belonging +to `cdrom.c` is called `cdrom.h`. Formerly, some of the contents of this +file were placed in the file `ucdrom.h`, but this file has now been +merged back into `cdrom.h`. + +:: + + struct file_operations cdrom_fops + +The contents of this structure were described in cdrom_api_. +A pointer to this structure is assigned to the *fops* field +of the *struct gendisk*. + +:: + + int register_cdrom(struct cdrom_device_info *cdi) + +This function is used in about the same way one registers *cdrom_fops* +with the kernel, the device operations and information structures, +as described in cdrom_api_, should be registered with the +Uniform CD-ROM Driver:: + + register_cdrom(&<device>_info); + + +This function returns zero upon success, and non-zero upon +failure. The structure *<device>_info* should have a pointer to the +driver's *<device>_dops*, as in:: + + struct cdrom_device_info <device>_info = { + <device>_dops; + ... + } + +Note that a driver must have one static structure, *<device>_dops*, while +it may have as many structures *<device>_info* as there are minor devices +active. *Register_cdrom()* builds a linked list from these. + + +:: + + void unregister_cdrom(struct cdrom_device_info *cdi) + +Unregistering device *cdi* with minor number *MINOR(cdi->dev)* removes +the minor device from the list. If it was the last registered minor for +the low-level driver, this disconnects the registered device-operation +routines from the CD-ROM interface. This function returns zero upon +success, and non-zero upon failure. + +:: + + int cdrom_open(struct inode * ip, struct file * fp) + +This function is not called directly by the low-level drivers, it is +listed in the standard *cdrom_fops*. If the VFS opens a file, this +function becomes active. A strategy is implemented in this routine, +taking care of all capabilities and options that are set in the +*cdrom_device_ops* connected to the device. Then, the program flow is +transferred to the device_dependent *open()* call. + +:: + + void cdrom_release(struct inode *ip, struct file *fp) + +This function implements the reverse-logic of *cdrom_open()*, and then +calls the device-dependent *release()* routine. When the use-count has +reached 0, the allocated buffers are flushed by calls to *sync_dev(dev)* +and *invalidate_buffers(dev)*. + + +.. _cdrom_ioctl: + +:: + + int cdrom_ioctl(struct inode *ip, struct file *fp, + unsigned int cmd, unsigned long arg) + +This function handles all the standard *ioctl* requests for CD-ROM +devices in a uniform way. The different calls fall into three +categories: *ioctl()'s* that can be directly implemented by device +operations, ones that are routed through the call *audio_ioctl()*, and +the remaining ones, that are presumable device-dependent. Generally, a +negative return value indicates an error. + +Directly implemented *ioctl()'s* +-------------------------------- + +The following `old` CD-ROM *ioctl()*\ 's are implemented by directly +calling device-operations in *cdrom_device_ops*, if implemented and +not masked: + +`CDROMMULTISESSION` + Requests the last session on a CD-ROM. +`CDROMEJECT` + Open tray. +`CDROMCLOSETRAY` + Close tray. +`CDROMEJECT_SW` + If *arg\not=0*, set behavior to auto-close (close + tray on first open) and auto-eject (eject on last release), otherwise + set behavior to non-moving on *open()* and *release()* calls. +`CDROM_GET_MCN` + Get the Media Catalog Number from a CD. + +*Ioctl*s routed through *audio_ioctl()* +--------------------------------------- + +The following set of *ioctl()'s* are all implemented through a call to +the *cdrom_fops* function *audio_ioctl()*. Memory checks and +allocation are performed in *cdrom_ioctl()*, and also sanitization of +address format (*CDROM_LBA*/*CDROM_MSF*) is done. + +`CDROMSUBCHNL` + Get sub-channel data in argument *arg* of type + `struct cdrom_subchnl *`. +`CDROMREADTOCHDR` + Read Table of Contents header, in *arg* of type + `struct cdrom_tochdr *`. +`CDROMREADTOCENTRY` + Read a Table of Contents entry in *arg* and specified by *arg* + of type `struct cdrom_tocentry *`. +`CDROMPLAYMSF` + Play audio fragment specified in Minute, Second, Frame format, + delimited by *arg* of type `struct cdrom_msf *`. +`CDROMPLAYTRKIND` + Play audio fragment in track-index format delimited by *arg* + of type `struct cdrom_ti *`. +`CDROMVOLCTRL` + Set volume specified by *arg* of type `struct cdrom_volctrl *`. +`CDROMVOLREAD` + Read volume into by *arg* of type `struct cdrom_volctrl *`. +`CDROMSTART` + Spin up disc. +`CDROMSTOP` + Stop playback of audio fragment. +`CDROMPAUSE` + Pause playback of audio fragment. +`CDROMRESUME` + Resume playing. + +New *ioctl()'s* in `cdrom.c` +---------------------------- + +The following *ioctl()'s* have been introduced to allow user programs to +control the behavior of individual CD-ROM devices. New *ioctl* +commands can be identified by the underscores in their names. + +`CDROM_SET_OPTIONS` + Set options specified by *arg*. Returns the option flag register + after modification. Use *arg = \rm0* for reading the current flags. +`CDROM_CLEAR_OPTIONS` + Clear options specified by *arg*. Returns the option flag register + after modification. +`CDROM_SELECT_SPEED` + Select head-rate speed of disc specified as by *arg* in units + of standard cdrom speed (176\,kB/sec raw data or + 150kB/sec file system data). The value 0 means `auto-select`, + i. e., play audio discs at real time and data discs at maximum speed. + The value *arg* is checked against the maximum head rate of the + drive found in the *cdrom_dops*. +`CDROM_SELECT_DISC` + Select disc numbered *arg* from a juke-box. + + First disc is numbered 0. The number *arg* is checked against the + maximum number of discs in the juke-box found in the *cdrom_dops*. +`CDROM_MEDIA_CHANGED` + Returns 1 if a disc has been changed since the last call. + Note that calls to *cdrom_media_changed* by the VFS are treated + by an independent queue, so both mechanisms will detect a + media change once. For juke-boxes, an extra argument *arg* + specifies the slot for which the information is given. The special + value *CDSL_CURRENT* requests that information about the currently + selected slot be returned. +`CDROM_DRIVE_STATUS` + Returns the status of the drive by a call to + *drive_status()*. Return values are defined in cdrom_drive_status_. + Note that this call doesn't return information on the + current playing activity of the drive; this can be polled through + an *ioctl* call to *CDROMSUBCHNL*. For juke-boxes, an extra argument + *arg* specifies the slot for which (possibly limited) information is + given. The special value *CDSL_CURRENT* requests that information + about the currently selected slot be returned. +`CDROM_DISC_STATUS` + Returns the type of the disc currently in the drive. + It should be viewed as a complement to *CDROM_DRIVE_STATUS*. + This *ioctl* can provide *some* information about the current + disc that is inserted in the drive. This functionality used to be + implemented in the low level drivers, but is now carried out + entirely in Uniform CD-ROM Driver. + + The history of development of the CD's use as a carrier medium for + various digital information has lead to many different disc types. + This *ioctl* is useful only in the case that CDs have \emph {only + one} type of data on them. While this is often the case, it is + also very common for CDs to have some tracks with data, and some + tracks with audio. Because this is an existing interface, rather + than fixing this interface by changing the assumptions it was made + under, thereby breaking all user applications that use this + function, the Uniform CD-ROM Driver implements this *ioctl* as + follows: If the CD in question has audio tracks on it, and it has + absolutely no CD-I, XA, or data tracks on it, it will be reported + as *CDS_AUDIO*. If it has both audio and data tracks, it will + return *CDS_MIXED*. If there are no audio tracks on the disc, and + if the CD in question has any CD-I tracks on it, it will be + reported as *CDS_XA_2_2*. Failing that, if the CD in question + has any XA tracks on it, it will be reported as *CDS_XA_2_1*. + Finally, if the CD in question has any data tracks on it, + it will be reported as a data CD (*CDS_DATA_1*). + + This *ioctl* can return:: + + CDS_NO_INFO /* no information available */ + CDS_NO_DISC /* no disc is inserted, or tray is opened */ + CDS_AUDIO /* Audio disc (2352 audio bytes/frame) */ + CDS_DATA_1 /* data disc, mode 1 (2048 user bytes/frame) */ + CDS_XA_2_1 /* mixed data (XA), mode 2, form 1 (2048 user bytes) */ + CDS_XA_2_2 /* mixed data (XA), mode 2, form 1 (2324 user bytes) */ + CDS_MIXED /* mixed audio/data disc */ + + For some information concerning frame layout of the various disc + types, see a recent version of `cdrom.h`. + +`CDROM_CHANGER_NSLOTS` + Returns the number of slots in a juke-box. +`CDROMRESET` + Reset the drive. +`CDROM_GET_CAPABILITY` + Returns the *capability* flags for the drive. Refer to section + cdrom_capabilities_ for more information on these flags. +`CDROM_LOCKDOOR` + Locks the door of the drive. `arg == 0` unlocks the door, + any other value locks it. +`CDROM_DEBUG` + Turns on debugging info. Only root is allowed to do this. + Same semantics as CDROM_LOCKDOOR. + + +Device dependent *ioctl()'s* +---------------------------- + +Finally, all other *ioctl()'s* are passed to the function *dev_ioctl()*, +if implemented. No memory allocation or verification is carried out. + +How to update your driver +========================= + +- Make a backup of your current driver. +- Get hold of the files `cdrom.c` and `cdrom.h`, they should be in + the directory tree that came with this documentation. +- Make sure you include `cdrom.h`. +- Change the 3rd argument of *register_blkdev* from `&<your-drive>_fops` + to `&cdrom_fops`. +- Just after that line, add the following to register with the Uniform + CD-ROM Driver:: + + register_cdrom(&<your-drive>_info);* + + Similarly, add a call to *unregister_cdrom()* at the appropriate place. +- Copy an example of the device-operations *struct* to your + source, e. g., from `cm206.c` *cm206_dops*, and change all + entries to names corresponding to your driver, or names you just + happen to like. If your driver doesn't support a certain function, + make the entry *NULL*. At the entry *capability* you should list all + capabilities your driver currently supports. If your driver + has a capability that is not listed, please send me a message. +- Copy the *cdrom_device_info* declaration from the same example + driver, and modify the entries according to your needs. If your + driver dynamically determines the capabilities of the hardware, this + structure should also be declared dynamically. +- Implement all functions in your `<device>_dops` structure, + according to prototypes listed in `cdrom.h`, and specifications given + in cdrom_api_. Most likely you have already implemented + the code in a large part, and you will almost certainly need to adapt the + prototype and return values. +- Rename your `<device>_ioctl()` function to *audio_ioctl* and + change the prototype a little. Remove entries listed in the first + part in cdrom_ioctl_, if your code was OK, these are + just calls to the routines you adapted in the previous step. +- You may remove all remaining memory checking code in the + *audio_ioctl()* function that deals with audio commands (these are + listed in the second part of cdrom_ioctl_. There is no + need for memory allocation either, so most *case*s in the *switch* + statement look similar to:: + + case CDROMREADTOCENTRY: + get_toc_entry\bigl((struct cdrom_tocentry *) arg); + +- All remaining *ioctl* cases must be moved to a separate + function, *<device>_ioctl*, the device-dependent *ioctl()'s*. Note that + memory checking and allocation must be kept in this code! +- Change the prototypes of *<device>_open()* and + *<device>_release()*, and remove any strategic code (i. e., tray + movement, door locking, etc.). +- Try to recompile the drivers. We advise you to use modules, both + for `cdrom.o` and your driver, as debugging is much easier this + way. + +Thanks +====== + +Thanks to all the people involved. First, Erik Andersen, who has +taken over the torch in maintaining `cdrom.c` and integrating much +CD-ROM-related code in the 2.1-kernel. Thanks to Scott Snyder and +Gerd Knorr, who were the first to implement this interface for SCSI +and IDE-CD drivers and added many ideas for extension of the data +structures relative to kernel~2.0. Further thanks to Heiko Eißfeldt, +Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard Mönkeberg and Andrew Kroll, +the Linux CD-ROM device driver developers who were kind +enough to give suggestions and criticisms during the writing. Finally +of course, I want to thank Linus Torvalds for making this possible in +the first place. diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex deleted file mode 100644 index f7cd455973f7..000000000000 --- a/Documentation/cdrom/cdrom-standard.tex +++ /dev/null @@ -1,1026 +0,0 @@ -\documentclass{article} -\def\version{$Id: cdrom-standard.tex,v 1.9 1997/12/28 15:42:49 david Exp $} -\newcommand{\newsection}[1]{\newpage\section{#1}} - -\evensidemargin=0pt -\oddsidemargin=0pt -\topmargin=-\headheight \advance\topmargin by -\headsep -\textwidth=15.99cm \textheight=24.62cm % normal A4, 1'' margin - -\def\linux{{\sc Linux}} -\def\cdrom{{\sc cd-rom}} -\def\UCD{{\sc Uniform cd-rom Driver}} -\def\cdromc{{\tt {cdrom.c}}} -\def\cdromh{{\tt {cdrom.h}}} -\def\fo{\sl} % foreign words -\def\ie{{\fo i.e.}} -\def\eg{{\fo e.g.}} - -\everymath{\it} \everydisplay{\it} -\catcode `\_=\active \def_{\_\penalty100 } -\catcode`\<=\active \def<#1>{{\langle\hbox{\rm#1}\rangle}} - -\begin{document} -\title{A \linux\ \cdrom\ standard} -\author{David van Leeuwen\\{\normalsize\tt david@ElseWare.cistron.nl} -\\{\footnotesize updated by Erik Andersen {\tt(andersee@debian.org)}} -\\{\footnotesize updated by Jens Axboe {\tt(axboe@image.dk)}}} -\date{12 March 1999} - -\maketitle - -\newsection{Introduction} - -\linux\ is probably the Unix-like operating system that supports -the widest variety of hardware devices. The reasons for this are -presumably -\begin{itemize} -\item - The large list of hardware devices available for the many platforms - that \linux\ now supports (\ie, i386-PCs, Sparc Suns, etc.) -\item - The open design of the operating system, such that anybody can write a - driver for \linux. -\item - There is plenty of source code around as examples of how to write a driver. -\end{itemize} -The openness of \linux, and the many different types of available -hardware has allowed \linux\ to support many different hardware devices. -Unfortunately, the very openness that has allowed \linux\ to support -all these different devices has also allowed the behavior of each -device driver to differ significantly from one device to another. -This divergence of behavior has been very significant for \cdrom\ -devices; the way a particular drive reacts to a `standard' $ioctl()$ -call varies greatly from one device driver to another. To avoid making -their drivers totally inconsistent, the writers of \linux\ \cdrom\ -drivers generally created new device drivers by understanding, copying, -and then changing an existing one. Unfortunately, this practice did not -maintain uniform behavior across all the \linux\ \cdrom\ drivers. - -This document describes an effort to establish Uniform behavior across -all the different \cdrom\ device drivers for \linux. This document also -defines the various $ioctl$s, and how the low-level \cdrom\ device -drivers should implement them. Currently (as of the \linux\ 2.1.$x$ -development kernels) several low-level \cdrom\ device drivers, including -both IDE/ATAPI and SCSI, now use this Uniform interface. - -When the \cdrom\ was developed, the interface between the \cdrom\ drive -and the computer was not specified in the standards. As a result, many -different \cdrom\ interfaces were developed. Some of them had their -own proprietary design (Sony, Mitsumi, Panasonic, Philips), other -manufacturers adopted an existing electrical interface and changed -the functionality (CreativeLabs/SoundBlaster, Teac, Funai) or simply -adapted their drives to one or more of the already existing electrical -interfaces (Aztech, Sanyo, Funai, Vertos, Longshine, Optics Storage and -most of the `NoName' manufacturers). In cases where a new drive really -brought its own interface or used its own command set and flow control -scheme, either a separate driver had to be written, or an existing -driver had to be enhanced. History has delivered us \cdrom\ support for -many of these different interfaces. Nowadays, almost all new \cdrom\ -drives are either IDE/ATAPI or SCSI, and it is very unlikely that any -manufacturer will create a new interface. Even finding drives for the -old proprietary interfaces is getting difficult. - -When (in the 1.3.70's) I looked at the existing software interface, -which was expressed through \cdromh, it appeared to be a rather wild -set of commands and data formats.\footnote{I cannot recollect what -kernel version I looked at, then, presumably 1.2.13 and 1.3.34---the -latest kernel that I was indirectly involved in.} It seemed that many -features of the software interface had been added to accommodate the -capabilities of a particular drive, in an {\fo ad hoc\/} manner. More -importantly, it appeared that the behavior of the `standard' commands -was different for most of the different drivers: \eg, some drivers -close the tray if an $open()$ call occurs when the tray is open, while -others do not. Some drivers lock the door upon opening the device, to -prevent an incoherent file system, but others don't, to allow software -ejection. Undoubtedly, the capabilities of the different drives vary, -but even when two drives have the same capability their drivers' -behavior was usually different. - -I decided to start a discussion on how to make all the \linux\ \cdrom\ -drivers behave more uniformly. I began by contacting the developers of -the many \cdrom\ drivers found in the \linux\ kernel. Their reactions -encouraged me to write the \UCD\ which this document is intended to -describe. The implementation of the \UCD\ is in the file \cdromc. This -driver is intended to be an additional software layer that sits on top -of the low-level device drivers for each \cdrom\ drive. By adding this -additional layer, it is possible to have all the different \cdrom\ -devices behave {\em exactly\/} the same (insofar as the underlying -hardware will allow). - -The goal of the \UCD\ is {\em not\/} to alienate driver developers who -have not yet taken steps to support this effort. The goal of \UCD\ is -simply to give people writing application programs for \cdrom\ drives -{\em one\/} \linux\ \cdrom\ interface with consistent behavior for all -\cdrom\ devices. In addition, this also provides a consistent interface -between the low-level device driver code and the \linux\ kernel. Care -is taken that 100\,\% compatibility exists with the data structures and -programmer's interface defined in \cdromh. This guide was written to -help \cdrom\ driver developers adapt their code to use the \UCD\ code -defined in \cdromc. - -Personally, I think that the most important hardware interfaces are -the IDE/ATAPI drives and, of course, the SCSI drives, but as prices -of hardware drop continuously, it is also likely that people may have -more than one \cdrom\ drive, possibly of mixed types. It is important -that these drives behave in the same way. In December 1994, one of the -cheapest \cdrom\ drives was a Philips cm206, a double-speed proprietary -drive. In the months that I was busy writing a \linux\ driver for it, -proprietary drives became obsolete and IDE/ATAPI drives became the -standard. At the time of the last update to this document (November -1997) it is becoming difficult to even {\em find} anything less than a -16 speed \cdrom\ drive, and 24 speed drives are common. - -\newsection{Standardizing through another software level} -\label{cdrom.c} - -At the time this document was conceived, all drivers directly -implemented the \cdrom\ $ioctl()$ calls through their own routines. This -led to the danger of different drivers forgetting to do important things -like checking that the user was giving the driver valid data. More -importantly, this led to the divergence of behavior, which has already -been discussed. - -For this reason, the \UCD\ was created to enforce consistent \cdrom\ -drive behavior, and to provide a common set of services to the various -low-level \cdrom\ device drivers. The \UCD\ now provides another -software-level, that separates the $ioctl()$ and $open()$ implementation -from the actual hardware implementation. Note that this effort has -made few changes which will affect a user's application programs. The -greatest change involved moving the contents of the various low-level -\cdrom\ drivers' header files to the kernel's cdrom directory. This was -done to help ensure that the user is only presented with only one cdrom -interface, the interface defined in \cdromh. - -\cdrom\ drives are specific enough (\ie, different from other -block-devices such as floppy or hard disc drives), to define a set -of common {\em \cdrom\ device operations}, $<cdrom-device>_dops$. -These operations are different from the classical block-device file -operations, $<block-device>_fops$. - -The routines for the \UCD\ interface level are implemented in the file -\cdromc. In this file, the \UCD\ interfaces with the kernel as a block -device by registering the following general $struct\ file_operations$: -$$ -\halign{$#$\ \hfil&$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -struct& file_operations\ cdrom_fops = \{\hidewidth\cr - &NULL, & lseek \cr - &block_read, & read---general block-dev read \cr - &block_write, & write---general block-dev write \cr - &NULL, & readdir \cr - &NULL, & select \cr - &cdrom_ioctl, & ioctl \cr - &NULL, & mmap \cr - &cdrom_open, & open \cr - &cdrom_release, & release \cr - &NULL, & fsync \cr - &NULL, & fasync \cr - &cdrom_media_changed, & media change \cr - &NULL & revalidate \cr -\};\cr -} -$$ - -Every active \cdrom\ device shares this $struct$. The routines -declared above are all implemented in \cdromc, since this file is the -place where the behavior of all \cdrom-devices is defined and -standardized. The actual interface to the various types of \cdrom\ -hardware is still performed by various low-level \cdrom-device -drivers. These routines simply implement certain {\em capabilities\/} -that are common to all \cdrom\ (and really, all removable-media -devices). - -Registration of a low-level \cdrom\ device driver is now done through -the general routines in \cdromc, not through the Virtual File System -(VFS) any more. The interface implemented in \cdromc\ is carried out -through two general structures that contain information about the -capabilities of the driver, and the specific drives on which the -driver operates. The structures are: -\begin{description} -\item[$cdrom_device_ops$] - This structure contains information about the low-level driver for a - \cdrom\ device. This structure is conceptually connected to the major - number of the device (although some drivers may have different - major numbers, as is the case for the IDE driver). -\item[$cdrom_device_info$] - This structure contains information about a particular \cdrom\ drive, - such as its device name, speed, etc. This structure is conceptually - connected to the minor number of the device. -\end{description} - -Registering a particular \cdrom\ drive with the \UCD\ is done by the -low-level device driver though a call to: -$$register_cdrom(struct\ cdrom_device_info * <device>_info) -$$ -The device information structure, $<device>_info$, contains all the -information needed for the kernel to interface with the low-level -\cdrom\ device driver. One of the most important entries in this -structure is a pointer to the $cdrom_device_ops$ structure of the -low-level driver. - -The device operations structure, $cdrom_device_ops$, contains a list -of pointers to the functions which are implemented in the low-level -device driver. When \cdromc\ accesses a \cdrom\ device, it does it -through the functions in this structure. It is impossible to know all -the capabilities of future \cdrom\ drives, so it is expected that this -list may need to be expanded from time to time as new technologies are -developed. For example, CD-R and CD-R/W drives are beginning to become -popular, and support will soon need to be added for them. For now, the -current $struct$ is: -$$ -\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}& - $/*$ \rm# $*/$\hfil\cr -struct& cdrom_device_ops\ \{ \hidewidth\cr - &int& (* open)(struct\ cdrom_device_info *, int)\cr - &void& (* release)(struct\ cdrom_device_info *);\cr - &int& (* drive_status)(struct\ cdrom_device_info *, int);\cr - &unsigned\ int& (* check_events)(struct\ cdrom_device_info *, unsigned\ int, int);\cr - &int& (* media_changed)(struct\ cdrom_device_info *, int);\cr - &int& (* tray_move)(struct\ cdrom_device_info *, int);\cr - &int& (* lock_door)(struct\ cdrom_device_info *, int);\cr - &int& (* select_speed)(struct\ cdrom_device_info *, int);\cr - &int& (* select_disc)(struct\ cdrom_device_info *, int);\cr - &int& (* get_last_session) (struct\ cdrom_device_info *, - struct\ cdrom_multisession *{});\cr - &int& (* get_mcn)(struct\ cdrom_device_info *, struct\ cdrom_mcn *{});\cr - &int& (* reset)(struct\ cdrom_device_info *);\cr - &int& (* audio_ioctl)(struct\ cdrom_device_info *, unsigned\ int, - void *{});\cr -\noalign{\medskip} - &const\ int& capability;& capability flags \cr - &int& (* generic_packet)(struct\ cdrom_device_info *, struct\ packet_command *{});\cr -\};\cr -} -$$ -When a low-level device driver implements one of these capabilities, -it should add a function pointer to this $struct$. When a particular -function is not implemented, however, this $struct$ should contain a -NULL instead. The $capability$ flags specify the capabilities of the -\cdrom\ hardware and/or low-level \cdrom\ driver when a \cdrom\ drive -is registered with the \UCD. - -Note that most functions have fewer parameters than their -$blkdev_fops$ counterparts. This is because very little of the -information in the structures $inode$ and $file$ is used. For most -drivers, the main parameter is the $struct$ $cdrom_device_info$, from -which the major and minor number can be extracted. (Most low-level -\cdrom\ drivers don't even look at the major and minor number though, -since many of them only support one device.) This will be available -through $dev$ in $cdrom_device_info$ described below. - -The drive-specific, minor-like information that is registered with -\cdromc, currently contains the following fields: -$$ -\halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}& - $/*$ \rm# $*/$\hfil\cr -struct& cdrom_device_info\ \{ \hidewidth\cr - & const\ struct\ cdrom_device_ops *& ops;& device operations for this major\cr - & struct\ list_head& list;& linked list of all device_info\cr - & struct\ gendisk *& disk;& matching block layer disk\cr - & void *& handle;& driver-dependent data\cr -\noalign{\medskip} - & int& mask;& mask of capability: disables them \cr - & int& speed;& maximum speed for reading data \cr - & int& capacity;& number of discs in a jukebox \cr -\noalign{\medskip} - &unsigned\ int& options : 30;& options flags \cr - &unsigned& mc_flags : 2;& media-change buffer flags \cr - &unsigned\ int& vfs_events;& cached events for vfs path\cr - &unsigned\ int& ioctl_events;& cached events for ioctl path\cr - & int& use_count;& number of times device is opened\cr - & char& name[20];& name of the device type\cr -\noalign{\medskip} - &__u8& sanyo_slot : 2;& Sanyo 3-CD changer support\cr - &__u8& keeplocked : 1;& CDROM_LOCKDOOR status\cr - &__u8& reserved : 5;& not used yet\cr - & int& cdda_method;& see CDDA_* flags\cr - &__u8& last_sense;& saves last sense key\cr - &__u8& media_written;& dirty flag, DVD+RW bookkeeping\cr - &unsigned\ short& mmc3_profile;& current MMC3 profile\cr - & int& for_data;& unknown:TBD\cr - & int\ (* exit)\ (struct\ cdrom_device_info *);&& unknown:TBD\cr - & int& mrw_mode_page;& which MRW mode page is in use\cr -\}\cr -}$$ -Using this $struct$, a linked list of the registered minor devices is -built, using the $next$ field. The device number, the device operations -struct and specifications of properties of the drive are stored in this -structure. - -The $mask$ flags can be used to mask out some of the capabilities listed -in $ops\to capability$, if a specific drive doesn't support a feature -of the driver. The value $speed$ specifies the maximum head-rate of the -drive, measured in units of normal audio speed (176\,kB/sec raw data or -150\,kB/sec file system data). The parameters are declared $const$ -because they describe properties of the drive, which don't change after -registration. - -A few registers contain variables local to the \cdrom\ drive. The -flags $options$ are used to specify how the general \cdrom\ routines -should behave. These various flags registers should provide enough -flexibility to adapt to the different users' wishes (and {\em not\/} the -`arbitrary' wishes of the author of the low-level device driver, as is -the case in the old scheme). The register $mc_flags$ is used to buffer -the information from $media_changed()$ to two separate queues. Other -data that is specific to a minor drive, can be accessed through $handle$, -which can point to a data structure specific to the low-level driver. -The fields $use_count$, $next$, $options$ and $mc_flags$ need not be -initialized. - -The intermediate software layer that \cdromc\ forms will perform some -additional bookkeeping. The use count of the device (the number of -processes that have the device opened) is registered in $use_count$. The -function $cdrom_ioctl()$ will verify the appropriate user-memory regions -for read and write, and in case a location on the CD is transferred, -it will `sanitize' the format by making requests to the low-level -drivers in a standard format, and translating all formats between the -user-software and low level drivers. This relieves much of the drivers' -memory checking and format checking and translation. Also, the necessary -structures will be declared on the program stack. - -The implementation of the functions should be as defined in the -following sections. Two functions {\em must\/} be implemented, namely -$open()$ and $release()$. Other functions may be omitted, their -corresponding capability flags will be cleared upon registration. -Generally, a function returns zero on success and negative on error. A -function call should return only after the command has completed, but of -course waiting for the device should not use processor time. - -\subsection{$Int\ open(struct\ cdrom_device_info * cdi, int\ purpose)$} - -$Open()$ should try to open the device for a specific $purpose$, which -can be either: -\begin{itemize} -\item[0] Open for reading data, as done by {\tt {mount()}} (2), or the -user commands {\tt {dd}} or {\tt {cat}}. -\item[1] Open for $ioctl$ commands, as done by audio-CD playing -programs. -\end{itemize} -Notice that any strategic code (closing tray upon $open()$, etc.)\ is -done by the calling routine in \cdromc, so the low-level routine -should only be concerned with proper initialization, such as spinning -up the disc, etc. % and device-use count - - -\subsection{$Void\ release(struct\ cdrom_device_info * cdi)$} - - -Device-specific actions should be taken such as spinning down the device. -However, strategic actions such as ejection of the tray, or unlocking -the door, should be left over to the general routine $cdrom_release()$. -This is the only function returning type $void$. - -\subsection{$Int\ drive_status(struct\ cdrom_device_info * cdi, int\ slot_nr)$} -\label{drive status} - -The function $drive_status$, if implemented, should provide -information on the status of the drive (not the status of the disc, -which may or may not be in the drive). If the drive is not a changer, -$slot_nr$ should be ignored. In \cdromh\ the possibilities are listed: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDS_NO_INFO& no information available\cr -CDS_NO_DISC& no disc is inserted, tray is closed\cr -CDS_TRAY_OPEN& tray is opened\cr -CDS_DRIVE_NOT_READY& something is wrong, tray is moving?\cr -CDS_DISC_OK& a disc is loaded and everything is fine\cr -} -$$ - -\subsection{$Int\ media_changed(struct\ cdrom_device_info * cdi, int\ disc_nr)$} - -This function is very similar to the original function in $struct\ -file_operations$. It returns 1 if the medium of the device $cdi\to -dev$ has changed since the last call, and 0 otherwise. The parameter -$disc_nr$ identifies a specific slot in a juke-box, it should be -ignored for single-disc drives. Note that by `re-routing' this -function through $cdrom_media_changed()$, we can implement separate -queues for the VFS and a new $ioctl()$ function that can report device -changes to software (\eg, an auto-mounting daemon). - -\subsection{$Int\ tray_move(struct\ cdrom_device_info * cdi, int\ position)$} - -This function, if implemented, should control the tray movement. (No -other function should control this.) The parameter $position$ controls -the desired direction of movement: -\begin{itemize} -\item[0] Close tray -\item[1] Open tray -\end{itemize} -This function returns 0 upon success, and a non-zero value upon -error. Note that if the tray is already in the desired position, no -action need be taken, and the return value should be 0. - -\subsection{$Int\ lock_door(struct\ cdrom_device_info * cdi, int\ lock)$} - -This function (and no other code) controls locking of the door, if the -drive allows this. The value of $lock$ controls the desired locking -state: -\begin{itemize} -\item[0] Unlock door, manual opening is allowed -\item[1] Lock door, tray cannot be ejected manually -\end{itemize} -This function returns 0 upon success, and a non-zero value upon -error. Note that if the door is already in the requested state, no -action need be taken, and the return value should be 0. - -\subsection{$Int\ select_speed(struct\ cdrom_device_info * cdi, int\ speed)$} - -Some \cdrom\ drives are capable of changing their head-speed. There -are several reasons for changing the speed of a \cdrom\ drive. Badly -pressed \cdrom s may benefit from less-than-maximum head rate. Modern -\cdrom\ drives can obtain very high head rates (up to $24\times$ is -common). It has been reported that these drives can make reading -errors at these high speeds, reducing the speed can prevent data loss -in these circumstances. Finally, some of these drives can -make an annoyingly loud noise, which a lower speed may reduce. %Finally, -%although the audio-low-pass filters probably aren't designed for it, -%more than real-time playback of audio might be used for high-speed -%copying of audio tracks. - -This function specifies the speed at which data is read or audio is -played back. The value of $speed$ specifies the head-speed of the -drive, measured in units of standard cdrom speed (176\,kB/sec raw data -or 150\,kB/sec file system data). So to request that a \cdrom\ drive -operate at 300\,kB/sec you would call the CDROM_SELECT_SPEED $ioctl$ -with $speed=2$. The special value `0' means `auto-selection', \ie, -maximum data-rate or real-time audio rate. If the drive doesn't have -this `auto-selection' capability, the decision should be made on the -current disc loaded and the return value should be positive. A negative -return value indicates an error. - -\subsection{$Int\ select_disc(struct\ cdrom_device_info * cdi, int\ number)$} - -If the drive can store multiple discs (a juke-box) this function -will perform disc selection. It should return the number of the -selected disc on success, a negative value on error. Currently, only -the ide-cd driver supports this functionality. - -\subsection{$Int\ get_last_session(struct\ cdrom_device_info * cdi, struct\ - cdrom_multisession * ms_info)$} - -This function should implement the old corresponding $ioctl()$. For -device $cdi\to dev$, the start of the last session of the current disc -should be returned in the pointer argument $ms_info$. Note that -routines in \cdromc\ have sanitized this argument: its requested -format will {\em always\/} be of the type $CDROM_LBA$ (linear block -addressing mode), whatever the calling software requested. But -sanitization goes even further: the low-level implementation may -return the requested information in $CDROM_MSF$ format if it wishes so -(setting the $ms_info\rightarrow addr_format$ field appropriately, of -course) and the routines in \cdromc\ will make the transformation if -necessary. The return value is 0 upon success. - -\subsection{$Int\ get_mcn(struct\ cdrom_device_info * cdi, struct\ - cdrom_mcn * mcn)$} - -Some discs carry a `Media Catalog Number' (MCN), also called -`Universal Product Code' (UPC). This number should reflect the number -that is generally found in the bar-code on the product. Unfortunately, -the few discs that carry such a number on the disc don't even use the -same format. The return argument to this function is a pointer to a -pre-declared memory region of type $struct\ cdrom_mcn$. The MCN is -expected as a 13-character string, terminated by a null-character. - -\subsection{$Int\ reset(struct\ cdrom_device_info * cdi)$} - -This call should perform a hard-reset on the drive (although in -circumstances that a hard-reset is necessary, a drive may very well not -listen to commands anymore). Preferably, control is returned to the -caller only after the drive has finished resetting. If the drive is no -longer listening, it may be wise for the underlying low-level cdrom -driver to time out. - -\subsection{$Int\ audio_ioctl(struct\ cdrom_device_info * cdi, unsigned\ - int\ cmd, void * arg)$} - -Some of the \cdrom-$ioctl$s defined in \cdromh\ can be -implemented by the routines described above, and hence the function -$cdrom_ioctl$ will use those. However, most $ioctl$s deal with -audio-control. We have decided to leave these to be accessed through a -single function, repeating the arguments $cmd$ and $arg$. Note that -the latter is of type $void*{}$, rather than $unsigned\ long\ -int$. The routine $cdrom_ioctl()$ does do some useful things, -though. It sanitizes the address format type to $CDROM_MSF$ (Minutes, -Seconds, Frames) for all audio calls. It also verifies the memory -location of $arg$, and reserves stack-memory for the argument. This -makes implementation of the $audio_ioctl()$ much simpler than in the -old driver scheme. For example, you may look up the function -$cm206_audio_ioctl()$ in {\tt {cm206.c}} that should be updated with -this documentation. - -An unimplemented ioctl should return $-ENOSYS$, but a harmless request -(\eg, $CDROMSTART$) may be ignored by returning 0 (success). Other -errors should be according to the standards, whatever they are. When -an error is returned by the low-level driver, the \UCD\ tries whenever -possible to return the error code to the calling program. (We may decide -to sanitize the return value in $cdrom_ioctl()$ though, in order to -guarantee a uniform interface to the audio-player software.) - -\subsection{$Int\ dev_ioctl(struct\ cdrom_device_info * cdi, unsigned\ int\ - cmd, unsigned\ long\ arg)$} - -Some $ioctl$s seem to be specific to certain \cdrom\ drives. That is, -they are introduced to service some capabilities of certain drives. In -fact, there are 6 different $ioctl$s for reading data, either in some -particular kind of format, or audio data. Not many drives support -reading audio tracks as data, I believe this is because of protection -of copyrights of artists. Moreover, I think that if audio-tracks are -supported, it should be done through the VFS and not via $ioctl$s. A -problem here could be the fact that audio-frames are 2352 bytes long, -so either the audio-file-system should ask for 75264 bytes at once -(the least common multiple of 512 and 2352), or the drivers should -bend their backs to cope with this incoherence (to which I would be -opposed). Furthermore, it is very difficult for the hardware to find -the exact frame boundaries, since there are no synchronization headers -in audio frames. Once these issues are resolved, this code should be -standardized in \cdromc. - -Because there are so many $ioctl$s that seem to be introduced to -satisfy certain drivers,\footnote{Is there software around that - actually uses these? I'd be interested!} any `non-standard' $ioctl$s -are routed through the call $dev_ioctl()$. In principle, `private' -$ioctl$s should be numbered after the device's major number, and not -the general \cdrom\ $ioctl$ number, {\tt {0x53}}. Currently the -non-supported $ioctl$s are: {\it CDROMREADMODE1, CDROMREADMODE2, - CDROMREADAUDIO, CDROMREADRAW, CDROMREADCOOKED, CDROMSEEK, - CDROMPLAY\-BLK and CDROM\-READALL}. - - -\subsection{\cdrom\ capabilities} -\label{capability} - -Instead of just implementing some $ioctl$ calls, the interface in -\cdromc\ supplies the possibility to indicate the {\em capabilities\/} -of a \cdrom\ drive. This can be done by ORing any number of -capability-constants that are defined in \cdromh\ at the registration -phase. Currently, the capabilities are any of: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDC_CLOSE_TRAY& can close tray by software control\cr -CDC_OPEN_TRAY& can open tray\cr -CDC_LOCK& can lock and unlock the door\cr -CDC_SELECT_SPEED& can select speed, in units of $\sim$150\,kB/s\cr -CDC_SELECT_DISC& drive is juke-box\cr -CDC_MULTI_SESSION& can read sessions $>\rm1$\cr -CDC_MCN& can read Media Catalog Number\cr -CDC_MEDIA_CHANGED& can report if disc has changed\cr -CDC_PLAY_AUDIO& can perform audio-functions (play, pause, etc)\cr -CDC_RESET& hard reset device\cr -CDC_IOCTLS& driver has non-standard ioctls\cr -CDC_DRIVE_STATUS& driver implements drive status\cr -} -$$ -The capability flag is declared $const$, to prevent drivers from -accidentally tampering with the contents. The capability fags actually -inform \cdromc\ of what the driver can do. If the drive found -by the driver does not have the capability, is can be masked out by -the $cdrom_device_info$ variable $mask$. For instance, the SCSI \cdrom\ -driver has implemented the code for loading and ejecting \cdrom's, and -hence its corresponding flags in $capability$ will be set. But a SCSI -\cdrom\ drive might be a caddy system, which can't load the tray, and -hence for this drive the $cdrom_device_info$ struct will have set -the $CDC_CLOSE_TRAY$ bit in $mask$. - -In the file \cdromc\ you will encounter many constructions of the type -$$\it -if\ (cdo\rightarrow capability \mathrel\& \mathord{\sim} cdi\rightarrow mask - \mathrel{\&} CDC_<capability>) \ldots -$$ -There is no $ioctl$ to set the mask\dots The reason is that -I think it is better to control the {\em behavior\/} rather than the -{\em capabilities}. - -\subsection{Options} - -A final flag register controls the {\em behavior\/} of the \cdrom\ -drives, in order to satisfy different users' wishes, hopefully -independently of the ideas of the respective author who happened to -have made the drive's support available to the \linux\ community. The -current behavior options are: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDO_AUTO_CLOSE& try to close tray upon device $open()$\cr -CDO_AUTO_EJECT& try to open tray on last device $close()$\cr -CDO_USE_FFLAGS& use $file_pointer\rightarrow f_flags$ to indicate - purpose for $open()$\cr -CDO_LOCK& try to lock door if device is opened\cr -CDO_CHECK_TYPE& ensure disc type is data if opened for data\cr -} -$$ - -The initial value of this register is $CDO_AUTO_CLOSE \mathrel| -CDO_USE_FFLAGS \mathrel| CDO_LOCK$, reflecting my own view on user -interface and software standards. Before you protest, there are two -new $ioctl$s implemented in \cdromc, that allow you to control the -behavior by software. These are: -$$ -\halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr -CDROM_SET_OPTIONS& set options specified in $(int)\ arg$\cr -CDROM_CLEAR_OPTIONS& clear options specified in $(int)\ arg$\cr -} -$$ -One option needs some more explanation: $CDO_USE_FFLAGS$. In the next -newsection we explain what the need for this option is. - -A software package {\tt setcd}, available from the Debian distribution -and {\tt sunsite.unc.edu}, allows user level control of these flags. - -\newsection{The need to know the purpose of opening the \cdrom\ device} - -Traditionally, Unix devices can be used in two different `modes', -either by reading/writing to the device file, or by issuing -controlling commands to the device, by the device's $ioctl()$ -call. The problem with \cdrom\ drives, is that they can be used for -two entirely different purposes. One is to mount removable -file systems, \cdrom s, the other is to play audio CD's. Audio commands -are implemented entirely through $ioctl$s, presumably because the -first implementation (SUN?) has been such. In principle there is -nothing wrong with this, but a good control of the `CD player' demands -that the device can {\em always\/} be opened in order to give the -$ioctl$ commands, regardless of the state the drive is in. - -On the other hand, when used as a removable-media disc drive (what the -original purpose of \cdrom s is) we would like to make sure that the -disc drive is ready for operation upon opening the device. In the old -scheme, some \cdrom\ drivers don't do any integrity checking, resulting -in a number of i/o errors reported by the VFS to the kernel when an -attempt for mounting a \cdrom\ on an empty drive occurs. This is not a -particularly elegant way to find out that there is no \cdrom\ inserted; -it more-or-less looks like the old IBM-PC trying to read an empty floppy -drive for a couple of seconds, after which the system complains it -can't read from it. Nowadays we can {\em sense\/} the existence of a -removable medium in a drive, and we believe we should exploit that -fact. An integrity check on opening of the device, that verifies the -availability of a \cdrom\ and its correct type (data), would be -desirable. - -These two ways of using a \cdrom\ drive, principally for data and -secondarily for playing audio discs, have different demands for the -behavior of the $open()$ call. Audio use simply wants to open the -device in order to get a file handle which is needed for issuing -$ioctl$ commands, while data use wants to open for correct and -reliable data transfer. The only way user programs can indicate what -their {\em purpose\/} of opening the device is, is through the $flags$ -parameter (see {\tt {open(2)}}). For \cdrom\ devices, these flags aren't -implemented (some drivers implement checking for write-related flags, -but this is not strictly necessary if the device file has correct -permission flags). Most option flags simply don't make sense to -\cdrom\ devices: $O_CREAT$, $O_NOCTTY$, $O_TRUNC$, $O_APPEND$, and -$O_SYNC$ have no meaning to a \cdrom. - -We therefore propose to use the flag $O_NONBLOCK$ to indicate -that the device is opened just for issuing $ioctl$ -commands. Strictly, the meaning of $O_NONBLOCK$ is that opening and -subsequent calls to the device don't cause the calling process to -wait. We could interpret this as ``don't wait until someone has -inserted some valid data-\cdrom.'' Thus, our proposal of the -implementation for the $open()$ call for \cdrom s is: -\begin{itemize} -\item If no other flags are set than $O_RDONLY$, the device is opened -for data transfer, and the return value will be 0 only upon successful -initialization of the transfer. The call may even induce some actions -on the \cdrom, such as closing the tray. -\item If the option flag $O_NONBLOCK$ is set, opening will always be -successful, unless the whole device doesn't exist. The drive will take -no actions whatsoever. -\end{itemize} - -\subsection{And what about standards?} - -You might hesitate to accept this proposal as it comes from the -\linux\ community, and not from some standardizing institute. What -about SUN, SGI, HP and all those other Unix and hardware vendors? -Well, these companies are in the lucky position that they generally -control both the hardware and software of their supported products, -and are large enough to set their own standard. They do not have to -deal with a dozen or more different, competing hardware -configurations.\footnote{Incidentally, I think that SUN's approach to -mounting \cdrom s is very good in origin: under Solaris a -volume-daemon automatically mounts a newly inserted \cdrom\ under {\tt -{/cdrom/$<volume-name>$/}}. In my opinion they should have pushed this -further and have {\em every\/} \cdrom\ on the local area network be -mounted at the similar location, \ie, no matter in which particular -machine you insert a \cdrom, it will always appear at the same -position in the directory tree, on every system. When I wanted to -implement such a user-program for \linux, I came across the -differences in behavior of the various drivers, and the need for an -$ioctl$ informing about media changes.} - -We believe that using $O_NONBLOCK$ to indicate that a device is being opened -for $ioctl$ commands only can be easily introduced in the \linux\ -community. All the CD-player authors will have to be informed, we can -even send in our own patches to the programs. The use of $O_NONBLOCK$ -has most likely no influence on the behavior of the CD-players on -other operating systems than \linux. Finally, a user can always revert -to old behavior by a call to $ioctl(file_descriptor, CDROM_CLEAR_OPTIONS, -CDO_USE_FFLAGS)$. - -\subsection{The preferred strategy of $open()$} - -The routines in \cdromc\ are designed in such a way that run-time -configuration of the behavior of \cdrom\ devices (of {\em any\/} type) -can be carried out, by the $CDROM_SET/CLEAR_OPTIONS$ $ioctls$. Thus, various -modes of operation can be set: -\begin{description} -\item[$CDO_AUTO_CLOSE \mathrel| CDO_USE_FFLAGS \mathrel| CDO_LOCK$] This -is the default setting. (With $CDO_CHECK_TYPE$ it will be better, in the -future.) If the device is not yet opened by any other process, and if -the device is being opened for data ($O_NONBLOCK$ is not set) and the -tray is found to be open, an attempt to close the tray is made. Then, -it is verified that a disc is in the drive and, if $CDO_CHECK_TYPE$ is -set, that it contains tracks of type `data mode 1.' Only if all tests -are passed is the return value zero. The door is locked to prevent file -system corruption. If the drive is opened for audio ($O_NONBLOCK$ is -set), no actions are taken and a value of 0 will be returned. -\item[$CDO_AUTO_CLOSE \mathrel| CDO_AUTO_EJECT \mathrel| CDO_LOCK$] This -mimics the behavior of the current sbpcd-driver. The option flags are -ignored, the tray is closed on the first open, if necessary. Similarly, -the tray is opened on the last release, \ie, if a \cdrom\ is unmounted, -it is automatically ejected, such that the user can replace it. -\end{description} -We hope that these option can convince everybody (both driver -maintainers and user program developers) to adopt the new \cdrom\ -driver scheme and option flag interpretation. - -\newsection{Description of routines in \cdromc} - -Only a few routines in \cdromc\ are exported to the drivers. In this -new section we will discuss these, as well as the functions that `take -over' the \cdrom\ interface to the kernel. The header file belonging -to \cdromc\ is called \cdromh. Formerly, some of the contents of this -file were placed in the file {\tt {ucdrom.h}}, but this file has now been -merged back into \cdromh. - -\subsection{$Struct\ file_operations\ cdrom_fops$} - -The contents of this structure were described in section~\ref{cdrom.c}. -A pointer to this structure is assigned to the $fops$ field -of the $struct gendisk$. - -\subsection{$Int\ register_cdrom( struct\ cdrom_device_info\ * cdi)$} - -This function is used in about the same way one registers $cdrom_fops$ -with the kernel, the device operations and information structures, -as described in section~\ref{cdrom.c}, should be registered with the -\UCD: -$$ -register_cdrom(\&<device>_info)); -$$ -This function returns zero upon success, and non-zero upon -failure. The structure $<device>_info$ should have a pointer to the -driver's $<device>_dops$, as in -$$ -\vbox{\halign{&$#$\hfil\cr -struct\ &cdrom_device_info\ <device>_info = \{\cr -& <device>_dops;\cr -&\ldots\cr -\}\cr -}}$$ -Note that a driver must have one static structure, $<device>_dops$, while -it may have as many structures $<device>_info$ as there are minor devices -active. $Register_cdrom()$ builds a linked list from these. - -\subsection{$Void\ unregister_cdrom(struct\ cdrom_device_info * cdi)$} - -Unregistering device $cdi$ with minor number $MINOR(cdi\to dev)$ removes -the minor device from the list. If it was the last registered minor for -the low-level driver, this disconnects the registered device-operation -routines from the \cdrom\ interface. This function returns zero upon -success, and non-zero upon failure. - -\subsection{$Int\ cdrom_open(struct\ inode * ip, struct\ file * fp)$} - -This function is not called directly by the low-level drivers, it is -listed in the standard $cdrom_fops$. If the VFS opens a file, this -function becomes active. A strategy is implemented in this routine, -taking care of all capabilities and options that are set in the -$cdrom_device_ops$ connected to the device. Then, the program flow is -transferred to the device_dependent $open()$ call. - -\subsection{$Void\ cdrom_release(struct\ inode *ip, struct\ file -*fp)$} - -This function implements the reverse-logic of $cdrom_open()$, and then -calls the device-dependent $release()$ routine. When the use-count has -reached 0, the allocated buffers are flushed by calls to $sync_dev(dev)$ -and $invalidate_buffers(dev)$. - - -\subsection{$Int\ cdrom_ioctl(struct\ inode *ip, struct\ file *fp, -unsigned\ int\ cmd, unsigned\ long\ arg)$} -\label{cdrom-ioctl} - -This function handles all the standard $ioctl$ requests for \cdrom\ -devices in a uniform way. The different calls fall into three -categories: $ioctl$s that can be directly implemented by device -operations, ones that are routed through the call $audio_ioctl()$, and -the remaining ones, that are presumable device-dependent. Generally, a -negative return value indicates an error. - -\subsubsection{Directly implemented $ioctl$s} -\label{ioctl-direct} - -The following `old' \cdrom-$ioctl$s are implemented by directly -calling device-operations in $cdrom_device_ops$, if implemented and -not masked: -\begin{description} -\item[CDROMMULTISESSION] Requests the last session on a \cdrom. -\item[CDROMEJECT] Open tray. -\item[CDROMCLOSETRAY] Close tray. -\item[CDROMEJECT_SW] If $arg\not=0$, set behavior to auto-close (close -tray on first open) and auto-eject (eject on last release), otherwise -set behavior to non-moving on $open()$ and $release()$ calls. -\item[CDROM_GET_MCN] Get the Media Catalog Number from a CD. -\end{description} - -\subsubsection{$Ioctl$s routed through $audio_ioctl()$} -\label{ioctl-audio} - -The following set of $ioctl$s are all implemented through a call to -the $cdrom_fops$ function $audio_ioctl()$. Memory checks and -allocation are performed in $cdrom_ioctl()$, and also sanitization of -address format ($CDROM_LBA$/$CDROM_MSF$) is done. -\begin{description} -\item[CDROMSUBCHNL] Get sub-channel data in argument $arg$ of type $struct\ -cdrom_subchnl *{}$. -\item[CDROMREADTOCHDR] Read Table of Contents header, in $arg$ of type -$struct\ cdrom_tochdr *{}$. -\item[CDROMREADTOCENTRY] Read a Table of Contents entry in $arg$ and -specified by $arg$ of type $struct\ cdrom_tocentry *{}$. -\item[CDROMPLAYMSF] Play audio fragment specified in Minute, Second, -Frame format, delimited by $arg$ of type $struct\ cdrom_msf *{}$. -\item[CDROMPLAYTRKIND] Play audio fragment in track-index format -delimited by $arg$ of type $struct\ \penalty-1000 cdrom_ti *{}$. -\item[CDROMVOLCTRL] Set volume specified by $arg$ of type $struct\ -cdrom_volctrl *{}$. -\item[CDROMVOLREAD] Read volume into by $arg$ of type $struct\ -cdrom_volctrl *{}$. -\item[CDROMSTART] Spin up disc. -\item[CDROMSTOP] Stop playback of audio fragment. -\item[CDROMPAUSE] Pause playback of audio fragment. -\item[CDROMRESUME] Resume playing. -\end{description} - -\subsubsection{New $ioctl$s in \cdromc} - -The following $ioctl$s have been introduced to allow user programs to -control the behavior of individual \cdrom\ devices. New $ioctl$ -commands can be identified by the underscores in their names. -\begin{description} -\item[CDROM_SET_OPTIONS] Set options specified by $arg$. Returns the -option flag register after modification. Use $arg = \rm0$ for reading -the current flags. -\item[CDROM_CLEAR_OPTIONS] Clear options specified by $arg$. Returns - the option flag register after modification. -\item[CDROM_SELECT_SPEED] Select head-rate speed of disc specified as - by $arg$ in units of standard cdrom speed (176\,kB/sec raw data or - 150\,kB/sec file system data). The value 0 means `auto-select', \ie, - play audio discs at real time and data discs at maximum speed. The value - $arg$ is checked against the maximum head rate of the drive found in the - $cdrom_dops$. -\item[CDROM_SELECT_DISC] Select disc numbered $arg$ from a juke-box. - First disc is numbered 0. The number $arg$ is checked against the - maximum number of discs in the juke-box found in the $cdrom_dops$. -\item[CDROM_MEDIA_CHANGED] Returns 1 if a disc has been changed since - the last call. Note that calls to $cdrom_media_changed$ by the VFS - are treated by an independent queue, so both mechanisms will detect - a media change once. For juke-boxes, an extra argument $arg$ - specifies the slot for which the information is given. The special - value $CDSL_CURRENT$ requests that information about the currently - selected slot be returned. -\item[CDROM_DRIVE_STATUS] Returns the status of the drive by a call to - $drive_status()$. Return values are defined in section~\ref{drive - status}. Note that this call doesn't return information on the - current playing activity of the drive; this can be polled through an - $ioctl$ call to $CDROMSUBCHNL$. For juke-boxes, an extra argument - $arg$ specifies the slot for which (possibly limited) information is - given. The special value $CDSL_CURRENT$ requests that information - about the currently selected slot be returned. -\item[CDROM_DISC_STATUS] Returns the type of the disc currently in the - drive. It should be viewed as a complement to $CDROM_DRIVE_STATUS$. - This $ioctl$ can provide \emph {some} information about the current - disc that is inserted in the drive. This functionality used to be - implemented in the low level drivers, but is now carried out - entirely in \UCD. - - The history of development of the CD's use as a carrier medium for - various digital information has lead to many different disc types. - This $ioctl$ is useful only in the case that CDs have \emph {only - one} type of data on them. While this is often the case, it is - also very common for CDs to have some tracks with data, and some - tracks with audio. Because this is an existing interface, rather - than fixing this interface by changing the assumptions it was made - under, thereby breaking all user applications that use this - function, the \UCD\ implements this $ioctl$ as follows: If the CD in - question has audio tracks on it, and it has absolutely no CD-I, XA, - or data tracks on it, it will be reported as $CDS_AUDIO$. If it has - both audio and data tracks, it will return $CDS_MIXED$. If there - are no audio tracks on the disc, and if the CD in question has any - CD-I tracks on it, it will be reported as $CDS_XA_2_2$. Failing - that, if the CD in question has any XA tracks on it, it will be - reported as $CDS_XA_2_1$. Finally, if the CD in question has any - data tracks on it, it will be reported as a data CD ($CDS_DATA_1$). - - This $ioctl$ can return: - $$ - \halign{$#$\ \hfil&$/*$ \rm# $*/$\hfil\cr - CDS_NO_INFO& no information available\cr - CDS_NO_DISC& no disc is inserted, or tray is opened\cr - CDS_AUDIO& Audio disc (2352 audio bytes/frame)\cr - CDS_DATA_1& data disc, mode 1 (2048 user bytes/frame)\cr - CDS_XA_2_1& mixed data (XA), mode 2, form 1 (2048 user bytes)\cr - CDS_XA_2_2& mixed data (XA), mode 2, form 1 (2324 user bytes)\cr - CDS_MIXED& mixed audio/data disc\cr - } - $$ - For some information concerning frame layout of the various disc - types, see a recent version of \cdromh. - -\item[CDROM_CHANGER_NSLOTS] Returns the number of slots in a - juke-box. -\item[CDROMRESET] Reset the drive. -\item[CDROM_GET_CAPABILITY] Returns the $capability$ flags for the - drive. Refer to section \ref{capability} for more information on - these flags. -\item[CDROM_LOCKDOOR] Locks the door of the drive. $arg == \rm0$ - unlocks the door, any other value locks it. -\item[CDROM_DEBUG] Turns on debugging info. Only root is allowed - to do this. Same semantics as CDROM_LOCKDOOR. -\end{description} - -\subsubsection{Device dependent $ioctl$s} - -Finally, all other $ioctl$s are passed to the function $dev_ioctl()$, -if implemented. No memory allocation or verification is carried out. - -\newsection{How to update your driver} - -\begin{enumerate} -\item Make a backup of your current driver. -\item Get hold of the files \cdromc\ and \cdromh, they should be in - the directory tree that came with this documentation. -\item Make sure you include \cdromh. -\item Change the 3rd argument of $register_blkdev$ from -$\&<your-drive>_fops$ to $\&cdrom_fops$. -\item Just after that line, add the following to register with the \UCD: - $$register_cdrom(\&<your-drive>_info);$$ - Similarly, add a call to $unregister_cdrom()$ at the appropriate place. -\item Copy an example of the device-operations $struct$ to your - source, \eg, from {\tt {cm206.c}} $cm206_dops$, and change all - entries to names corresponding to your driver, or names you just - happen to like. If your driver doesn't support a certain function, - make the entry $NULL$. At the entry $capability$ you should list all - capabilities your driver currently supports. If your driver - has a capability that is not listed, please send me a message. -\item Copy the $cdrom_device_info$ declaration from the same example - driver, and modify the entries according to your needs. If your - driver dynamically determines the capabilities of the hardware, this - structure should also be declared dynamically. -\item Implement all functions in your $<device>_dops$ structure, - according to prototypes listed in \cdromh, and specifications given - in section~\ref{cdrom.c}. Most likely you have already implemented - the code in a large part, and you will almost certainly need to adapt the - prototype and return values. -\item Rename your $<device>_ioctl()$ function to $audio_ioctl$ and - change the prototype a little. Remove entries listed in the first - part in section~\ref{cdrom-ioctl}, if your code was OK, these are - just calls to the routines you adapted in the previous step. -\item You may remove all remaining memory checking code in the - $audio_ioctl()$ function that deals with audio commands (these are - listed in the second part of section~\ref{cdrom-ioctl}). There is no - need for memory allocation either, so most $case$s in the $switch$ - statement look similar to: - $$ - case\ CDROMREADTOCENTRY\colon get_toc_entry\bigl((struct\ - cdrom_tocentry *{})\ arg\bigr); - $$ -\item All remaining $ioctl$ cases must be moved to a separate - function, $<device>_ioctl$, the device-dependent $ioctl$s. Note that - memory checking and allocation must be kept in this code! -\item Change the prototypes of $<device>_open()$ and - $<device>_release()$, and remove any strategic code (\ie, tray - movement, door locking, etc.). -\item Try to recompile the drivers. We advise you to use modules, both - for {\tt {cdrom.o}} and your driver, as debugging is much easier this - way. -\end{enumerate} - -\newsection{Thanks} - -Thanks to all the people involved. First, Erik Andersen, who has -taken over the torch in maintaining \cdromc\ and integrating much -\cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and -Gerd Knorr, who were the first to implement this interface for SCSI -and IDE-CD drivers and added many ideas for extension of the data -structures relative to kernel~2.0. Further thanks to Heiko Ei{\ss}feldt, -Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew -Kroll, the \linux\ \cdrom\ device driver developers who were kind -enough to give suggestions and criticisms during the writing. Finally -of course, I want to thank Linus Torvalds for making this possible in -the first place. - -\vfill -$ \version\ $ -\eject -\end{document} diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd.rst index a5f2a7f1ff46..bdccb74fc92d 100644 --- a/Documentation/cdrom/ide-cd +++ b/Documentation/cdrom/ide-cd.rst @@ -1,18 +1,20 @@ IDE-CD driver documentation -Originally by scott snyder <snyder@fnald0.fnal.gov> (19 May 1996) -Carrying on the torch is: Erik Andersen <andersee@debian.org> -New maintainers (19 Oct 1998): Jens Axboe <axboe@image.dk> +=========================== + +:Originally by: scott snyder <snyder@fnald0.fnal.gov> (19 May 1996) +:Carrying on the torch is: Erik Andersen <andersee@debian.org> +:New maintainers (19 Oct 1998): Jens Axboe <axboe@image.dk> 1. Introduction --------------- -The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant +The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant CDROM drives which attach to an IDE interface. Note that some CDROM vendors (including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made both ATAPI-compliant drives and drives which use a proprietary interface. If your drive uses one of those proprietary interfaces, this driver will not work with it (but one of the other CDROM drivers -probably will). This driver will not work with `ATAPI' drives which +probably will). This driver will not work with `ATAPI` drives which attach to the parallel port. In addition, there is at least one drive (CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI; this driver will not work with drives like that either (but see the @@ -31,7 +33,7 @@ This driver provides the following features: from audio tracks. The program cdda2wav can be used for this. Note, however, that only some drives actually support this. - - There is now support for CDROM changers which comply with the + - There is now support for CDROM changers which comply with the ATAPI 2.6 draft standard (such as the NEC CDR-251). This additional functionality includes a function call to query which slot is the currently selected slot, a function call to query which slots contain @@ -45,22 +47,22 @@ This driver provides the following features: --------------- 0. The ide-cd relies on the ide disk driver. See - Documentation/ide/ide.txt for up-to-date information on the ide + Documentation/ide/ide.rst for up-to-date information on the ide driver. 1. Make sure that the ide and ide-cd drivers are compiled into the - kernel you're using. When configuring the kernel, in the section - entitled "Floppy, IDE, and other block devices", say either `Y' - (which will compile the support directly into the kernel) or `M' + kernel you're using. When configuring the kernel, in the section + entitled "Floppy, IDE, and other block devices", say either `Y` + (which will compile the support directly into the kernel) or `M` (to compile support as a module which can be loaded and unloaded) - to the options: + to the options:: ATA/ATAPI/MFM/RLL support Include IDE/ATAPI CDROM support Depending on what type of IDE interface you have, you may need to specify additional configuration options. See - Documentation/ide/ide.txt. + Documentation/ide/ide.rst. 2. You should also ensure that the iso9660 filesystem is either compiled into the kernel or available as a loadable module. You @@ -72,35 +74,35 @@ This driver provides the following features: address and an IRQ number, the standard assignments being 0x1f0 and 14 for the primary interface and 0x170 and 15 for the secondary interface. Each interface can control up to two devices, - where each device can be a hard drive, a CDROM drive, a floppy drive, - or a tape drive. The two devices on an interface are called `master' - and `slave'; this is usually selectable via a jumper on the drive. + where each device can be a hard drive, a CDROM drive, a floppy drive, + or a tape drive. The two devices on an interface are called `master` + and `slave`; this is usually selectable via a jumper on the drive. Linux names these devices as follows. The master and slave devices - on the primary IDE interface are called `hda' and `hdb', + on the primary IDE interface are called `hda` and `hdb`, respectively. The drives on the secondary interface are called - `hdc' and `hdd'. (Interfaces at other locations get other letters - in the third position; see Documentation/ide/ide.txt.) + `hdc` and `hdd`. (Interfaces at other locations get other letters + in the third position; see Documentation/ide/ide.rst.) If you want your CDROM drive to be found automatically by the driver, you should make sure your IDE interface uses either the primary or secondary addresses mentioned above. In addition, if the CDROM drive is the only device on the IDE interface, it should - be jumpered as `master'. (If for some reason you cannot configure + be jumpered as `master`. (If for some reason you cannot configure your system in this manner, you can probably still use the driver. You may have to pass extra configuration information to the kernel - when you boot, however. See Documentation/ide/ide.txt for more + when you boot, however. See Documentation/ide/ide.rst for more information.) 4. Boot the system. If the drive is recognized, you should see a - message which looks like + message which looks like:: hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive If you do not see this, see section 5 below. 5. You may want to create a symbolic link /dev/cdrom pointing to the - actual device. You can do this with the command + actual device. You can do this with the command:: ln -s /dev/hdX /dev/cdrom @@ -108,14 +110,14 @@ This driver provides the following features: drive is installed. 6. You should be able to see any error messages from the driver with - the `dmesg' command. + the `dmesg` command. 3. Basic usage -------------- -An ISO 9660 CDROM can be mounted by putting the disc in the drive and -typing (as root) +An ISO 9660 CDROM can be mounted by putting the disc in the drive and +typing (as root):: mount -t iso9660 /dev/cdrom /mnt/cdrom @@ -123,7 +125,7 @@ where it is assumed that /dev/cdrom is a link pointing to the actual device (as described in step 5 of the last section) and /mnt/cdrom is an empty directory. You should now be able to see the contents of the CDROM under the /mnt/cdrom directory. If you want to eject the CDROM, -you must first dismount it with a command like +you must first dismount it with a command like:: umount /mnt/cdrom @@ -148,7 +150,7 @@ such as cdda2wav. The only types of drive which I've heard support this are Sony and Toshiba drives. You will get errors if you try to use this function on a drive which does not support it. -For supported changers, you can use the `cdchange' program (appended to +For supported changers, you can use the `cdchange` program (appended to the end of this file) to switch between changer slots. Note that the drive should be unmounted before attempting this. The program takes two arguments: the CDROM device, and the slot number to which you wish @@ -161,17 +163,17 @@ to change. If the slot number is -1, the drive is unloaded. This section discusses some common problems encountered when trying to use the driver, and some possible solutions. Note that if you are experiencing problems, you should probably also review -Documentation/ide/ide.txt for current information about the underlying +Documentation/ide/ide.rst for current information about the underlying IDE support code. Some of these items apply only to earlier versions of the driver, but are mentioned here for completeness. -In most cases, you should probably check with `dmesg' for any errors +In most cases, you should probably check with `dmesg` for any errors from the driver. a. Drive is not detected during booting. - Review the configuration instructions above and in - Documentation/ide/ide.txt, and check how your hardware is + Documentation/ide/ide.rst, and check how your hardware is configured. - If your drive is the only device on an IDE interface, it should @@ -179,14 +181,14 @@ a. Drive is not detected during booting. - If your IDE interface is not at the standard addresses of 0x170 or 0x1f0, you'll need to explicitly inform the driver using a - lilo option. See Documentation/ide/ide.txt. (This feature was + lilo option. See Documentation/ide/ide.rst. (This feature was added around kernel version 1.3.30.) - If the autoprobing is not finding your drive, you can tell the driver to assume that one exists by using a lilo option of the - form `hdX=cdrom', where X is the drive letter corresponding to - where your drive is installed. Note that if you do this and you - see a boot message like + form `hdX=cdrom`, where X is the drive letter corresponding to + where your drive is installed. Note that if you do this and you + see a boot message like:: hdX: ATAPI cdrom (?) @@ -205,7 +207,7 @@ a. Drive is not detected during booting. Support for some interfaces needing extra initialization is provided in later 1.3.x kernels. You may need to turn on additional kernel configuration options to get them to work; - see Documentation/ide/ide.txt. + see Documentation/ide/ide.rst. Even if support is not available for your interface, you may be able to get it to work with the following procedure. First boot @@ -220,7 +222,7 @@ b. Timeout/IRQ errors. probably not making it to the host. - IRQ problems may also be indicated by the message - `IRQ probe failed (<n>)' while booting. If <n> is zero, that + `IRQ probe failed (<n>)` while booting. If <n> is zero, that means that the system did not see an interrupt from the drive when it was expecting one (on any feasible IRQ). If <n> is negative, that means the system saw interrupts on multiple IRQ lines, when @@ -240,27 +242,27 @@ b. Timeout/IRQ errors. there are hardware problems with the interrupt setup; they apparently don't use interrupts. - - If you own a Pioneer DR-A24X, you _will_ get nasty error messages + - If you own a Pioneer DR-A24X, you _will_ get nasty error messages on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }" The Pioneer DR-A24X CDROM drives are fairly popular these days. Unfortunately, these drives seem to become very confused when we perform the standard Linux ATA disk drive probe. If you own one of these drives, - you can bypass the ATA probing which confuses these CDROM drives, by - adding `append="hdX=noprobe hdX=cdrom"' to your lilo.conf file and running - lilo (again where X is the drive letter corresponding to where your drive + you can bypass the ATA probing which confuses these CDROM drives, by + adding `append="hdX=noprobe hdX=cdrom"` to your lilo.conf file and running + lilo (again where X is the drive letter corresponding to where your drive is installed.) - + c. System hangups. - If the system locks up when you try to access the CDROM, the most likely cause is that you have a buggy IDE adapter which doesn't properly handle simultaneous transactions on multiple interfaces. The most notorious of these is the CMD640B chip. This problem can - be worked around by specifying the `serialize' option when + be worked around by specifying the `serialize` option when booting. Recent kernels should be able to detect the need for this automatically in most cases, but the detection is not - foolproof. See Documentation/ide/ide.txt for more information - about the `serialize' option and the CMD640B. + foolproof. See Documentation/ide/ide.rst for more information + about the `serialize` option and the CMD640B. - Note that many MS-DOS CDROM drivers will work with such buggy hardware, apparently because they never attempt to overlap CDROM @@ -269,14 +271,14 @@ c. System hangups. d. Can't mount a CDROM. - - If you get errors from mount, it may help to check `dmesg' to see + - If you get errors from mount, it may help to check `dmesg` to see if there are any more specific errors from the driver or from the filesystem. - Make sure there's a CDROM loaded in the drive, and that's it's an ISO 9660 disc. You can't mount an audio CD. - - With the CDROM in the drive and unmounted, try something like + - With the CDROM in the drive and unmounted, try something like:: cat /dev/cdrom | od | more @@ -284,9 +286,9 @@ d. Can't mount a CDROM. OK, and the problem is at the filesystem level (i.e., the CDROM is not ISO 9660 or has errors in the filesystem structure). - - If you see `not a block device' errors, check that the definitions + - If you see `not a block device` errors, check that the definitions of the device special files are correct. They should be as - follows: + follows:: brw-rw---- 1 root disk 3, 0 Nov 11 18:48 /dev/hda brw-rw---- 1 root disk 3, 64 Nov 11 18:48 /dev/hdb @@ -301,7 +303,7 @@ d. Can't mount a CDROM. If you have a /dev/cdrom symbolic link, check that it is pointing to the correct device file. - If you hear people talking of the devices `hd1a' and `hd1b', these + If you hear people talking of the devices `hd1a` and `hd1b`, these were old names for what are now called hdc and hdd. Those names should be considered obsolete. @@ -311,8 +313,8 @@ d. Can't mount a CDROM. always give meaningful error messages. -e. Directory listings are unpredictably truncated, and `dmesg' shows - `buffer botch' error messages from the driver. +e. Directory listings are unpredictably truncated, and `dmesg` shows + `buffer botch` error messages from the driver. - There was a bug in the version of the driver in 1.2.x kernels which could cause this. It was fixed in 1.3.0. If you can't @@ -335,34 +337,36 @@ f. Data corruption. 5. cdchange.c ------------- -/* - * cdchange.c [-v] <device> [<slot>] - * - * This loads a CDROM from a specified slot in a changer, and displays - * information about the changer status. The drive should be unmounted before - * using this program. - * - * Changer information is displayed if either the -v flag is specified - * or no slot was specified. - * - * Based on code originally from Gerhard Zuber <zuber@berlin.snafu.de>. - * Changer status information, and rewrite for the new Uniform CDROM driver - * interface by Erik Andersen <andersee@debian.org>. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <errno.h> -#include <string.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/ioctl.h> -#include <linux/cdrom.h> - - -int -main (int argc, char **argv) -{ +:: + + /* + * cdchange.c [-v] <device> [<slot>] + * + * This loads a CDROM from a specified slot in a changer, and displays + * information about the changer status. The drive should be unmounted before + * using this program. + * + * Changer information is displayed if either the -v flag is specified + * or no slot was specified. + * + * Based on code originally from Gerhard Zuber <zuber@berlin.snafu.de>. + * Changer status information, and rewrite for the new Uniform CDROM driver + * interface by Erik Andersen <andersee@debian.org>. + */ + + #include <stdio.h> + #include <stdlib.h> + #include <errno.h> + #include <string.h> + #include <unistd.h> + #include <fcntl.h> + #include <sys/ioctl.h> + #include <linux/cdrom.h> + + + int + main (int argc, char **argv) + { char *program; char *device; int fd; /* file descriptor for CD-ROM device */ @@ -382,30 +386,30 @@ main (int argc, char **argv) fprintf (stderr, " Slots are numbered 1 -- n.\n"); exit (1); } - + if (strcmp (argv[0], "-v") == 0) { verbose = 1; ++argv; --argc; } - + device = argv[0]; - + if (argc == 2) slot = atoi (argv[1]) - 1; - /* open device */ + /* open device */ fd = open(device, O_RDONLY | O_NONBLOCK); if (fd < 0) { - fprintf (stderr, "%s: open failed for `%s': %s\n", + fprintf (stderr, "%s: open failed for `%s`: %s\n", program, device, strerror (errno)); exit (1); } - /* Check CD player status */ + /* Check CD player status */ total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS); if (total_slots_available <= 1 ) { - fprintf (stderr, "%s: Device `%s' is not an ATAPI " + fprintf (stderr, "%s: Device `%s` is not an ATAPI " "compliant CD changer.\n", program, device); exit (1); } @@ -418,7 +422,7 @@ main (int argc, char **argv) exit (1); } - /* load */ + /* load */ slot=ioctl (fd, CDROM_SELECT_DISC, slot); if (slot<0) { fflush(stdout); @@ -462,14 +466,14 @@ main (int argc, char **argv) for (x_slot=0; x_slot<total_slots_available; x_slot++) { printf ("Slot %2d: ", x_slot+1); - status = ioctl (fd, CDROM_DRIVE_STATUS, x_slot); - if (status<0) { - perror(" CDROM_DRIVE_STATUS"); - } else switch(status) { + status = ioctl (fd, CDROM_DRIVE_STATUS, x_slot); + if (status<0) { + perror(" CDROM_DRIVE_STATUS"); + } else switch(status) { case CDS_DISC_OK: printf ("Disc present."); break; - case CDS_NO_DISC: + case CDS_NO_DISC: printf ("Empty slot."); break; case CDS_TRAY_OPEN: @@ -507,11 +511,11 @@ main (int argc, char **argv) break; } } - status = ioctl (fd, CDROM_MEDIA_CHANGED, x_slot); - if (status<0) { + status = ioctl (fd, CDROM_MEDIA_CHANGED, x_slot); + if (status<0) { perror(" CDROM_MEDIA_CHANGED"); - } - switch (status) { + } + switch (status) { case 1: printf ("Changed.\n"); break; @@ -525,10 +529,10 @@ main (int argc, char **argv) /* close device */ status = close (fd); if (status != 0) { - fprintf (stderr, "%s: close failed for `%s': %s\n", + fprintf (stderr, "%s: close failed for `%s`: %s\n", program, device, strerror (errno)); exit (1); } - + exit (0); -} + } diff --git a/Documentation/cdrom/index.rst b/Documentation/cdrom/index.rst new file mode 100644 index 000000000000..efbd5d111825 --- /dev/null +++ b/Documentation/cdrom/index.rst @@ -0,0 +1,19 @@ +:orphan: + +===== +cdrom +===== + +.. toctree:: + :maxdepth: 1 + + cdrom-standard + ide-cd + packet-writing + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.rst index 2834170d821e..c5c957195a5a 100644 --- a/Documentation/cdrom/packet-writing.txt +++ b/Documentation/cdrom/packet-writing.rst @@ -1,3 +1,7 @@ +============== +Packet writing +============== + Getting started quick --------------------- @@ -10,13 +14,16 @@ Getting started quick Download from http://sourceforge.net/projects/linux-udf/ - Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute - as appropriate): + as appropriate):: + # cdrwtool -d /dev/hdc -q -- Setup your writer +- Setup your writer:: + # pktsetup dev_name /dev/hdc -- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy! +- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy:: + # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime @@ -25,11 +32,11 @@ Packet writing for DVD-RW media DVD-RW discs can be written to much like CD-RW discs if they are in the so called "restricted overwrite" mode. To put a disc in restricted -overwrite mode, run: +overwrite mode, run:: # dvd+rw-format /dev/hdc -You can then use the disc the same way you would use a CD-RW disc: +You can then use the disc the same way you would use a CD-RW disc:: # pktsetup dev_name /dev/hdc # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime @@ -41,7 +48,7 @@ Packet writing for DVD+RW media According to the DVD+RW specification, a drive supporting DVD+RW discs shall implement "true random writes with 2KB granularity", which means that it should be possible to put any filesystem with a block size >= -2KB on such a disc. For example, it should be possible to do: +2KB on such a disc. For example, it should be possible to do:: # dvd+rw-format /dev/hdc (only needed if the disc has never been formatted) @@ -54,7 +61,7 @@ follow the specification, but suffer bad performance problems if the writes are not 32KB aligned. Both problems can be solved by using the pktcdvd driver, which always -generates aligned writes. +generates aligned writes:: # dvd+rw-format /dev/hdc # pktsetup dev_name /dev/hdc @@ -83,7 +90,7 @@ Notes - Since the pktcdvd driver makes the disc appear as a regular block device with a 2KB block size, you can put any filesystem you like on - the disc. For example, run: + the disc. For example, run:: # /sbin/mke2fs /dev/pktcdvd/dev_name @@ -97,7 +104,7 @@ Since Linux 2.6.20, the pktcdvd module has a sysfs interface and can be controlled by it. For example the "pktcdvd" tool uses this interface. (see http://tom.ist-im-web.de/download/pktcdvd ) -"pktcdvd" works similar to "pktsetup", e.g.: +"pktcdvd" works similar to "pktsetup", e.g.:: # pktcdvd -a dev_name /dev/hdc # mkudffs /dev/pktcdvd/dev_name @@ -115,7 +122,7 @@ For a description of the sysfs interface look into the file: Using the pktcdvd debugfs interface ----------------------------------- -To read pktcdvd device infos in human readable form, do: +To read pktcdvd device infos in human readable form, do:: # cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info diff --git a/Documentation/conf.py b/Documentation/conf.py index 7ace3f8852bd..3b2397bcb565 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -34,7 +34,8 @@ needs_sphinx = '1.3' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig'] +extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', + 'kfigure', 'sphinx.ext.ifconfig', 'automarkup'] # The name of the math extension changed on Sphinx 1.4 if (major == 1 and minor > 3) or (major > 1): @@ -200,7 +201,7 @@ html_context = { # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +html_use_smartypants = False # Custom sidebar templates, maps document names to template names. #html_sidebars = {} diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index ee1bb8983a88..322ac954b390 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -34,6 +34,8 @@ Core utilities timekeeping boot-time-mm memory-hotplug + protection-keys + ../RCU/index Interfaces for kernel debugging diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index a29c99d13331..824f24ccf401 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -33,6 +33,9 @@ String Conversions .. kernel-doc:: lib/kstrtox.c :export: +.. kernel-doc:: lib/string_helpers.c + :export: + String Manipulation ------------------- @@ -138,6 +141,15 @@ Base 2 log and power Functions .. kernel-doc:: include/linux/log2.h :internal: +Integer power Functions +----------------------- + +.. kernel-doc:: lib/math/int_pow.c + :export: + +.. kernel-doc:: lib/math/int_sqrt.c + :export: + Division Functions ------------------ @@ -358,8 +370,6 @@ Read-Copy Update (RCU) .. kernel-doc:: kernel/rcu/tree.c -.. kernel-doc:: kernel/rcu/tree_plugin.h - .. kernel-doc:: kernel/rcu/tree_exp.h .. kernel-doc:: kernel/rcu/update.c diff --git a/Documentation/x86/protection-keys.rst b/Documentation/core-api/protection-keys.rst index 49d9833af871..49d9833af871 100644 --- a/Documentation/x86/protection-keys.rst +++ b/Documentation/core-api/protection-keys.rst diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst index 20ee447a50f3..c0ffa30c7c37 100644 --- a/Documentation/core-api/timekeeping.rst +++ b/Documentation/core-api/timekeeping.rst @@ -115,7 +115,7 @@ Some additional variants exist for more specialized cases: void ktime_get_coarse_clocktai_ts64( struct timespec64 * ) These are quicker than the non-coarse versions, but less accurate, - corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE + corresponding to CLOCK_MONOTONIC_COARSE and CLOCK_REALTIME_COARSE in user space, along with the equivalent boottime/tai/raw timebase not available in user space. diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index ef6f9f98f595..fcedc5349ace 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -30,27 +30,27 @@ it called marks. Each mark may be set or cleared independently of the others. You can iterate over entries which are marked. Normal pointers may be stored in the XArray directly. They must be 4-byte -aligned, which is true for any pointer returned from :c:func:`kmalloc` and -:c:func:`alloc_page`. It isn't true for arbitrary user-space pointers, +aligned, which is true for any pointer returned from kmalloc() and +alloc_page(). It isn't true for arbitrary user-space pointers, nor for function pointers. You can store pointers to statically allocated objects, as long as those objects have an alignment of at least 4. You can also store integers between 0 and ``LONG_MAX`` in the XArray. -You must first convert it into an entry using :c:func:`xa_mk_value`. +You must first convert it into an entry using xa_mk_value(). When you retrieve an entry from the XArray, you can check whether it is -a value entry by calling :c:func:`xa_is_value`, and convert it back to -an integer by calling :c:func:`xa_to_value`. +a value entry by calling xa_is_value(), and convert it back to +an integer by calling xa_to_value(). Some users want to store tagged pointers instead of using the marks -described above. They can call :c:func:`xa_tag_pointer` to create an -entry with a tag, :c:func:`xa_untag_pointer` to turn a tagged entry -back into an untagged pointer and :c:func:`xa_pointer_tag` to retrieve +described above. They can call xa_tag_pointer() to create an +entry with a tag, xa_untag_pointer() to turn a tagged entry +back into an untagged pointer and xa_pointer_tag() to retrieve the tag of an entry. Tagged pointers use the same bits that are used to distinguish value entries from normal pointers, so each user must decide whether they want to store value entries or tagged pointers in any particular XArray. -The XArray does not support storing :c:func:`IS_ERR` pointers as some +The XArray does not support storing IS_ERR() pointers as some conflict with value entries or internal entries. An unusual feature of the XArray is the ability to create entries which @@ -64,89 +64,89 @@ entry will cause the XArray to forget about the range. Normal API ========== -Start by initialising an XArray, either with :c:func:`DEFINE_XARRAY` -for statically allocated XArrays or :c:func:`xa_init` for dynamically +Start by initialising an XArray, either with DEFINE_XARRAY() +for statically allocated XArrays or xa_init() for dynamically allocated ones. A freshly-initialised XArray contains a ``NULL`` pointer at every index. -You can then set entries using :c:func:`xa_store` and get entries -using :c:func:`xa_load`. xa_store will overwrite any entry with the +You can then set entries using xa_store() and get entries +using xa_load(). xa_store will overwrite any entry with the new entry and return the previous entry stored at that index. You can -use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a +use xa_erase() instead of calling xa_store() with a ``NULL`` entry. There is no difference between an entry that has never been stored to, one that has been erased and one that has most recently had ``NULL`` stored to it. You can conditionally replace an entry at an index by using -:c:func:`xa_cmpxchg`. Like :c:func:`cmpxchg`, it will only succeed if +xa_cmpxchg(). Like cmpxchg(), it will only succeed if the entry at that index has the 'old' value. It also returns the entry which was at that index; if it returns the same entry which was passed as -'old', then :c:func:`xa_cmpxchg` succeeded. +'old', then xa_cmpxchg() succeeded. If you want to only store a new entry to an index if the current entry -at that index is ``NULL``, you can use :c:func:`xa_insert` which +at that index is ``NULL``, you can use xa_insert() which returns ``-EBUSY`` if the entry is not empty. You can enquire whether a mark is set on an entry by using -:c:func:`xa_get_mark`. If the entry is not ``NULL``, you can set a mark -on it by using :c:func:`xa_set_mark` and remove the mark from an entry by -calling :c:func:`xa_clear_mark`. You can ask whether any entry in the -XArray has a particular mark set by calling :c:func:`xa_marked`. +xa_get_mark(). If the entry is not ``NULL``, you can set a mark +on it by using xa_set_mark() and remove the mark from an entry by +calling xa_clear_mark(). You can ask whether any entry in the +XArray has a particular mark set by calling xa_marked(). You can copy entries out of the XArray into a plain array by calling -:c:func:`xa_extract`. Or you can iterate over the present entries in -the XArray by calling :c:func:`xa_for_each`. You may prefer to use -:c:func:`xa_find` or :c:func:`xa_find_after` to move to the next present +xa_extract(). Or you can iterate over the present entries in +the XArray by calling xa_for_each(). You may prefer to use +xa_find() or xa_find_after() to move to the next present entry in the XArray. -Calling :c:func:`xa_store_range` stores the same entry in a range +Calling xa_store_range() stores the same entry in a range of indices. If you do this, some of the other operations will behave in a slightly odd way. For example, marking the entry at one index may result in the entry being marked at some, but not all of the other indices. Storing into one index may result in the entry retrieved by some, but not all of the other indices changing. -Sometimes you need to ensure that a subsequent call to :c:func:`xa_store` -will not need to allocate memory. The :c:func:`xa_reserve` function +Sometimes you need to ensure that a subsequent call to xa_store() +will not need to allocate memory. The xa_reserve() function will store a reserved entry at the indicated index. Users of the normal API will see this entry as containing ``NULL``. If you do -not need to use the reserved entry, you can call :c:func:`xa_release` +not need to use the reserved entry, you can call xa_release() to remove the unused entry. If another user has stored to the entry -in the meantime, :c:func:`xa_release` will do nothing; if instead you -want the entry to become ``NULL``, you should use :c:func:`xa_erase`. -Using :c:func:`xa_insert` on a reserved entry will fail. +in the meantime, xa_release() will do nothing; if instead you +want the entry to become ``NULL``, you should use xa_erase(). +Using xa_insert() on a reserved entry will fail. -If all entries in the array are ``NULL``, the :c:func:`xa_empty` function +If all entries in the array are ``NULL``, the xa_empty() function will return ``true``. Finally, you can remove all entries from an XArray by calling -:c:func:`xa_destroy`. If the XArray entries are pointers, you may wish +xa_destroy(). If the XArray entries are pointers, you may wish to free the entries first. You can do this by iterating over all present -entries in the XArray using the :c:func:`xa_for_each` iterator. +entries in the XArray using the xa_for_each() iterator. Allocating XArrays ------------------ -If you use :c:func:`DEFINE_XARRAY_ALLOC` to define the XArray, or -initialise it by passing ``XA_FLAGS_ALLOC`` to :c:func:`xa_init_flags`, +If you use DEFINE_XARRAY_ALLOC() to define the XArray, or +initialise it by passing ``XA_FLAGS_ALLOC`` to xa_init_flags(), the XArray changes to track whether entries are in use or not. -You can call :c:func:`xa_alloc` to store the entry at an unused index +You can call xa_alloc() to store the entry at an unused index in the XArray. If you need to modify the array from interrupt context, -you can use :c:func:`xa_alloc_bh` or :c:func:`xa_alloc_irq` to disable +you can use xa_alloc_bh() or xa_alloc_irq() to disable interrupts while allocating the ID. -Using :c:func:`xa_store`, :c:func:`xa_cmpxchg` or :c:func:`xa_insert` will +Using xa_store(), xa_cmpxchg() or xa_insert() will also mark the entry as being allocated. Unlike a normal XArray, storing -``NULL`` will mark the entry as being in use, like :c:func:`xa_reserve`. -To free an entry, use :c:func:`xa_erase` (or :c:func:`xa_release` if +``NULL`` will mark the entry as being in use, like xa_reserve(). +To free an entry, use xa_erase() (or xa_release() if you only want to free the entry if it's ``NULL``). By default, the lowest free entry is allocated starting from 0. If you want to allocate entries starting at 1, it is more efficient to use -:c:func:`DEFINE_XARRAY_ALLOC1` or ``XA_FLAGS_ALLOC1``. If you want to +DEFINE_XARRAY_ALLOC1() or ``XA_FLAGS_ALLOC1``. If you want to allocate IDs up to a maximum, then wrap back around to the lowest free -ID, you can use :c:func:`xa_alloc_cyclic`. +ID, you can use xa_alloc_cyclic(). You cannot use ``XA_MARK_0`` with an allocating XArray as this mark is used to track whether an entry is free or not. The other marks are @@ -155,17 +155,17 @@ available for your use. Memory allocation ----------------- -The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_alloc`, -:c:func:`xa_reserve` and :c:func:`xa_insert` functions take a gfp_t +The xa_store(), xa_cmpxchg(), xa_alloc(), +xa_reserve() and xa_insert() functions take a gfp_t parameter in case the XArray needs to allocate memory to store this entry. If the entry is being deleted, no memory allocation needs to be performed, and the GFP flags specified will be ignored. It is possible for no memory to be allocatable, particularly if you pass a restrictive set of GFP flags. In that case, the functions return a -special value which can be turned into an errno using :c:func:`xa_err`. +special value which can be turned into an errno using xa_err(). If you don't need to know exactly which error occurred, using -:c:func:`xa_is_err` is slightly more efficient. +xa_is_err() is slightly more efficient. Locking ------- @@ -174,54 +174,54 @@ When using the Normal API, you do not have to worry about locking. The XArray uses RCU and an internal spinlock to synchronise access: No lock needed: - * :c:func:`xa_empty` - * :c:func:`xa_marked` + * xa_empty() + * xa_marked() Takes RCU read lock: - * :c:func:`xa_load` - * :c:func:`xa_for_each` - * :c:func:`xa_find` - * :c:func:`xa_find_after` - * :c:func:`xa_extract` - * :c:func:`xa_get_mark` + * xa_load() + * xa_for_each() + * xa_find() + * xa_find_after() + * xa_extract() + * xa_get_mark() Takes xa_lock internally: - * :c:func:`xa_store` - * :c:func:`xa_store_bh` - * :c:func:`xa_store_irq` - * :c:func:`xa_insert` - * :c:func:`xa_insert_bh` - * :c:func:`xa_insert_irq` - * :c:func:`xa_erase` - * :c:func:`xa_erase_bh` - * :c:func:`xa_erase_irq` - * :c:func:`xa_cmpxchg` - * :c:func:`xa_cmpxchg_bh` - * :c:func:`xa_cmpxchg_irq` - * :c:func:`xa_store_range` - * :c:func:`xa_alloc` - * :c:func:`xa_alloc_bh` - * :c:func:`xa_alloc_irq` - * :c:func:`xa_reserve` - * :c:func:`xa_reserve_bh` - * :c:func:`xa_reserve_irq` - * :c:func:`xa_destroy` - * :c:func:`xa_set_mark` - * :c:func:`xa_clear_mark` + * xa_store() + * xa_store_bh() + * xa_store_irq() + * xa_insert() + * xa_insert_bh() + * xa_insert_irq() + * xa_erase() + * xa_erase_bh() + * xa_erase_irq() + * xa_cmpxchg() + * xa_cmpxchg_bh() + * xa_cmpxchg_irq() + * xa_store_range() + * xa_alloc() + * xa_alloc_bh() + * xa_alloc_irq() + * xa_reserve() + * xa_reserve_bh() + * xa_reserve_irq() + * xa_destroy() + * xa_set_mark() + * xa_clear_mark() Assumes xa_lock held on entry: - * :c:func:`__xa_store` - * :c:func:`__xa_insert` - * :c:func:`__xa_erase` - * :c:func:`__xa_cmpxchg` - * :c:func:`__xa_alloc` - * :c:func:`__xa_set_mark` - * :c:func:`__xa_clear_mark` + * __xa_store() + * __xa_insert() + * __xa_erase() + * __xa_cmpxchg() + * __xa_alloc() + * __xa_set_mark() + * __xa_clear_mark() If you want to take advantage of the lock to protect the data structures -that you are storing in the XArray, you can call :c:func:`xa_lock` -before calling :c:func:`xa_load`, then take a reference count on the -object you have found before calling :c:func:`xa_unlock`. This will +that you are storing in the XArray, you can call xa_lock() +before calling xa_load(), then take a reference count on the +object you have found before calling xa_unlock(). This will prevent stores from removing the object from the array between looking up the object and incrementing the refcount. You can also use RCU to avoid dereferencing freed memory, but an explanation of that is beyond @@ -261,7 +261,7 @@ context and then erase them in softirq context, you can do that this way:: } If you are going to modify the XArray from interrupt or softirq context, -you need to initialise the array using :c:func:`xa_init_flags`, passing +you need to initialise the array using xa_init_flags(), passing ``XA_FLAGS_LOCK_IRQ`` or ``XA_FLAGS_LOCK_BH``. The above example also shows a common pattern of wanting to extend the @@ -269,20 +269,20 @@ coverage of the xa_lock on the store side to protect some statistics associated with the array. Sharing the XArray with interrupt context is also possible, either -using :c:func:`xa_lock_irqsave` in both the interrupt handler and process -context, or :c:func:`xa_lock_irq` in process context and :c:func:`xa_lock` +using xa_lock_irqsave() in both the interrupt handler and process +context, or xa_lock_irq() in process context and xa_lock() in the interrupt handler. Some of the more common patterns have helper -functions such as :c:func:`xa_store_bh`, :c:func:`xa_store_irq`, -:c:func:`xa_erase_bh`, :c:func:`xa_erase_irq`, :c:func:`xa_cmpxchg_bh` -and :c:func:`xa_cmpxchg_irq`. +functions such as xa_store_bh(), xa_store_irq(), +xa_erase_bh(), xa_erase_irq(), xa_cmpxchg_bh() +and xa_cmpxchg_irq(). Sometimes you need to protect access to the XArray with a mutex because that lock sits above another mutex in the locking hierarchy. That does -not entitle you to use functions like :c:func:`__xa_erase` without taking +not entitle you to use functions like __xa_erase() without taking the xa_lock; the xa_lock is used for lockdep validation and will be used for other purposes in the future. -The :c:func:`__xa_set_mark` and :c:func:`__xa_clear_mark` functions are also +The __xa_set_mark() and __xa_clear_mark() functions are also available for situations where you look up an entry and want to atomically set or clear a mark. It may be more efficient to use the advanced API in this case, as it will save you from walking the tree twice. @@ -300,27 +300,27 @@ indeed the normal API is implemented in terms of the advanced API. The advanced API is only available to modules with a GPL-compatible license. The advanced API is based around the xa_state. This is an opaque data -structure which you declare on the stack using the :c:func:`XA_STATE` +structure which you declare on the stack using the XA_STATE() macro. This macro initialises the xa_state ready to start walking around the XArray. It is used as a cursor to maintain the position in the XArray and let you compose various operations together without having to restart from the top every time. The xa_state is also used to store errors. You can call -:c:func:`xas_error` to retrieve the error. All operations check whether +xas_error() to retrieve the error. All operations check whether the xa_state is in an error state before proceeding, so there's no need for you to check for an error after each call; you can make multiple calls in succession and only check at a convenient point. The only errors currently generated by the XArray code itself are ``ENOMEM`` and ``EINVAL``, but it supports arbitrary errors in case you want to call -:c:func:`xas_set_err` yourself. +xas_set_err() yourself. -If the xa_state is holding an ``ENOMEM`` error, calling :c:func:`xas_nomem` +If the xa_state is holding an ``ENOMEM`` error, calling xas_nomem() will attempt to allocate more memory using the specified gfp flags and cache it in the xa_state for the next attempt. The idea is that you take the xa_lock, attempt the operation and drop the lock. The operation attempts to allocate memory while holding the lock, but it is more -likely to fail. Once you have dropped the lock, :c:func:`xas_nomem` +likely to fail. Once you have dropped the lock, xas_nomem() can try harder to allocate more memory. It will return ``true`` if it is worth retrying the operation (i.e. that there was a memory error *and* more memory was allocated). If it has previously allocated memory, and @@ -333,7 +333,7 @@ Internal Entries The XArray reserves some entries for its own purposes. These are never exposed through the normal API, but when using the advanced API, it's possible to see them. Usually the best way to handle them is to pass them -to :c:func:`xas_retry`, and retry the operation if it returns ``true``. +to xas_retry(), and retry the operation if it returns ``true``. .. flat-table:: :widths: 1 1 6 @@ -343,89 +343,89 @@ to :c:func:`xas_retry`, and retry the operation if it returns ``true``. - Usage * - Node - - :c:func:`xa_is_node` + - xa_is_node() - An XArray node. May be visible when using a multi-index xa_state. * - Sibling - - :c:func:`xa_is_sibling` + - xa_is_sibling() - A non-canonical entry for a multi-index entry. The value indicates which slot in this node has the canonical entry. * - Retry - - :c:func:`xa_is_retry` + - xa_is_retry() - This entry is currently being modified by a thread which has the xa_lock. The node containing this entry may be freed at the end of this RCU period. You should restart the lookup from the head of the array. * - Zero - - :c:func:`xa_is_zero` + - xa_is_zero() - Zero entries appear as ``NULL`` through the Normal API, but occupy an entry in the XArray which can be used to reserve the index for future use. This is used by allocating XArrays for allocated entries which are ``NULL``. Other internal entries may be added in the future. As far as possible, they -will be handled by :c:func:`xas_retry`. +will be handled by xas_retry(). Additional functionality ------------------------ -The :c:func:`xas_create_range` function allocates all the necessary memory +The xas_create_range() function allocates all the necessary memory to store every entry in a range. It will set ENOMEM in the xa_state if it cannot allocate memory. -You can use :c:func:`xas_init_marks` to reset the marks on an entry +You can use xas_init_marks() to reset the marks on an entry to their default state. This is usually all marks clear, unless the XArray is marked with ``XA_FLAGS_TRACK_FREE``, in which case mark 0 is set and all other marks are clear. Replacing one entry with another using -:c:func:`xas_store` will not reset the marks on that entry; if you want +xas_store() will not reset the marks on that entry; if you want the marks reset, you should do that explicitly. -The :c:func:`xas_load` will walk the xa_state as close to the entry +The xas_load() will walk the xa_state as close to the entry as it can. If you know the xa_state has already been walked to the entry and need to check that the entry hasn't changed, you can use -:c:func:`xas_reload` to save a function call. +xas_reload() to save a function call. If you need to move to a different index in the XArray, call -:c:func:`xas_set`. This resets the cursor to the top of the tree, which +xas_set(). This resets the cursor to the top of the tree, which will generally make the next operation walk the cursor to the desired spot in the tree. If you want to move to the next or previous index, -call :c:func:`xas_next` or :c:func:`xas_prev`. Setting the index does +call xas_next() or xas_prev(). Setting the index does not walk the cursor around the array so does not require a lock to be held, while moving to the next or previous index does. -You can search for the next present entry using :c:func:`xas_find`. This -is the equivalent of both :c:func:`xa_find` and :c:func:`xa_find_after`; +You can search for the next present entry using xas_find(). This +is the equivalent of both xa_find() and xa_find_after(); if the cursor has been walked to an entry, then it will find the next entry after the one currently referenced. If not, it will return the -entry at the index of the xa_state. Using :c:func:`xas_next_entry` to -move to the next present entry instead of :c:func:`xas_find` will save +entry at the index of the xa_state. Using xas_next_entry() to +move to the next present entry instead of xas_find() will save a function call in the majority of cases at the expense of emitting more inline code. -The :c:func:`xas_find_marked` function is similar. If the xa_state has +The xas_find_marked() function is similar. If the xa_state has not been walked, it will return the entry at the index of the xa_state, if it is marked. Otherwise, it will return the first marked entry after -the entry referenced by the xa_state. The :c:func:`xas_next_marked` -function is the equivalent of :c:func:`xas_next_entry`. +the entry referenced by the xa_state. The xas_next_marked() +function is the equivalent of xas_next_entry(). -When iterating over a range of the XArray using :c:func:`xas_for_each` -or :c:func:`xas_for_each_marked`, it may be necessary to temporarily stop -the iteration. The :c:func:`xas_pause` function exists for this purpose. +When iterating over a range of the XArray using xas_for_each() +or xas_for_each_marked(), it may be necessary to temporarily stop +the iteration. The xas_pause() function exists for this purpose. After you have done the necessary work and wish to resume, the xa_state is in an appropriate state to continue the iteration after the entry you last processed. If you have interrupts disabled while iterating, then it is good manners to pause the iteration and reenable interrupts every ``XA_CHECK_SCHED`` entries. -The :c:func:`xas_get_mark`, :c:func:`xas_set_mark` and -:c:func:`xas_clear_mark` functions require the xa_state cursor to have +The xas_get_mark(), xas_set_mark() and +xas_clear_mark() functions require the xa_state cursor to have been moved to the appropriate location in the xarray; they will do -nothing if you have called :c:func:`xas_pause` or :c:func:`xas_set` +nothing if you have called xas_pause() or xas_set() immediately before. -You can call :c:func:`xas_set_update` to have a callback function +You can call xas_set_update() to have a callback function called each time the XArray updates a node. This is used by the page cache workingset code to maintain its list of nodes which contain only shadow entries. @@ -443,25 +443,25 @@ eg indices 64-127 may be tied together, but 2-6 may not be. This may save substantial quantities of memory; for example tying 512 entries together will save over 4kB. -You can create a multi-index entry by using :c:func:`XA_STATE_ORDER` -or :c:func:`xas_set_order` followed by a call to :c:func:`xas_store`. -Calling :c:func:`xas_load` with a multi-index xa_state will walk the +You can create a multi-index entry by using XA_STATE_ORDER() +or xas_set_order() followed by a call to xas_store(). +Calling xas_load() with a multi-index xa_state will walk the xa_state to the right location in the tree, but the return value is not meaningful, potentially being an internal entry or ``NULL`` even when there -is an entry stored within the range. Calling :c:func:`xas_find_conflict` +is an entry stored within the range. Calling xas_find_conflict() will return the first entry within the range or ``NULL`` if there are no -entries in the range. The :c:func:`xas_for_each_conflict` iterator will +entries in the range. The xas_for_each_conflict() iterator will iterate over every entry which overlaps the specified range. -If :c:func:`xas_load` encounters a multi-index entry, the xa_index +If xas_load() encounters a multi-index entry, the xa_index in the xa_state will not be changed. When iterating over an XArray -or calling :c:func:`xas_find`, if the initial index is in the middle +or calling xas_find(), if the initial index is in the middle of a multi-index entry, it will not be altered. Subsequent calls or iterations will move the index to the first index in the range. Each entry will only be returned once, no matter how many indices it occupies. -Using :c:func:`xas_next` or :c:func:`xas_prev` with a multi-index xa_state +Using xas_next() or xas_prev() with a multi-index xa_state is not supported. Using either of these functions on a multi-index entry will reveal sibling entries; these should be skipped over by the caller. diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.rst index 86786d87d9a8..b17fe352fc41 100644 --- a/Documentation/device-mapper/cache-policies.txt +++ b/Documentation/device-mapper/cache-policies.rst @@ -1,3 +1,4 @@ +============================= Guidance for writing policies ============================= @@ -30,7 +31,7 @@ multiqueue (mq) This policy is now an alias for smq (see below). -The following tunables are accepted, but have no effect: +The following tunables are accepted, but have no effect:: 'sequential_threshold <#nr_sequential_ios>' 'random_threshold <#nr_random_ios>' @@ -56,7 +57,9 @@ mq policy's hints to be dropped. Also, performance of the cache may degrade slightly until smq recalculates the origin device's hotspots that should be cached. -Memory usage: +Memory usage +^^^^^^^^^^^^ + The mq policy used a lot of memory; 88 bytes per cache block on a 64 bit machine. @@ -69,7 +72,9 @@ cache block). All this means smq uses ~25bytes per cache block. Still a lot of memory, but a substantial improvement nontheless. -Level balancing: +Level balancing +^^^^^^^^^^^^^^^ + mq placed entries in different levels of the multiqueue structures based on their hit count (~ln(hit count)). This meant the bottom levels generally had the most entries, and the top ones had very @@ -94,7 +99,9 @@ is used to decide which blocks to promote. If the hotspot queue is performing badly then it starts moving entries more quickly between levels. This lets it adapt to new IO patterns very quickly. -Performance: +Performance +^^^^^^^^^^^ + Testing smq shows substantially better performance than mq. cleaner @@ -105,16 +112,19 @@ The cleaner writes back all dirty blocks in a cache to decommission it. Examples ======== -The syntax for a table is: +The syntax for a table is:: + cache <metadata dev> <cache dev> <origin dev> <block size> <#feature_args> [<feature arg>]* <policy> <#policy_args> [<policy arg>]* -The syntax to send a message using the dmsetup command is: +The syntax to send a message using the dmsetup command is:: + dmsetup message <mapped device> 0 sequential_threshold 1024 dmsetup message <mapped device> 0 random_threshold 8 -Using dmsetup: +Using dmsetup:: + dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \ /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8" creates a 128GB large mapped device named 'blah' with the diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.rst index 8ae1cf8e94da..f15e5254d05b 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.rst @@ -1,3 +1,7 @@ +===== +Cache +===== + Introduction ============ @@ -24,10 +28,13 @@ scenarios (eg. a vm image server). Glossary ======== - Migration - Movement of the primary copy of a logical block from one + Migration + Movement of the primary copy of a logical block from one device to the other. - Promotion - Migration from slow device to fast device. - Demotion - Migration from fast device to slow device. + Promotion + Migration from slow device to fast device. + Demotion + Migration from fast device to slow device. The origin device always contains a copy of the logical block, which may be out of date or kept in sync with the copy on the cache device @@ -169,45 +176,53 @@ Target interface Constructor ----------- - cache <metadata dev> <cache dev> <origin dev> <block size> - <#feature args> [<feature arg>]* - <policy> <#policy args> [policy args]* + :: + + cache <metadata dev> <cache dev> <origin dev> <block size> + <#feature args> [<feature arg>]* + <policy> <#policy args> [policy args]* - metadata dev : fast device holding the persistent metadata - cache dev : fast device holding cached data blocks - origin dev : slow device holding original data blocks - block size : cache unit size in sectors + ================ ======================================================= + metadata dev fast device holding the persistent metadata + cache dev fast device holding cached data blocks + origin dev slow device holding original data blocks + block size cache unit size in sectors - #feature args : number of feature arguments passed - feature args : writethrough or passthrough (The default is writeback.) + #feature args number of feature arguments passed + feature args writethrough or passthrough (The default is writeback.) - policy : the replacement policy to use - #policy args : an even number of arguments corresponding to - key/value pairs passed to the policy - policy args : key/value pairs passed to the policy - E.g. 'sequential_threshold 1024' - See cache-policies.txt for details. + policy the replacement policy to use + #policy args an even number of arguments corresponding to + key/value pairs passed to the policy + policy args key/value pairs passed to the policy + E.g. 'sequential_threshold 1024' + See cache-policies.txt for details. + ================ ======================================================= Optional feature arguments are: - writethrough : write through caching that prohibits cache block - content from being different from origin block content. - Without this argument, the default behaviour is to write - back cache block contents later for performance reasons, - so they may differ from the corresponding origin blocks. - - passthrough : a degraded mode useful for various cache coherency - situations (e.g., rolling back snapshots of - underlying storage). Reads and writes always go to - the origin. If a write goes to a cached origin - block, then the cache block is invalidated. - To enable passthrough mode the cache must be clean. - - metadata2 : use version 2 of the metadata. This stores the dirty bits - in a separate btree, which improves speed of shutting - down the cache. - - no_discard_passdown : disable passing down discards from the cache - to the origin's data device. + + + ==================== ======================================================== + writethrough write through caching that prohibits cache block + content from being different from origin block content. + Without this argument, the default behaviour is to write + back cache block contents later for performance reasons, + so they may differ from the corresponding origin blocks. + + passthrough a degraded mode useful for various cache coherency + situations (e.g., rolling back snapshots of + underlying storage). Reads and writes always go to + the origin. If a write goes to a cached origin + block, then the cache block is invalidated. + To enable passthrough mode the cache must be clean. + + metadata2 use version 2 of the metadata. This stores the dirty + bits in a separate btree, which improves speed of + shutting down the cache. + + no_discard_passdown disable passing down discards from the cache + to the origin's data device. + ==================== ======================================================== A policy called 'default' is always registered. This is an alias for the policy we currently think is giving best all round performance. @@ -218,54 +233,61 @@ the characteristics of a specific policy, always request it by name. Status ------ -<metadata block size> <#used metadata blocks>/<#total metadata blocks> -<cache block size> <#used cache blocks>/<#total cache blocks> -<#read hits> <#read misses> <#write hits> <#write misses> -<#demotions> <#promotions> <#dirty> <#features> <features>* -<#core args> <core args>* <policy name> <#policy args> <policy args>* -<cache metadata mode> - -metadata block size : Fixed block size for each metadata block in - sectors -#used metadata blocks : Number of metadata blocks used -#total metadata blocks : Total number of metadata blocks -cache block size : Configurable block size for the cache device - in sectors -#used cache blocks : Number of blocks resident in the cache -#total cache blocks : Total number of cache blocks -#read hits : Number of times a READ bio has been mapped - to the cache -#read misses : Number of times a READ bio has been mapped - to the origin -#write hits : Number of times a WRITE bio has been mapped - to the cache -#write misses : Number of times a WRITE bio has been - mapped to the origin -#demotions : Number of times a block has been removed - from the cache -#promotions : Number of times a block has been moved to - the cache -#dirty : Number of blocks in the cache that differ - from the origin -#feature args : Number of feature args to follow -feature args : 'writethrough' (optional) -#core args : Number of core arguments (must be even) -core args : Key/value pairs for tuning the core - e.g. migration_threshold -policy name : Name of the policy -#policy args : Number of policy arguments to follow (must be even) -policy args : Key/value pairs e.g. sequential_threshold -cache metadata mode : ro if read-only, rw if read-write - In serious cases where even a read-only mode is deemed unsafe - no further I/O will be permitted and the status will just - contain the string 'Fail'. The userspace recovery tools - should then be used. -needs_check : 'needs_check' if set, '-' if not set - A metadata operation has failed, resulting in the needs_check - flag being set in the metadata's superblock. The metadata - device must be deactivated and checked/repaired before the - cache can be made fully operational again. '-' indicates - needs_check is not set. +:: + + <metadata block size> <#used metadata blocks>/<#total metadata blocks> + <cache block size> <#used cache blocks>/<#total cache blocks> + <#read hits> <#read misses> <#write hits> <#write misses> + <#demotions> <#promotions> <#dirty> <#features> <features>* + <#core args> <core args>* <policy name> <#policy args> <policy args>* + <cache metadata mode> + + +========================= ===================================================== +metadata block size Fixed block size for each metadata block in + sectors +#used metadata blocks Number of metadata blocks used +#total metadata blocks Total number of metadata blocks +cache block size Configurable block size for the cache device + in sectors +#used cache blocks Number of blocks resident in the cache +#total cache blocks Total number of cache blocks +#read hits Number of times a READ bio has been mapped + to the cache +#read misses Number of times a READ bio has been mapped + to the origin +#write hits Number of times a WRITE bio has been mapped + to the cache +#write misses Number of times a WRITE bio has been + mapped to the origin +#demotions Number of times a block has been removed + from the cache +#promotions Number of times a block has been moved to + the cache +#dirty Number of blocks in the cache that differ + from the origin +#feature args Number of feature args to follow +feature args 'writethrough' (optional) +#core args Number of core arguments (must be even) +core args Key/value pairs for tuning the core + e.g. migration_threshold +policy name Name of the policy +#policy args Number of policy arguments to follow (must be even) +policy args Key/value pairs e.g. sequential_threshold +cache metadata mode ro if read-only, rw if read-write + + In serious cases where even a read-only mode is + deemed unsafe no further I/O will be permitted and + the status will just contain the string 'Fail'. + The userspace recovery tools should then be used. +needs_check 'needs_check' if set, '-' if not set + A metadata operation has failed, resulting in the + needs_check flag being set in the metadata's + superblock. The metadata device must be + deactivated and checked/repaired before the + cache can be made fully operational again. + '-' indicates needs_check is not set. +========================= ===================================================== Messages -------- @@ -274,11 +296,12 @@ Policies will have different tunables, specific to each one, so we need a generic way of getting and setting these. Device-mapper messages are used. (A sysfs interface would also be possible.) -The message format is: +The message format is:: <key> <value> -E.g. +E.g.:: + dmsetup message my_cache 0 sequential_threshold 1024 @@ -290,11 +313,12 @@ of values from 5 to 9. Each cblock must be expressed as a decimal value, in the future a variant message that takes cblock ranges expressed in hexadecimal may be needed to better support efficient invalidation of larger caches. The cache must be in passthrough mode -when invalidate_cblocks is used. +when invalidate_cblocks is used:: invalidate_cblocks [<cblock>|<cblock begin>-<cblock end>]* -E.g. +E.g.:: + dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 Examples @@ -304,8 +328,10 @@ The test suite can be found here: https://github.com/jthornber/device-mapper-test-suite -dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' -dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ - /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ - mq 4 sequential_threshold 1024 random_threshold 8' +:: + + dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' + dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ + mq 4 sequential_threshold 1024 random_threshold 8' diff --git a/Documentation/device-mapper/delay.txt b/Documentation/device-mapper/delay.rst index 6426c45273cb..917ba8c33359 100644 --- a/Documentation/device-mapper/delay.txt +++ b/Documentation/device-mapper/delay.rst @@ -1,10 +1,12 @@ +======== dm-delay ======== Device-Mapper's "delay" target delays reads and/or writes and maps them to different devices. -Parameters: +Parameters:: + <device> <offset> <delay> [<write_device> <write_offset> <write_delay> [<flush_device> <flush_offset> <flush_delay>]] @@ -14,15 +16,16 @@ Delays are specified in milliseconds. Example scripts =============== -[[ -#!/bin/sh -# Create device delaying rw operation for 500ms -echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed -]] - -[[ -#!/bin/sh -# Create device delaying only write operation for 500ms and -# splitting reads and writes to different devices $1 $2 -echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed -]] + +:: + + #!/bin/sh + # Create device delaying rw operation for 500ms + echo "0 `blockdev --getsz $1` delay $1 0 500" | dmsetup create delayed + +:: + + #!/bin/sh + # Create device delaying only write operation for 500ms and + # splitting reads and writes to different devices $1 $2 + echo "0 `blockdev --getsz $1` delay $1 0 0 $2 0 500" | dmsetup create delayed diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.rst index 3b3e1de21c9c..8f4a3f889d43 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.rst @@ -1,5 +1,6 @@ +======== dm-crypt -========= +======== Device-Mapper's "crypt" target provides transparent encryption of block devices using the kernel crypto API. @@ -7,15 +8,20 @@ using the kernel crypto API. For a more detailed description of supported parameters see: https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt -Parameters: <cipher> <key> <iv_offset> <device path> \ +Parameters:: + + <cipher> <key> <iv_offset> <device path> \ <offset> [<#opt_params> <opt_params>] <cipher> Encryption cipher, encryption mode and Initial Vector (IV) generator. - The cipher specifications format is: + The cipher specifications format is:: + cipher[:keycount]-chainmode-ivmode[:ivopts] - Examples: + + Examples:: + aes-cbc-essiv:sha256 aes-xts-plain64 serpent-xts-plain64 @@ -25,12 +31,17 @@ Parameters: <cipher> <key> <iv_offset> <device path> \ as for the first format type. This format is mainly used for specification of authenticated modes. - The crypto API cipher specifications format is: + The crypto API cipher specifications format is:: + capi:cipher_api_spec-ivmode[:ivopts] - Examples: + + Examples:: + capi:cbc(aes)-essiv:sha256 capi:xts(aes)-plain64 - Examples of authenticated modes: + + Examples of authenticated modes:: + capi:gcm(aes)-random capi:authenc(hmac(sha256),xts(aes))-random capi:rfc7539(chacha20,poly1305)-random @@ -142,21 +153,21 @@ LUKS (Linux Unified Key Setup) is now the preferred way to set up disk encryption with dm-crypt using the 'cryptsetup' utility, see https://gitlab.com/cryptsetup/cryptsetup -[[ -#!/bin/sh -# Create a crypt device using dmsetup -dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" -]] - -[[ -#!/bin/sh -# Create a crypt device using dmsetup when encryption key is stored in keyring service -dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" -]] - -[[ -#!/bin/sh -# Create a crypt device using cryptsetup and LUKS header with default cipher -cryptsetup luksFormat $1 -cryptsetup luksOpen $1 crypt1 -]] +:: + + #!/bin/sh + # Create a crypt device using dmsetup + dmsetup create crypt1 --table "0 `blockdev --getsz $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" + +:: + + #!/bin/sh + # Create a crypt device using dmsetup when encryption key is stored in keyring service + dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" + +:: + + #!/bin/sh + # Create a crypt device using cryptsetup and LUKS header with default cipher + cryptsetup luksFormat $1 + cryptsetup luksOpen $1 crypt1 diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.rst index 9f0e247d0877..86138735879d 100644 --- a/Documentation/device-mapper/dm-flakey.txt +++ b/Documentation/device-mapper/dm-flakey.rst @@ -1,3 +1,4 @@ +========= dm-flakey ========= @@ -15,17 +16,26 @@ underlying devices. Table parameters ---------------- + +:: + <dev path> <offset> <up interval> <down interval> \ [<num_features> [<feature arguments>]] Mandatory parameters: - <dev path>: Full pathname to the underlying block-device, or a - "major:minor" device-number. - <offset>: Starting sector within the device. - <up interval>: Number of seconds device is available. - <down interval>: Number of seconds device returns errors. + + <dev path>: + Full pathname to the underlying block-device, or a + "major:minor" device-number. + <offset>: + Starting sector within the device. + <up interval>: + Number of seconds device is available. + <down interval>: + Number of seconds device returns errors. Optional feature parameters: + If no feature parameters are present, during the periods of unreliability, all I/O returns errors. @@ -41,17 +51,24 @@ Optional feature parameters: During <down interval>, replace <Nth_byte> of the data of each matching bio with <value>. - <Nth_byte>: The offset of the byte to replace. - Counting starts at 1, to replace the first byte. - <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes. - 'w' is incompatible with drop_writes. - <value>: The value (from 0-255) to write. - <flags>: Perform the replacement only if bio->bi_opf has all the - selected flags set. + <Nth_byte>: + The offset of the byte to replace. + Counting starts at 1, to replace the first byte. + <direction>: + Either 'r' to corrupt reads or 'w' to corrupt writes. + 'w' is incompatible with drop_writes. + <value>: + The value (from 0-255) to write. + <flags>: + Perform the replacement only if bio->bi_opf has all the + selected flags set. Examples: + +Replaces the 32nd byte of READ bios with the value 1:: + corrupt_bio_byte 32 r 1 0 - - replaces the 32nd byte of READ bios with the value 1 + +Replaces the 224th byte of REQ_META (=32) bios with the value 0:: corrupt_bio_byte 224 w 0 32 - - replaces the 224th byte of REQ_META (=32) bios with the value 0 diff --git a/Documentation/device-mapper/dm-init.txt b/Documentation/device-mapper/dm-init.rst index 8464ee7c01b8..e5242ff17e9b 100644 --- a/Documentation/device-mapper/dm-init.txt +++ b/Documentation/device-mapper/dm-init.rst @@ -1,5 +1,6 @@ +================================ Early creation of mapped devices -==================================== +================================ It is possible to configure a device-mapper device to act as the root device for your system in two ways. @@ -12,15 +13,17 @@ The second is to create one or more device-mappers using the module parameter The format is specified as a string of data separated by commas and optionally semi-colons, where: + - a comma is used to separate fields like name, uuid, flags and table (specifies one device) - a semi-colon is used to separate devices. -So the format will look like this: +So the format will look like this:: dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+] -Where, +Where:: + <name> ::= The device name. <uuid> ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "" <minor> ::= The device minor number | "" @@ -29,7 +32,7 @@ Where, <target_type> ::= "verity" | "linear" | ... (see list below) The dm line should be equivalent to the one used by the dmsetup tool with the ---concise argument. +`--concise` argument. Target types ============ @@ -38,32 +41,34 @@ Not all target types are available as there are serious risks in allowing activation of certain DM targets without first using userspace tools to check the validity of associated metadata. - "cache": constrained, userspace should verify cache device - "crypt": allowed - "delay": allowed - "era": constrained, userspace should verify metadata device - "flakey": constrained, meant for test - "linear": allowed - "log-writes": constrained, userspace should verify metadata device - "mirror": constrained, userspace should verify main/mirror device - "raid": constrained, userspace should verify metadata device - "snapshot": constrained, userspace should verify src/dst device - "snapshot-origin": allowed - "snapshot-merge": constrained, userspace should verify src/dst device - "striped": allowed - "switch": constrained, userspace should verify dev path - "thin": constrained, requires dm target message from userspace - "thin-pool": constrained, requires dm target message from userspace - "verity": allowed - "writecache": constrained, userspace should verify cache device - "zero": constrained, not meant for rootfs +======================= ======================================================= +`cache` constrained, userspace should verify cache device +`crypt` allowed +`delay` allowed +`era` constrained, userspace should verify metadata device +`flakey` constrained, meant for test +`linear` allowed +`log-writes` constrained, userspace should verify metadata device +`mirror` constrained, userspace should verify main/mirror device +`raid` constrained, userspace should verify metadata device +`snapshot` constrained, userspace should verify src/dst device +`snapshot-origin` allowed +`snapshot-merge` constrained, userspace should verify src/dst device +`striped` allowed +`switch` constrained, userspace should verify dev path +`thin` constrained, requires dm target message from userspace +`thin-pool` constrained, requires dm target message from userspace +`verity` allowed +`writecache` constrained, userspace should verify cache device +`zero` constrained, not meant for rootfs +======================= ======================================================= If the target is not listed above, it is constrained by default (not tested). Examples ======== An example of booting to a linear array made up of user-mode linux block -devices: +devices:: dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0 @@ -71,43 +76,49 @@ This will boot to a rw dm-linear target of 8192 sectors split across two block devices identified by their major:minor numbers. After boot, udev will rename this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned. -An example of multiple device-mappers, with the dm-mod.create="..." contents is shown here -split on multiple lines for readability: +An example of multiple device-mappers, with the dm-mod.create="..." contents +is shown here split on multiple lines for readability:: - vroot,,,ro, - 0 1740800 verity 254:0 254:0 1740800 sha1 - 76e9be054b15884a9fa85973e9cb274c93afadb6 - 5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe; - vram,,,rw, - 0 32768 linear 1:0 0, - 32768 32768 linear 1:1 0 + dm-linear,,1,rw, + 0 32768 linear 8:1 0, + 32768 1024000 linear 8:2 0; + dm-verity,,3,ro, + 0 1638400 verity 1 /dev/sdc1 /dev/sdc2 4096 4096 204800 1 sha256 + ac87db56303c9c1da433d7209b5a6ef3e4779df141200cbd7c157dcb8dd89c42 + 5ebfe87f7df3235b80a117ebc4078e44f55045487ad4a96581d1adb564615b51 Other examples (per target): -"crypt": +"crypt":: + dm-crypt,,8,ro, 0 1048576 crypt aes-xts-plain64 babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0 /dev/sda 0 1 allow_discards -"delay": +"delay":: + dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500 -"linear": +"linear":: + dm-linear,,,rw, 0 32768 linear /dev/sda1 0, 32768 1024000 linear /dev/sda2 0, 1056768 204800 linear /dev/sda3 0, 1261568 512000 linear /dev/sda4 0 -"snapshot-origin": +"snapshot-origin":: + dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2 -"striped": +"striped":: + dm-striped,,4,ro,0 1638400 striped 4 4096 /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0 -"verity": +"verity":: + dm-verity,,4,ro, 0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256 fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.rst index d63d78ffeb73..a30aa91b5fbe 100644 --- a/Documentation/device-mapper/dm-integrity.txt +++ b/Documentation/device-mapper/dm-integrity.rst @@ -1,3 +1,7 @@ +============ +dm-integrity +============ + The dm-integrity target emulates a block device that has additional per-sector tags that can be used for storing integrity information. @@ -35,15 +39,16 @@ zeroes. If the superblock is neither valid nor zeroed, the dm-integrity target can't be loaded. To use the target for the first time: + 1. overwrite the superblock with zeroes 2. load the dm-integrity target with one-sector size, the kernel driver - will format the device + will format the device 3. unload the dm-integrity target 4. read the "provided_data_sectors" value from the superblock 5. load the dm-integrity target with the the target size - "provided_data_sectors" + "provided_data_sectors" 6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target - with the size "provided_data_sectors" + with the size "provided_data_sectors" Target arguments: @@ -51,17 +56,20 @@ Target arguments: 1. the underlying block device 2. the number of reserved sector at the beginning of the device - the - dm-integrity won't read of write these sectors + dm-integrity won't read of write these sectors 3. the size of the integrity tag (if "-" is used, the size is taken from - the internal-hash algorithm) + the internal-hash algorithm) 4. mode: - D - direct writes (without journal) - in this mode, journaling is + + D - direct writes (without journal) + in this mode, journaling is not used and data sectors and integrity tags are written separately. In case of crash, it is possible that the data and integrity tag doesn't match. - J - journaled writes - data and integrity tags are written to the + J - journaled writes + data and integrity tags are written to the journal and atomicity is guaranteed. In case of crash, either both data and tag or none of them are written. The journaled mode degrades write throughput twice because the @@ -178,9 +186,12 @@ and the reloaded target would be non-functional. The layout of the formatted block device: -* reserved sectors (they are not used by this target, they can be used for - storing LUKS metadata or for other purpose), the size of the reserved - area is specified in the target arguments + +* reserved sectors + (they are not used by this target, they can be used for + storing LUKS metadata or for other purpose), the size of the reserved + area is specified in the target arguments + * superblock (4kiB) * magic string - identifies that the device was formatted * version @@ -192,40 +203,55 @@ The layout of the formatted block device: metadata and padding). The user of this target should not send bios that access data beyond the "provided data sectors" limit. * flags - SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used - SB_FLAG_RECALCULATING - recalculating is in progress - SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty - blocks + SB_FLAG_HAVE_JOURNAL_MAC + - a flag is set if journal_mac is used + SB_FLAG_RECALCULATING + - recalculating is in progress + SB_FLAG_DIRTY_BITMAP + - journal area contains the bitmap of dirty + blocks * log2(sectors per block) * a position where recalculating finished * journal The journal is divided into sections, each section contains: + * metadata area (4kiB), it contains journal entries - every journal entry contains: + + - every journal entry contains: + * logical sector (specifies where the data and tag should be written) * last 8 bytes of data * integrity tag (the size is specified in the superblock) - every metadata sector ends with + + - every metadata sector ends with + * mac (8-bytes), all the macs in 8 metadata sectors form a 64-byte value. It is used to store hmac of sector numbers in the journal section, to protect against a possibility that the attacker tampers with sector numbers in the journal. * commit id + * data area (the size is variable; it depends on how many journal entries fit into the metadata area) - every sector in the data area contains: + + - every sector in the data area contains: + * data (504 bytes of data, the last 8 bytes are stored in the journal entry) * commit id + To test if the whole journal section was written correctly, every 512-byte sector of the journal ends with 8-byte commit id. If the commit id matches on all sectors in a journal section, then it is assumed that the section was written correctly. If the commit id doesn't match, the section was written partially and it should not be replayed. -* one or more runs of interleaved tags and data. Each run contains: + +* one or more runs of interleaved tags and data. + Each run contains: + * tag area - it contains integrity tags. There is one tag for each sector in the data area * data area - it contains data sectors. The number of data sectors diff --git a/Documentation/device-mapper/dm-io.txt b/Documentation/device-mapper/dm-io.rst index 3b5d9a52cdcf..d2492917a1f5 100644 --- a/Documentation/device-mapper/dm-io.txt +++ b/Documentation/device-mapper/dm-io.rst @@ -1,3 +1,4 @@ +===== dm-io ===== @@ -7,7 +8,7 @@ version. The user must set up an io_region structure to describe the desired location of the I/O. Each io_region indicates a block-device along with the starting -sector and size of the region. +sector and size of the region:: struct io_region { struct block_device *bdev; @@ -19,7 +20,7 @@ Dm-io can read from one io_region or write to one or more io_regions. Writes to multiple regions are specified by an array of io_region structures. The first I/O service type takes a list of memory pages as the data buffer for -the I/O, along with an offset into the first page. +the I/O, along with an offset into the first page:: struct page_list { struct page_list *next; @@ -35,7 +36,7 @@ the I/O, along with an offset into the first page. The second I/O service type takes an array of bio vectors as the data buffer for the I/O. This service can be handy if the caller has a pre-assembled bio, -but wants to direct different portions of the bio to different devices. +but wants to direct different portions of the bio to different devices:: int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, struct bio_vec *bvec, @@ -47,7 +48,7 @@ but wants to direct different portions of the bio to different devices. The third I/O service type takes a pointer to a vmalloc'd memory buffer as the data buffer for the I/O. This service can be handy if the caller needs to do I/O to a large region but doesn't want to allocate a large number of individual -memory pages. +memory pages:: int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw, void *data, unsigned long *error_bits); @@ -55,11 +56,11 @@ memory pages. void *data, io_notify_fn fn, void *context); Callers of the asynchronous I/O services must include the name of a completion -callback routine and a pointer to some context data for the I/O. +callback routine and a pointer to some context data for the I/O:: typedef void (*io_notify_fn)(unsigned long error, void *context); -The "error" parameter in this callback, as well as the "*error" parameter in +The "error" parameter in this callback, as well as the `*error` parameter in all of the synchronous versions, is a bitset (instead of a simple error value). In the case of an write-I/O to multiple regions, this bitset allows dm-io to indicate success or failure on each individual region. @@ -72,4 +73,3 @@ always available in order to avoid unnecessary waiting while performing I/O. When the user is finished using the dm-io services, they should call dm_io_put() and specify the same number of pages that were given on the dm_io_get() call. - diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.rst index c155ac569c44..ba4fce39bc27 100644 --- a/Documentation/device-mapper/dm-log.txt +++ b/Documentation/device-mapper/dm-log.rst @@ -1,3 +1,4 @@ +===================== Device-Mapper Logging ===================== The device-mapper logging code is used by some of the device-mapper @@ -16,11 +17,13 @@ dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different logging implementations are available and provide different capabilities. The list includes: +============== ============================================================== Type Files -==== ===== +============== ============================================================== disk drivers/md/dm-log.c core drivers/md/dm-log.c userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h +============== ============================================================== The "disk" log type ------------------- diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.rst index f4db2562175c..d8e381c1cb02 100644 --- a/Documentation/device-mapper/dm-queue-length.txt +++ b/Documentation/device-mapper/dm-queue-length.rst @@ -1,3 +1,4 @@ +=============== dm-queue-length =============== @@ -6,12 +7,18 @@ which selects a path with the least number of in-flight I/Os. The path selector name is 'queue-length'. Table parameters for each path: [<repeat_count>] + +:: + <repeat_count>: The number of I/Os to dispatch using the selected path before switching to the next path. If not given, internal default is used. To check the default value, see the activated table. Status for each path: <status> <fail-count> <in-flight> + +:: + <status>: 'A' if the path is active, 'F' if the path is failed. <fail-count>: The number of path failures. <in-flight>: The number of in-flight I/Os on the path. @@ -29,11 +36,13 @@ Examples ======== In case that 2 paths (sda and sdb) are used with repeat_count == 128. -# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ - dmsetup create test -# -# dmsetup table -test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 -# -# dmsetup status -test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 +:: + + # echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0 diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.rst index 2355bef14653..2fe255b130fb 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.rst @@ -1,3 +1,4 @@ +======= dm-raid ======= @@ -8,49 +9,66 @@ interface. Mapping Table Interface ----------------------- -The target is named "raid" and it accepts the following parameters: +The target is named "raid" and it accepts the following parameters:: <raid_type> <#raid_params> <raid_params> \ <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>] <raid_type>: + + ============= =============================================================== raid0 RAID0 striping (no resilience) raid1 RAID1 mirroring raid4 RAID4 with dedicated last parity disk raid5_n RAID5 with dedicated last parity disk supporting takeover Same as raid4 - -Transitory layout + + - Transitory layout raid5_la RAID5 left asymmetric + - rotating parity 0 with data continuation raid5_ra RAID5 right asymmetric + - rotating parity N with data continuation raid5_ls RAID5 left symmetric + - rotating parity 0 with data restart raid5_rs RAID5 right symmetric + - rotating parity N with data restart raid6_zr RAID6 zero restart + - rotating parity zero (left-to-right) with data restart raid6_nr RAID6 N restart + - rotating parity N (right-to-left) with data restart raid6_nc RAID6 N continue + - rotating parity N (right-to-left) with data continuation raid6_n_6 RAID6 with dedicate parity disks + - parity and Q-syndrome on the last 2 disks; layout for takeover from/to raid4/raid5_n raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk + - layout for takeover from raid5_la from/to raid6 raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk + - layout for takeover from raid5_ra from/to raid6 raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk + - layout for takeover from raid5_ls from/to raid6 raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk + - layout for takeover from raid5_rs from/to raid6 raid10 Various RAID10 inspired algorithms chosen by additional params (see raid10_format and raid10_copies below) + - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') - RAID1E: Integrated Adjacent Stripe Mirroring - RAID1E: Integrated Offset Stripe Mirroring - - and other similar RAID10 variants + - and other similar RAID10 variants + ============= =============================================================== Reference: Chapter 4 of http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf @@ -58,33 +76,41 @@ The target is named "raid" and it accepts the following parameters: <#raid_params>: The number of parameters that follow. <raid_params> consists of + Mandatory parameters: - <chunk_size>: Chunk size in sectors. This parameter is often known as + <chunk_size>: + Chunk size in sectors. This parameter is often known as "stripe size". It is the only mandatory parameter and is placed first. followed by optional parameters (in any order): - [sync|nosync] Force or prevent RAID initialization. + [sync|nosync] + Force or prevent RAID initialization. - [rebuild <idx>] Rebuild drive number 'idx' (first drive is 0). + [rebuild <idx>] + Rebuild drive number 'idx' (first drive is 0). [daemon_sleep <ms>] Interval between runs of the bitmap daemon that clear bits. A longer interval means less bitmap I/O but resyncing after a failure is likely to take longer. - [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization - [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization - [write_mostly <idx>] Mark drive index 'idx' write-mostly. - [max_write_behind <sectors>] See '--write-behind=' (man mdadm) - [stripe_cache <sectors>] Stripe cache size (RAID 4/5/6 only) + [min_recovery_rate <kB/sec/disk>] + Throttle RAID initialization + [max_recovery_rate <kB/sec/disk>] + Throttle RAID initialization + [write_mostly <idx>] + Mark drive index 'idx' write-mostly. + [max_write_behind <sectors>] + See '--write-behind=' (man mdadm) + [stripe_cache <sectors>] + Stripe cache size (RAID 4/5/6 only) [region_size <sectors>] The region_size multiplied by the number of regions is the logical size of the array. The bitmap records the device synchronisation state for each region. - [raid10_copies <# copies>] - [raid10_format <near|far|offset>] + [raid10_copies <# copies>], [raid10_format <near|far|offset>] These two options are used to alter the default layout of a RAID10 configuration. The number of copies is can be specified, but the default is 2. There are also three @@ -93,13 +119,17 @@ The target is named "raid" and it accepts the following parameters: respect to mirroring. If these options are left unspecified, or 'raid10_copies 2' and/or 'raid10_format near' are given, then the layouts for 2, 3 and 4 devices are: + + ======== ========== ============== 2 drives 3 drives 4 drives - -------- ---------- -------------- + ======== ========== ============== A1 A1 A1 A1 A2 A1 A1 A2 A2 A2 A2 A2 A3 A3 A3 A3 A4 A4 A3 A3 A4 A4 A5 A5 A5 A6 A6 A4 A4 A5 A6 A6 A7 A7 A8 A8 .. .. .. .. .. .. .. .. .. + ======== ========== ============== + The 2-device layout is equivalent 2-way RAID1. The 4-device layout is what a traditional RAID10 would look like. The 3-device layout is what might be called a 'RAID1E - Integrated @@ -107,8 +137,10 @@ The target is named "raid" and it accepts the following parameters: If 'raid10_copies 2' and 'raid10_format far', then the layouts for 2, 3 and 4 devices are: + + ======== ============ =================== 2 drives 3 drives 4 drives - -------- -------------- -------------------- + ======== ============ =================== A1 A2 A1 A2 A3 A1 A2 A3 A4 A3 A4 A4 A5 A6 A5 A6 A7 A8 A5 A6 A7 A8 A9 A9 A10 A11 A12 @@ -117,11 +149,14 @@ The target is named "raid" and it accepts the following parameters: A4 A3 A6 A4 A5 A6 A5 A8 A7 A6 A5 A9 A7 A8 A10 A9 A12 A11 .. .. .. .. .. .. .. .. .. + ======== ============ =================== If 'raid10_copies 2' and 'raid10_format offset', then the layouts for 2, 3 and 4 devices are: + + ======== ========== ================ 2 drives 3 drives 4 drives - -------- ------------ ----------------- + ======== ========== ================ A1 A2 A1 A2 A3 A1 A2 A3 A4 A2 A1 A3 A1 A2 A2 A1 A4 A3 A3 A4 A4 A5 A6 A5 A6 A7 A8 @@ -129,6 +164,8 @@ The target is named "raid" and it accepts the following parameters: A5 A6 A7 A8 A9 A9 A10 A11 A12 A6 A5 A9 A7 A8 A10 A9 A12 A11 .. .. .. .. .. .. .. .. .. + ======== ========== ================ + Here we see layouts closely akin to 'RAID1E - Integrated Offset Stripe Mirroring'. @@ -190,22 +227,25 @@ The target is named "raid" and it accepts the following parameters: Example Tables -------------- -# RAID4 - 4 data drives, 1 parity (no metadata devices) -# No metadata devices specified to hold superblock/bitmap info -# Chunk size of 1MiB -# (Lines separated for easy reading) -0 1960893648 raid \ - raid4 1 2048 \ - 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 +:: -# RAID4 - 4 data drives, 1 parity (with metadata devices) -# Chunk size of 1MiB, force RAID initialization, -# min recovery rate at 20 kiB/sec/disk + # RAID4 - 4 data drives, 1 parity (no metadata devices) + # No metadata devices specified to hold superblock/bitmap info + # Chunk size of 1MiB + # (Lines separated for easy reading) -0 1960893648 raid \ - raid4 4 2048 sync min_recovery_rate 20 \ - 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 + 0 1960893648 raid \ + raid4 1 2048 \ + 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 + + # RAID4 - 4 data drives, 1 parity (with metadata devices) + # Chunk size of 1MiB, force RAID initialization, + # min recovery rate at 20 kiB/sec/disk + + 0 1960893648 raid \ + raid4 4 2048 sync min_recovery_rate 20 \ + 5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82 Status Output @@ -219,41 +259,58 @@ Arguments that can be repeated are ordered by value. 'dmsetup status' yields information on the state and health of the array. The output is as follows (normally a single line, but expanded here for -clarity): -1: <s> <l> raid \ -2: <raid_type> <#devices> <health_chars> \ -3: <sync_ratio> <sync_action> <mismatch_cnt> +clarity):: + + 1: <s> <l> raid \ + 2: <raid_type> <#devices> <health_chars> \ + 3: <sync_ratio> <sync_action> <mismatch_cnt> Line 1 is the standard output produced by device-mapper. -Line 2 & 3 are produced by the raid target and are best explained by example: + +Line 2 & 3 are produced by the raid target and are best explained by example:: + 0 1960893648 raid raid4 5 AAAAA 2/490221568 init 0 + Here we can see the RAID type is raid4, there are 5 devices - all of which are 'A'live, and the array is 2/490221568 complete with its initial recovery. Here is a fuller description of the individual fields: + + =============== ========================================================= <raid_type> Same as the <raid_type> used to create the array. - <health_chars> One char for each device, indicating: 'A' = alive and - in-sync, 'a' = alive but not in-sync, 'D' = dead/failed. + <health_chars> One char for each device, indicating: + + - 'A' = alive and in-sync + - 'a' = alive but not in-sync + - 'D' = dead/failed. <sync_ratio> The ratio indicating how much of the array has undergone the process described by 'sync_action'. If the 'sync_action' is "check" or "repair", then the process of "resync" or "recover" can be considered complete. <sync_action> One of the following possible states: - idle - No synchronization action is being performed. - frozen - The current action has been halted. - resync - Array is undergoing its initial synchronization + + idle + - No synchronization action is being performed. + frozen + - The current action has been halted. + resync + - Array is undergoing its initial synchronization or is resynchronizing after an unclean shutdown (possibly aided by a bitmap). - recover - A device in the array is being rebuilt or + recover + - A device in the array is being rebuilt or replaced. - check - A user-initiated full check of the array is + check + - A user-initiated full check of the array is being performed. All blocks are read and checked for consistency. The number of discrepancies found are recorded in <mismatch_cnt>. No changes are made to the array by this action. - repair - The same as "check", but discrepancies are + repair + - The same as "check", but discrepancies are corrected. - reshape - The array is undergoing a reshape. + reshape + - The array is undergoing a reshape. <mismatch_cnt> The number of discrepancies found between mirror copies in RAID1/10 or wrong parity values found in RAID4/5/6. This value is valid only after a "check" of the array @@ -261,10 +318,11 @@ recovery. Here is a fuller description of the individual fields: <data_offset> The current data offset to the start of the user data on each component device of a raid set (see the respective raid parameter to support out-of-place reshaping). - <journal_char> 'A' - active write-through journal device. - 'a' - active write-back journal device. - 'D' - dead journal device. - '-' - no journal device. + <journal_char> - 'A' - active write-through journal device. + - 'a' - active write-back journal device. + - 'D' - dead journal device. + - '-' - no journal device. + =============== ========================================================= Message Interface @@ -272,12 +330,15 @@ Message Interface The dm-raid target will accept certain actions through the 'message' interface. ('man dmsetup' for more information on the message interface.) These actions include: - "idle" - Halt the current sync action. - "frozen" - Freeze the current sync action. - "resync" - Initiate/continue a resync. - "recover"- Initiate/continue a recover process. - "check" - Initiate a check (i.e. a "scrub") of the array. - "repair" - Initiate a repair of the array. + + ========= ================================================ + "idle" Halt the current sync action. + "frozen" Freeze the current sync action. + "resync" Initiate/continue a resync. + "recover" Initiate/continue a recover process. + "check" Initiate a check (i.e. a "scrub") of the array. + "repair" Initiate a repair of the array. + ========= ================================================ Discard Support @@ -307,48 +368,52 @@ increasingly whitelisted in the kernel and can thus be trusted. For trusted devices, the following dm-raid module parameter can be set to safely enable discard support for RAID 4/5/6: + 'devices_handle_discards_safely' Version History --------------- -1.0.0 Initial version. Support for RAID 4/5/6 -1.1.0 Added support for RAID 1 -1.2.0 Handle creation of arrays that contain failed devices. -1.3.0 Added support for RAID 10 -1.3.1 Allow device replacement/rebuild for RAID 10 -1.3.2 Fix/improve redundancy checking for RAID10 -1.4.0 Non-functional change. Removes arg from mapping function. -1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). -1.4.2 Add RAID10 "far" and "offset" algorithm support. -1.5.0 Add message interface to allow manipulation of the sync_action. + +:: + + 1.0.0 Initial version. Support for RAID 4/5/6 + 1.1.0 Added support for RAID 1 + 1.2.0 Handle creation of arrays that contain failed devices. + 1.3.0 Added support for RAID 10 + 1.3.1 Allow device replacement/rebuild for RAID 10 + 1.3.2 Fix/improve redundancy checking for RAID10 + 1.4.0 Non-functional change. Removes arg from mapping function. + 1.4.1 RAID10 fix redundancy validation checks (commit 55ebbb5). + 1.4.2 Add RAID10 "far" and "offset" algorithm support. + 1.5.0 Add message interface to allow manipulation of the sync_action. New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt. -1.5.1 Add ability to restore transiently failed devices on resume. -1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". -1.6.0 Add discard support (and devices_handle_discard_safely module param). -1.7.0 Add support for MD RAID0 mappings. -1.8.0 Explicitly check for compatible flags in the superblock metadata + 1.5.1 Add ability to restore transiently failed devices on resume. + 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". + 1.6.0 Add discard support (and devices_handle_discard_safely module param). + 1.7.0 Add support for MD RAID0 mappings. + 1.8.0 Explicitly check for compatible flags in the superblock metadata and reject to start the raid set if any are set by a newer target version, thus avoiding data corruption on a raid set with a reshape in progress. -1.9.0 Add support for RAID level takeover/reshape/region size + 1.9.0 Add support for RAID level takeover/reshape/region size and set size reduction. -1.9.1 Fix activation of existing RAID 4/10 mapped devices -1.9.2 Don't emit '- -' on the status table line in case the constructor + 1.9.1 Fix activation of existing RAID 4/10 mapped devices + 1.9.2 Don't emit '- -' on the status table line in case the constructor fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and 'D' on the status line. If '- -' is passed into the constructor, emit '- -' on the table line and '-' as the status line health character. -1.10.0 Add support for raid4/5/6 journal device -1.10.1 Fix data corruption on reshape request -1.11.0 Fix table line argument order + 1.10.0 Add support for raid4/5/6 journal device + 1.10.1 Fix data corruption on reshape request + 1.11.0 Fix table line argument order (wrong raid10_copies/raid10_format sequence) -1.11.1 Add raid4/5/6 journal write-back support via journal_mode option -1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available -1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') -1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an + 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option + 1.12.1 Fix for MD deadlock between mddev_suspend() and md_write_start() available + 1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') + 1.13.1 Fix deadlock caused by early md_stop_writes(). Also fix size an state races. -1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen -1.14.0 Fix reshape race on small devices. Fix stripe adding reshape + 1.13.2 Fix raid redundancy validation and avoid keeping raid set frozen + 1.14.0 Fix reshape race on small devices. Fix stripe adding reshape deadlock/potential data corruption. Update superblock when specific devices are requested via rebuild. Fix RAID leg rebuild errors. diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.rst index fb1d4a0cf122..facf277fc13c 100644 --- a/Documentation/device-mapper/dm-service-time.txt +++ b/Documentation/device-mapper/dm-service-time.rst @@ -1,3 +1,4 @@ +=============== dm-service-time =============== @@ -12,25 +13,34 @@ in a path-group, and it can be specified as a table argument. The path selector name is 'service-time'. -Table parameters for each path: [<repeat_count> [<relative_throughput>]] - <repeat_count>: The number of I/Os to dispatch using the selected +Table parameters for each path: + + [<repeat_count> [<relative_throughput>]] + <repeat_count>: + The number of I/Os to dispatch using the selected path before switching to the next path. If not given, internal default is used. To check the default value, see the activated table. - <relative_throughput>: The relative throughput value of the path + <relative_throughput>: + The relative throughput value of the path among all paths in the path-group. The valid range is 0-100. If not given, minimum value '1' is used. If '0' is given, the path isn't selected while other paths having a positive value are available. -Status for each path: <status> <fail-count> <in-flight-size> \ - <relative_throughput> - <status>: 'A' if the path is active, 'F' if the path is failed. - <fail-count>: The number of path failures. - <in-flight-size>: The size of in-flight I/Os on the path. - <relative_throughput>: The relative throughput value of the path - among all paths in the path-group. +Status for each path: + + <status> <fail-count> <in-flight-size> <relative_throughput> + <status>: + 'A' if the path is active, 'F' if the path is failed. + <fail-count>: + The number of path failures. + <in-flight-size>: + The size of in-flight I/Os on the path. + <relative_throughput>: + The relative throughput value of the path + among all paths in the path-group. Algorithm @@ -39,7 +49,7 @@ Algorithm dm-service-time adds the I/O size to 'in-flight-size' when the I/O is dispatched and subtracts when completed. Basically, dm-service-time selects a path having minimum service time -which is calculated by: +which is calculated by:: ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' @@ -67,25 +77,25 @@ Examples ======== In case that 2 paths (sda and sdb) are used with repeat_count == 128 and sda has an average throughput 1GB/s and sdb has 4GB/s, -'relative_throughput' value may be '1' for sda and '4' for sdb. - -# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ - dmsetup create test -# -# dmsetup table -test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 -# -# dmsetup status -test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 - - -Or '2' for sda and '8' for sdb would be also true. - -# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ - dmsetup create test -# -# dmsetup table -test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 -# -# dmsetup status -test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 +'relative_throughput' value may be '1' for sda and '4' for sdb:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 + + +Or '2' for sda and '8' for sdb would be also true:: + + # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ + dmsetup create test + # + # dmsetup table + test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 + # + # dmsetup status + test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 diff --git a/Documentation/device-mapper/dm-uevent.rst b/Documentation/device-mapper/dm-uevent.rst new file mode 100644 index 000000000000..4a8ee8d069c9 --- /dev/null +++ b/Documentation/device-mapper/dm-uevent.rst @@ -0,0 +1,110 @@ +==================== +device-mapper uevent +==================== + +The device-mapper uevent code adds the capability to device-mapper to create +and send kobject uevents (uevents). Previously device-mapper events were only +available through the ioctl interface. The advantage of the uevents interface +is the event contains environment attributes providing increased context for +the event avoiding the need to query the state of the device-mapper device after +the event is received. + +There are two functions currently for device-mapper events. The first function +listed creates the event and the second function sends the event(s):: + + void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, + const char *path, unsigned nr_valid_paths) + + void dm_send_uevents(struct list_head *events, struct kobject *kobj) + + +The variables added to the uevent environment are: + +Variable Name: DM_TARGET +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Name of device-mapper target that generated the event. + +Variable Name: DM_ACTION +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: +:Value: Device-mapper specific action that caused the uevent action. + PATH_FAILED - A path has failed; + PATH_REINSTATED - A path has been reinstated. + +Variable Name: DM_SEQNUM +------------------------ +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: A sequence number for this specific device-mapper device. +:Value: Valid unsigned integer range. + +Variable Name: DM_PATH +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Major and minor number of the path device pertaining to this + event. +:Value: Path name in the form of "Major:Minor" + +Variable Name: DM_NR_VALID_PATHS +-------------------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: unsigned integer +:Description: +:Value: Valid unsigned integer range. + +Variable Name: DM_NAME +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: Name of the device-mapper device. +:Value: Name + +Variable Name: DM_UUID +---------------------- +:Uevent Action(s): KOBJ_CHANGE +:Type: string +:Description: UUID of the device-mapper device. +:Value: UUID. (Empty string if there isn't one.) + +An example of the uevents generated as captured by udevmonitor is shown +below + +1.) Path failure:: + + UEVENT[1192521009.711215] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_FAILED + DM_SEQNUM=1 + DM_PATH=8:32 + DM_NR_VALID_PATHS=0 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1130 + +2.) Path reinstate:: + + UEVENT[1192521132.989927] change@/block/dm-3 + ACTION=change + DEVPATH=/block/dm-3 + SUBSYSTEM=block + DM_TARGET=multipath + DM_ACTION=PATH_REINSTATED + DM_SEQNUM=2 + DM_PATH=8:32 + DM_NR_VALID_PATHS=1 + DM_NAME=mpath2 + DM_UUID=mpath-35333333000002328 + MINOR=3 + MAJOR=253 + SEQNUM=1131 diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt deleted file mode 100644 index 07edbd85c714..000000000000 --- a/Documentation/device-mapper/dm-uevent.txt +++ /dev/null @@ -1,97 +0,0 @@ -The device-mapper uevent code adds the capability to device-mapper to create -and send kobject uevents (uevents). Previously device-mapper events were only -available through the ioctl interface. The advantage of the uevents interface -is the event contains environment attributes providing increased context for -the event avoiding the need to query the state of the device-mapper device after -the event is received. - -There are two functions currently for device-mapper events. The first function -listed creates the event and the second function sends the event(s). - -void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, - const char *path, unsigned nr_valid_paths) - -void dm_send_uevents(struct list_head *events, struct kobject *kobj) - - -The variables added to the uevent environment are: - -Variable Name: DM_TARGET -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: -Value: Name of device-mapper target that generated the event. - -Variable Name: DM_ACTION -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: -Value: Device-mapper specific action that caused the uevent action. - PATH_FAILED - A path has failed. - PATH_REINSTATED - A path has been reinstated. - -Variable Name: DM_SEQNUM -Uevent Action(s): KOBJ_CHANGE -Type: unsigned integer -Description: A sequence number for this specific device-mapper device. -Value: Valid unsigned integer range. - -Variable Name: DM_PATH -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: Major and minor number of the path device pertaining to this -event. -Value: Path name in the form of "Major:Minor" - -Variable Name: DM_NR_VALID_PATHS -Uevent Action(s): KOBJ_CHANGE -Type: unsigned integer -Description: -Value: Valid unsigned integer range. - -Variable Name: DM_NAME -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: Name of the device-mapper device. -Value: Name - -Variable Name: DM_UUID -Uevent Action(s): KOBJ_CHANGE -Type: string -Description: UUID of the device-mapper device. -Value: UUID. (Empty string if there isn't one.) - -An example of the uevents generated as captured by udevmonitor is shown -below. - -1.) Path failure. -UEVENT[1192521009.711215] change@/block/dm-3 -ACTION=change -DEVPATH=/block/dm-3 -SUBSYSTEM=block -DM_TARGET=multipath -DM_ACTION=PATH_FAILED -DM_SEQNUM=1 -DM_PATH=8:32 -DM_NR_VALID_PATHS=0 -DM_NAME=mpath2 -DM_UUID=mpath-35333333000002328 -MINOR=3 -MAJOR=253 -SEQNUM=1130 - -2.) Path reinstate. -UEVENT[1192521132.989927] change@/block/dm-3 -ACTION=change -DEVPATH=/block/dm-3 -SUBSYSTEM=block -DM_TARGET=multipath -DM_ACTION=PATH_REINSTATED -DM_SEQNUM=2 -DM_PATH=8:32 -DM_NR_VALID_PATHS=1 -DM_NAME=mpath2 -DM_UUID=mpath-35333333000002328 -MINOR=3 -MAJOR=253 -SEQNUM=1131 diff --git a/Documentation/device-mapper/dm-zoned.txt b/Documentation/device-mapper/dm-zoned.rst index 736fcc78d193..07f56ebc1730 100644 --- a/Documentation/device-mapper/dm-zoned.txt +++ b/Documentation/device-mapper/dm-zoned.rst @@ -1,3 +1,4 @@ +======== dm-zoned ======== @@ -133,12 +134,13 @@ A zoned block device must first be formatted using the dmzadm tool. This will analyze the device zone configuration, determine where to place the metadata sets on the device and initialize the metadata sets. -Ex: +Ex:: -dmzadm --format /dev/sdxx + dmzadm --format /dev/sdxx For a formatted device, the target can be created normally with the dmsetup utility. The only parameter that dm-zoned requires is the -underlying zoned block device name. Ex: +underlying zoned block device name. Ex:: -echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | dmsetup create dmz-`basename ${dev}` + echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | \ + dmsetup create dmz-`basename ${dev}` diff --git a/Documentation/device-mapper/era.txt b/Documentation/device-mapper/era.rst index 3c6d01be3560..90dd5c670b9f 100644 --- a/Documentation/device-mapper/era.txt +++ b/Documentation/device-mapper/era.rst @@ -1,3 +1,7 @@ +====== +dm-era +====== + Introduction ============ @@ -14,12 +18,14 @@ coherency after rolling back a vendor snapshot. Constructor =========== - era <metadata dev> <origin dev> <block size> +era <metadata dev> <origin dev> <block size> - metadata dev : fast device holding the persistent metadata - origin dev : device holding data blocks that may change - block size : block size of origin data device, granularity that is - tracked by the target + ================ ====================================================== + metadata dev fast device holding the persistent metadata + origin dev device holding data blocks that may change + block size block size of origin data device, granularity that is + tracked by the target + ================ ====================================================== Messages ======== @@ -49,14 +55,16 @@ Status <metadata block size> <#used metadata blocks>/<#total metadata blocks> <current era> <held metadata root | '-'> -metadata block size : Fixed block size for each metadata block in - sectors -#used metadata blocks : Number of metadata blocks used -#total metadata blocks : Total number of metadata blocks -current era : The current era -held metadata root : The location, in blocks, of the metadata root - that has been 'held' for userspace read - access. '-' indicates there is no held root +========================= ============================================== +metadata block size Fixed block size for each metadata block in + sectors +#used metadata blocks Number of metadata blocks used +#total metadata blocks Total number of metadata blocks +current era The current era +held metadata root The location, in blocks, of the metadata root + that has been 'held' for userspace read + access. '-' indicates there is no held root +========================= ============================================== Detailed use case ================= @@ -88,7 +96,7 @@ Memory usage The target uses a bitset to record writes in the current era. It also has a spare bitset ready for switching over to a new era. Other than -that it uses a few 4k blocks for updating metadata. +that it uses a few 4k blocks for updating metadata:: (4 * nr_blocks) bytes + buffers diff --git a/Documentation/device-mapper/index.rst b/Documentation/device-mapper/index.rst new file mode 100644 index 000000000000..105e253bc231 --- /dev/null +++ b/Documentation/device-mapper/index.rst @@ -0,0 +1,44 @@ +:orphan: + +============= +Device Mapper +============= + +.. toctree:: + :maxdepth: 1 + + cache-policies + cache + delay + dm-crypt + dm-flakey + dm-init + dm-integrity + dm-io + dm-log + dm-queue-length + dm-raid + dm-service-time + dm-uevent + dm-zoned + era + kcopyd + linear + log-writes + persistent-data + snapshot + statistics + striped + switch + thin-provisioning + unstriped + verity + writecache + zero + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/device-mapper/kcopyd.txt b/Documentation/device-mapper/kcopyd.rst index 820382c4cecf..7651d395127f 100644 --- a/Documentation/device-mapper/kcopyd.txt +++ b/Documentation/device-mapper/kcopyd.rst @@ -1,3 +1,4 @@ +====== kcopyd ====== @@ -7,7 +8,7 @@ notification. It is used by dm-snapshot and dm-mirror. Users of kcopyd must first create a client and indicate how many memory pages to set aside for their copy jobs. This is done with a call to -kcopyd_client_create(). +kcopyd_client_create():: int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); @@ -16,7 +17,7 @@ To start a copy job, the user must set up io_region structures to describe the source and destinations of the copy. Each io_region indicates a block-device along with the starting sector and size of the region. The source of the copy is given as one io_region structure, and the destinations of the -copy are given as an array of io_region structures. +copy are given as an array of io_region structures:: struct io_region { struct block_device *bdev; @@ -26,7 +27,7 @@ copy are given as an array of io_region structures. To start the copy, the user calls kcopyd_copy(), passing in the client pointer, pointers to the source and destination io_regions, the name of a -completion callback routine, and a pointer to some context data for the copy. +completion callback routine, and a pointer to some context data for the copy:: int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, unsigned int num_dests, struct io_region *dests, @@ -41,7 +42,6 @@ write error occurred during the copy. When a user is done with all their copy jobs, they should call kcopyd_client_destroy() to delete the kcopyd client, which will release the -associated memory pages. +associated memory pages:: void kcopyd_client_destroy(struct kcopyd_client *kc); - diff --git a/Documentation/device-mapper/linear.rst b/Documentation/device-mapper/linear.rst new file mode 100644 index 000000000000..9d17fc6e64a9 --- /dev/null +++ b/Documentation/device-mapper/linear.rst @@ -0,0 +1,63 @@ +========= +dm-linear +========= + +Device-Mapper's "linear" target maps a linear range of the Device-Mapper +device onto a linear range of another device. This is the basic building +block of logical volume managers. + +Parameters: <dev path> <offset> + <dev path>: + Full pathname to the underlying block-device, or a + "major:minor" device-number. + <offset>: + Starting sector within the device. + + +Example scripts +=============== + +:: + + #!/bin/sh + # Create an identity mapping for a device + echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity + +:: + + #!/bin/sh + # Join 2 devices together + size1=`blockdev --getsz $1` + size2=`blockdev --getsz $2` + echo "0 $size1 linear $1 0 + $size1 $size2 linear $2 0" | dmsetup create joined + +:: + + #!/usr/bin/perl -w + # Split a device into 4M chunks and then join them together in reverse order. + + my $name = "reverse"; + my $extent_size = 4 * 1024 * 2; + my $dev = $ARGV[0]; + my $table = ""; + my $count = 0; + + if (!defined($dev)) { + die("Please specify a device.\n"); + } + + my $dev_size = `blockdev --getsz $dev`; + my $extents = int($dev_size / $extent_size) - + (($dev_size % $extent_size) ? 1 : 0); + + while ($extents > 0) { + my $this_start = $count * $extent_size; + $extents--; + $count++; + my $this_offset = $extents * $extent_size; + + $table .= "$this_start $extent_size linear $dev $this_offset\n"; + } + + `echo \"$table\" | dmsetup create $name`; diff --git a/Documentation/device-mapper/linear.txt b/Documentation/device-mapper/linear.txt deleted file mode 100644 index 7cb98d89d3f8..000000000000 --- a/Documentation/device-mapper/linear.txt +++ /dev/null @@ -1,61 +0,0 @@ -dm-linear -========= - -Device-Mapper's "linear" target maps a linear range of the Device-Mapper -device onto a linear range of another device. This is the basic building -block of logical volume managers. - -Parameters: <dev path> <offset> - <dev path>: Full pathname to the underlying block-device, or a - "major:minor" device-number. - <offset>: Starting sector within the device. - - -Example scripts -=============== -[[ -#!/bin/sh -# Create an identity mapping for a device -echo "0 `blockdev --getsz $1` linear $1 0" | dmsetup create identity -]] - - -[[ -#!/bin/sh -# Join 2 devices together -size1=`blockdev --getsz $1` -size2=`blockdev --getsz $2` -echo "0 $size1 linear $1 0 -$size1 $size2 linear $2 0" | dmsetup create joined -]] - - -[[ -#!/usr/bin/perl -w -# Split a device into 4M chunks and then join them together in reverse order. - -my $name = "reverse"; -my $extent_size = 4 * 1024 * 2; -my $dev = $ARGV[0]; -my $table = ""; -my $count = 0; - -if (!defined($dev)) { - die("Please specify a device.\n"); -} - -my $dev_size = `blockdev --getsz $dev`; -my $extents = int($dev_size / $extent_size) - - (($dev_size % $extent_size) ? 1 : 0); - -while ($extents > 0) { - my $this_start = $count * $extent_size; - $extents--; - $count++; - my $this_offset = $extents * $extent_size; - - $table .= "$this_start $extent_size linear $dev $this_offset\n"; -} - -`echo \"$table\" | dmsetup create $name`; -]] diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.rst index b638d124be6a..23141f2ffb7c 100644 --- a/Documentation/device-mapper/log-writes.txt +++ b/Documentation/device-mapper/log-writes.rst @@ -1,3 +1,4 @@ +============= dm-log-writes ============= @@ -25,11 +26,11 @@ completed WRITEs, at the time the REQ_PREFLUSH is issued, are added in order to simulate the worst case scenario with regard to power failures. Consider the following example (W means write, C means complete): -W1,W2,W3,C3,C2,Wflush,C1,Cflush + W1,W2,W3,C3,C2,Wflush,C1,Cflush -The log would show the following +The log would show the following: -W3,W2,flush,W1.... + W3,W2,flush,W1.... Again this is to simulate what is actually on disk, this allows us to detect cases where a power failure at a particular point in time would create an @@ -42,11 +43,11 @@ Any REQ_OP_DISCARD requests are treated like WRITE requests. Otherwise we would have all the DISCARD requests, and then the WRITE requests and then the FLUSH request. Consider the following example: -WRITE block 1, DISCARD block 1, FLUSH + WRITE block 1, DISCARD block 1, FLUSH -If we logged DISCARD when it completed, the replay would look like this +If we logged DISCARD when it completed, the replay would look like this: -DISCARD 1, WRITE 1, FLUSH + DISCARD 1, WRITE 1, FLUSH which isn't quite what happened and wouldn't be caught during the log replay. @@ -57,15 +58,19 @@ i) Constructor log-writes <dev_path> <log_dev_path> - dev_path : Device that all of the IO will go to normally. - log_dev_path : Device where the log entries are written to. + ============= ============================================== + dev_path Device that all of the IO will go to normally. + log_dev_path Device where the log entries are written to. + ============= ============================================== ii) Status <#logged entries> <highest allocated sector> - #logged entries : Number of logged entries - highest allocated sector : Highest allocated sector + =========================== ======================== + #logged entries Number of logged entries + highest allocated sector Highest allocated sector + =========================== ======================== iii) Messages @@ -75,15 +80,15 @@ iii) Messages For example say you want to fsck a file system after every write, but first you need to replay up to the mkfs to make sure we're fsck'ing something reasonable, you would do something like - this: + this:: mkfs.btrfs -f /dev/mapper/log dmsetup message log 0 mark mkfs <run test> - This would allow you to replay the log up to the mkfs mark and - then replay from that point on doing the fsck check in the - interval that you want. + This would allow you to replay the log up to the mkfs mark and + then replay from that point on doing the fsck check in the + interval that you want. Every log has a mark at the end labeled "dm-log-writes-end". @@ -97,42 +102,42 @@ Example usage ============= Say you want to test fsync on your file system. You would do something like -this: - -TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" -dmsetup create log --table "$TABLE" -mkfs.btrfs -f /dev/mapper/log -dmsetup message log 0 mark mkfs - -mount /dev/mapper/log /mnt/btrfs-test -<some test that does fsync at the end> -dmsetup message log 0 mark fsync -md5sum /mnt/btrfs-test/foo -umount /mnt/btrfs-test - -dmsetup remove log -replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync -mount /dev/sdb /mnt/btrfs-test -md5sum /mnt/btrfs-test/foo -<verify md5sum's are correct> - -Another option is to do a complicated file system operation and verify the file -system is consistent during the entire operation. You could do this with: - -TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" -dmsetup create log --table "$TABLE" -mkfs.btrfs -f /dev/mapper/log -dmsetup message log 0 mark mkfs - -mount /dev/mapper/log /mnt/btrfs-test -<fsstress to dirty the fs> -btrfs filesystem balance /mnt/btrfs-test -umount /mnt/btrfs-test -dmsetup remove log - -replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs -btrfsck /dev/sdb -replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ +this:: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + <some test that does fsync at the end> + dmsetup message log 0 mark fsync + md5sum /mnt/btrfs-test/foo + umount /mnt/btrfs-test + + dmsetup remove log + replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync + mount /dev/sdb /mnt/btrfs-test + md5sum /mnt/btrfs-test/foo + <verify md5sum's are correct> + + Another option is to do a complicated file system operation and verify the file + system is consistent during the entire operation. You could do this with: + + TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" + dmsetup create log --table "$TABLE" + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + + mount /dev/mapper/log /mnt/btrfs-test + <fsstress to dirty the fs> + btrfs filesystem balance /mnt/btrfs-test + umount /mnt/btrfs-test + dmsetup remove log + + replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs + btrfsck /dev/sdb + replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ --fsck "btrfsck /dev/sdb" --check fua And that will replay the log until it sees a FUA request, run the fsck command diff --git a/Documentation/device-mapper/persistent-data.txt b/Documentation/device-mapper/persistent-data.rst index a333bcb3a6c2..2065c3c5a091 100644 --- a/Documentation/device-mapper/persistent-data.txt +++ b/Documentation/device-mapper/persistent-data.rst @@ -1,3 +1,7 @@ +=============== +Persistent data +=============== + Introduction ============ diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.rst index b8bbb516f989..4c53304e72f1 100644 --- a/Documentation/device-mapper/snapshot.txt +++ b/Documentation/device-mapper/snapshot.rst @@ -1,15 +1,16 @@ +============================== Device-mapper snapshot support ============================== Device-mapper allows you, without massive data copying: -*) To create snapshots of any block device i.e. mountable, saved states of -the block device which are also writable without interfering with the -original content; -*) To create device "forks", i.e. multiple different versions of the -same data stream. -*) To merge a snapshot of a block device back into the snapshot's origin -device. +- To create snapshots of any block device i.e. mountable, saved states of + the block device which are also writable without interfering with the + original content; +- To create device "forks", i.e. multiple different versions of the + same data stream. +- To merge a snapshot of a block device back into the snapshot's origin + device. In the first two cases, dm copies only the chunks of data that get changed and uses a separate copy-on-write (COW) block device for @@ -22,7 +23,7 @@ the origin device. There are three dm targets available: snapshot, snapshot-origin, and snapshot-merge. -*) snapshot-origin <origin> +- snapshot-origin <origin> which will normally have one or more snapshots based on it. Reads will be mapped directly to the backing device. For each write, the @@ -30,7 +31,7 @@ original data will be saved in the <COW device> of each snapshot to keep its visible content unchanged, at least until the <COW device> fills up. -*) snapshot <origin> <COW device> <persistent?> <chunksize> +- snapshot <origin> <COW device> <persistent?> <chunksize> A snapshot of the <origin> block device is created. Changed chunks of <chunksize> sectors will be stored on the <COW device>. Writes will @@ -83,25 +84,25 @@ When you create the first LVM2 snapshot of a volume, four dm devices are used: source volume), whose table is replaced by a "snapshot-origin" mapping from device #1. -A fixed naming scheme is used, so with the following commands: +A fixed naming scheme is used, so with the following commands:: -lvcreate -L 1G -n base volumeGroup -lvcreate -L 100M --snapshot -n snap volumeGroup/base + lvcreate -L 1G -n base volumeGroup + lvcreate -L 100M --snapshot -n snap volumeGroup/base -we'll have this situation (with volumes in above order): +we'll have this situation (with volumes in above order):: -# dmsetup table|grep volumeGroup + # dmsetup table|grep volumeGroup -volumeGroup-base-real: 0 2097152 linear 8:19 384 -volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 -volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 -volumeGroup-base: 0 2097152 snapshot-origin 254:11 + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-snap-cow: 0 204800 linear 8:19 2097536 + volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16 + volumeGroup-base: 0 2097152 snapshot-origin 254:11 -# ls -lL /dev/mapper/volumeGroup-* -brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real -brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow -brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap -brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow + brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap + brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base How snapshot-merge is used by LVM2 @@ -114,27 +115,28 @@ merging snapshot after it completes. The "snapshot" that hands over its COW device to the "snapshot-merge" is deactivated (unless using lvchange --refresh); but if it is left active it will simply return I/O errors. -A snapshot will merge into its origin with the following command: +A snapshot will merge into its origin with the following command:: -lvconvert --merge volumeGroup/snap + lvconvert --merge volumeGroup/snap -we'll now have this situation: +we'll now have this situation:: -# dmsetup table|grep volumeGroup + # dmsetup table|grep volumeGroup -volumeGroup-base-real: 0 2097152 linear 8:19 384 -volumeGroup-base-cow: 0 204800 linear 8:19 2097536 -volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 + volumeGroup-base-real: 0 2097152 linear 8:19 384 + volumeGroup-base-cow: 0 204800 linear 8:19 2097536 + volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16 -# ls -lL /dev/mapper/volumeGroup-* -brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real -brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow -brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base + # ls -lL /dev/mapper/volumeGroup-* + brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real + brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow + brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base How to determine when a merging is complete =========================================== The snapshot-merge and snapshot status lines end with: + <sectors_allocated>/<total_sectors> <metadata_sectors> Both <sectors_allocated> and <total_sectors> include both data and metadata. @@ -142,35 +144,37 @@ During merging, the number of sectors allocated gets smaller and smaller. Merging has finished when the number of sectors holding data is zero, in other words <sectors_allocated> == <metadata_sectors>. -Here is a practical example (using a hybrid of lvm and dmsetup commands): +Here is a practical example (using a hybrid of lvm and dmsetup commands):: -# lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g - snap volumeGroup swi-a- 1.00g base 18.97 + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g + snap volumeGroup swi-a- 1.00g base 18.97 -# dmsetup status volumeGroup-snap -0 8388608 snapshot 397896/2097152 1560 - ^^^^ metadata sectors + # dmsetup status volumeGroup-snap + 0 8388608 snapshot 397896/2097152 1560 + ^^^^ metadata sectors -# lvconvert --merge -b volumeGroup/snap - Merging of volume snap started. + # lvconvert --merge -b volumeGroup/snap + Merging of volume snap started. -# lvs volumeGroup/snap - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup Owi-a- 4.00g 17.23 + # lvs volumeGroup/snap + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup Owi-a- 4.00g 17.23 -# dmsetup status volumeGroup-base -0 8388608 snapshot-merge 281688/2097152 1104 + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 281688/2097152 1104 -# dmsetup status volumeGroup-base -0 8388608 snapshot-merge 180480/2097152 712 + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 180480/2097152 712 -# dmsetup status volumeGroup-base -0 8388608 snapshot-merge 16/2097152 16 + # dmsetup status volumeGroup-base + 0 8388608 snapshot-merge 16/2097152 16 Merging has finished. -# lvs - LV VG Attr LSize Origin Snap% Move Log Copy% Convert - base volumeGroup owi-a- 4.00g +:: + + # lvs + LV VG Attr LSize Origin Snap% Move Log Copy% Convert + base volumeGroup owi-a- 4.00g diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.rst index 170ac02a1f50..3d80a9f850cc 100644 --- a/Documentation/device-mapper/statistics.txt +++ b/Documentation/device-mapper/statistics.rst @@ -1,3 +1,4 @@ +============= DM statistics ============= @@ -11,7 +12,7 @@ Individual statistics will be collected for each step-sized area within the range specified. The I/O statistics counters for each step-sized area of a region are -in the same format as /sys/block/*/stat or /proc/diskstats (see: +in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: Documentation/iostats.txt). But two extra counters (12 and 13) are provided: total time spent reading and writing. When the histogram argument is used, the 14th parameter is reported that represents the @@ -32,40 +33,45 @@ on each other's data. The creation of DM statistics will allocate memory via kmalloc or fallback to using vmalloc space. At most, 1/4 of the overall system memory may be allocated by DM statistics. The admin can see how much -memory is used by reading -/sys/module/dm_mod/parameters/stats_current_allocated_bytes +memory is used by reading: + + /sys/module/dm_mod/parameters/stats_current_allocated_bytes Messages ======== - @stats_create <range> <step> - [<number_of_optional_arguments> <optional_arguments>...] - [<program_id> [<aux_data>]] - + @stats_create <range> <step> [<number_of_optional_arguments> <optional_arguments>...] [<program_id> [<aux_data>]] Create a new region and return the region_id. <range> - "-" - whole device - "<start_sector>+<length>" - a range of <length> 512-byte sectors - starting with <start_sector>. + "-" + whole device + "<start_sector>+<length>" + a range of <length> 512-byte sectors + starting with <start_sector>. <step> - "<area_size>" - the range is subdivided into areas each containing - <area_size> sectors. - "/<number_of_areas>" - the range is subdivided into the specified - number of areas. + "<area_size>" + the range is subdivided into areas each containing + <area_size> sectors. + "/<number_of_areas>" + the range is subdivided into the specified + number of areas. <number_of_optional_arguments> The number of optional arguments <optional_arguments> - The following optional arguments are supported - precise_timestamps - use precise timer with nanosecond resolution + The following optional arguments are supported: + + precise_timestamps + use precise timer with nanosecond resolution instead of the "jiffies" variable. When this argument is used, the resulting times are in nanoseconds instead of milliseconds. Precise timestamps are a little bit slower to obtain than jiffies-based timestamps. - histogram:n1,n2,n3,n4,... - collect histogram of latencies. The + histogram:n1,n2,n3,n4,... + collect histogram of latencies. The numbers n1, n2, etc are times that represent the boundaries of the histogram. If precise_timestamps is not used, the times are in milliseconds, otherwise they are in @@ -96,21 +102,18 @@ Messages @stats_list message, but it doesn't use this value for anything. @stats_delete <region_id> - Delete the region with the specified id. <region_id> region_id returned from @stats_create @stats_clear <region_id> - Clear all the counters except the in-flight i/o counters. <region_id> region_id returned from @stats_create @stats_list [<program_id>] - List all regions registered with @stats_create. <program_id> @@ -127,7 +130,6 @@ Messages if they were specified when creating the region. @stats_print <region_id> [<starting_line> <number_of_lines>] - Print counters for each step-sized area of a region. <region_id> @@ -143,10 +145,11 @@ Messages Output format for each step-sized area of a region: - <start_sector>+<length> counters + <start_sector>+<length> + counters The first 11 counters have the same meaning as - /sys/block/*/stat or /proc/diskstats. + `/sys/block/*/stat or /proc/diskstats`. Please refer to Documentation/iostats.txt for details. @@ -163,11 +166,11 @@ Messages 11. the weighted number of milliseconds spent doing I/Os Additional counters: + 12. the total time spent reading in milliseconds 13. the total time spent writing in milliseconds @stats_print_clear <region_id> [<starting_line> <number_of_lines>] - Atomically print and then clear all the counters except the in-flight i/o counters. Useful when the client consuming the statistics does not want to lose any statistics (those updated @@ -185,7 +188,6 @@ Messages If omitted, all lines are printed and then cleared. @stats_set_aux <region_id> <aux_data> - Store auxiliary data aux_data for the specified region. <region_id> @@ -201,23 +203,23 @@ Examples ======== Subdivide the DM device 'vol' into 100 pieces and start collecting -statistics on them: +statistics on them:: dmsetup message vol 0 @stats_create - /100 Set the auxiliary data string to "foo bar baz" (the escape for each -space must also be escaped, otherwise the shell will consume them): +space must also be escaped, otherwise the shell will consume them):: dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz -List the statistics: +List the statistics:: dmsetup message vol 0 @stats_list -Print the statistics: +Print the statistics:: dmsetup message vol 0 @stats_print 0 -Delete the statistics: +Delete the statistics:: dmsetup message vol 0 @stats_delete 0 diff --git a/Documentation/device-mapper/striped.rst b/Documentation/device-mapper/striped.rst new file mode 100644 index 000000000000..e9a8da192ae1 --- /dev/null +++ b/Documentation/device-mapper/striped.rst @@ -0,0 +1,61 @@ +========= +dm-stripe +========= + +Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) +device across one or more underlying devices. Data is written in "chunks", +with consecutive chunks rotating among the underlying devices. This can +potentially provide improved I/O throughput by utilizing several physical +devices in parallel. + +Parameters: <num devs> <chunk size> [<dev path> <offset>]+ + <num devs>: + Number of underlying devices. + <chunk size>: + Size of each chunk of data. Must be at least as + large as the system's PAGE_SIZE. + <dev path>: + Full pathname to the underlying block-device, or a + "major:minor" device-number. + <offset>: + Starting sector within the device. + +One or more underlying devices can be specified. The striped device size must +be a multiple of the chunk size multiplied by the number of underlying devices. + + +Example scripts +=============== + +:: + + #!/usr/bin/perl -w + # Create a striped device across any number of underlying devices. The device + # will be called "stripe_dev" and have a chunk-size of 128k. + + my $chunk_size = 128 * 2; + my $dev_name = "stripe_dev"; + my $num_devs = @ARGV; + my @devs = @ARGV; + my ($min_dev_size, $stripe_dev_size, $i); + + if (!$num_devs) { + die("Specify at least one device\n"); + } + + $min_dev_size = `blockdev --getsz $devs[0]`; + for ($i = 1; $i < $num_devs; $i++) { + my $this_size = `blockdev --getsz $devs[$i]`; + $min_dev_size = ($min_dev_size < $this_size) ? + $min_dev_size : $this_size; + } + + $stripe_dev_size = $min_dev_size * $num_devs; + $stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); + + $table = "0 $stripe_dev_size striped $num_devs $chunk_size"; + for ($i = 0; $i < $num_devs; $i++) { + $table .= " $devs[$i] 0"; + } + + `echo $table | dmsetup create $dev_name`; diff --git a/Documentation/device-mapper/striped.txt b/Documentation/device-mapper/striped.txt deleted file mode 100644 index 07ec492cceee..000000000000 --- a/Documentation/device-mapper/striped.txt +++ /dev/null @@ -1,57 +0,0 @@ -dm-stripe -========= - -Device-Mapper's "striped" target is used to create a striped (i.e. RAID-0) -device across one or more underlying devices. Data is written in "chunks", -with consecutive chunks rotating among the underlying devices. This can -potentially provide improved I/O throughput by utilizing several physical -devices in parallel. - -Parameters: <num devs> <chunk size> [<dev path> <offset>]+ - <num devs>: Number of underlying devices. - <chunk size>: Size of each chunk of data. Must be at least as - large as the system's PAGE_SIZE. - <dev path>: Full pathname to the underlying block-device, or a - "major:minor" device-number. - <offset>: Starting sector within the device. - -One or more underlying devices can be specified. The striped device size must -be a multiple of the chunk size multiplied by the number of underlying devices. - - -Example scripts -=============== - -[[ -#!/usr/bin/perl -w -# Create a striped device across any number of underlying devices. The device -# will be called "stripe_dev" and have a chunk-size of 128k. - -my $chunk_size = 128 * 2; -my $dev_name = "stripe_dev"; -my $num_devs = @ARGV; -my @devs = @ARGV; -my ($min_dev_size, $stripe_dev_size, $i); - -if (!$num_devs) { - die("Specify at least one device\n"); -} - -$min_dev_size = `blockdev --getsz $devs[0]`; -for ($i = 1; $i < $num_devs; $i++) { - my $this_size = `blockdev --getsz $devs[$i]`; - $min_dev_size = ($min_dev_size < $this_size) ? - $min_dev_size : $this_size; -} - -$stripe_dev_size = $min_dev_size * $num_devs; -$stripe_dev_size -= $stripe_dev_size % ($chunk_size * $num_devs); - -$table = "0 $stripe_dev_size striped $num_devs $chunk_size"; -for ($i = 0; $i < $num_devs; $i++) { - $table .= " $devs[$i] 0"; -} - -`echo $table | dmsetup create $dev_name`; -]] - diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.rst index 5bd4831db4a8..7dde06be1a4f 100644 --- a/Documentation/device-mapper/switch.txt +++ b/Documentation/device-mapper/switch.rst @@ -1,3 +1,4 @@ +========= dm-switch ========= @@ -67,27 +68,25 @@ b-tree can achieve. Construction Parameters ======================= - <num_paths> <region_size> <num_optional_args> [<optional_args>...] - [<dev_path> <offset>]+ - -<num_paths> - The number of paths across which to distribute the I/O. + <num_paths> <region_size> <num_optional_args> [<optional_args>...] [<dev_path> <offset>]+ + <num_paths> + The number of paths across which to distribute the I/O. -<region_size> - The number of 512-byte sectors in a region. Each region can be redirected - to any of the available paths. + <region_size> + The number of 512-byte sectors in a region. Each region can be redirected + to any of the available paths. -<num_optional_args> - The number of optional arguments. Currently, no optional arguments - are supported and so this must be zero. + <num_optional_args> + The number of optional arguments. Currently, no optional arguments + are supported and so this must be zero. -<dev_path> - The block device that represents a specific path to the device. + <dev_path> + The block device that represents a specific path to the device. -<offset> - The offset of the start of data on the specific <dev_path> (in units - of 512-byte sectors). This number is added to the sector number when - forwarding the request to the specific path. Typically it is zero. + <offset> + The offset of the start of data on the specific <dev_path> (in units + of 512-byte sectors). This number is added to the sector number when + forwarding the request to the specific path. Typically it is zero. Messages ======== @@ -122,17 +121,21 @@ Example Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with the same size. -Create a switch device with 64kB region size: +Create a switch device with 64kB region size:: + dmsetup create switch --table "0 `blockdev --getsz /dev/vg1/switch0` switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" Set mappings for the first 7 entries to point to devices switch0, switch1, -switch2, switch0, switch1, switch2, switch1: +switch2, switch0, switch1, switch2, switch1:: + dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 -Set repetitive mapping. This command: +Set repetitive mapping. This command:: + dmsetup message switch 0 set_region_mappings 1000:1 :2 R2,10 -is equivalent to: + +is equivalent to:: + dmsetup message switch 0 set_region_mappings 1000:1 :2 :1 :2 :1 :2 :1 :2 \ :1 :2 :1 :2 :1 :2 :1 :2 :1 :2 - diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.rst index 883e7ca5f745..bafebf79da4b 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.rst @@ -1,3 +1,7 @@ +================= +Thin provisioning +================= + Introduction ============ @@ -95,6 +99,8 @@ previously.) Using an existing pool device ----------------------------- +:: + dmsetup create pool \ --table "0 20971520 thin-pool $metadata_dev $data_dev \ $data_block_size $low_water_mark" @@ -154,7 +160,7 @@ Thin provisioning i) Creating a new thinly-provisioned volume. To create a new thinly- provisioned volume you must send a message to an - active pool device, /dev/mapper/pool in this example. + active pool device, /dev/mapper/pool in this example:: dmsetup message /dev/mapper/pool 0 "create_thin 0" @@ -164,7 +170,7 @@ i) Creating a new thinly-provisioned volume. ii) Using a thinly-provisioned volume. - Thinly-provisioned volumes are activated using the 'thin' target: + Thinly-provisioned volumes are activated using the 'thin' target:: dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0" @@ -181,6 +187,8 @@ i) Creating an internal snapshot. must suspend it before creating the snapshot to avoid corruption. This is NOT enforced at the moment, so please be careful! + :: + dmsetup suspend /dev/mapper/thin dmsetup message /dev/mapper/pool 0 "create_snap 1 0" dmsetup resume /dev/mapper/thin @@ -198,14 +206,14 @@ ii) Using an internal snapshot. activating or removing them both. (This differs from conventional device-mapper snapshots.) - Activate it exactly the same way as any other thinly-provisioned volume: + Activate it exactly the same way as any other thinly-provisioned volume:: dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1" External snapshots ------------------ -You can use an external _read only_ device as an origin for a +You can use an external **read only** device as an origin for a thinly-provisioned volume. Any read to an unprovisioned area of the thin device will be passed through to the origin. Writes trigger the allocation of new blocks as usual. @@ -223,11 +231,13 @@ i) Creating a snapshot of an external device This is the same as creating a thin device. You don't mention the origin at this stage. + :: + dmsetup message /dev/mapper/pool 0 "create_thin 0" ii) Using a snapshot of an external device. - Append an extra parameter to the thin target specifying the origin: + Append an extra parameter to the thin target specifying the origin:: dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 0 /dev/image" @@ -240,6 +250,8 @@ Deactivation All devices using a pool must be deactivated before the pool itself can be. +:: + dmsetup remove thin dmsetup remove snap dmsetup remove pool @@ -252,25 +264,32 @@ Reference i) Constructor - thin-pool <metadata dev> <data dev> <data block size (sectors)> \ - <low water mark (blocks)> [<number of feature args> [<arg>]*] + :: + + thin-pool <metadata dev> <data dev> <data block size (sectors)> \ + <low water mark (blocks)> [<number of feature args> [<arg>]*] Optional feature arguments: - skip_block_zeroing: Skip the zeroing of newly-provisioned blocks. + skip_block_zeroing: + Skip the zeroing of newly-provisioned blocks. - ignore_discard: Disable discard support. + ignore_discard: + Disable discard support. - no_discard_passdown: Don't pass discards down to the underlying - data device, but just remove the mapping. + no_discard_passdown: + Don't pass discards down to the underlying + data device, but just remove the mapping. - read_only: Don't allow any changes to be made to the pool + read_only: + Don't allow any changes to be made to the pool metadata. This mode is only available after the thin-pool has been created and first used in full read/write mode. It cannot be specified on initial thin-pool creation. - error_if_no_space: Error IOs, instead of queueing, if no space. + error_if_no_space: + Error IOs, instead of queueing, if no space. Data block size must be between 64KB (128 sectors) and 1GB (2097152 sectors) inclusive. @@ -278,10 +297,12 @@ i) Constructor ii) Status - <transaction id> <used metadata blocks>/<total metadata blocks> - <used data blocks>/<total data blocks> <held metadata root> - ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space - needs_check|- metadata_low_watermark + :: + + <transaction id> <used metadata blocks>/<total metadata blocks> + <used data blocks>/<total data blocks> <held metadata root> + ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space + needs_check|- metadata_low_watermark transaction id: A 64-bit number used by userspace to help synchronise with metadata @@ -336,13 +357,11 @@ ii) Status iii) Messages create_thin <dev id> - Create a new thinly-provisioned device. <dev id> is an arbitrary unique 24-bit identifier chosen by the caller. create_snap <dev id> <origin id> - Create a new snapshot of another thinly-provisioned device. <dev id> is an arbitrary unique 24-bit identifier chosen by the caller. @@ -350,11 +369,9 @@ iii) Messages of which the new device will be a snapshot. delete <dev id> - Deletes a thin device. Irreversible. set_transaction_id <current id> <new id> - Userland volume managers, such as LVM, need a way to synchronise their external metadata with the internal metadata of the pool target. The thin-pool target offers to store an @@ -364,14 +381,12 @@ iii) Messages compare-and-swap message. reserve_metadata_snap - Reserve a copy of the data mapping btree for use by userland. This allows userland to inspect the mappings as they were when this message was executed. Use the pool's status command to get the root block associated with the metadata snapshot. release_metadata_snap - Release a previously reserved copy of the data mapping btree. 'thin' target @@ -379,7 +394,9 @@ iii) Messages i) Constructor - thin <pool dev> <dev id> [<external origin dev>] + :: + + thin <pool dev> <dev id> [<external origin dev>] pool dev: the thin-pool device, e.g. /dev/mapper/my_pool or 253:0 @@ -401,8 +418,7 @@ provisioned as and when needed. ii) Status - <nr mapped sectors> <highest mapped sector> - + <nr mapped sectors> <highest mapped sector> If the pool has encountered device errors and failed, the status will just contain the string 'Fail'. The userspace recovery tools should then be used. diff --git a/Documentation/device-mapper/unstriped.txt b/Documentation/device-mapper/unstriped.rst index 0b2a306c54ee..0a8d3eb3f072 100644 --- a/Documentation/device-mapper/unstriped.txt +++ b/Documentation/device-mapper/unstriped.rst @@ -1,3 +1,7 @@ +================================ +Device-mapper "unstriped" target +================================ + Introduction ============ @@ -34,46 +38,46 @@ striped target to combine the 4 devices into one. It then will use the unstriped target ontop of the striped device to access the individual backing loop devices. We write data to the newly exposed unstriped devices and verify the data written matches the correct -underlying device on the striped array. +underlying device on the striped array:: -#!/bin/bash + #!/bin/bash -MEMBER_SIZE=$((128 * 1024 * 1024)) -NUM=4 -SEQ_END=$((${NUM}-1)) -CHUNK=256 -BS=4096 + MEMBER_SIZE=$((128 * 1024 * 1024)) + NUM=4 + SEQ_END=$((${NUM}-1)) + CHUNK=256 + BS=4096 -RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) -DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" -COUNT=$((${MEMBER_SIZE} / ${BS})) + RAID_SIZE=$((${MEMBER_SIZE}*${NUM}/512)) + DM_PARMS="0 ${RAID_SIZE} striped ${NUM} ${CHUNK}" + COUNT=$((${MEMBER_SIZE} / ${BS})) -for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct - losetup /dev/loop${i} member-${i} - DM_PARMS+=" /dev/loop${i} 0" -done + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/zero of=member-${i} bs=${MEMBER_SIZE} count=1 oflag=direct + losetup /dev/loop${i} member-${i} + DM_PARMS+=" /dev/loop${i} 0" + done -echo $DM_PARMS | dmsetup create raid0 -for i in $(seq 0 ${SEQ_END}); do - echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} -done; + echo $DM_PARMS | dmsetup create raid0 + for i in $(seq 0 ${SEQ_END}); do + echo "0 1 unstriped ${NUM} ${CHUNK} ${i} /dev/mapper/raid0 0" | dmsetup create set-${i} + done; -for i in $(seq 0 ${SEQ_END}); do - dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct - diff /dev/mapper/set-${i} member-${i} -done; + for i in $(seq 0 ${SEQ_END}); do + dd if=/dev/urandom of=/dev/mapper/set-${i} bs=${BS} count=${COUNT} oflag=direct + diff /dev/mapper/set-${i} member-${i} + done; -for i in $(seq 0 ${SEQ_END}); do - dmsetup remove set-${i} -done + for i in $(seq 0 ${SEQ_END}); do + dmsetup remove set-${i} + done -dmsetup remove raid0 + dmsetup remove raid0 -for i in $(seq 0 ${SEQ_END}); do - losetup -d /dev/loop${i} - rm -f member-${i} -done + for i in $(seq 0 ${SEQ_END}); do + losetup -d /dev/loop${i} + rm -f member-${i} + done Another example --------------- @@ -81,7 +85,7 @@ Another example Intel NVMe drives contain two cores on the physical device. Each core of the drive has segregated access to its LBA range. The current LBA model has a RAID 0 128k chunk on each core, resulting -in a 256k stripe across the two cores: +in a 256k stripe across the two cores:: Core 0: Core 1: __________ __________ @@ -108,17 +112,24 @@ Example dmsetup usage unstriped ontop of Intel NVMe device that has 2 cores ----------------------------------------------------- -dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' -dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' + +:: + + dmsetup create nvmset0 --table '0 512 unstriped 2 256 0 /dev/nvme0n1 0' + dmsetup create nvmset1 --table '0 512 unstriped 2 256 1 /dev/nvme0n1 0' There will now be two devices that expose Intel NVMe core 0 and 1 -respectively: -/dev/mapper/nvmset0 -/dev/mapper/nvmset1 +respectively:: + + /dev/mapper/nvmset0 + /dev/mapper/nvmset1 unstriped ontop of striped with 4 drives using 128K chunk size -------------------------------------------------------------- -dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' -dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' -dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' -dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' + +:: + + dmsetup create raid_disk0 --table '0 512 unstriped 4 256 0 /dev/mapper/striped 0' + dmsetup create raid_disk1 --table '0 512 unstriped 4 256 1 /dev/mapper/striped 0' + dmsetup create raid_disk2 --table '0 512 unstriped 4 256 2 /dev/mapper/striped 0' + dmsetup create raid_disk3 --table '0 512 unstriped 4 256 3 /dev/mapper/striped 0' diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.rst index b3d2e4a42255..a4d1c1476d72 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.rst @@ -1,5 +1,6 @@ +========= dm-verity -========== +========= Device-Mapper's "verity" target provides transparent integrity checking of block devices using a cryptographic digest provided by the kernel crypto API. @@ -7,6 +8,9 @@ This target is read-only. Construction Parameters ======================= + +:: + <version> <dev> <hash_dev> <data_block_size> <hash_block_size> <num_data_blocks> <hash_start_block> @@ -160,7 +164,9 @@ calculating the parent node. The tree looks something like: -alg = sha256, num_blocks = 32768, block_size = 4096 + alg = sha256, num_blocks = 32768, block_size = 4096 + +:: [ root ] / . . . \ @@ -189,6 +195,7 @@ block boundary) are the hash blocks which are stored a depth at a time The full specification of kernel parameters and on-disk metadata format is available at the cryptsetup project's wiki page + https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity Status @@ -198,7 +205,8 @@ If any check failed, C (for Corruption) is returned. Example ======= -Set up a device: +Set up a device:: + # dmsetup create vroot --readonly --table \ "0 2097152 verity 1 /dev/sda1 /dev/sda2 4096 4096 262144 1 sha256 "\ "4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 "\ @@ -209,11 +217,13 @@ the hash tree or activate the kernel device. This is available from the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ (as a libcryptsetup extension). -Create hash on the device: +Create hash on the device:: + # veritysetup format /dev/sda1 /dev/sda2 ... Root hash: 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 -Activate the device: +Activate the device:: + # veritysetup create vroot /dev/sda1 /dev/sda2 \ 4392712ba01368efdf14b05c76f9e4df0d53664630b5d48632ed17a137f39076 diff --git a/Documentation/device-mapper/writecache.txt b/Documentation/device-mapper/writecache.rst index 01532b3008ae..d3d7690f5e8d 100644 --- a/Documentation/device-mapper/writecache.txt +++ b/Documentation/device-mapper/writecache.rst @@ -1,3 +1,7 @@ +================= +Writecache target +================= + The writecache target caches writes on persistent memory or on SSD. It doesn't cache reads because reads are supposed to be cached in page cache in normal RAM. @@ -6,15 +10,18 @@ When the device is constructed, the first sector should be zeroed or the first sector should contain valid superblock from previous invocation. Constructor parameters: + 1. type of the cache device - "p" or "s" - p - persistent memory - s - SSD + + - p - persistent memory + - s - SSD 2. the underlying device that will be cached 3. the cache device 4. block size (4096 is recommended; the maximum block size is the page size) 5. the number of optional parameters (the parameters with an argument count as two) + start_sector n (default: 0) offset from the start of cache device in 512-byte sectors high_watermark n (default: 50) @@ -43,6 +50,7 @@ Constructor parameters: applicable only to persistent memory - don't use the FUA flag when writing back data and send the FLUSH request afterwards + - some underlying devices perform better with fua, some with nofua. The user should test it @@ -60,6 +68,7 @@ Messages: flush the cache device on next suspend. Use this message when you are going to remove the cache device. The proper sequence for removing the cache device is: + 1. send the "flush_on_suspend" message 2. load an inactive table with a linear target that maps to the underlying device diff --git a/Documentation/device-mapper/zero.txt b/Documentation/device-mapper/zero.rst index 20fb38e7fa7e..11fb5cf4597c 100644 --- a/Documentation/device-mapper/zero.txt +++ b/Documentation/device-mapper/zero.rst @@ -1,3 +1,4 @@ +======= dm-zero ======= @@ -18,20 +19,19 @@ filesystem limitations. To create a sparse device, start by creating a dm-zero device that's the desired size of the sparse device. For this example, we'll assume a 10TB -sparse device. +sparse device:: -TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors -echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 + TEN_TERABYTES=`expr 10 \* 1024 \* 1024 \* 1024 \* 2` # 10 TB in sectors + echo "0 $TEN_TERABYTES zero" | dmsetup create zero1 Then create a snapshot of the zero device, using any available block-device as the COW device. The size of the COW device will determine the amount of real space available to the sparse device. For this example, we'll assume /dev/sdb1 -is an available 10GB partition. +is an available 10GB partition:: -echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ - dmsetup create sparse1 + echo "0 $TEN_TERABYTES snapshot /dev/mapper/zero1 /dev/sdb1 p 128" | \ + dmsetup create sparse1 This will create a 10TB sparse device called /dev/mapper/sparse1 that has 10GB of actual storage space available. If more than 10GB of data is written to this device, it will start returning I/O errors. - diff --git a/Documentation/devicetree/bindings/net/fsl-enetc.txt b/Documentation/devicetree/bindings/net/fsl-enetc.txt index c812e25ae90f..25fc687419db 100644 --- a/Documentation/devicetree/bindings/net/fsl-enetc.txt +++ b/Documentation/devicetree/bindings/net/fsl-enetc.txt @@ -16,8 +16,8 @@ Required properties: In this case, the ENETC node should include a "mdio" sub-node that in turn should contain the "ethernet-phy" node describing the external phy. Below properties are required, their bindings -already defined in ethernet.txt or phy.txt, under -Documentation/devicetree/bindings/net/*. +already defined in Documentation/devicetree/bindings/net/ethernet.txt or +Documentation/devicetree/bindings/net/phy.txt. Required: @@ -51,8 +51,7 @@ Example: connection: In this case, the ENETC port node defines a fixed link connection, -as specified by "fixed-link.txt", under -Documentation/devicetree/bindings/net/*. +as specified by Documentation/devicetree/bindings/net/fixed-link.txt. Required: diff --git a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt index 12b18f82d441..efa2c8b9b85a 100644 --- a/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt +++ b/Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt @@ -3,7 +3,7 @@ Amlogic Meson AXG DWC PCIE SoC controller Amlogic Meson PCIe host controller is based on the Synopsys DesignWare PCI core. It shares common functions with the PCIe DesignWare core driver and inherits common properties defined in -Documentation/devicetree/bindings/pci/designware-pci.txt. +Documentation/devicetree/bindings/pci/designware-pcie.txt. Additional properties are described here: diff --git a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt index 7ef2dbe48e8a..14d2eee96b3d 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt @@ -97,7 +97,7 @@ Second Level Nodes - Regulators sent for this regulator including those which are for a strictly lower power state. -Other properties defined in Documentation/devicetree/bindings/regulator.txt +Other properties defined in Documentation/devicetree/bindings/regulator/regulator.txt may also be used. regulator-initial-mode and regulator-allowed-modes may be specified for VRM regulators using mode values from include/dt-bindings/regulator/qcom,rpmh-regulator.h. regulator-allow-bypass diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt index e86bd2f64117..60f8640f2b2f 100644 --- a/Documentation/devicetree/booting-without-of.txt +++ b/Documentation/devicetree/booting-without-of.txt @@ -277,7 +277,7 @@ it with special cases. the decompressor (the real mode entry point goes to the same 32bit entry point once it switched into protected mode). That entry point supports one calling convention which is documented in - Documentation/x86/boot.txt + Documentation/x86/boot.rst The physical pointer to the device-tree block (defined in chapter II) is passed via setup_data which requires at least boot protocol 2.09. The type filed is defined as diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst index f96059767c8c..192c36af39e2 100644 --- a/Documentation/doc-guide/kernel-doc.rst +++ b/Documentation/doc-guide/kernel-doc.rst @@ -359,7 +359,7 @@ Domain`_ references. ``monospaced font``. Useful if you need to use special characters that would otherwise have some - meaning either by kernel-doc script of by reStructuredText. + meaning either by kernel-doc script or by reStructuredText. This is particularly useful if you need to use things like ``%ph`` inside a function description. diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst index c039224b404e..f71ddd592aaa 100644 --- a/Documentation/doc-guide/sphinx.rst +++ b/Documentation/doc-guide/sphinx.rst @@ -27,8 +27,7 @@ Sphinx Install ============== The ReST markups currently used by the Documentation/ files are meant to be -built with ``Sphinx`` version 1.3 or higher. If you desire to build -PDF output, it is recommended to use version 1.4.6 or higher. +built with ``Sphinx`` version 1.3 or higher. There's a script that checks for the Sphinx requirements. Please see :ref:`sphinx-pre-install` for further details. @@ -56,13 +55,13 @@ or ``virtualenv``, depending on how your distribution packaged Python 3. those expressions are written using LaTeX notation. It needs texlive installed with amdfonts and amsmath in order to evaluate them. -In summary, if you want to install Sphinx version 1.4.9, you should do:: +In summary, if you want to install Sphinx version 1.7.9, you should do:: - $ virtualenv sphinx_1.4 - $ . sphinx_1.4/bin/activate - (sphinx_1.4) $ pip install -r Documentation/sphinx/requirements.txt + $ virtualenv sphinx_1.7.9 + $ . sphinx_1.7.9/bin/activate + (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt -After running ``. sphinx_1.4/bin/activate``, the prompt will change, +After running ``. sphinx_1.7.9/bin/activate``, the prompt will change, in order to indicate that you're using the new environment. If you open a new shell, you need to rerun this command to enter again at the virtual environment before building the documentation. @@ -105,8 +104,8 @@ command line options for your distro:: You should run: sudo dnf install -y texlive-luatex85 - /usr/bin/virtualenv sphinx_1.4 - . sphinx_1.4/bin/activate + /usr/bin/virtualenv sphinx_1.7.9 + . sphinx_1.7.9/bin/activate pip install -r Documentation/sphinx/requirements.txt Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468. @@ -218,7 +217,7 @@ Here are some specific guidelines for the kernel documentation: examples, etc.), use ``::`` for anything that doesn't really benefit from syntax highlighting, especially short snippets. Use ``.. code-block:: <language>`` for longer code blocks that benefit - from highlighting. + from highlighting. For a short snippet of code embedded in the text, use \`\`. the C domain @@ -242,11 +241,14 @@ The C domain of the kernel-doc has some additional features. E.g. you can The func-name (e.g. ioctl) remains in the output but the ref-name changed from ``ioctl`` to ``VIDIOC_LOG_STATUS``. The index entry for this function is also -changed to ``VIDIOC_LOG_STATUS`` and the function can now referenced by: - -.. code-block:: rst - - :c:func:`VIDIOC_LOG_STATUS` +changed to ``VIDIOC_LOG_STATUS``. + +Please note that there is no need to use ``c:func:`` to generate cross +references to function documentation. Due to some Sphinx extension magic, +the documentation build system will automatically turn a reference to +``function()`` into a cross reference if an index entry for the given +function name exists. If you see ``c:func:`` use in a kernel document, +please feel free to remove it. list tables diff --git a/Documentation/docutils.conf b/Documentation/docutils.conf index 2830772264c8..f1a180b97dec 100644 --- a/Documentation/docutils.conf +++ b/Documentation/docutils.conf @@ -4,4 +4,4 @@ # http://docutils.sourceforge.net/docs/user/config.html [general] -halt_level: severe
\ No newline at end of file +halt_level: severe diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst index e970fadf4d1a..1ba88c7b3984 100644 --- a/Documentation/driver-api/basics.rst +++ b/Documentation/driver-api/basics.rst @@ -115,9 +115,6 @@ Kernel utility functions .. kernel-doc:: kernel/rcu/tree.c :export: -.. kernel-doc:: kernel/rcu/tree_plugin.h - :export: - .. kernel-doc:: kernel/rcu/update.c :export: diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst index 593cca5058b1..3cad45d14187 100644 --- a/Documentation/driver-api/clk.rst +++ b/Documentation/driver-api/clk.rst @@ -175,9 +175,9 @@ the following:: To take advantage of your data you'll need to support valid operations for your clk:: - struct clk_ops clk_foo_ops { - .enable = &clk_foo_enable; - .disable = &clk_foo_disable; + struct clk_ops clk_foo_ops = { + .enable = &clk_foo_enable, + .disable = &clk_foo_disable, }; Implement the above functions using container_of:: diff --git a/Documentation/driver-api/firmware/other_interfaces.rst b/Documentation/driver-api/firmware/other_interfaces.rst index a4ac54b5fd79..b81794e0cfbb 100644 --- a/Documentation/driver-api/firmware/other_interfaces.rst +++ b/Documentation/driver-api/firmware/other_interfaces.rst @@ -33,7 +33,7 @@ of the requests on to a secure monitor (EL3). :functions: stratix10_svc_client_msg .. kernel-doc:: include/linux/firmware/intel/stratix10-svc-client.h - :functions: stratix10_svc_command_reconfig_payload + :functions: stratix10_svc_command_config_type .. kernel-doc:: include/linux/firmware/intel/stratix10-svc-client.h :functions: stratix10_svc_cb_data diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index b37f3f7b8926..ce91518bf9f4 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -101,7 +101,7 @@ with the help of _DSD (Device Specific Data), introduced in ACPI 5.1:: } For more information about the ACPI GPIO bindings see -Documentation/acpi/gpio-properties.txt. +Documentation/firmware-guide/acpi/gpio-properties.rst. Platform Data ------------- diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst index 9559aa3cbcef..423492d125b9 100644 --- a/Documentation/driver-api/gpio/consumer.rst +++ b/Documentation/driver-api/gpio/consumer.rst @@ -435,7 +435,7 @@ case, it will be handled by the GPIO subsystem automatically. However, if the _DSD is not present, the mappings between GpioIo()/GpioInt() resources and GPIO connection IDs need to be provided by device drivers. -For details refer to Documentation/acpi/gpio-properties.txt +For details refer to Documentation/firmware-guide/acpi/gpio-properties.rst Interacting With the Legacy GPIO Subsystem diff --git a/Documentation/driver-api/iio/hw-consumer.rst b/Documentation/driver-api/iio/hw-consumer.rst index e0fe0b98230e..819fb9edc005 100644 --- a/Documentation/driver-api/iio/hw-consumer.rst +++ b/Documentation/driver-api/iio/hw-consumer.rst @@ -45,7 +45,6 @@ A typical IIO HW consumer setup looks like this:: More details ============ -.. kernel-doc:: include/linux/iio/hw-consumer.h .. kernel-doc:: drivers/iio/buffer/industrialio-hw-consumer.c :export: diff --git a/Documentation/pps/pps.txt b/Documentation/driver-api/pps.rst index 99f5d8c4c652..1456d2c32ebd 100644 --- a/Documentation/pps/pps.txt +++ b/Documentation/driver-api/pps.rst @@ -1,8 +1,10 @@ +:orphan: - PPS - Pulse Per Second - ---------------------- +====================== +PPS - Pulse Per Second +====================== -(C) Copyright 2007 Rodolfo Giometti <giometti@enneenne.com> +Copyright (C) 2007 Rodolfo Giometti <giometti@enneenne.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -88,7 +90,7 @@ Coding example -------------- To register a PPS source into the kernel you should define a struct -pps_source_info as follows: +pps_source_info as follows:: static struct pps_source_info pps_ktimer_info = { .name = "ktimer", @@ -101,12 +103,12 @@ pps_source_info as follows: }; and then calling the function pps_register_source() in your -initialization routine as follows: +initialization routine as follows:: source = pps_register_source(&pps_ktimer_info, PPS_CAPTUREASSERT | PPS_OFFSETASSERT); -The pps_register_source() prototype is: +The pps_register_source() prototype is:: int pps_register_source(struct pps_source_info *info, int default_params) @@ -118,7 +120,7 @@ pps_source_info which describe the capabilities of the driver). Once you have registered a new PPS source into the system you can signal an assert event (for example in the interrupt handler routine) -just using: +just using:: pps_event(source, &ts, PPS_CAPTUREASSERT, ptr) @@ -134,13 +136,13 @@ Please see the file drivers/pps/clients/pps-ktimer.c for example code. SYSFS support ------------- -If the SYSFS filesystem is enabled in the kernel it provides a new class: +If the SYSFS filesystem is enabled in the kernel it provides a new class:: $ ls /sys/class/pps/ pps0/ pps1/ pps2/ Every directory is the ID of a PPS sources defined in the system and -inside you find several files: +inside you find several files:: $ ls -F /sys/class/pps/pps0/ assert dev mode path subsystem@ @@ -148,7 +150,7 @@ inside you find several files: Inside each "assert" and "clear" file you can find the timestamp and a -sequence number: +sequence number:: $ cat /sys/class/pps/pps0/assert 1170026870.983207967#8 @@ -175,11 +177,11 @@ and the userland tools available in your distribution's pps-tools package, http://linuxpps.org , or https://github.com/redlab-i/pps-tools. Once you have enabled the compilation of pps-ktimer just modprobe it (if -not statically compiled): +not statically compiled):: # modprobe pps-ktimer -and the run ppstest as follow: +and the run ppstest as follow:: $ ./ppstest /dev/pps1 trying PPS source "/dev/pps1" @@ -204,26 +206,27 @@ nor affordable. The cheap way is to load a PPS generator on one of the computers (master) and PPS clients on others (slaves), and use very simple cables to deliver signals using parallel ports, for example. -Parallel port cable pinout: -pin name master slave -1 STROBE *------ * -2 D0 * | * -3 D1 * | * -4 D2 * | * -5 D3 * | * -6 D4 * | * -7 D5 * | * -8 D6 * | * -9 D7 * | * -10 ACK * ------* -11 BUSY * * -12 PE * * -13 SEL * * -14 AUTOFD * * -15 ERROR * * -16 INIT * * -17 SELIN * * -18-25 GND *-----------* +Parallel port cable pinout:: + + pin name master slave + 1 STROBE *------ * + 2 D0 * | * + 3 D1 * | * + 4 D2 * | * + 5 D3 * | * + 6 D4 * | * + 7 D5 * | * + 8 D6 * | * + 9 D7 * | * + 10 ACK * ------* + 11 BUSY * * + 12 PE * * + 13 SEL * * + 14 AUTOFD * * + 15 ERROR * * + 16 INIT * * + 17 SELIN * * + 18-25 GND *-----------* Please note that parallel port interrupt occurs only on high->low transition, so it is used for PPS assert edge. PPS clear edge can be determined only diff --git a/Documentation/ptp/ptp.txt b/Documentation/driver-api/ptp.rst index 11e904ee073f..b6e65d66d37a 100644 --- a/Documentation/ptp/ptp.txt +++ b/Documentation/driver-api/ptp.rst @@ -1,5 +1,8 @@ +:orphan: -* PTP hardware clock infrastructure for Linux +=========================================== +PTP hardware clock infrastructure for Linux +=========================================== This patch set introduces support for IEEE 1588 PTP clocks in Linux. Together with the SO_TIMESTAMPING socket options, this @@ -22,7 +25,8 @@ - Period output signals configurable from user space - Synchronization of the Linux system time via the PPS subsystem -** PTP hardware clock kernel API +PTP hardware clock kernel API +============================= A PTP clock driver registers itself with the class driver. The class driver handles all of the dealings with user space. The @@ -36,7 +40,8 @@ development, it can be useful to have more than one clock in a single system, in order to allow performance comparisons. -** PTP hardware clock user space API +PTP hardware clock user space API +================================= The class driver also creates a character device for each registered clock. User space can use an open file descriptor from @@ -49,7 +54,8 @@ ancillary clock features. User space can receive time stamped events via blocking read() and poll(). -** Writing clock drivers +Writing clock drivers +===================== Clock drivers include include/linux/ptp_clock_kernel.h and register themselves by presenting a 'struct ptp_clock_info' to the @@ -66,14 +72,17 @@ class driver, since the lock may also be needed by the clock driver's interrupt service routine. -** Supported hardware +Supported hardware +================== + + * Freescale eTSEC gianfar - + Freescale eTSEC gianfar - 2 Time stamp external triggers, programmable polarity (opt. interrupt) - 2 Alarm registers (optional interrupt) - 3 Periodic signals (optional interrupt) - + National DP83640 + * National DP83640 + - 6 GPIOs programmable as inputs or outputs - 6 GPIOs with dedicated functions (LED/JTAG/clock) can also be used as general inputs or outputs @@ -81,6 +90,7 @@ - GPIO outputs can produce periodic signals - 1 interrupt pin - + Intel IXP465 + * Intel IXP465 + - Auxiliary Slave/Master Mode Snapshot (optional interrupt) - Target Time (optional interrupt) diff --git a/Documentation/driver-api/target.rst b/Documentation/driver-api/target.rst index 4363611dd86d..620ec6173a93 100644 --- a/Documentation/driver-api/target.rst +++ b/Documentation/driver-api/target.rst @@ -10,8 +10,8 @@ TBD Target core device interfaces ============================= -.. kernel-doc:: drivers/target/target_core_device.c - :export: +This section is blank because no kerneldoc comments have been added to +drivers/target/target_core_device.c. Target core transport interfaces ================================ diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.rst index a17517a083c3..f51bb21d20e4 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.rst @@ -1,3 +1,4 @@ +=========================================== Fault injection capabilities infrastructure =========================================== @@ -7,36 +8,36 @@ See also drivers/md/md-faulty.c and "every_nth" module option for scsi_debug. Available fault injection capabilities -------------------------------------- -o failslab +- failslab injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), ...) -o fail_page_alloc +- fail_page_alloc injects page allocation failures. (alloc_pages(), get_free_pages(), ...) -o fail_futex +- fail_futex injects futex deadlock and uaddr fault errors. -o fail_make_request +- fail_make_request injects disk IO errors on devices permitted by setting /sys/block/<device>/make-it-fail or /sys/block/<device>/<partition>/make-it-fail. (generic_make_request()) -o fail_mmc_request +- fail_mmc_request injects MMC data errors on devices permitted by setting debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request -o fail_function +- fail_function injects error return on specific functions, which are marked by ALLOW_ERROR_INJECTION() macro, by setting debugfs entries under /sys/kernel/debug/fail_function. No boot option supported. -o NVMe fault injection +- NVMe fault injection inject NVMe status code and retry flag on devices permitted by setting debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default @@ -47,7 +48,8 @@ o NVMe fault injection Configure fault-injection capabilities behavior ----------------------------------------------- -o debugfs entries +debugfs entries +^^^^^^^^^^^^^^^ fault-inject-debugfs kernel module provides some debugfs entries for runtime configuration of fault-injection capabilities. @@ -55,6 +57,7 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail*/probability: likelihood of failure injection, in percent. + Format: <percent> Note that one-failure-per-hundred is a very high error rate @@ -83,6 +86,7 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail*/verbose Format: { 0 | 1 | 2 } + specifies the verbosity of the messages when failure is injected. '0' means no messages; '1' will print only a single log line per failure; '2' will print a call trace too -- useful @@ -91,14 +95,15 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail*/task-filter: Format: { 'Y' | 'N' } + A value of 'N' disables filtering by process (default). Any positive value limits failures to only processes indicated by /proc/<pid>/make-it-fail==1. -- /sys/kernel/debug/fail*/require-start: -- /sys/kernel/debug/fail*/require-end: -- /sys/kernel/debug/fail*/reject-start: -- /sys/kernel/debug/fail*/reject-end: +- /sys/kernel/debug/fail*/require-start, + /sys/kernel/debug/fail*/require-end, + /sys/kernel/debug/fail*/reject-start, + /sys/kernel/debug/fail*/reject-end: specifies the range of virtual addresses tested during stacktrace walking. Failure is injected only if some caller @@ -116,6 +121,7 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem: Format: { 'Y' | 'N' } + default is 'N', setting it to 'Y' won't inject failures into highmem/user allocations. @@ -123,6 +129,7 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait: Format: { 'Y' | 'N' } + default is 'N', setting it to 'Y' will inject failures only into non-sleep allocations (GFP_ATOMIC allocations). @@ -134,12 +141,14 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail_futex/ignore-private: Format: { 'Y' | 'N' } + default is 'N', setting it to 'Y' will disable failure injections when dealing with private (address space) futexes. - /sys/kernel/debug/fail_function/inject: Format: { 'function-name' | '!function-name' | '' } + specifies the target function of error injection by name. If the function name leads '!' prefix, given function is removed from injection list. If nothing specified ('') @@ -160,10 +169,11 @@ configuration of fault-injection capabilities. function for given function. This will be created when user specifies new injection entry. -o Boot option +Boot option +^^^^^^^^^^^ In order to inject faults while debugfs is not available (early boot time), -use the boot option: +use the boot option:: failslab= fail_page_alloc= @@ -171,10 +181,11 @@ use the boot option: fail_futex= mmc_core.fail_request=<interval>,<probability>,<space>,<times> -o proc entries +proc entries +^^^^^^^^^^^^ -- /proc/<pid>/fail-nth: -- /proc/self/task/<tid>/fail-nth: +- /proc/<pid>/fail-nth, + /proc/self/task/<tid>/fail-nth: Write to this file of integer N makes N-th call in the task fail. Read from this file returns a integer value. A value of '0' indicates @@ -191,16 +202,16 @@ o proc entries How to add new fault injection capability ----------------------------------------- -o #include <linux/fault-inject.h> +- #include <linux/fault-inject.h> -o define the fault attributes +- define the fault attributes DECLARE_FAULT_ATTR(name); Please see the definition of struct fault_attr in fault-inject.h for details. -o provide a way to configure fault attributes +- provide a way to configure fault attributes - boot option @@ -222,126 +233,126 @@ o provide a way to configure fault attributes single kernel module, it is better to provide module parameters to configure the fault attributes. -o add a hook to insert failures +- add a hook to insert failures - Upon should_fail() returning true, client code should inject a failure. + Upon should_fail() returning true, client code should inject a failure: should_fail(attr, size); Application Examples -------------------- -o Inject slab allocation failures into module init/exit code +- Inject slab allocation failures into module init/exit code:: -#!/bin/bash + #!/bin/bash -FAILTYPE=failslab -echo Y > /sys/kernel/debug/$FAILTYPE/task-filter -echo 10 > /sys/kernel/debug/$FAILTYPE/probability -echo 100 > /sys/kernel/debug/$FAILTYPE/interval -echo -1 > /sys/kernel/debug/$FAILTYPE/times -echo 0 > /sys/kernel/debug/$FAILTYPE/space -echo 2 > /sys/kernel/debug/$FAILTYPE/verbose -echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait + FAILTYPE=failslab + echo Y > /sys/kernel/debug/$FAILTYPE/task-filter + echo 10 > /sys/kernel/debug/$FAILTYPE/probability + echo 100 > /sys/kernel/debug/$FAILTYPE/interval + echo -1 > /sys/kernel/debug/$FAILTYPE/times + echo 0 > /sys/kernel/debug/$FAILTYPE/space + echo 2 > /sys/kernel/debug/$FAILTYPE/verbose + echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait -faulty_system() -{ + faulty_system() + { bash -c "echo 1 > /proc/self/make-it-fail && exec $*" -} + } -if [ $# -eq 0 ] -then + if [ $# -eq 0 ] + then echo "Usage: $0 modulename [ modulename ... ]" exit 1 -fi + fi -for m in $* -do + for m in $* + do echo inserting $m... faulty_system modprobe $m echo removing $m... faulty_system modprobe -r $m -done + done ------------------------------------------------------------------------------ -o Inject page allocation failures only for a specific module +- Inject page allocation failures only for a specific module:: -#!/bin/bash + #!/bin/bash -FAILTYPE=fail_page_alloc -module=$1 + FAILTYPE=fail_page_alloc + module=$1 -if [ -z $module ] -then + if [ -z $module ] + then echo "Usage: $0 <modulename>" exit 1 -fi + fi -modprobe $module + modprobe $module -if [ ! -d /sys/module/$module/sections ] -then + if [ ! -d /sys/module/$module/sections ] + then echo Module $module is not loaded exit 1 -fi + fi -cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start -cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end + cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start + cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end -echo N > /sys/kernel/debug/$FAILTYPE/task-filter -echo 10 > /sys/kernel/debug/$FAILTYPE/probability -echo 100 > /sys/kernel/debug/$FAILTYPE/interval -echo -1 > /sys/kernel/debug/$FAILTYPE/times -echo 0 > /sys/kernel/debug/$FAILTYPE/space -echo 2 > /sys/kernel/debug/$FAILTYPE/verbose -echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait -echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem -echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth + echo N > /sys/kernel/debug/$FAILTYPE/task-filter + echo 10 > /sys/kernel/debug/$FAILTYPE/probability + echo 100 > /sys/kernel/debug/$FAILTYPE/interval + echo -1 > /sys/kernel/debug/$FAILTYPE/times + echo 0 > /sys/kernel/debug/$FAILTYPE/space + echo 2 > /sys/kernel/debug/$FAILTYPE/verbose + echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait + echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem + echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth -trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT + trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT -echo "Injecting errors into the module $module... (interrupt to stop)" -sleep 1000000 + echo "Injecting errors into the module $module... (interrupt to stop)" + sleep 1000000 ------------------------------------------------------------------------------ -o Inject open_ctree error while btrfs mount - -#!/bin/bash - -rm -f testfile.img -dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 -DEVICE=$(losetup --show -f testfile.img) -mkfs.btrfs -f $DEVICE -mkdir -p tmpmnt - -FAILTYPE=fail_function -FAILFUNC=open_ctree -echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject -echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval -echo N > /sys/kernel/debug/$FAILTYPE/task-filter -echo 100 > /sys/kernel/debug/$FAILTYPE/probability -echo 0 > /sys/kernel/debug/$FAILTYPE/interval -echo -1 > /sys/kernel/debug/$FAILTYPE/times -echo 0 > /sys/kernel/debug/$FAILTYPE/space -echo 1 > /sys/kernel/debug/$FAILTYPE/verbose - -mount -t btrfs $DEVICE tmpmnt -if [ $? -ne 0 ] -then +- Inject open_ctree error while btrfs mount:: + + #!/bin/bash + + rm -f testfile.img + dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 + DEVICE=$(losetup --show -f testfile.img) + mkfs.btrfs -f $DEVICE + mkdir -p tmpmnt + + FAILTYPE=fail_function + FAILFUNC=open_ctree + echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject + echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval + echo N > /sys/kernel/debug/$FAILTYPE/task-filter + echo 100 > /sys/kernel/debug/$FAILTYPE/probability + echo 0 > /sys/kernel/debug/$FAILTYPE/interval + echo -1 > /sys/kernel/debug/$FAILTYPE/times + echo 0 > /sys/kernel/debug/$FAILTYPE/space + echo 1 > /sys/kernel/debug/$FAILTYPE/verbose + + mount -t btrfs $DEVICE tmpmnt + if [ $? -ne 0 ] + then echo "SUCCESS!" -else + else echo "FAILED!" umount tmpmnt -fi + fi -echo > /sys/kernel/debug/$FAILTYPE/inject + echo > /sys/kernel/debug/$FAILTYPE/inject -rmdir tmpmnt -losetup -d $DEVICE -rm testfile.img + rmdir tmpmnt + losetup -d $DEVICE + rm testfile.img Tool to run command with failslab or fail_page_alloc @@ -354,43 +365,43 @@ see the following examples. Examples: Run a command "make -C tools/testing/selftests/ run_tests" with injecting slab -allocation failure. +allocation failure:: # ./tools/testing/fault-injection/failcmd.sh \ -- make -C tools/testing/selftests/ run_tests Same as above except to specify 100 times failures at most instead of one time -at most by default. +at most by default:: # ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests Same as above except to inject page allocation failure instead of slab -allocation failure. +allocation failure:: # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ - -- make -C tools/testing/selftests/ run_tests + -- make -C tools/testing/selftests/ run_tests Systematic faults using fail-nth --------------------------------- The following code systematically faults 0-th, 1-st, 2-nd and so on -capabilities in the socketpair() system call. - -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/socket.h> -#include <sys/syscall.h> -#include <fcntl.h> -#include <unistd.h> -#include <string.h> -#include <stdlib.h> -#include <stdio.h> -#include <errno.h> - -int main() -{ +capabilities in the socketpair() system call:: + + #include <sys/types.h> + #include <sys/stat.h> + #include <sys/socket.h> + #include <sys/syscall.h> + #include <fcntl.h> + #include <unistd.h> + #include <string.h> + #include <stdlib.h> + #include <stdio.h> + #include <errno.h> + + int main() + { int i, err, res, fail_nth, fds[2]; char buf[128]; @@ -413,23 +424,23 @@ int main() break; } return 0; -} - -An example output: - -1-th fault Y: res=-1/23 -2-th fault Y: res=-1/23 -3-th fault Y: res=-1/12 -4-th fault Y: res=-1/12 -5-th fault Y: res=-1/23 -6-th fault Y: res=-1/23 -7-th fault Y: res=-1/23 -8-th fault Y: res=-1/12 -9-th fault Y: res=-1/12 -10-th fault Y: res=-1/12 -11-th fault Y: res=-1/12 -12-th fault Y: res=-1/12 -13-th fault Y: res=-1/12 -14-th fault Y: res=-1/12 -15-th fault Y: res=-1/12 -16-th fault N: res=0/12 + } + +An example output:: + + 1-th fault Y: res=-1/23 + 2-th fault Y: res=-1/23 + 3-th fault Y: res=-1/12 + 4-th fault Y: res=-1/12 + 5-th fault Y: res=-1/23 + 6-th fault Y: res=-1/23 + 7-th fault Y: res=-1/23 + 8-th fault Y: res=-1/12 + 9-th fault Y: res=-1/12 + 10-th fault Y: res=-1/12 + 11-th fault Y: res=-1/12 + 12-th fault Y: res=-1/12 + 13-th fault Y: res=-1/12 + 14-th fault Y: res=-1/12 + 15-th fault Y: res=-1/12 + 16-th fault N: res=0/12 diff --git a/Documentation/fault-injection/index.rst b/Documentation/fault-injection/index.rst new file mode 100644 index 000000000000..92b5639ed07a --- /dev/null +++ b/Documentation/fault-injection/index.rst @@ -0,0 +1,20 @@ +:orphan: + +=============== +fault-injection +=============== + +.. toctree:: + :maxdepth: 1 + + fault-injection + notifier-error-inject + nvme-fault-injection + provoke-crashes + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/fault-injection/notifier-error-inject.txt b/Documentation/fault-injection/notifier-error-inject.rst index e861d761de24..1668b6e48d3a 100644 --- a/Documentation/fault-injection/notifier-error-inject.txt +++ b/Documentation/fault-injection/notifier-error-inject.rst @@ -14,7 +14,8 @@ modules that can be used to test the following notifiers. PM notifier error injection module ---------------------------------- This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/pm/actions/<notifier event>/error + + /sys/kernel/debug/notifier-error-inject/pm/actions/<notifier event>/error Possible PM notifier events to be failed are: @@ -22,7 +23,7 @@ Possible PM notifier events to be failed are: * PM_SUSPEND_PREPARE * PM_RESTORE_PREPARE -Example: Inject PM suspend error (-12 = -ENOMEM) +Example: Inject PM suspend error (-12 = -ENOMEM):: # cd /sys/kernel/debug/notifier-error-inject/pm/ # echo -12 > actions/PM_SUSPEND_PREPARE/error @@ -32,14 +33,15 @@ Example: Inject PM suspend error (-12 = -ENOMEM) Memory hotplug notifier error injection module ---------------------------------------------- This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/memory/actions/<notifier event>/error + + /sys/kernel/debug/notifier-error-inject/memory/actions/<notifier event>/error Possible memory notifier events to be failed are: * MEM_GOING_ONLINE * MEM_GOING_OFFLINE -Example: Inject memory hotplug offline error (-12 == -ENOMEM) +Example: Inject memory hotplug offline error (-12 == -ENOMEM):: # cd /sys/kernel/debug/notifier-error-inject/memory # echo -12 > actions/MEM_GOING_OFFLINE/error @@ -49,7 +51,8 @@ Example: Inject memory hotplug offline error (-12 == -ENOMEM) powerpc pSeries reconfig notifier error injection module -------------------------------------------------------- This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions/<notifier event>/error + + /sys/kernel/debug/notifier-error-inject/pSeries-reconfig/actions/<notifier event>/error Possible pSeries reconfig notifier events to be failed are: @@ -61,7 +64,8 @@ Possible pSeries reconfig notifier events to be failed are: Netdevice notifier error injection module ---------------------------------------------- This feature is controlled through debugfs interface -/sys/kernel/debug/notifier-error-inject/netdev/actions/<notifier event>/error + + /sys/kernel/debug/notifier-error-inject/netdev/actions/<notifier event>/error Netdevice notifier events which can be failed are: @@ -75,7 +79,7 @@ Netdevice notifier events which can be failed are: * NETDEV_PRECHANGEUPPER * NETDEV_CHANGEUPPER -Example: Inject netdevice mtu change error (-22 == -EINVAL) +Example: Inject netdevice mtu change error (-22 == -EINVAL):: # cd /sys/kernel/debug/notifier-error-inject/netdev # echo -22 > actions/NETDEV_CHANGEMTU/error diff --git a/Documentation/fault-injection/nvme-fault-injection.rst b/Documentation/fault-injection/nvme-fault-injection.rst new file mode 100644 index 000000000000..cdb2e829228e --- /dev/null +++ b/Documentation/fault-injection/nvme-fault-injection.rst @@ -0,0 +1,178 @@ +NVMe Fault Injection +==================== +Linux's fault injection framework provides a systematic way to support +error injection via debugfs in the /sys/kernel/debug directory. When +enabled, the default NVME_SC_INVALID_OPCODE with no retry will be +injected into the nvme_end_request. Users can change the default status +code and no retry flag via the debugfs. The list of Generic Command +Status can be found in include/linux/nvme.h + +Following examples show how to inject an error into the nvme. + +First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config, +recompile the kernel. After booting up the kernel, do the +following. + +Example 1: Inject default status code with no retry +--------------------------------------------------- + +:: + + mount /dev/nvme0n1 /mnt + echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times + echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability + cp a.file /mnt + +Expected Result:: + + cp: cannot stat ‘/mnt/a.file’: Input/output error + +Message from dmesg:: + + FAULT_INJECTION: forcing a failure. + name fault_inject, interval 1, probability 100, space 0, times 1 + CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2 + Hardware name: innotek GmbH VirtualBox/VirtualBox, + BIOS VirtualBox 12/01/2006 + Call Trace: + <IRQ> + dump_stack+0x5c/0x7d + should_fail+0x148/0x170 + nvme_should_fail+0x2f/0x50 [nvme_core] + nvme_process_cq+0xe7/0x1d0 [nvme] + nvme_irq+0x1e/0x40 [nvme] + __handle_irq_event_percpu+0x3a/0x190 + handle_irq_event_percpu+0x30/0x70 + handle_irq_event+0x36/0x60 + handle_fasteoi_irq+0x78/0x120 + handle_irq+0xa7/0x130 + ? tick_irq_enter+0xa8/0xc0 + do_IRQ+0x43/0xc0 + common_interrupt+0xa2/0xa2 + </IRQ> + RIP: 0010:native_safe_halt+0x2/0x10 + RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd + RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 + RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480 + R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000 + ? __sched_text_end+0x4/0x4 + default_idle+0x18/0xf0 + do_idle+0x150/0x1d0 + cpu_startup_entry+0x6f/0x80 + start_kernel+0x4c4/0x4e4 + ? set_init_arg+0x55/0x55 + secondary_startup_64+0xa5/0xb0 + print_req_error: I/O error, dev nvme0n1, sector 9240 + EXT4-fs error (device nvme0n1): ext4_find_entry:1436: + inode #2: comm cp: reading directory lblock 0 + +Example 2: Inject default status code with retry +------------------------------------------------ + +:: + + mount /dev/nvme0n1 /mnt + echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times + echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability + echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status + echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry + + cp a.file /mnt + +Expected Result:: + + command success without error + +Message from dmesg:: + + FAULT_INJECTION: forcing a failure. + name fault_inject, interval 1, probability 100, space 0, times 1 + CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4 + Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 + Call Trace: + <IRQ> + dump_stack+0x5c/0x7d + should_fail+0x148/0x170 + nvme_should_fail+0x30/0x60 [nvme_core] + nvme_loop_queue_response+0x84/0x110 [nvme_loop] + nvmet_req_complete+0x11/0x40 [nvmet] + nvmet_bio_done+0x28/0x40 [nvmet] + blk_update_request+0xb0/0x310 + blk_mq_end_request+0x18/0x60 + flush_smp_call_function_queue+0x3d/0xf0 + smp_call_function_single_interrupt+0x2c/0xc0 + call_function_single_interrupt+0xa2/0xb0 + </IRQ> + RIP: 0010:native_safe_halt+0x2/0x10 + RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04 + RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 + RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680 + R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000 + ? __sched_text_end+0x4/0x4 + default_idle+0x18/0xf0 + do_idle+0x150/0x1d0 + cpu_startup_entry+0x6f/0x80 + start_secondary+0x187/0x1e0 + secondary_startup_64+0xa5/0xb0 + +Example 3: Inject an error into the 10th admin command +------------------------------------------------------ + +:: + + echo 100 > /sys/kernel/debug/nvme0/fault_inject/probability + echo 10 > /sys/kernel/debug/nvme0/fault_inject/space + echo 1 > /sys/kernel/debug/nvme0/fault_inject/times + nvme reset /dev/nvme0 + +Expected Result:: + + After NVMe controller reset, the reinitialization may or may not succeed. + It depends on which admin command is actually forced to fail. + +Message from dmesg:: + + nvme nvme0: resetting controller + FAULT_INJECTION: forcing a failure. + name fault_inject, interval 1, probability 100, space 1, times 1 + CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0-rc2+ #2 + Hardware name: MSI MS-7A45/B150M MORTAR ARCTIC (MS-7A45), BIOS 1.50 04/25/2017 + Call Trace: + <IRQ> + dump_stack+0x63/0x85 + should_fail+0x14a/0x170 + nvme_should_fail+0x38/0x80 [nvme_core] + nvme_irq+0x129/0x280 [nvme] + ? blk_mq_end_request+0xb3/0x120 + __handle_irq_event_percpu+0x84/0x1a0 + handle_irq_event_percpu+0x32/0x80 + handle_irq_event+0x3b/0x60 + handle_edge_irq+0x7f/0x1a0 + handle_irq+0x20/0x30 + do_IRQ+0x4e/0xe0 + common_interrupt+0xf/0xf + </IRQ> + RIP: 0010:cpuidle_enter_state+0xc5/0x460 + Code: ff e8 8f 5f 86 ff 80 7d c7 00 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 69 03 00 00 31 ff e8 62 aa 8c ff fb 66 0f 1f 44 00 00 <45> 85 ed 0f 88 37 03 00 00 4c 8b 45 d0 4c 2b 45 b8 48 ba cf f7 53 + RSP: 0018:ffffffff88c03dd0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdc + RAX: ffff9dac25a2ac80 RBX: ffffffff88d53760 RCX: 000000000000001f + RDX: 0000000000000000 RSI: 000000002d958403 RDI: 0000000000000000 + RBP: ffffffff88c03e18 R08: fffffff75e35ffb7 R09: 00000a49a56c0b48 + R10: ffffffff88c03da0 R11: 0000000000001b0c R12: ffff9dac25a34d00 + R13: 0000000000000006 R14: 0000000000000006 R15: ffffffff88d53760 + cpuidle_enter+0x2e/0x40 + call_cpuidle+0x23/0x40 + do_idle+0x201/0x280 + cpu_startup_entry+0x1d/0x20 + rest_init+0xaa/0xb0 + arch_call_rest_init+0xe/0x1b + start_kernel+0x51c/0x53b + x86_64_start_reservations+0x24/0x26 + x86_64_start_kernel+0x74/0x77 + secondary_startup_64+0xa4/0xb0 + nvme nvme0: Could not set queue count (16385) + nvme nvme0: IO queues not created diff --git a/Documentation/fault-injection/nvme-fault-injection.txt b/Documentation/fault-injection/nvme-fault-injection.txt deleted file mode 100644 index efcb339a3add..000000000000 --- a/Documentation/fault-injection/nvme-fault-injection.txt +++ /dev/null @@ -1,172 +0,0 @@ -NVMe Fault Injection -==================== -Linux's fault injection framework provides a systematic way to support -error injection via debugfs in the /sys/kernel/debug directory. When -enabled, the default NVME_SC_INVALID_OPCODE with no retry will be -injected into the nvme_end_request. Users can change the default status -code and no retry flag via the debugfs. The list of Generic Command -Status can be found in include/linux/nvme.h - -Following examples show how to inject an error into the nvme. - -First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config, -recompile the kernel. After booting up the kernel, do the -following. - -Example 1: Inject default status code with no retry ---------------------------------------------------- - -mount /dev/nvme0n1 /mnt -echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times -echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability -cp a.file /mnt - -Expected Result: - -cp: cannot stat ‘/mnt/a.file’: Input/output error - -Message from dmesg: - -FAULT_INJECTION: forcing a failure. -name fault_inject, interval 1, probability 100, space 0, times 1 -CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2 -Hardware name: innotek GmbH VirtualBox/VirtualBox, -BIOS VirtualBox 12/01/2006 -Call Trace: - <IRQ> - dump_stack+0x5c/0x7d - should_fail+0x148/0x170 - nvme_should_fail+0x2f/0x50 [nvme_core] - nvme_process_cq+0xe7/0x1d0 [nvme] - nvme_irq+0x1e/0x40 [nvme] - __handle_irq_event_percpu+0x3a/0x190 - handle_irq_event_percpu+0x30/0x70 - handle_irq_event+0x36/0x60 - handle_fasteoi_irq+0x78/0x120 - handle_irq+0xa7/0x130 - ? tick_irq_enter+0xa8/0xc0 - do_IRQ+0x43/0xc0 - common_interrupt+0xa2/0xa2 - </IRQ> -RIP: 0010:native_safe_halt+0x2/0x10 -RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd -RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000 -RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 -RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000 -R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480 -R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000 - ? __sched_text_end+0x4/0x4 - default_idle+0x18/0xf0 - do_idle+0x150/0x1d0 - cpu_startup_entry+0x6f/0x80 - start_kernel+0x4c4/0x4e4 - ? set_init_arg+0x55/0x55 - secondary_startup_64+0xa5/0xb0 - print_req_error: I/O error, dev nvme0n1, sector 9240 -EXT4-fs error (device nvme0n1): ext4_find_entry:1436: -inode #2: comm cp: reading directory lblock 0 - -Example 2: Inject default status code with retry ------------------------------------------------- - -mount /dev/nvme0n1 /mnt -echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times -echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability -echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status -echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry - -cp a.file /mnt - -Expected Result: - -command success without error - -Message from dmesg: - -FAULT_INJECTION: forcing a failure. -name fault_inject, interval 1, probability 100, space 0, times 1 -CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4 -Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 -Call Trace: - <IRQ> - dump_stack+0x5c/0x7d - should_fail+0x148/0x170 - nvme_should_fail+0x30/0x60 [nvme_core] - nvme_loop_queue_response+0x84/0x110 [nvme_loop] - nvmet_req_complete+0x11/0x40 [nvmet] - nvmet_bio_done+0x28/0x40 [nvmet] - blk_update_request+0xb0/0x310 - blk_mq_end_request+0x18/0x60 - flush_smp_call_function_queue+0x3d/0xf0 - smp_call_function_single_interrupt+0x2c/0xc0 - call_function_single_interrupt+0xa2/0xb0 - </IRQ> -RIP: 0010:native_safe_halt+0x2/0x10 -RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04 -RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000 -RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 -RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000 -R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680 -R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000 - ? __sched_text_end+0x4/0x4 - default_idle+0x18/0xf0 - do_idle+0x150/0x1d0 - cpu_startup_entry+0x6f/0x80 - start_secondary+0x187/0x1e0 - secondary_startup_64+0xa5/0xb0 - -Example 3: Inject an error into the 10th admin command ------------------------------------------------------- - -echo 100 > /sys/kernel/debug/nvme0/fault_inject/probability -echo 10 > /sys/kernel/debug/nvme0/fault_inject/space -echo 1 > /sys/kernel/debug/nvme0/fault_inject/times -nvme reset /dev/nvme0 - -Expected Result: - -After NVMe controller reset, the reinitialization may or may not succeed. -It depends on which admin command is actually forced to fail. - -Message from dmesg: - -nvme nvme0: resetting controller -FAULT_INJECTION: forcing a failure. -name fault_inject, interval 1, probability 100, space 1, times 1 -CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0-rc2+ #2 -Hardware name: MSI MS-7A45/B150M MORTAR ARCTIC (MS-7A45), BIOS 1.50 04/25/2017 -Call Trace: - <IRQ> - dump_stack+0x63/0x85 - should_fail+0x14a/0x170 - nvme_should_fail+0x38/0x80 [nvme_core] - nvme_irq+0x129/0x280 [nvme] - ? blk_mq_end_request+0xb3/0x120 - __handle_irq_event_percpu+0x84/0x1a0 - handle_irq_event_percpu+0x32/0x80 - handle_irq_event+0x3b/0x60 - handle_edge_irq+0x7f/0x1a0 - handle_irq+0x20/0x30 - do_IRQ+0x4e/0xe0 - common_interrupt+0xf/0xf - </IRQ> -RIP: 0010:cpuidle_enter_state+0xc5/0x460 -Code: ff e8 8f 5f 86 ff 80 7d c7 00 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 69 03 00 00 31 ff e8 62 aa 8c ff fb 66 0f 1f 44 00 00 <45> 85 ed 0f 88 37 03 00 00 4c 8b 45 d0 4c 2b 45 b8 48 ba cf f7 53 -RSP: 0018:ffffffff88c03dd0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdc -RAX: ffff9dac25a2ac80 RBX: ffffffff88d53760 RCX: 000000000000001f -RDX: 0000000000000000 RSI: 000000002d958403 RDI: 0000000000000000 -RBP: ffffffff88c03e18 R08: fffffff75e35ffb7 R09: 00000a49a56c0b48 -R10: ffffffff88c03da0 R11: 0000000000001b0c R12: ffff9dac25a34d00 -R13: 0000000000000006 R14: 0000000000000006 R15: ffffffff88d53760 - cpuidle_enter+0x2e/0x40 - call_cpuidle+0x23/0x40 - do_idle+0x201/0x280 - cpu_startup_entry+0x1d/0x20 - rest_init+0xaa/0xb0 - arch_call_rest_init+0xe/0x1b - start_kernel+0x51c/0x53b - x86_64_start_reservations+0x24/0x26 - x86_64_start_kernel+0x74/0x77 - secondary_startup_64+0xa4/0xb0 -nvme nvme0: Could not set queue count (16385) -nvme nvme0: IO queues not created diff --git a/Documentation/fault-injection/provoke-crashes.rst b/Documentation/fault-injection/provoke-crashes.rst new file mode 100644 index 000000000000..9279a3e12278 --- /dev/null +++ b/Documentation/fault-injection/provoke-crashes.rst @@ -0,0 +1,48 @@ +=============== +Provoke crashes +=============== + +The lkdtm module provides an interface to crash or injure the kernel at +predefined crashpoints to evaluate the reliability of crash dumps obtained +using different dumping solutions. The module uses KPROBEs to instrument +crashing points, but can also crash the kernel directly without KRPOBE +support. + + +You can provide the way either through module arguments when inserting +the module, or through a debugfs interface. + +Usage:: + + insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<> + [cpoint_count={>0}] + +recur_count + Recursion level for the stack overflow test. Default is 10. + +cpoint_name + Crash point where the kernel is to be crashed. It can be + one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, + FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, + IDE_CORE_CP, DIRECT + +cpoint_type + Indicates the action to be taken on hitting the crash point. + It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW, + CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION, + WRITE_AFTER_FREE, + +cpoint_count + Indicates the number of times the crash point is to be hit + to trigger an action. The default is 10. + +You can also induce failures by mounting debugfs and writing the type to +<mountpoint>/provoke-crash/<crashpoint>. E.g.:: + + mount -t debugfs debugfs /mnt + echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY + + +A special file is `DIRECT` which will induce the crash directly without +KPROBE instrumentation. This mode is the only one available when the module +is built on a kernel without KPROBEs support. diff --git a/Documentation/fault-injection/provoke-crashes.txt b/Documentation/fault-injection/provoke-crashes.txt deleted file mode 100644 index 7a9d3d81525b..000000000000 --- a/Documentation/fault-injection/provoke-crashes.txt +++ /dev/null @@ -1,38 +0,0 @@ -The lkdtm module provides an interface to crash or injure the kernel at -predefined crashpoints to evaluate the reliability of crash dumps obtained -using different dumping solutions. The module uses KPROBEs to instrument -crashing points, but can also crash the kernel directly without KRPOBE -support. - - -You can provide the way either through module arguments when inserting -the module, or through a debugfs interface. - -Usage: insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<> - [cpoint_count={>0}] - - recur_count : Recursion level for the stack overflow test. Default is 10. - - cpoint_name : Crash point where the kernel is to be crashed. It can be - one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, - FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, - IDE_CORE_CP, DIRECT - - cpoint_type : Indicates the action to be taken on hitting the crash point. - It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW, - CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION, - WRITE_AFTER_FREE, - - cpoint_count : Indicates the number of times the crash point is to be hit - to trigger an action. The default is 10. - -You can also induce failures by mounting debugfs and writing the type to -<mountpoint>/provoke-crash/<crashpoint>. E.g., - - mount -t debugfs debugfs /mnt - echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY - - -A special file is `DIRECT' which will induce the crash directly without -KPROBE instrumentation. This mode is the only one available when the module -is built on a kernel without KPROBEs support. diff --git a/Documentation/fb/api.txt b/Documentation/fb/api.rst index d52cf1e3b975..79ec33dded74 100644 --- a/Documentation/fb/api.txt +++ b/Documentation/fb/api.rst @@ -1,5 +1,6 @@ - The Frame Buffer Device API - --------------------------- +=========================== +The Frame Buffer Device API +=========================== Last revised: June 21, 2011 @@ -21,13 +22,13 @@ deal with different behaviours. --------------- Device and driver capabilities are reported in the fixed screen information -capabilities field. +capabilities field:: -struct fb_fix_screeninfo { + struct fb_fix_screeninfo { ... __u16 capabilities; /* see FB_CAP_* */ ... -}; + }; Application should use those capabilities to find out what features they can expect from the device and driver. @@ -151,9 +152,9 @@ fb_fix_screeninfo and fb_var_screeninfo structure respectively. struct fb_fix_screeninfo stores device independent unchangeable information about the frame buffer device and the current format. Those information can't be directly modified by applications, but can be changed by the driver when an -application modifies the format. +application modifies the format:: -struct fb_fix_screeninfo { + struct fb_fix_screeninfo { char id[16]; /* identification string eg "TT Builtin" */ unsigned long smem_start; /* Start of frame buffer mem */ /* (physical address) */ @@ -172,13 +173,13 @@ struct fb_fix_screeninfo { /* specific chip/card we have */ __u16 capabilities; /* see FB_CAP_* */ __u16 reserved[2]; /* Reserved for future compatibility */ -}; + }; struct fb_var_screeninfo stores device independent changeable information about a frame buffer device, its current format and video mode, as well as -other miscellaneous parameters. +other miscellaneous parameters:: -struct fb_var_screeninfo { + struct fb_var_screeninfo { __u32 xres; /* visible resolution */ __u32 yres; __u32 xres_virtual; /* virtual resolution */ @@ -216,7 +217,7 @@ struct fb_var_screeninfo { __u32 rotate; /* angle we rotate counter clockwise */ __u32 colorspace; /* colorspace for FOURCC-based modes */ __u32 reserved[4]; /* Reserved for future compatibility */ -}; + }; To modify variable information, applications call the FBIOPUT_VSCREENINFO ioctl with a pointer to a fb_var_screeninfo structure. If the call is @@ -255,14 +256,14 @@ monochrome, grayscale or pseudocolor visuals, although this is not required. - For truecolor and directcolor formats, applications set the grayscale field to zero, and the red, blue, green and transp fields to describe the layout of - color components in memory. + color components in memory:: -struct fb_bitfield { + struct fb_bitfield { __u32 offset; /* beginning of bitfield */ __u32 length; /* length of bitfield */ __u32 msb_right; /* != 0 : Most significant bit is */ /* right */ -}; + }; Pixel values are bits_per_pixel wide and are split in non-overlapping red, green, blue and alpha (transparency) components. Location and size of each diff --git a/Documentation/fb/arkfb.txt b/Documentation/fb/arkfb.rst index e8487a9d6a05..aeca8773dd7e 100644 --- a/Documentation/fb/arkfb.txt +++ b/Documentation/fb/arkfb.rst @@ -1,6 +1,6 @@ - - arkfb - fbdev driver for ARK Logic chips - ======================================== +======================================== +arkfb - fbdev driver for ARK Logic chips +======================================== Supported Hardware @@ -47,7 +47,7 @@ Missing Features (alias TODO list) * secondary (not initialized by BIOS) device support - * big endian support + * big endian support * DPMS support * MMIO support * interlaced mode variant diff --git a/Documentation/fb/aty128fb.txt b/Documentation/fb/aty128fb.rst index b605204fcfe1..3f107718f933 100644 --- a/Documentation/fb/aty128fb.txt +++ b/Documentation/fb/aty128fb.rst @@ -1,8 +1,9 @@ -[This file is cloned from VesaFB/matroxfb] - +================= What is aty128fb? ================= +.. [This file is cloned from VesaFB/matroxfb] + This is a driver for a graphic framebuffer for ATI Rage128 based devices on Intel and PPC boxes. @@ -24,15 +25,15 @@ How to use it? ============== Switching modes is done using the video=aty128fb:<resolution>... modedb -boot parameter or using `fbset' program. +boot parameter or using `fbset` program. -See Documentation/fb/modedb.txt for more information on modedb +See Documentation/fb/modedb.rst for more information on modedb resolutions. You should compile in both vgacon (to boot if you remove your Rage128 from box) and aty128fb (for graphics mode). You should not compile-in vesafb -unless you have primary display on non-Rage128 VBE2.0 device (see -Documentation/fb/vesafb.txt for details). +unless you have primary display on non-Rage128 VBE2.0 device (see +Documentation/fb/vesafb.rst for details). X11 @@ -48,16 +49,18 @@ Configuration ============= You can pass kernel command line options to vesafb with -`video=aty128fb:option1,option2:value2,option3' (multiple options should -be separated by comma, values are separated from options by `:'). +`video=aty128fb:option1,option2:value2,option3` (multiple options should +be separated by comma, values are separated from options by `:`). Accepted options: -noaccel - do not use acceleration engine. It is default. -accel - use acceleration engine. Not finished. -vmode:x - chooses PowerMacintosh video mode <x>. Deprecated. -cmode:x - chooses PowerMacintosh colour mode <x>. Deprecated. -<XxX@X> - selects startup videomode. See modedb.txt for detailed - explanation. Default is 640x480x8bpp. +========= ======================================================= +noaccel do not use acceleration engine. It is default. +accel use acceleration engine. Not finished. +vmode:x chooses PowerMacintosh video mode <x>. Deprecated. +cmode:x chooses PowerMacintosh colour mode <x>. Deprecated. +<XxX@X> selects startup videomode. See modedb.txt for detailed + explanation. Default is 640x480x8bpp. +========= ======================================================= Limitations @@ -65,8 +68,8 @@ Limitations There are known and unknown bugs, features and misfeatures. Currently there are following known bugs: - + This driver is still experimental and is not finished. Too many + + - This driver is still experimental and is not finished. Too many bugs/errata to list here. --- Brad Douglas <brad@neruo.com> diff --git a/Documentation/fb/cirrusfb.txt b/Documentation/fb/cirrusfb.rst index f75950d330a4..8c3e6c6cb114 100644 --- a/Documentation/fb/cirrusfb.txt +++ b/Documentation/fb/cirrusfb.rst @@ -1,32 +1,32 @@ +============================================ +Framebuffer driver for Cirrus Logic chipsets +============================================ - Framebuffer driver for Cirrus Logic chipsets - Copyright 1999 Jeff Garzik <jgarzik@pobox.com> +Copyright 1999 Jeff Garzik <jgarzik@pobox.com> - -{ just a little something to get people going; contributors welcome! } - +.. just a little something to get people going; contributors welcome! Chip families supported: - SD64 - Piccolo - Picasso - Spectrum - Alpine (GD-543x/4x) - Picasso4 (GD-5446) - GD-5480 - Laguna (GD-546x) + - SD64 + - Piccolo + - Picasso + - Spectrum + - Alpine (GD-543x/4x) + - Picasso4 (GD-5446) + - GD-5480 + - Laguna (GD-546x) Bus's supported: - PCI - Zorro + - PCI + - Zorro Architectures supported: - i386 - Alpha - PPC (Motorola Powerstack) - m68k (Amiga) + - i386 + - Alpha + - PPC (Motorola Powerstack) + - m68k (Amiga) @@ -34,10 +34,9 @@ Default video modes ------------------- At the moment, there are two kernel command line arguments supported: -mode:640x480 -mode:800x600 - or -mode:1024x768 +- mode:640x480 +- mode:800x600 +- mode:1024x768 Full support for startup video modes (modedb) will be integrated soon. @@ -93,5 +92,3 @@ Version 1.9.4 Version 1.9.3 ------------- * Bundled with kernel 2.3.14-pre1 or later. - - diff --git a/Documentation/fb/cmap_xfbdev.txt b/Documentation/fb/cmap_xfbdev.rst index 55e1f0a3d2b4..5db5e9787361 100644 --- a/Documentation/fb/cmap_xfbdev.txt +++ b/Documentation/fb/cmap_xfbdev.rst @@ -1,26 +1,29 @@ +========================== Understanding fbdev's cmap --------------------------- +========================== These notes explain how X's dix layer uses fbdev's cmap structures. -*. example of relevant structures in fbdev as used for a 3-bit grayscale cmap -struct fb_var_screeninfo { - .bits_per_pixel = 8, - .grayscale = 1, - .red = { 4, 3, 0 }, - .green = { 0, 0, 0 }, - .blue = { 0, 0, 0 }, -} -struct fb_fix_screeninfo { - .visual = FB_VISUAL_STATIC_PSEUDOCOLOR, -} -for (i = 0; i < 8; i++) +- example of relevant structures in fbdev as used for a 3-bit grayscale cmap:: + + struct fb_var_screeninfo { + .bits_per_pixel = 8, + .grayscale = 1, + .red = { 4, 3, 0 }, + .green = { 0, 0, 0 }, + .blue = { 0, 0, 0 }, + } + struct fb_fix_screeninfo { + .visual = FB_VISUAL_STATIC_PSEUDOCOLOR, + } + for (i = 0; i < 8; i++) info->cmap.red[i] = (((2*i)+1)*(0xFFFF))/16; -memcpy(info->cmap.green, info->cmap.red, sizeof(u16)*8); -memcpy(info->cmap.blue, info->cmap.red, sizeof(u16)*8); + memcpy(info->cmap.green, info->cmap.red, sizeof(u16)*8); + memcpy(info->cmap.blue, info->cmap.red, sizeof(u16)*8); -*. X11 apps do something like the following when trying to use grayscale. -for (i=0; i < 8; i++) { +- X11 apps do something like the following when trying to use grayscale:: + + for (i=0; i < 8; i++) { char colorspec[64]; memset(colorspec,0,64); sprintf(colorspec, "rgb:%x/%x/%x", i*36,i*36,i*36); @@ -28,26 +31,26 @@ for (i=0; i < 8; i++) { printf("Can't get color %s\n",colorspec); XAllocColor(outputDisplay, testColormap, &wantedColor); grays[i] = wantedColor; -} + } + There's also named equivalents like gray1..x provided you have an rgb.txt. Somewhere in X's callchain, this results in a call to X code that handles the colormap. For example, Xfbdev hits the following: -xc-011010/programs/Xserver/dix/colormap.c: +xc-011010/programs/Xserver/dix/colormap.c:: -FindBestPixel(pentFirst, size, prgb, channel) + FindBestPixel(pentFirst, size, prgb, channel) -dr = (long) pent->co.local.red - prgb->red; -dg = (long) pent->co.local.green - prgb->green; -db = (long) pent->co.local.blue - prgb->blue; -sq = dr * dr; -UnsignedToBigNum (sq, &sum); -BigNumAdd (&sum, &temp, &sum); + dr = (long) pent->co.local.red - prgb->red; + dg = (long) pent->co.local.green - prgb->green; + db = (long) pent->co.local.blue - prgb->blue; + sq = dr * dr; + UnsignedToBigNum (sq, &sum); + BigNumAdd (&sum, &temp, &sum); co.local.red are entries that were brought in through FBIOGETCMAP which come directly from the info->cmap.red that was listed above. The prgb is the rgb that the app wants to match to. The above code is doing what looks like a least squares matching function. That's why the cmap entries can't be set to the left hand side boundaries of a color range. - diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.rst index 748328370250..7300cff255a3 100644 --- a/Documentation/fb/deferred_io.txt +++ b/Documentation/fb/deferred_io.rst @@ -1,5 +1,6 @@ +=========== Deferred IO ------------ +=========== Deferred IO is a way to delay and repurpose IO. It uses host memory as a buffer and the MMU pagefault as a pretrigger for when to perform the device @@ -16,7 +17,7 @@ works: - app continues writing to that page with no additional cost. this is the key benefit. - the workqueue task comes in and mkcleans the pages on the list, then - completes the work associated with updating the framebuffer. this is + completes the work associated with updating the framebuffer. this is the real work talking to the device. - app tries to write to the address (that has now been mkcleaned) - get pagefault and the above sequence occurs again @@ -47,29 +48,32 @@ How to use it: (for fbdev drivers) ---------------------------------- The following example may be helpful. -1. Setup your structure. Eg: +1. Setup your structure. Eg:: -static struct fb_deferred_io hecubafb_defio = { - .delay = HZ, - .deferred_io = hecubafb_dpy_deferred_io, -}; + static struct fb_deferred_io hecubafb_defio = { + .delay = HZ, + .deferred_io = hecubafb_dpy_deferred_io, + }; The delay is the minimum delay between when the page_mkwrite trigger occurs and when the deferred_io callback is called. The deferred_io callback is explained below. -2. Setup your deferred IO callback. Eg: -static void hecubafb_dpy_deferred_io(struct fb_info *info, - struct list_head *pagelist) +2. Setup your deferred IO callback. Eg:: + + static void hecubafb_dpy_deferred_io(struct fb_info *info, + struct list_head *pagelist) The deferred_io callback is where you would perform all your IO to the display device. You receive the pagelist which is the list of pages that were written to during the delay. You must not modify this list. This callback is called from a workqueue. -3. Call init +3. Call init:: + info->fbdefio = &hecubafb_defio; fb_deferred_io_init(info); -4. Call cleanup +4. Call cleanup:: + fb_deferred_io_cleanup(info); diff --git a/Documentation/fb/efifb.txt b/Documentation/fb/efifb.rst index 1a85c1bdaf38..04840331a00e 100644 --- a/Documentation/fb/efifb.txt +++ b/Documentation/fb/efifb.rst @@ -1,6 +1,6 @@ - +============== What is efifb? -=============== +============== This is a generic EFI platform driver for Intel based Apple computers. efifb is only for EFI booted Intel Macs. @@ -8,16 +8,17 @@ efifb is only for EFI booted Intel Macs. Supported Hardware ================== -iMac 17"/20" -Macbook -Macbook Pro 15"/17" -MacMini +- iMac 17"/20" +- Macbook +- Macbook Pro 15"/17" +- MacMini How to use it? ============== efifb does not have any kind of autodetection of your machine. -You have to add the following kernel parameters in your elilo.conf: +You have to add the following kernel parameters in your elilo.conf:: + Macbook : video=efifb:macbook MacMini : @@ -29,9 +30,10 @@ You have to add the following kernel parameters in your elilo.conf: Accepted options: +======= =========================================================== nowc Don't map the framebuffer write combined. This can be used to workaround side-effects and slowdowns on other CPU cores when large amounts of console data are written. +======= =========================================================== --- Edgar Hucek <gimli@dark-green.com> diff --git a/Documentation/fb/ep93xx-fb.txt b/Documentation/fb/ep93xx-fb.rst index 5af1bd9effae..6f7767926d1a 100644 --- a/Documentation/fb/ep93xx-fb.txt +++ b/Documentation/fb/ep93xx-fb.rst @@ -4,7 +4,7 @@ Driver for EP93xx LCD controller The EP93xx LCD controller can drive both standard desktop monitors and embedded LCD displays. If you have a standard desktop monitor then you -can use the standard Linux video mode database. In your board file: +can use the standard Linux video mode database. In your board file:: static struct ep93xxfb_mach_info some_board_fb_info = { .num_modes = EP93XXFB_USE_MODEDB, @@ -12,7 +12,7 @@ can use the standard Linux video mode database. In your board file: }; If you have an embedded LCD display then you need to define a video -mode for it as follows: +mode for it as follows:: static struct fb_videomode some_board_video_modes[] = { { @@ -23,11 +23,11 @@ mode for it as follows: Note that the pixel clock value is in pico-seconds. You can use the KHZ2PICOS macro to convert the pixel clock value. Most other values -are in pixel clocks. See Documentation/fb/framebuffer.txt for further +are in pixel clocks. See Documentation/fb/framebuffer.rst for further details. The ep93xxfb_mach_info structure for your board should look like the -following: +following:: static struct ep93xxfb_mach_info some_board_fb_info = { .num_modes = ARRAY_SIZE(some_board_video_modes), @@ -37,7 +37,7 @@ following: }; The framebuffer device can be registered by adding the following to -your board initialisation function: +your board initialisation function:: ep93xx_register_fb(&some_board_fb_info); @@ -50,6 +50,7 @@ to configure the controller. The video attributes flags are fully documented in section 7 of the EP93xx users' guide. The following flags are available: +=============================== ========================================== EP93XXFB_PCLK_FALLING Clock data on the falling edge of the pixel clock. The default is to clock data on the rising edge. @@ -62,10 +63,12 @@ EP93XXFB_SYNC_HORIZ_HIGH Horizontal sync is active high. By EP93XXFB_SYNC_VERT_HIGH Vertical sync is active high. By default the vertical sync is active high. +=============================== ========================================== The physical address of the framebuffer can be controlled using the following flags: +=============================== ====================================== EP93XXFB_USE_SDCSN0 Use SDCSn[0] for the framebuffer. This is the default setting. @@ -74,6 +77,7 @@ EP93XXFB_USE_SDCSN1 Use SDCSn[1] for the framebuffer. EP93XXFB_USE_SDCSN2 Use SDCSn[2] for the framebuffer. EP93XXFB_USE_SDCSN3 Use SDCSn[3] for the framebuffer. +=============================== ====================================== ================== Platform callbacks @@ -87,7 +91,7 @@ blanked or unblanked. The setup and teardown devices pass the platform_device structure as an argument. The fb_info and ep93xxfb_mach_info structures can be -obtained as follows: +obtained as follows:: static int some_board_fb_setup(struct platform_device *pdev) { @@ -101,17 +105,17 @@ obtained as follows: Setting the video mode ====================== -The video mode is set using the following syntax: +The video mode is set using the following syntax:: video=XRESxYRES[-BPP][@REFRESH] If the EP93xx video driver is built-in then the video mode is set on -the Linux kernel command line, for example: +the Linux kernel command line, for example:: video=ep93xx-fb:800x600-16@60 If the EP93xx video driver is built as a module then the video mode is -set when the module is installed: +set when the module is installed:: modprobe ep93xx-fb video=320x240 @@ -121,13 +125,14 @@ Screenpage bug At least on the EP9315 there is a silicon bug which causes bit 27 of the VIDSCRNPAGE (framebuffer physical offset) to be tied low. There is -an unofficial errata for this bug at: +an unofficial errata for this bug at:: + http://marc.info/?l=linux-arm-kernel&m=110061245502000&w=2 By default the EP93xx framebuffer driver checks if the allocated physical address has bit 27 set. If it does, then the memory is freed and an error is returned. The check can be disabled by adding the following -option when loading the driver: +option when loading the driver:: ep93xx-fb.check_screenpage_bug=0 diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.rst index 5a865437b33f..1da65b9000de 100644 --- a/Documentation/fb/fbcon.txt +++ b/Documentation/fb/fbcon.rst @@ -1,39 +1,41 @@ +======================= The Framebuffer Console ======================= - The framebuffer console (fbcon), as its name implies, is a text +The framebuffer console (fbcon), as its name implies, is a text console running on top of the framebuffer device. It has the functionality of any standard text console driver, such as the VGA console, with the added features that can be attributed to the graphical nature of the framebuffer. - In the x86 architecture, the framebuffer console is optional, and +In the x86 architecture, the framebuffer console is optional, and some even treat it as a toy. For other architectures, it is the only available display device, text or graphical. - What are the features of fbcon? The framebuffer console supports +What are the features of fbcon? The framebuffer console supports high resolutions, varying font types, display rotation, primitive multihead, etc. Theoretically, multi-colored fonts, blending, aliasing, and any feature made available by the underlying graphics card are also possible. A. Configuration +================ - The framebuffer console can be enabled by using your favorite kernel +The framebuffer console can be enabled by using your favorite kernel configuration tool. It is under Device Drivers->Graphics Support->Frame buffer Devices->Console display driver support->Framebuffer Console Support. Select 'y' to compile support statically or 'm' for module support. The module will be fbcon. - In order for fbcon to activate, at least one framebuffer driver is +In order for fbcon to activate, at least one framebuffer driver is required, so choose from any of the numerous drivers available. For x86 systems, they almost universally have VGA cards, so vga16fb and vesafb will always be available. However, using a chipset-specific driver will give you more speed and features, such as the ability to change the video mode dynamically. - To display the penguin logo, choose any logo available in Graphics +To display the penguin logo, choose any logo available in Graphics support->Bootup logo. - Also, you will need to select at least one compiled-in font, but if +Also, you will need to select at least one compiled-in font, but if you don't do anything, the kernel configuration tool will select one for you, usually an 8x16 font. @@ -44,6 +46,7 @@ fortunate to have a driver that does not alter the graphics chip, then you will still get a VGA console. B. Loading +========== Possible scenarios: @@ -72,33 +75,33 @@ Possible scenarios: C. Boot options - The framebuffer console has several, largely unknown, boot options - that can change its behavior. + The framebuffer console has several, largely unknown, boot options + that can change its behavior. 1. fbcon=font:<name> - Select the initial font to use. The value 'name' can be any of the - compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6, - PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, TER16x32, VGA8x16, VGA8x8. + Select the initial font to use. The value 'name' can be any of the + compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6, + PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, TER16x32, VGA8x16, VGA8x8. Note, not all drivers can handle font with widths not divisible by 8, - such as vga16fb. + such as vga16fb. 2. fbcon=scrollback:<value>[k] - The scrollback buffer is memory that is used to preserve display - contents that has already scrolled past your view. This is accessed - by using the Shift-PageUp key combination. The value 'value' is any - integer. It defaults to 32KB. The 'k' suffix is optional, and will - multiply the 'value' by 1024. + The scrollback buffer is memory that is used to preserve display + contents that has already scrolled past your view. This is accessed + by using the Shift-PageUp key combination. The value 'value' is any + integer. It defaults to 32KB. The 'k' suffix is optional, and will + multiply the 'value' by 1024. 3. fbcon=map:<0123> - This is an interesting option. It tells which driver gets mapped to - which console. The value '0123' is a sequence that gets repeated until - the total length is 64 which is the number of consoles available. In - the above example, it is expanded to 012301230123... and the mapping - will be: + This is an interesting option. It tells which driver gets mapped to + which console. The value '0123' is a sequence that gets repeated until + the total length is 64 which is the number of consoles available. In + the above example, it is expanded to 012301230123... and the mapping + will be:: tty | 1 2 3 4 5 6 7 8 9 ... fb | 0 1 2 3 0 1 2 3 0 ... @@ -126,20 +129,20 @@ C. Boot options 4. fbcon=rotate:<n> - This option changes the orientation angle of the console display. The - value 'n' accepts the following: + This option changes the orientation angle of the console display. The + value 'n' accepts the following: - 0 - normal orientation (0 degree) - 1 - clockwise orientation (90 degrees) - 2 - upside down orientation (180 degrees) - 3 - counterclockwise orientation (270 degrees) + - 0 - normal orientation (0 degree) + - 1 - clockwise orientation (90 degrees) + - 2 - upside down orientation (180 degrees) + - 3 - counterclockwise orientation (270 degrees) The angle can be changed anytime afterwards by 'echoing' the same numbers to any one of the 2 attributes found in /sys/class/graphics/fbcon: - rotate - rotate the display of the active console - rotate_all - rotate the display of all consoles + - rotate - rotate the display of the active console + - rotate_all - rotate the display of all consoles Console rotation will only become available if Framebuffer Console Rotation support is compiled in your kernel. @@ -177,9 +180,9 @@ Before going on to how to attach, detach and unload the framebuffer console, an illustration of the dependencies may help. The console layer, as with most subsystems, needs a driver that interfaces with -the hardware. Thus, in a VGA console: +the hardware. Thus, in a VGA console:: -console ---> VGA driver ---> hardware. + console ---> VGA driver ---> hardware. Assuming the VGA driver can be unloaded, one must first unbind the VGA driver from the console layer before unloading the driver. The VGA driver cannot be @@ -187,9 +190,9 @@ unloaded if it is still bound to the console layer. (See Documentation/console/console.txt for more information). This is more complicated in the case of the framebuffer console (fbcon), -because fbcon is an intermediate layer between the console and the drivers: +because fbcon is an intermediate layer between the console and the drivers:: -console ---> fbcon ---> fbdev drivers ---> hardware + console ---> fbcon ---> fbdev drivers ---> hardware The fbdev drivers cannot be unloaded if bound to fbcon, and fbcon cannot be unloaded if it's bound to the console layer. @@ -204,12 +207,12 @@ So, how do we unbind fbcon from the console? Part of the answer is in Documentation/console/console.txt. To summarize: Echo a value to the bind file that represents the framebuffer console -driver. So assuming vtcon1 represents fbcon, then: +driver. So assuming vtcon1 represents fbcon, then:: -echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to - console layer -echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from - console layer + echo 1 > sys/class/vtconsole/vtcon1/bind - attach framebuffer console to + console layer + echo 0 > sys/class/vtconsole/vtcon1/bind - detach framebuffer console from + console layer If fbcon is detached from the console layer, your boot console driver (which is usually VGA text mode) will take over. A few drivers (rivafb and i810fb) will @@ -223,19 +226,19 @@ restored properly. The following is one of the several methods that you can do: 2. In your kernel configuration, ensure that CONFIG_FRAMEBUFFER_CONSOLE is set to 'y' or 'm'. Enable one or more of your favorite framebuffer drivers. -3. Boot into text mode and as root run: +3. Boot into text mode and as root run:: vbetool vbestate save > <vga state file> - The above command saves the register contents of your graphics - hardware to <vga state file>. You need to do this step only once as - the state file can be reused. + The above command saves the register contents of your graphics + hardware to <vga state file>. You need to do this step only once as + the state file can be reused. -4. If fbcon is compiled as a module, load fbcon by doing: +4. If fbcon is compiled as a module, load fbcon by doing:: modprobe fbcon -5. Now to detach fbcon: +5. Now to detach fbcon:: vbetool vbestate restore < <vga state file> && \ echo 0 > /sys/class/vtconsole/vtcon1/bind @@ -243,7 +246,7 @@ restored properly. The following is one of the several methods that you can do: 6. That's it, you're back to VGA mode. And if you compiled fbcon as a module, you can unload it by 'rmmod fbcon'. -7. To reattach fbcon: +7. To reattach fbcon:: echo 1 > /sys/class/vtconsole/vtcon1/bind @@ -266,82 +269,82 @@ the following: Variation 1: - a. Before detaching fbcon, do + a. Before detaching fbcon, do:: - vbetool vbemode save > <vesa state file> # do once for each vesafb mode, - # the file can be reused + vbetool vbemode save > <vesa state file> # do once for each vesafb mode, + # the file can be reused b. Detach fbcon as in step 5. - c. Attach fbcon + c. Attach fbcon:: - vbetool vbestate restore < <vesa state file> && \ + vbetool vbestate restore < <vesa state file> && \ echo 1 > /sys/class/vtconsole/vtcon1/bind Variation 2: - a. Before detaching fbcon, do: - echo <ID> > /sys/class/tty/console/bind + a. Before detaching fbcon, do:: + echo <ID> > /sys/class/tty/console/bind - vbetool vbemode get + vbetool vbemode get b. Take note of the mode number b. Detach fbcon as in step 5. - c. Attach fbcon: + c. Attach fbcon:: - vbetool vbemode set <mode number> && \ - echo 1 > /sys/class/vtconsole/vtcon1/bind + vbetool vbemode set <mode number> && \ + echo 1 > /sys/class/vtconsole/vtcon1/bind Samples: ======== Here are 2 sample bash scripts that you can use to bind or unbind the -framebuffer console driver if you are on an X86 box: +framebuffer console driver if you are on an X86 box:: ---------------------------------------------------------------------------- -#!/bin/bash -# Unbind fbcon + #!/bin/bash + # Unbind fbcon -# Change this to where your actual vgastate file is located -# Or Use VGASTATE=$1 to indicate the state file at runtime -VGASTATE=/tmp/vgastate + # Change this to where your actual vgastate file is located + # Or Use VGASTATE=$1 to indicate the state file at runtime + VGASTATE=/tmp/vgastate -# path to vbetool -VBETOOL=/usr/local/bin + # path to vbetool + VBETOOL=/usr/local/bin -for (( i = 0; i < 16; i++)) -do - if test -x /sys/class/vtconsole/vtcon$i; then - if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ - = 1 ]; then + for (( i = 0; i < 16; i++)) + do + if test -x /sys/class/vtconsole/vtcon$i; then + if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ + = 1 ]; then if test -x $VBETOOL/vbetool; then echo Unbinding vtcon$i $VBETOOL/vbetool vbestate restore < $VGASTATE echo 0 > /sys/class/vtconsole/vtcon$i/bind fi - fi - fi -done + fi + fi + done --------------------------------------------------------------------------- -#!/bin/bash -# Bind fbcon - -for (( i = 0; i < 16; i++)) -do - if test -x /sys/class/vtconsole/vtcon$i; then - if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ - = 1 ]; then + +:: + + #!/bin/bash + # Bind fbcon + + for (( i = 0; i < 16; i++)) + do + if test -x /sys/class/vtconsole/vtcon$i; then + if [ `cat /sys/class/vtconsole/vtcon$i/name | grep -c "frame buffer"` \ + = 1 ]; then echo Unbinding vtcon$i echo 1 > /sys/class/vtconsole/vtcon$i/bind - fi - fi -done ---------------------------------------------------------------------------- + fi + fi + done --- Antonino Daplas <adaplas@pol.net> diff --git a/Documentation/fb/framebuffer.txt b/Documentation/fb/framebuffer.rst index 58c5ae2e9f59..7fe087310c82 100644 --- a/Documentation/fb/framebuffer.txt +++ b/Documentation/fb/framebuffer.rst @@ -1,7 +1,7 @@ - The Frame Buffer Device - ----------------------- +======================= +The Frame Buffer Device +======================= -Maintained by Geert Uytterhoeven <geert@linux-m68k.org> Last revised: May 10, 2001 @@ -26,7 +26,7 @@ other device in /dev. It's a character device using major 29; the minor specifies the frame buffer number. By convention, the following device nodes are used (numbers indicate the device -minor numbers): +minor numbers):: 0 = /dev/fb0 First frame buffer 1 = /dev/fb1 Second frame buffer @@ -34,15 +34,15 @@ minor numbers): 31 = /dev/fb31 32nd frame buffer For backwards compatibility, you may want to create the following symbolic -links: +links:: /dev/fb0current -> fb0 /dev/fb1current -> fb1 and so on... -The frame buffer devices are also `normal' memory devices, this means, you can -read and write their contents. You can, for example, make a screen snapshot by +The frame buffer devices are also `normal` memory devices, this means, you can +read and write their contents. You can, for example, make a screen snapshot by:: cp /dev/fb0 myfile @@ -54,11 +54,11 @@ Application software that uses the frame buffer device (e.g. the X server) will use /dev/fb0 by default (older software uses /dev/fb0current). You can specify an alternative frame buffer device by setting the environment variable $FRAMEBUFFER to the path name of a frame buffer device, e.g. (for sh/bash -users): +users):: export FRAMEBUFFER=/dev/fb1 -or (for csh users): +or (for csh users):: setenv FRAMEBUFFER /dev/fb1 @@ -90,9 +90,9 @@ which data structures they work. Here's just a brief overview: possible). - You can get and set parts of the color map. Communication is done with 16 - bits per color part (red, green, blue, transparency) to support all - existing hardware. The driver does all the computations needed to apply - it to the hardware (round it down to less bits, maybe throw away + bits per color part (red, green, blue, transparency) to support all + existing hardware. The driver does all the computations needed to apply + it to the hardware (round it down to less bits, maybe throw away transparency). All this hardware abstraction makes the implementation of application programs @@ -113,10 +113,10 @@ much trouble... 3. Frame Buffer Resolution Maintenance -------------------------------------- -Frame buffer resolutions are maintained using the utility `fbset'. It can +Frame buffer resolutions are maintained using the utility `fbset`. It can change the video mode properties of a frame buffer device. Its main usage is -to change the current video mode, e.g. during boot up in one of your /etc/rc.* -or /etc/init.d/* files. +to change the current video mode, e.g. during boot up in one of your `/etc/rc.*` +or `/etc/init.d/*` files. Fbset uses a video mode database stored in a configuration file, so you can easily add your own modes and refer to them with a simple identifier. @@ -129,8 +129,8 @@ The X server (XF68_FBDev) is the most notable application program for the frame buffer device. Starting with XFree86 release 3.2, the X server is part of XFree86 and has 2 modes: - - If the `Display' subsection for the `fbdev' driver in the /etc/XF86Config - file contains a + - If the `Display` subsection for the `fbdev` driver in the /etc/XF86Config + file contains a:: Modes "default" @@ -146,7 +146,7 @@ XFree86 and has 2 modes: same virtual desktop size. The frame buffer device that's used is still /dev/fb0current (or $FRAMEBUFFER), but the available resolutions are defined by /etc/XF86Config now. The disadvantage is that you have to - specify the timings in a different format (but `fbset -x' may help). + specify the timings in a different format (but `fbset -x` may help). To tune a video mode, you can use fbset or xvidtune. Note that xvidtune doesn't work 100% with XF68_FBDev: the reported clock values are always incorrect. @@ -172,29 +172,29 @@ retrace, the electron beam is turned off (blanked). The speed at which the electron beam paints the pixels is determined by the dotclock in the graphics board. For a dotclock of e.g. 28.37516 MHz (millions -of cycles per second), each pixel is 35242 ps (picoseconds) long: +of cycles per second), each pixel is 35242 ps (picoseconds) long:: 1/(28.37516E6 Hz) = 35.242E-9 s -If the screen resolution is 640x480, it will take +If the screen resolution is 640x480, it will take:: 640*35.242E-9 s = 22.555E-6 s to paint the 640 (xres) pixels on one scanline. But the horizontal retrace -also takes time (e.g. 272 `pixels'), so a full scanline takes +also takes time (e.g. 272 `pixels`), so a full scanline takes:: (640+272)*35.242E-9 s = 32.141E-6 s -We'll say that the horizontal scanrate is about 31 kHz: +We'll say that the horizontal scanrate is about 31 kHz:: 1/(32.141E-6 s) = 31.113E3 Hz A full screen counts 480 (yres) lines, but we have to consider the vertical -retrace too (e.g. 49 `lines'). So a full screen will take +retrace too (e.g. 49 `lines`). So a full screen will take:: (480+49)*32.141E-6 s = 17.002E-3 s -The vertical scanrate is about 59 Hz: +The vertical scanrate is about 59 Hz:: 1/(17.002E-3 s) = 58.815 Hz @@ -212,7 +212,7 @@ influenced by the moments at which the synchronization pulses occur. The following picture summarizes all timings. The horizontal retrace time is the sum of the left margin, the right margin and the hsync length, while the vertical retrace time is the sum of the upper margin, the lower margin and the -vsync length. +vsync length:: +----------+---------------------------------------------+----------+-------+ | | ↑ | | | @@ -256,7 +256,8 @@ The frame buffer device expects all horizontal timings in number of dotclocks 6. Converting XFree86 timing values info frame buffer device timings -------------------------------------------------------------------- -An XFree86 mode line consists of the following fields: +An XFree86 mode line consists of the following fields:: + "800x600" 50 800 856 976 1040 600 637 643 666 < name > DCF HR SH1 SH2 HFL VR SV1 SV2 VFL @@ -271,19 +272,27 @@ The frame buffer device uses the following fields: - vsync_len: length of vertical sync 1) Pixelclock: + xfree: in MHz + fb: in picoseconds (ps) pixclock = 1000000 / DCF 2) horizontal timings: + left_margin = HFL - SH2 + right_margin = SH1 - HR + hsync_len = SH2 - SH1 3) vertical timings: + upper_margin = VFL - SV2 + lower_margin = SV1 - VR + vsync_len = SV2 - SV1 Good examples for VESA timings can be found in the XFree86 source tree, @@ -303,9 +312,10 @@ and to the following documentation: - The manual pages for fbset: fbset(8), fb.modes(5) - The manual pages for XFree86: XF68_FBDev(1), XF86Config(4/5) - The mighty kernel sources: - o linux/drivers/video/ - o linux/include/linux/fb.h - o linux/include/video/ + + - linux/drivers/video/ + - linux/include/linux/fb.h + - linux/include/video/ @@ -330,14 +340,14 @@ and on its mirrors. The latest version of fbset can be found at - http://www.linux-fbdev.org/ + http://www.linux-fbdev.org/ + + +10. Credits +----------- - -10. Credits ----------- - This readme was written by Geert Uytterhoeven, partly based on the original -`X-framebuffer.README' by Roman Hodek and Martin Schaller. Section 6 was +`X-framebuffer.README` by Roman Hodek and Martin Schaller. Section 6 was provided by Frank Neumann. The frame buffer device abstraction was designed by Martin Schaller. diff --git a/Documentation/fb/gxfb.txt b/Documentation/fb/gxfb.rst index 2f640903bbb2..5738709bccbb 100644 --- a/Documentation/fb/gxfb.txt +++ b/Documentation/fb/gxfb.rst @@ -1,7 +1,8 @@ -[This file is cloned from VesaFB/aty128fb] - +============= What is gxfb? -================= +============= + +.. [This file is cloned from VesaFB/aty128fb] This is a graphics framebuffer driver for AMD Geode GX2 based processors. @@ -23,9 +24,9 @@ How to use it? ============== Switching modes is done using gxfb.mode_option=<resolution>... boot -parameter or using `fbset' program. +parameter or using `fbset` program. -See Documentation/fb/modedb.txt for more information on modedb +See Documentation/fb/modedb.rst for more information on modedb resolutions. @@ -42,11 +43,12 @@ You can pass kernel command line options to gxfb with gxfb.<option>. For example, gxfb.mode_option=800x600@75. Accepted options: -mode_option - specify the video mode. Of the form - <x>x<y>[-<bpp>][@<refresh>] -vram - size of video ram (normally auto-detected) -vt_switch - enable vt switching during suspend/resume. The vt - switch is slow, but harmless. +================ ================================================== +mode_option specify the video mode. Of the form + <x>x<y>[-<bpp>][@<refresh>] +vram size of video ram (normally auto-detected) +vt_switch enable vt switching during suspend/resume. The vt + switch is slow, but harmless. +================ ================================================== --- Andres Salomon <dilinger@debian.org> diff --git a/Documentation/fb/index.rst b/Documentation/fb/index.rst new file mode 100644 index 000000000000..d47313714635 --- /dev/null +++ b/Documentation/fb/index.rst @@ -0,0 +1,50 @@ +:orphan: + +============ +Frame Buffer +============ + +.. toctree:: + :maxdepth: 1 + + api + arkfb + aty128fb + cirrusfb + cmap_xfbdev + deferred_io + efifb + ep93xx-fb + fbcon + framebuffer + gxfb + intel810 + intelfb + internals + lxfb + matroxfb + metronomefb + modedb + pvr2fb + pxafb + s3fb + sa1100fb + sh7760fb + sisfb + sm501 + sm712fb + sstfb + tgafb + tridentfb + udlfb + uvesafb + vesafb + viafb + vt8623fb + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/fb/intel810.txt b/Documentation/fb/intel810.rst index a8e9f5bca6f3..eb86098db91f 100644 --- a/Documentation/fb/intel810.txt +++ b/Documentation/fb/intel810.rst @@ -1,26 +1,31 @@ +================================ Intel 810/815 Framebuffer driver - Tony Daplas <adaplas@pol.net> - http://i810fb.sourceforge.net +================================ - March 17, 2002 +Tony Daplas <adaplas@pol.net> - First Released: July 2001 - Last Update: September 12, 2005 -================================================================ +http://i810fb.sourceforge.net + +March 17, 2002 + +First Released: July 2001 +Last Update: September 12, 2005 A. Introduction +=============== This is a framebuffer driver for various Intel 810/815 compatible graphics devices. These include: - Intel 810 - Intel 810E - Intel 810-DC100 - Intel 815 Internal graphics only, 100Mhz FSB - Intel 815 Internal graphics only - Intel 815 Internal graphics and AGP + - Intel 810 + - Intel 810E + - Intel 810-DC100 + - Intel 815 Internal graphics only, 100Mhz FSB + - Intel 815 Internal graphics only + - Intel 815 Internal graphics and AGP B. Features +============ - Choice of using Discrete Video Timings, VESA Generalized Timing Formula, or a framebuffer specific database to set the video mode @@ -45,10 +50,11 @@ B. Features - Can concurrently run with xfree86 running with native i810 drivers - Hardware Cursor Support - + - Supports EDID probing either by DDC/I2C or through the BIOS C. List of available options +============================= a. "video=i810fb" enables the i810 driver @@ -158,7 +164,7 @@ C. List of available options (default = not set) n. "dcolor" - Use directcolor visual instead of truecolor for pixel depths greater + Use directcolor visual instead of truecolor for pixel depths greater than 8 bpp. Useful for color tuning, such as gamma control. Recommendation: do not set @@ -167,35 +173,37 @@ C. List of available options o. <xres>x<yres>[-<bpp>][@<refresh>] The driver will now accept specification of boot mode option. If this is specified, the options 'xres' and 'yres' will be ignored. See - Documentation/fb/modedb.txt for usage. + Documentation/fb/modedb.rst for usage. D. Kernel booting +================= Separate each option/option-pair by commas (,) and the option from its value -with a colon (:) as in the following: +with a colon (:) as in the following:: -video=i810fb:option1,option2:value2 + video=i810fb:option1,option2:value2 Sample Usage ------------ -In /etc/lilo.conf, add the line: +In /etc/lilo.conf, add the line:: -append="video=i810fb:vram:2,xres:1024,yres:768,bpp:8,hsync1:30,hsync2:55, \ - vsync1:50,vsync2:85,accel,mtrr" + append="video=i810fb:vram:2,xres:1024,yres:768,bpp:8,hsync1:30,hsync2:55, \ + vsync1:50,vsync2:85,accel,mtrr" This will initialize the framebuffer to 1024x768 at 8bpp. The framebuffer will use 2 MB of System RAM. MTRR support will be enabled. The refresh rate will be computed based on the hsync1/hsync2 and vsync1/vsync2 values. IMPORTANT: -You must include hsync1, hsync2, vsync1 and vsync2 to enable video modes -better than 640x480 at 60Hz. HOWEVER, if your chipset/display combination -supports I2C and has an EDID block, you can safely exclude hsync1, hsync2, -vsync1 and vsync2 parameters. These parameters will be taken from the EDID -block. + You must include hsync1, hsync2, vsync1 and vsync2 to enable video modes + better than 640x480 at 60Hz. HOWEVER, if your chipset/display combination + supports I2C and has an EDID block, you can safely exclude hsync1, hsync2, + vsync1 and vsync2 parameters. These parameters will be taken from the EDID + block. E. Module options +================== The module parameters are essentially similar to the kernel parameters. The main difference is that you need to include a Boolean value @@ -206,31 +214,32 @@ Example, to enable MTRR, include "mtrr=1". Sample Usage ------------ -Using the same setup as described above, load the module like this: +Using the same setup as described above, load the module like this:: modprobe i810fb vram=2 xres=1024 bpp=8 hsync1=30 hsync2=55 vsync1=50 \ - vsync2=85 accel=1 mtrr=1 + vsync2=85 accel=1 mtrr=1 -Or just add the following to a configuration file in /etc/modprobe.d/ +Or just add the following to a configuration file in /etc/modprobe.d/:: options i810fb vram=2 xres=1024 bpp=16 hsync1=30 hsync2=55 vsync1=50 \ vsync2=85 accel=1 mtrr=1 -and just do a +and just do a:: modprobe i810fb F. Setup +========= - a. Do your usual method of configuring the kernel. + a. Do your usual method of configuring the kernel - make menuconfig/xconfig/config + make menuconfig/xconfig/config b. Under "Code maturity level options" enable "Prompt for development and/or incomplete code/drivers". - c. Enable agpgart support for the Intel 810/815 on-board graphics. + c. Enable agpgart support for the Intel 810/815 on-board graphics. This is required. The option is under "Character Devices". d. Under "Graphics Support", select "Intel 810/815" either statically @@ -242,7 +251,7 @@ F. Setup set 'Enable DDC Support' to 'y'. To make this option appear, set 'use VESA Generalized Timing Formula' to 'y'. - f. If you want a framebuffer console, enable it under "Console + f. If you want a framebuffer console, enable it under "Console Drivers". g. Compile your kernel. @@ -253,6 +262,7 @@ F. Setup patch to see the chipset in action (or inaction :-). G. Acknowledgment: +=================== 1. Geert Uytterhoeven - his excellent howto and the virtual framebuffer driver code made this possible. @@ -269,10 +279,9 @@ G. Acknowledgment: optimizations possible. H. Home Page: +============== A more complete, and probably updated information is provided at http://i810fb.sourceforge.net. -########################### Tony - diff --git a/Documentation/fb/intelfb.txt b/Documentation/fb/intelfb.rst index feac4e4d6968..e2d0903f4efb 100644 --- a/Documentation/fb/intelfb.txt +++ b/Documentation/fb/intelfb.rst @@ -1,24 +1,28 @@ +============================================================= Intel 830M/845G/852GM/855GM/865G/915G/945G Framebuffer driver -================================================================ +============================================================= A. Introduction - This is a framebuffer driver for various Intel 8xx/9xx compatible +=============== + +This is a framebuffer driver for various Intel 8xx/9xx compatible graphics devices. These would include: - Intel 830M - Intel 845G - Intel 852GM - Intel 855GM - Intel 865G - Intel 915G - Intel 915GM - Intel 945G - Intel 945GM - Intel 945GME - Intel 965G - Intel 965GM + - Intel 830M + - Intel 845G + - Intel 852GM + - Intel 855GM + - Intel 865G + - Intel 915G + - Intel 915GM + - Intel 945G + - Intel 945GM + - Intel 945GME + - Intel 965G + - Intel 965GM B. List of available options +============================= a. "video=intelfb" enables the intelfb driver @@ -39,12 +43,12 @@ B. List of available options (default = 4 MB) d. "voffset=<value>" - select at what offset in MB of the logical memory to allocate the + select at what offset in MB of the logical memory to allocate the framebuffer memory. The intent is to avoid the memory blocks used by standard graphics applications (XFree86). Depending on your - usage, adjust the value up or down, (0 for maximum usage, 63/127 MB - for the least amount). Note, an arbitrary setting may conflict - with XFree86. + usage, adjust the value up or down, (0 for maximum usage, 63/127 MB + for the least amount). Note, an arbitrary setting may conflict + with XFree86. Recommendation: do not set (default = 48 MB) @@ -80,18 +84,19 @@ B. List of available options The default parameter (not named) is the mode. C. Kernel booting +================= Separate each option/option-pair by commas (,) and the option from its value -with an equals sign (=) as in the following: +with an equals sign (=) as in the following:: -video=intelfb:option1,option2=value2 + video=intelfb:option1,option2=value2 Sample Usage ------------ -In /etc/lilo.conf, add the line: +In /etc/lilo.conf, add the line:: -append="video=intelfb:mode=800x600-32@75,accel,hwcursor,vram=8" + append="video=intelfb:mode=800x600-32@75,accel,hwcursor,vram=8" This will initialize the framebuffer to 800x600 at 32bpp and 75Hz. The framebuffer will use 8 MB of System RAM. hw acceleration of text and cursor @@ -106,8 +111,9 @@ in this directory. D. Module options +================== - The module parameters are essentially similar to the kernel +The module parameters are essentially similar to the kernel parameters. The main difference is that you need to include a Boolean value (1 for TRUE, and 0 for FALSE) for those options which don't need a value. @@ -116,23 +122,24 @@ Example, to enable MTRR, include "mtrr=1". Sample Usage ------------ -Using the same setup as described above, load the module like this: +Using the same setup as described above, load the module like this:: modprobe intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1 -Or just add the following to a configuration file in /etc/modprobe.d/ +Or just add the following to a configuration file in /etc/modprobe.d/:: options intelfb mode=800x600-32@75 vram=8 accel=1 hwcursor=1 -and just do a +and just do a:: modprobe intelfb E. Acknowledgment: +=================== 1. Geert Uytterhoeven - his excellent howto and the virtual - framebuffer driver code made this possible. + framebuffer driver code made this possible. 2. Jeff Hartmann for his agpgart code. @@ -145,5 +152,4 @@ E. Acknowledgment: 6. Andrew Morton for his kernel patches maintenance. -########################### Sylvain diff --git a/Documentation/fb/internals.txt b/Documentation/fb/internals.rst index 9b2a2b2f3e57..696b50aa7c24 100644 --- a/Documentation/fb/internals.txt +++ b/Documentation/fb/internals.rst @@ -1,13 +1,19 @@ +============================= +Frame Buffer device internals +============================= This is a first start for some documentation about frame buffer device internals. -Geert Uytterhoeven <geert@linux-m68k.org>, 21 July 1998 -James Simmons <jsimmons@user.sf.net>, Nov 26 2002 +Authors: + +- Geert Uytterhoeven <geert@linux-m68k.org>, 21 July 1998 +- James Simmons <jsimmons@user.sf.net>, Nov 26 2002 -------------------------------------------------------------------------------- - *** STRUCTURES USED BY THE FRAME BUFFER DEVICE API *** +Structures used by the frame buffer device API +============================================== The following structures play a role in the game of frame buffer devices. They are defined in <linux/fb.h>. @@ -40,19 +46,18 @@ are defined in <linux/fb.h>. Generic information, API and low level information about a specific frame buffer device instance (slot number, board address, ...). - - struct `par' + - struct `par` Device dependent information that uniquely defines the video mode for this particular piece of hardware. --------------------------------------------------------------------------------- - - *** VISUALS USED BY THE FRAME BUFFER DEVICE API *** +Visuals used by the frame buffer device API +=========================================== Monochrome (FB_VISUAL_MONO01 and FB_VISUAL_MONO10) -------------------------------------------------- +-------------------------------------------------- Each pixel is either black or white. @@ -70,7 +75,7 @@ The pixel value is broken up into red, green, and blue fields. Direct color (FB_VISUAL_DIRECTCOLOR) ------------------------------------ -The pixel value is broken up into red, green, and blue fields, each of which +The pixel value is broken up into red, green, and blue fields, each of which are looked up in separate red, green, and blue lookup tables. @@ -79,4 +84,3 @@ Grayscale displays Grayscale and static grayscale are special variants of pseudo color and static pseudo color, where the red, green and blue components are always equal to each other. - diff --git a/Documentation/fb/lxfb.txt b/Documentation/fb/lxfb.rst index 38b3ca6f6ca7..863e6b98fbae 100644 --- a/Documentation/fb/lxfb.txt +++ b/Documentation/fb/lxfb.rst @@ -1,7 +1,9 @@ -[This file is cloned from VesaFB/aty128fb] - +============= What is lxfb? -================= +============= + +.. [This file is cloned from VesaFB/aty128fb] + This is a graphics framebuffer driver for AMD Geode LX based processors. @@ -23,9 +25,9 @@ How to use it? ============== Switching modes is done using lxfb.mode_option=<resolution>... boot -parameter or using `fbset' program. +parameter or using `fbset` program. -See Documentation/fb/modedb.txt for more information on modedb +See Documentation/fb/modedb.rst for more information on modedb resolutions. @@ -42,11 +44,12 @@ You can pass kernel command line options to lxfb with lxfb.<option>. For example, lxfb.mode_option=800x600@75. Accepted options: -mode_option - specify the video mode. Of the form - <x>x<y>[-<bpp>][@<refresh>] -vram - size of video ram (normally auto-detected) -vt_switch - enable vt switching during suspend/resume. The vt - switch is slow, but harmless. +================ ================================================== +mode_option specify the video mode. Of the form + <x>x<y>[-<bpp>][@<refresh>] +vram size of video ram (normally auto-detected) +vt_switch enable vt switching during suspend/resume. The vt + switch is slow, but harmless. +================ ================================================== --- Andres Salomon <dilinger@debian.org> diff --git a/Documentation/fb/matroxfb.rst b/Documentation/fb/matroxfb.rst new file mode 100644 index 000000000000..f1859d98606e --- /dev/null +++ b/Documentation/fb/matroxfb.rst @@ -0,0 +1,443 @@ +================= +What is matroxfb? +================= + +.. [This file is cloned from VesaFB. Thanks go to Gerd Knorr] + + +This is a driver for a graphic framebuffer for Matrox devices on +Alpha, Intel and PPC boxes. + +Advantages: + + * It provides a nice large console (128 cols + 48 lines with 1024x768) + without using tiny, unreadable fonts. + * You can run XF{68,86}_FBDev or XFree86 fbdev driver on top of /dev/fb0 + * Most important: boot logo :-) + +Disadvantages: + + * graphic mode is slower than text mode... but you should not notice + if you use same resolution as you used in textmode. + + +How to use it? +============== + +Switching modes is done using the video=matroxfb:vesa:... boot parameter +or using `fbset` program. + +If you want, for example, enable a resolution of 1280x1024x24bpp you should +pass to the kernel this command line: "video=matroxfb:vesa:0x1BB". + +You should compile in both vgacon (to boot if you remove you Matrox from +box) and matroxfb (for graphics mode). You should not compile-in vesafb +unless you have primary display on non-Matrox VBE2.0 device (see +Documentation/fb/vesafb.rst for details). + +Currently supported video modes are (through vesa:... interface, PowerMac +has [as addon] compatibility code): + + +Graphic modes +------------- + +=== ======= ======= ======= ======= ======= +bpp 640x400 640x480 768x576 800x600 960x720 +=== ======= ======= ======= ======= ======= + 4 0x12 0x102 + 8 0x100 0x101 0x180 0x103 0x188 + 15 0x110 0x181 0x113 0x189 + 16 0x111 0x182 0x114 0x18A + 24 0x1B2 0x184 0x1B5 0x18C + 32 0x112 0x183 0x115 0x18B +=== ======= ======= ======= ======= ======= + + +Graphic modes (continued) +------------------------- + +=== ======== ======== ========= ========= ========= +bpp 1024x768 1152x864 1280x1024 1408x1056 1600x1200 +=== ======== ======== ========= ========= ========= + 4 0x104 0x106 + 8 0x105 0x190 0x107 0x198 0x11C + 15 0x116 0x191 0x119 0x199 0x11D + 16 0x117 0x192 0x11A 0x19A 0x11E + 24 0x1B8 0x194 0x1BB 0x19C 0x1BF + 32 0x118 0x193 0x11B 0x19B +=== ======== ======== ========= ========= ========= + + +Text modes +---------- + +==== ======= ======= ======== ======== ======== +text 640x400 640x480 1056x344 1056x400 1056x480 +==== ======= ======= ======== ======== ======== + 8x8 0x1C0 0x108 0x10A 0x10B 0x10C +8x16 2, 3, 7 0x109 +==== ======= ======= ======== ======== ======== + +You can enter these number either hexadecimal (leading `0x`) or decimal +(0x100 = 256). You can also use value + 512 to achieve compatibility +with your old number passed to vesafb. + +Non-listed number can be achieved by more complicated command-line, for +example 1600x1200x32bpp can be specified by `video=matroxfb:vesa:0x11C,depth:32`. + + +X11 +=== + +XF{68,86}_FBDev should work just fine, but it is non-accelerated. On non-intel +architectures there are some glitches for 24bpp videomodes. 8, 16 and 32bpp +works fine. + +Running another (accelerated) X-Server like XF86_SVGA works too. But (at least) +XFree servers have big troubles in multihead configurations (even on first +head, not even talking about second). Running XFree86 4.x accelerated mga +driver is possible, but you must not enable DRI - if you do, resolution and +color depth of your X desktop must match resolution and color depths of your +virtual consoles, otherwise X will corrupt accelerator settings. + + +SVGALib +======= + +Driver contains SVGALib compatibility code. It is turned on by choosing textual +mode for console. You can do it at boot time by using videomode +2,3,7,0x108-0x10C or 0x1C0. At runtime, `fbset -depth 0` does this work. +Unfortunately, after SVGALib application exits, screen contents is corrupted. +Switching to another console and back fixes it. I hope that it is SVGALib's +problem and not mine, but I'm not sure. + + +Configuration +============= + +You can pass kernel command line options to matroxfb with +`video=matroxfb:option1,option2:value2,option3` (multiple options should be +separated by comma, values are separated from options by `:`). +Accepted options: + +============ =================================================================== +mem:X size of memory (X can be in megabytes, kilobytes or bytes) + You can only decrease value determined by driver because of + it always probe for memory. Default is to use whole detected + memory usable for on-screen display (i.e. max. 8 MB). +disabled do not load driver; you can use also `off`, but `disabled` + is here too. +enabled load driver, if you have `video=matroxfb:disabled` in LILO + configuration, you can override it by this (you cannot override + `off`). It is default. +noaccel do not use acceleration engine. It does not work on Alphas. +accel use acceleration engine. It is default. +nopan create initial consoles with vyres = yres, thus disabling virtual + scrolling. +pan create initial consoles as tall as possible (vyres = memory/vxres). + It is default. +nopciretry disable PCI retries. It is needed for some broken chipsets, + it is autodetected for intel's 82437. In this case device does + not comply to PCI 2.1 specs (it will not guarantee that every + transaction terminate with success or retry in 32 PCLK). +pciretry enable PCI retries. It is default, except for intel's 82437. +novga disables VGA I/O ports. It is default if BIOS did not enable + device. You should not use this option, some boards then do not + restart without power off. +vga preserve state of VGA I/O ports. It is default. Driver does not + enable VGA I/O if BIOS did not it (it is not safe to enable it in + most cases). +nobios disables BIOS ROM. It is default if BIOS did not enable BIOS + itself. You should not use this option, some boards then do not + restart without power off. +bios preserve state of BIOS ROM. It is default. Driver does not enable + BIOS if BIOS was not enabled before. +noinit tells driver, that devices were already initialized. You should use + it if you have G100 and/or if driver cannot detect memory, you see + strange pattern on screen and so on. Devices not enabled by BIOS + are still initialized. It is default. +init driver initializes every device it knows about. +memtype specifies memory type, implies 'init'. This is valid only for G200 + and G400 and has following meaning: + + G200: + - 0 -> 2x128Kx32 chips, 2MB onboard, probably sgram + - 1 -> 2x128Kx32 chips, 4MB onboard, probably sgram + - 2 -> 2x256Kx32 chips, 4MB onboard, probably sgram + - 3 -> 2x256Kx32 chips, 8MB onboard, probably sgram + - 4 -> 2x512Kx16 chips, 8/16MB onboard, probably sdram only + - 5 -> same as above + - 6 -> 4x128Kx32 chips, 4MB onboard, probably sgram + - 7 -> 4x128Kx32 chips, 8MB onboard, probably sgram + G400: + - 0 -> 2x512Kx16 SDRAM, 16/32MB + - 2x512Kx32 SGRAM, 16/32MB + - 1 -> 2x256Kx32 SGRAM, 8/16MB + - 2 -> 4x128Kx32 SGRAM, 8/16MB + - 3 -> 4x512Kx32 SDRAM, 32MB + - 4 -> 4x256Kx32 SGRAM, 16/32MB + - 5 -> 2x1Mx32 SDRAM, 32MB + - 6 -> reserved + - 7 -> reserved + + You should use sdram or sgram parameter in addition to memtype + parameter. +nomtrr disables write combining on frame buffer. This slows down driver + but there is reported minor incompatibility between GUS DMA and + XFree under high loads if write combining is enabled (sound + dropouts). +mtrr enables write combining on frame buffer. It speeds up video + accesses much. It is default. You must have MTRR support enabled + in kernel and your CPU must have MTRR (f.e. Pentium II have them). +sgram tells to driver that you have Gxx0 with SGRAM memory. It has no + effect without `init`. +sdram tells to driver that you have Gxx0 with SDRAM memory. + It is a default. +inv24 change timings parameters for 24bpp modes on Millennium and + Millennium II. Specify this if you see strange color shadows + around characters. +noinv24 use standard timings. It is the default. +inverse invert colors on screen (for LCD displays) +noinverse show true colors on screen. It is default. +dev:X bind driver to device X. Driver numbers device from 0 up to N, + where device 0 is first `known` device found, 1 second and so on. + lspci lists devices in this order. + Default is `every` known device. +nohwcursor disables hardware cursor (use software cursor instead). +hwcursor enables hardware cursor. It is default. If you are using + non-accelerated mode (`noaccel` or `fbset -accel false`), software + cursor is used (except for text mode). +noblink disables cursor blinking. Cursor in text mode always blinks (hw + limitation). +blink enables cursor blinking. It is default. +nofastfont disables fastfont feature. It is default. +fastfont:X enables fastfont feature. X specifies size of memory reserved for + font data, it must be >= (fontwidth*fontheight*chars_in_font)/8. + It is faster on Gx00 series, but slower on older cards. +grayscale enable grayscale summing. It works in PSEUDOCOLOR modes (text, + 4bpp, 8bpp). In DIRECTCOLOR modes it is limited to characters + displayed through putc/putcs. Direct accesses to framebuffer + can paint colors. +nograyscale disable grayscale summing. It is default. +cross4MB enables that pixel line can cross 4MB boundary. It is default for + non-Millennium. +nocross4MB pixel line must not cross 4MB boundary. It is default for + Millennium I or II, because of these devices have hardware + limitations which do not allow this. But this option is + incompatible with some (if not all yet released) versions of + XF86_FBDev. +dfp enables digital flat panel interface. This option is incompatible + with secondary (TV) output - if DFP is active, TV output must be + inactive and vice versa. DFP always uses same timing as primary + (monitor) output. +dfp:X use settings X for digital flat panel interface. X is number from + 0 to 0xFF, and meaning of each individual bit is described in + G400 manual, in description of DAC register 0x1F. For normal + operation you should set all bits to zero, except lowest bit. This + lowest bit selects who is source of display clocks, whether G400, + or panel. Default value is now read back from hardware - so you + should specify this value only if you are also using `init` + parameter. +outputs:XYZ set mapping between CRTC and outputs. Each letter can have value + of 0 (for no CRTC), 1 (CRTC1) or 2 (CRTC2), and first letter + corresponds to primary analog output, second letter to the + secondary analog output and third letter to the DVI output. + Default setting is 100 for cards below G400 or G400 without DFP, + 101 for G400 with DFP, and 111 for G450 and G550. You can set + mapping only on first card, use matroxset for setting up other + devices. +vesa:X selects startup videomode. X is number from 0 to 0x1FF, see table + above for detailed explanation. Default is 640x480x8bpp if driver + has 8bpp support. Otherwise first available of 640x350x4bpp, + 640x480x15bpp, 640x480x24bpp, 640x480x32bpp or 80x25 text + (80x25 text is always available). +============ =================================================================== + +If you are not satisfied with videomode selected by `vesa` option, you +can modify it with these options: + +============ =================================================================== +xres:X horizontal resolution, in pixels. Default is derived from `vesa` + option. +yres:X vertical resolution, in pixel lines. Default is derived from `vesa` + option. +upper:X top boundary: lines between end of VSYNC pulse and start of first + pixel line of picture. Default is derived from `vesa` option. +lower:X bottom boundary: lines between end of picture and start of VSYNC + pulse. Default is derived from `vesa` option. +vslen:X length of VSYNC pulse, in lines. Default is derived from `vesa` + option. +left:X left boundary: pixels between end of HSYNC pulse and first pixel. + Default is derived from `vesa` option. +right:X right boundary: pixels between end of picture and start of HSYNC + pulse. Default is derived from `vesa` option. +hslen:X length of HSYNC pulse, in pixels. Default is derived from `vesa` + option. +pixclock:X dotclocks, in ps (picoseconds). Default is derived from `vesa` + option and from `fh` and `fv` options. +sync:X sync. pulse - bit 0 inverts HSYNC polarity, bit 1 VSYNC polarity. + If bit 3 (value 0x08) is set, composite sync instead of HSYNC is + generated. If bit 5 (value 0x20) is set, sync on green is turned + on. Do not forget that if you want sync on green, you also probably + want composite sync. + Default depends on `vesa`. +depth:X Bits per pixel: 0=text, 4,8,15,16,24 or 32. Default depends on + `vesa`. +============ =================================================================== + +If you know capabilities of your monitor, you can specify some (or all) of +`maxclk`, `fh` and `fv`. In this case, `pixclock` is computed so that +pixclock <= maxclk, real_fh <= fh and real_fv <= fv. + +============ ================================================================== +maxclk:X maximum dotclock. X can be specified in MHz, kHz or Hz. Default is + `don`t care`. +fh:X maximum horizontal synchronization frequency. X can be specified + in kHz or Hz. Default is `don't care`. +fv:X maximum vertical frequency. X must be specified in Hz. Default is + 70 for modes derived from `vesa` with yres <= 400, 60Hz for + yres > 400. +============ ================================================================== + + +Limitations +=========== + +There are known and unknown bugs, features and misfeatures. +Currently there are following known bugs: + + - SVGALib does not restore screen on exit + - generic fbcon-cfbX procedures do not work on Alphas. Due to this, + `noaccel` (and cfb4 accel) driver does not work on Alpha. So everyone + with access to `/dev/fb*` on Alpha can hang machine (you should restrict + access to `/dev/fb*` - everyone with access to this device can destroy + your monitor, believe me...). + - 24bpp does not support correctly XF-FBDev on big-endian architectures. + - interlaced text mode is not supported; it looks like hardware limitation, + but I'm not sure. + - Gxx0 SGRAM/SDRAM is not autodetected. + - If you are using more than one framebuffer device, you must boot kernel + with 'video=scrollback:0'. + - maybe more... + +And following misfeatures: + + - SVGALib does not restore screen on exit. + - pixclock for text modes is limited by hardware to + + - 83 MHz on G200 + - 66 MHz on Millennium I + - 60 MHz on Millennium II + + Because I have no access to other devices, I do not know specific + frequencies for them. So driver does not check this and allows you to + set frequency higher that this. It causes sparks, black holes and other + pretty effects on screen. Device was not destroyed during tests. :-) + - my Millennium G200 oscillator has frequency range from 35 MHz to 380 MHz + (and it works with 8bpp on about 320 MHz dotclocks (and changed mclk)). + But Matrox says on product sheet that VCO limit is 50-250 MHz, so I believe + them (maybe that chip overheats, but it has a very big cooler (G100 has + none), so it should work). + - special mixed video/graphics videomodes of Mystique and Gx00 - 2G8V16 and + G16V16 are not supported + - color keying is not supported + - feature connector of Mystique and Gx00 is set to VGA mode (it is disabled + by BIOS) + - DDC (monitor detection) is supported through dualhead driver + - some check for input values are not so strict how it should be (you can + specify vslen=4000 and so on). + - maybe more... + +And following features: + + - 4bpp is available only on Millennium I and Millennium II. It is hardware + limitation. + - selection between 1:5:5:5 and 5:6:5 16bpp videomode is done by -rgba + option of fbset: "fbset -depth 16 -rgba 5,5,5" selects 1:5:5:5, anything + else selects 5:6:5 mode. + - text mode uses 6 bit VGA palette instead of 8 bit (one of 262144 colors + instead of one of 16M colors). It is due to hardware limitation of + Millennium I/II and SVGALib compatibility. + + +Benchmarks +========== +It is time to redraw whole screen 1000 times in 1024x768, 60Hz. It is +time for draw 6144000 characters on screen through /dev/vcsa +(for 32bpp it is about 3GB of data (exactly 3000 MB); for 8x16 font in +16 seconds, i.e. 187 MBps). +Times were obtained from one older version of driver, now they are about 3% +faster, it is kernel-space only time on P-II/350 MHz, Millennium I in 33 MHz +PCI slot, G200 in AGP 2x slot. I did not test vgacon:: + + NOACCEL + 8x16 12x22 + Millennium I G200 Millennium I G200 + 8bpp 16.42 9.54 12.33 9.13 + 16bpp 21.00 15.70 19.11 15.02 + 24bpp 36.66 36.66 35.00 35.00 + 32bpp 35.00 30.00 33.85 28.66 + + ACCEL, nofastfont + 8x16 12x22 6x11 + Millennium I G200 Millennium I G200 Millennium I G200 + 8bpp 7.79 7.24 13.55 7.78 30.00 21.01 + 16bpp 9.13 7.78 16.16 7.78 30.00 21.01 + 24bpp 14.17 10.72 18.69 10.24 34.99 21.01 + 32bpp 16.15 16.16 18.73 13.09 34.99 21.01 + + ACCEL, fastfont + 8x16 12x22 6x11 + Millennium I G200 Millennium I G200 Millennium I G200 + 8bpp 8.41 6.01 6.54 4.37 16.00 10.51 + 16bpp 9.54 9.12 8.76 6.17 17.52 14.01 + 24bpp 15.00 12.36 11.67 10.00 22.01 18.32 + 32bpp 16.18 18.29* 12.71 12.74 24.44 21.00 + + TEXT + 8x16 + Millennium I G200 + TEXT 3.29 1.50 + + * Yes, it is slower than Millennium I. + + +Dualhead G400 +============= +Driver supports dualhead G400 with some limitations: + + secondary head shares videomemory with primary head. It is not problem + if you have 32MB of videoram, but if you have only 16MB, you may have + to think twice before choosing videomode (for example twice 1880x1440x32bpp + is not possible). + + due to hardware limitation, secondary head can use only 16 and 32bpp + videomodes. + + secondary head is not accelerated. There were bad problems with accelerated + XFree when secondary head used to use acceleration. + + secondary head always powerups in 640x480@60-32 videomode. You have to use + fbset to change this mode. + + secondary head always powerups in monitor mode. You have to use fbmatroxset + to change it to TV mode. Also, you must select at least 525 lines for + NTSC output and 625 lines for PAL output. + + kernel is not fully multihead ready. So some things are impossible to do. + + if you compiled it as module, you must insert i2c-matroxfb, matroxfb_maven + and matroxfb_crtc2 into kernel. + + +Dualhead G450 +============= +Driver supports dualhead G450 with some limitations: + + secondary head shares videomemory with primary head. It is not problem + if you have 32MB of videoram, but if you have only 16MB, you may have + to think twice before choosing videomode. + + due to hardware limitation, secondary head can use only 16 and 32bpp + videomodes. + + secondary head is not accelerated. + + secondary head always powerups in 640x480@60-32 videomode. You have to use + fbset to change this mode. + + TV output is not supported + + kernel is not fully multihead ready, so some things are impossible to do. + + if you compiled it as module, you must insert matroxfb_g450 and matroxfb_crtc2 + into kernel. + +Petr Vandrovec <vandrove@vc.cvut.cz> diff --git a/Documentation/fb/matroxfb.txt b/Documentation/fb/matroxfb.txt deleted file mode 100644 index b95f5bb522f2..000000000000 --- a/Documentation/fb/matroxfb.txt +++ /dev/null @@ -1,413 +0,0 @@ -[This file is cloned from VesaFB. Thanks go to Gerd Knorr] - -What is matroxfb? -================= - -This is a driver for a graphic framebuffer for Matrox devices on -Alpha, Intel and PPC boxes. - -Advantages: - - * It provides a nice large console (128 cols + 48 lines with 1024x768) - without using tiny, unreadable fonts. - * You can run XF{68,86}_FBDev or XFree86 fbdev driver on top of /dev/fb0 - * Most important: boot logo :-) - -Disadvantages: - - * graphic mode is slower than text mode... but you should not notice - if you use same resolution as you used in textmode. - - -How to use it? -============== - -Switching modes is done using the video=matroxfb:vesa:... boot parameter -or using `fbset' program. - -If you want, for example, enable a resolution of 1280x1024x24bpp you should -pass to the kernel this command line: "video=matroxfb:vesa:0x1BB". - -You should compile in both vgacon (to boot if you remove you Matrox from -box) and matroxfb (for graphics mode). You should not compile-in vesafb -unless you have primary display on non-Matrox VBE2.0 device (see -Documentation/fb/vesafb.txt for details). - -Currently supported video modes are (through vesa:... interface, PowerMac -has [as addon] compatibility code): - - -[Graphic modes] - -bpp | 640x400 640x480 768x576 800x600 960x720 -----+-------------------------------------------- - 4 | 0x12 0x102 - 8 | 0x100 0x101 0x180 0x103 0x188 - 15 | 0x110 0x181 0x113 0x189 - 16 | 0x111 0x182 0x114 0x18A - 24 | 0x1B2 0x184 0x1B5 0x18C - 32 | 0x112 0x183 0x115 0x18B - - -[Graphic modes (continued)] - -bpp | 1024x768 1152x864 1280x1024 1408x1056 1600x1200 -----+------------------------------------------------ - 4 | 0x104 0x106 - 8 | 0x105 0x190 0x107 0x198 0x11C - 15 | 0x116 0x191 0x119 0x199 0x11D - 16 | 0x117 0x192 0x11A 0x19A 0x11E - 24 | 0x1B8 0x194 0x1BB 0x19C 0x1BF - 32 | 0x118 0x193 0x11B 0x19B - - -[Text modes] - -text | 640x400 640x480 1056x344 1056x400 1056x480 ------+------------------------------------------------ - 8x8 | 0x1C0 0x108 0x10A 0x10B 0x10C -8x16 | 2, 3, 7 0x109 - -You can enter these number either hexadecimal (leading `0x') or decimal -(0x100 = 256). You can also use value + 512 to achieve compatibility -with your old number passed to vesafb. - -Non-listed number can be achieved by more complicated command-line, for -example 1600x1200x32bpp can be specified by `video=matroxfb:vesa:0x11C,depth:32'. - - -X11 -=== - -XF{68,86}_FBDev should work just fine, but it is non-accelerated. On non-intel -architectures there are some glitches for 24bpp videomodes. 8, 16 and 32bpp -works fine. - -Running another (accelerated) X-Server like XF86_SVGA works too. But (at least) -XFree servers have big troubles in multihead configurations (even on first -head, not even talking about second). Running XFree86 4.x accelerated mga -driver is possible, but you must not enable DRI - if you do, resolution and -color depth of your X desktop must match resolution and color depths of your -virtual consoles, otherwise X will corrupt accelerator settings. - - -SVGALib -======= - -Driver contains SVGALib compatibility code. It is turned on by choosing textual -mode for console. You can do it at boot time by using videomode -2,3,7,0x108-0x10C or 0x1C0. At runtime, `fbset -depth 0' does this work. -Unfortunately, after SVGALib application exits, screen contents is corrupted. -Switching to another console and back fixes it. I hope that it is SVGALib's -problem and not mine, but I'm not sure. - - -Configuration -============= - -You can pass kernel command line options to matroxfb with -`video=matroxfb:option1,option2:value2,option3' (multiple options should be -separated by comma, values are separated from options by `:'). -Accepted options: - -mem:X - size of memory (X can be in megabytes, kilobytes or bytes) - You can only decrease value determined by driver because of - it always probe for memory. Default is to use whole detected - memory usable for on-screen display (i.e. max. 8 MB). -disabled - do not load driver; you can use also `off', but `disabled' - is here too. -enabled - load driver, if you have `video=matroxfb:disabled' in LILO - configuration, you can override it by this (you cannot override - `off'). It is default. -noaccel - do not use acceleration engine. It does not work on Alphas. -accel - use acceleration engine. It is default. -nopan - create initial consoles with vyres = yres, thus disabling virtual - scrolling. -pan - create initial consoles as tall as possible (vyres = memory/vxres). - It is default. -nopciretry - disable PCI retries. It is needed for some broken chipsets, - it is autodetected for intel's 82437. In this case device does - not comply to PCI 2.1 specs (it will not guarantee that every - transaction terminate with success or retry in 32 PCLK). -pciretry - enable PCI retries. It is default, except for intel's 82437. -novga - disables VGA I/O ports. It is default if BIOS did not enable device. - You should not use this option, some boards then do not restart - without power off. -vga - preserve state of VGA I/O ports. It is default. Driver does not - enable VGA I/O if BIOS did not it (it is not safe to enable it in - most cases). -nobios - disables BIOS ROM. It is default if BIOS did not enable BIOS itself. - You should not use this option, some boards then do not restart - without power off. -bios - preserve state of BIOS ROM. It is default. Driver does not enable - BIOS if BIOS was not enabled before. -noinit - tells driver, that devices were already initialized. You should use - it if you have G100 and/or if driver cannot detect memory, you see - strange pattern on screen and so on. Devices not enabled by BIOS - are still initialized. It is default. -init - driver initializes every device it knows about. -memtype - specifies memory type, implies 'init'. This is valid only for G200 - and G400 and has following meaning: - G200: 0 -> 2x128Kx32 chips, 2MB onboard, probably sgram - 1 -> 2x128Kx32 chips, 4MB onboard, probably sgram - 2 -> 2x256Kx32 chips, 4MB onboard, probably sgram - 3 -> 2x256Kx32 chips, 8MB onboard, probably sgram - 4 -> 2x512Kx16 chips, 8/16MB onboard, probably sdram only - 5 -> same as above - 6 -> 4x128Kx32 chips, 4MB onboard, probably sgram - 7 -> 4x128Kx32 chips, 8MB onboard, probably sgram - G400: 0 -> 2x512Kx16 SDRAM, 16/32MB - 2x512Kx32 SGRAM, 16/32MB - 1 -> 2x256Kx32 SGRAM, 8/16MB - 2 -> 4x128Kx32 SGRAM, 8/16MB - 3 -> 4x512Kx32 SDRAM, 32MB - 4 -> 4x256Kx32 SGRAM, 16/32MB - 5 -> 2x1Mx32 SDRAM, 32MB - 6 -> reserved - 7 -> reserved - You should use sdram or sgram parameter in addition to memtype - parameter. -nomtrr - disables write combining on frame buffer. This slows down driver but - there is reported minor incompatibility between GUS DMA and XFree - under high loads if write combining is enabled (sound dropouts). -mtrr - enables write combining on frame buffer. It speeds up video accesses - much. It is default. You must have MTRR support enabled in kernel - and your CPU must have MTRR (f.e. Pentium II have them). -sgram - tells to driver that you have Gxx0 with SGRAM memory. It has no - effect without `init'. -sdram - tells to driver that you have Gxx0 with SDRAM memory. - It is a default. -inv24 - change timings parameters for 24bpp modes on Millennium and - Millennium II. Specify this if you see strange color shadows around - characters. -noinv24 - use standard timings. It is the default. -inverse - invert colors on screen (for LCD displays) -noinverse - show true colors on screen. It is default. -dev:X - bind driver to device X. Driver numbers device from 0 up to N, - where device 0 is first `known' device found, 1 second and so on. - lspci lists devices in this order. - Default is `every' known device. -nohwcursor - disables hardware cursor (use software cursor instead). -hwcursor - enables hardware cursor. It is default. If you are using - non-accelerated mode (`noaccel' or `fbset -accel false'), software - cursor is used (except for text mode). -noblink - disables cursor blinking. Cursor in text mode always blinks (hw - limitation). -blink - enables cursor blinking. It is default. -nofastfont - disables fastfont feature. It is default. -fastfont:X - enables fastfont feature. X specifies size of memory reserved for - font data, it must be >= (fontwidth*fontheight*chars_in_font)/8. - It is faster on Gx00 series, but slower on older cards. -grayscale - enable grayscale summing. It works in PSEUDOCOLOR modes (text, - 4bpp, 8bpp). In DIRECTCOLOR modes it is limited to characters - displayed through putc/putcs. Direct accesses to framebuffer - can paint colors. -nograyscale - disable grayscale summing. It is default. -cross4MB - enables that pixel line can cross 4MB boundary. It is default for - non-Millennium. -nocross4MB - pixel line must not cross 4MB boundary. It is default for - Millennium I or II, because of these devices have hardware - limitations which do not allow this. But this option is - incompatible with some (if not all yet released) versions of - XF86_FBDev. -dfp - enables digital flat panel interface. This option is incompatible with - secondary (TV) output - if DFP is active, TV output must be - inactive and vice versa. DFP always uses same timing as primary - (monitor) output. -dfp:X - use settings X for digital flat panel interface. X is number from - 0 to 0xFF, and meaning of each individual bit is described in - G400 manual, in description of DAC register 0x1F. For normal operation - you should set all bits to zero, except lowest bit. This lowest bit - selects who is source of display clocks, whether G400, or panel. - Default value is now read back from hardware - so you should specify - this value only if you are also using `init' parameter. -outputs:XYZ - set mapping between CRTC and outputs. Each letter can have value - of 0 (for no CRTC), 1 (CRTC1) or 2 (CRTC2), and first letter corresponds - to primary analog output, second letter to the secondary analog output - and third letter to the DVI output. Default setting is 100 for - cards below G400 or G400 without DFP, 101 for G400 with DFP, and - 111 for G450 and G550. You can set mapping only on first card, - use matroxset for setting up other devices. -vesa:X - selects startup videomode. X is number from 0 to 0x1FF, see table - above for detailed explanation. Default is 640x480x8bpp if driver - has 8bpp support. Otherwise first available of 640x350x4bpp, - 640x480x15bpp, 640x480x24bpp, 640x480x32bpp or 80x25 text - (80x25 text is always available). - -If you are not satisfied with videomode selected by `vesa' option, you -can modify it with these options: - -xres:X - horizontal resolution, in pixels. Default is derived from `vesa' - option. -yres:X - vertical resolution, in pixel lines. Default is derived from `vesa' - option. -upper:X - top boundary: lines between end of VSYNC pulse and start of first - pixel line of picture. Default is derived from `vesa' option. -lower:X - bottom boundary: lines between end of picture and start of VSYNC - pulse. Default is derived from `vesa' option. -vslen:X - length of VSYNC pulse, in lines. Default is derived from `vesa' - option. -left:X - left boundary: pixels between end of HSYNC pulse and first pixel. - Default is derived from `vesa' option. -right:X - right boundary: pixels between end of picture and start of HSYNC - pulse. Default is derived from `vesa' option. -hslen:X - length of HSYNC pulse, in pixels. Default is derived from `vesa' - option. -pixclock:X - dotclocks, in ps (picoseconds). Default is derived from `vesa' - option and from `fh' and `fv' options. -sync:X - sync. pulse - bit 0 inverts HSYNC polarity, bit 1 VSYNC polarity. - If bit 3 (value 0x08) is set, composite sync instead of HSYNC is - generated. If bit 5 (value 0x20) is set, sync on green is turned on. - Do not forget that if you want sync on green, you also probably - want composite sync. - Default depends on `vesa'. -depth:X - Bits per pixel: 0=text, 4,8,15,16,24 or 32. Default depends on - `vesa'. - -If you know capabilities of your monitor, you can specify some (or all) of -`maxclk', `fh' and `fv'. In this case, `pixclock' is computed so that -pixclock <= maxclk, real_fh <= fh and real_fv <= fv. - -maxclk:X - maximum dotclock. X can be specified in MHz, kHz or Hz. Default is - `don't care'. -fh:X - maximum horizontal synchronization frequency. X can be specified - in kHz or Hz. Default is `don't care'. -fv:X - maximum vertical frequency. X must be specified in Hz. Default is - 70 for modes derived from `vesa' with yres <= 400, 60Hz for - yres > 400. - - -Limitations -=========== - -There are known and unknown bugs, features and misfeatures. -Currently there are following known bugs: - + SVGALib does not restore screen on exit - + generic fbcon-cfbX procedures do not work on Alphas. Due to this, - `noaccel' (and cfb4 accel) driver does not work on Alpha. So everyone - with access to /dev/fb* on Alpha can hang machine (you should restrict - access to /dev/fb* - everyone with access to this device can destroy - your monitor, believe me...). - + 24bpp does not support correctly XF-FBDev on big-endian architectures. - + interlaced text mode is not supported; it looks like hardware limitation, - but I'm not sure. - + Gxx0 SGRAM/SDRAM is not autodetected. - + If you are using more than one framebuffer device, you must boot kernel - with 'video=scrollback:0'. - + maybe more... -And following misfeatures: - + SVGALib does not restore screen on exit. - + pixclock for text modes is limited by hardware to - 83 MHz on G200 - 66 MHz on Millennium I - 60 MHz on Millennium II - Because I have no access to other devices, I do not know specific - frequencies for them. So driver does not check this and allows you to - set frequency higher that this. It causes sparks, black holes and other - pretty effects on screen. Device was not destroyed during tests. :-) - + my Millennium G200 oscillator has frequency range from 35 MHz to 380 MHz - (and it works with 8bpp on about 320 MHz dotclocks (and changed mclk)). - But Matrox says on product sheet that VCO limit is 50-250 MHz, so I believe - them (maybe that chip overheats, but it has a very big cooler (G100 has - none), so it should work). - + special mixed video/graphics videomodes of Mystique and Gx00 - 2G8V16 and - G16V16 are not supported - + color keying is not supported - + feature connector of Mystique and Gx00 is set to VGA mode (it is disabled - by BIOS) - + DDC (monitor detection) is supported through dualhead driver - + some check for input values are not so strict how it should be (you can - specify vslen=4000 and so on). - + maybe more... -And following features: - + 4bpp is available only on Millennium I and Millennium II. It is hardware - limitation. - + selection between 1:5:5:5 and 5:6:5 16bpp videomode is done by -rgba - option of fbset: "fbset -depth 16 -rgba 5,5,5" selects 1:5:5:5, anything - else selects 5:6:5 mode. - + text mode uses 6 bit VGA palette instead of 8 bit (one of 262144 colors - instead of one of 16M colors). It is due to hardware limitation of - Millennium I/II and SVGALib compatibility. - - -Benchmarks -========== -It is time to redraw whole screen 1000 times in 1024x768, 60Hz. It is -time for draw 6144000 characters on screen through /dev/vcsa -(for 32bpp it is about 3GB of data (exactly 3000 MB); for 8x16 font in -16 seconds, i.e. 187 MBps). -Times were obtained from one older version of driver, now they are about 3% -faster, it is kernel-space only time on P-II/350 MHz, Millennium I in 33 MHz -PCI slot, G200 in AGP 2x slot. I did not test vgacon. - -NOACCEL - 8x16 12x22 - Millennium I G200 Millennium I G200 -8bpp 16.42 9.54 12.33 9.13 -16bpp 21.00 15.70 19.11 15.02 -24bpp 36.66 36.66 35.00 35.00 -32bpp 35.00 30.00 33.85 28.66 - -ACCEL, nofastfont - 8x16 12x22 6x11 - Millennium I G200 Millennium I G200 Millennium I G200 -8bpp 7.79 7.24 13.55 7.78 30.00 21.01 -16bpp 9.13 7.78 16.16 7.78 30.00 21.01 -24bpp 14.17 10.72 18.69 10.24 34.99 21.01 -32bpp 16.15 16.16 18.73 13.09 34.99 21.01 - -ACCEL, fastfont - 8x16 12x22 6x11 - Millennium I G200 Millennium I G200 Millennium I G200 -8bpp 8.41 6.01 6.54 4.37 16.00 10.51 -16bpp 9.54 9.12 8.76 6.17 17.52 14.01 -24bpp 15.00 12.36 11.67 10.00 22.01 18.32 -32bpp 16.18 18.29* 12.71 12.74 24.44 21.00 - -TEXT - 8x16 - Millennium I G200 -TEXT 3.29 1.50 - -* Yes, it is slower than Millennium I. - - -Dualhead G400 -============= -Driver supports dualhead G400 with some limitations: - + secondary head shares videomemory with primary head. It is not problem - if you have 32MB of videoram, but if you have only 16MB, you may have - to think twice before choosing videomode (for example twice 1880x1440x32bpp - is not possible). - + due to hardware limitation, secondary head can use only 16 and 32bpp - videomodes. - + secondary head is not accelerated. There were bad problems with accelerated - XFree when secondary head used to use acceleration. - + secondary head always powerups in 640x480@60-32 videomode. You have to use - fbset to change this mode. - + secondary head always powerups in monitor mode. You have to use fbmatroxset - to change it to TV mode. Also, you must select at least 525 lines for - NTSC output and 625 lines for PAL output. - + kernel is not fully multihead ready. So some things are impossible to do. - + if you compiled it as module, you must insert i2c-matroxfb, matroxfb_maven - and matroxfb_crtc2 into kernel. - - -Dualhead G450 -============= -Driver supports dualhead G450 with some limitations: - + secondary head shares videomemory with primary head. It is not problem - if you have 32MB of videoram, but if you have only 16MB, you may have - to think twice before choosing videomode. - + due to hardware limitation, secondary head can use only 16 and 32bpp - videomodes. - + secondary head is not accelerated. - + secondary head always powerups in 640x480@60-32 videomode. You have to use - fbset to change this mode. - + TV output is not supported - + kernel is not fully multihead ready, so some things are impossible to do. - + if you compiled it as module, you must insert matroxfb_g450 and matroxfb_crtc2 - into kernel. - --- -Petr Vandrovec <vandrove@vc.cvut.cz> diff --git a/Documentation/fb/metronomefb.txt b/Documentation/fb/metronomefb.rst index 237ca412582d..63e1d31a7e54 100644 --- a/Documentation/fb/metronomefb.txt +++ b/Documentation/fb/metronomefb.rst @@ -1,6 +1,9 @@ - Metronomefb - ----------- +=========== +Metronomefb +=========== + Maintained by Jaya Kumar <jayakumar.lkml.gmail.com> + Last revised: Mar 10, 2008 Metronomefb is a driver for the Metronome display controller. The controller @@ -33,4 +36,3 @@ the physical media. Metronomefb uses the deferred IO interface so that it can provide a memory mappable frame buffer. It has been tested with tinyx (Xfbdev). It is known to work at this time with xeyes, xclock, xloadimage, xpdf. - diff --git a/Documentation/fb/modedb.txt b/Documentation/fb/modedb.rst index 16aa08453911..3c2397293977 100644 --- a/Documentation/fb/modedb.txt +++ b/Documentation/fb/modedb.rst @@ -1,6 +1,6 @@ - - - modedb default video mode support +================================= +modedb default video mode support +================================= Currently all frame buffer device drivers have their own video mode databases, @@ -18,7 +18,7 @@ When a frame buffer device receives a video= option it doesn't know, it should consider that to be a video mode option. If no frame buffer device is specified in a video= option, fbmem considers that to be a global video mode option. -Valid mode specifiers (mode_option argument): +Valid mode specifiers (mode_option argument):: <xres>x<yres>[M][R][-<bpp>][@<refresh>][i][m][eDd] <name>[-<bpp>][@<refresh>] @@ -45,15 +45,18 @@ signals (e.g. HDMI and DVI-I). For other outputs it behaves like 'e'. If 'd' is specified the output is disabled. You can additionally specify which output the options matches to. -To force the VGA output to be enabled and drive a specific mode say: +To force the VGA output to be enabled and drive a specific mode say:: + video=VGA-1:1280x1024@60me -Specifying the option multiple times for different ports is possible, e.g.: +Specifying the option multiple times for different ports is possible, e.g.:: + video=LVDS-1:d video=HDMI-1:D -***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** +----------------------------------------------------------------------------- What is the VESA(TM) Coordinated Video Timings (CVT)? +===================================================== From the VESA(TM) Website: @@ -90,14 +93,14 @@ determined from its EDID. The version 1.3 of the EDID has extra 128-byte blocks where additional timing information is placed. As of this time, there is no support yet in the layer to parse this additional blocks.) -CVT also introduced a new naming convention (should be seen from dmesg output): +CVT also introduced a new naming convention (should be seen from dmesg output):: <pix>M<a>[-R] where: pix = total amount of pixels in MB (xres x yres) - M = always present - a = aspect ratio (3 - 4:3; 4 - 5:4; 9 - 15:9, 16:9; A - 16:10) - -R = reduced blanking + M = always present + a = aspect ratio (3 - 4:3; 4 - 5:4; 9 - 15:9, 16:9; A - 16:10) + -R = reduced blanking example: .48M3-R - 800x600 with reduced blanking @@ -110,15 +113,15 @@ Note: VESA(TM) has restrictions on what is a standard CVT timing: If one of the above are not satisfied, the kernel will print a warning but the timings will still be calculated. -***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** oOo ***** +----------------------------------------------------------------------------- -To find a suitable video mode, you just call +To find a suitable video mode, you just call:: -int __init fb_find_mode(struct fb_var_screeninfo *var, - struct fb_info *info, const char *mode_option, - const struct fb_videomode *db, unsigned int dbsize, - const struct fb_videomode *default_mode, - unsigned int default_bpp) + int __init fb_find_mode(struct fb_var_screeninfo *var, + struct fb_info *info, const char *mode_option, + const struct fb_videomode *db, unsigned int dbsize, + const struct fb_videomode *default_mode, + unsigned int default_bpp) with db/dbsize your non-standard video mode database, or NULL to use the standard video mode database. @@ -127,12 +130,13 @@ fb_find_mode() first tries the specified video mode (or any mode that matches, e.g. there can be multiple 640x480 modes, each of them is tried). If that fails, the default mode is tried. If that fails, it walks over all modes. -To specify a video mode at bootup, use the following boot options: +To specify a video mode at bootup, use the following boot options:: + video=<driver>:<xres>x<yres>[-<bpp>][@refresh] where <driver> is a name from the table below. Valid default modes can be found in linux/drivers/video/modedb.c. Check your driver's documentation. -There may be more modes. +There may be more modes:: Drivers that support modedb boot options Boot Name Cards Supported diff --git a/Documentation/fb/pvr2fb.rst b/Documentation/fb/pvr2fb.rst new file mode 100644 index 000000000000..fcf2c21c8fcf --- /dev/null +++ b/Documentation/fb/pvr2fb.rst @@ -0,0 +1,66 @@ +=============== +What is pvr2fb? +=============== + +This is a driver for PowerVR 2 based graphics frame buffers, such as the +one found in the Dreamcast. + +Advantages: + + * It provides a nice large console (128 cols + 48 lines with 1024x768) + without using tiny, unreadable fonts (NOT on the Dreamcast) + * You can run XF86_FBDev on top of /dev/fb0 + * Most important: boot logo :-) + +Disadvantages: + + * Driver is largely untested on non-Dreamcast systems. + +Configuration +============= + +You can pass kernel command line options to pvr2fb with +`video=pvr2fb:option1,option2:value2,option3` (multiple options should be +separated by comma, values are separated from options by `:`). + +Accepted options: + +========== ================================================================== +font:X default font to use. All fonts are supported, including the + SUN12x22 font which is very nice at high resolutions. + + +mode:X default video mode with format [xres]x[yres]-<bpp>@<refresh rate> + The following video modes are supported: + 640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast + defaults to 640x480-16@60. At the time of writing the + 24bpp and 32bpp modes function poorly. Work to fix that is + ongoing + + Note: the 640x240 mode is currently broken, and should not be + used for any reason. It is only mentioned here as a reference. + +inverse invert colors on screen (for LCD displays) + +nomtrr disables write combining on frame buffer. This slows down driver + but there is reported minor incompatibility between GUS DMA and + XFree under high loads if write combining is enabled (sound + dropouts). MTRR is enabled by default on systems that have it + configured and that support it. + +cable:X cable type. This can be any of the following: vga, rgb, and + composite. If none is specified, we guess. + +output:X output type. This can be any of the following: pal, ntsc, and + vga. If none is specified, we guess. +========== ================================================================== + +X11 +=== + +XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet +on any 2.6 series kernel. + +Paul Mundt <lethal@linuxdc.org> + +Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk> diff --git a/Documentation/fb/pvr2fb.txt b/Documentation/fb/pvr2fb.txt deleted file mode 100644 index 36bdeff585e2..000000000000 --- a/Documentation/fb/pvr2fb.txt +++ /dev/null @@ -1,65 +0,0 @@ -$Id: pvr2fb.txt,v 1.1 2001/05/24 05:09:16 mrbrown Exp $ - -What is pvr2fb? -=============== - -This is a driver for PowerVR 2 based graphics frame buffers, such as the -one found in the Dreamcast. - -Advantages: - - * It provides a nice large console (128 cols + 48 lines with 1024x768) - without using tiny, unreadable fonts (NOT on the Dreamcast) - * You can run XF86_FBDev on top of /dev/fb0 - * Most important: boot logo :-) - -Disadvantages: - - * Driver is largely untested on non-Dreamcast systems. - -Configuration -============= - -You can pass kernel command line options to pvr2fb with -`video=pvr2fb:option1,option2:value2,option3' (multiple options should be -separated by comma, values are separated from options by `:'). -Accepted options: - -font:X - default font to use. All fonts are supported, including the - SUN12x22 font which is very nice at high resolutions. - - -mode:X - default video mode with format [xres]x[yres]-<bpp>@<refresh rate> - The following video modes are supported: - 640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast - defaults to 640x480-16@60. At the time of writing the - 24bpp and 32bpp modes function poorly. Work to fix that is - ongoing - - Note: the 640x240 mode is currently broken, and should not be - used for any reason. It is only mentioned here as a reference. - -inverse - invert colors on screen (for LCD displays) - -nomtrr - disables write combining on frame buffer. This slows down driver - but there is reported minor incompatibility between GUS DMA and - XFree under high loads if write combining is enabled (sound - dropouts). MTRR is enabled by default on systems that have it - configured and that support it. - -cable:X - cable type. This can be any of the following: vga, rgb, and - composite. If none is specified, we guess. - -output:X - output type. This can be any of the following: pal, ntsc, and - vga. If none is specified, we guess. - -X11 -=== - -XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet -on any 2.6 series kernel. - --- -Paul Mundt <lethal@linuxdc.org> -Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk> - diff --git a/Documentation/fb/pxafb.txt b/Documentation/fb/pxafb.rst index d143a0a749f9..90177f5e7e76 100644 --- a/Documentation/fb/pxafb.txt +++ b/Documentation/fb/pxafb.rst @@ -1,59 +1,82 @@ +================================ Driver for PXA25x LCD controller ================================ The driver supports the following options, either via options=<OPTIONS> when modular or video=pxafb:<OPTIONS> when built in. -For example: +For example:: + modprobe pxafb options=vmem:2M,mode:640x480-8,passive -or on the kernel command line + +or on the kernel command line:: + video=pxafb:vmem:2M,mode:640x480-8,passive vmem: VIDEO_MEM_SIZE + Amount of video memory to allocate (can be suffixed with K or M for kilobytes or megabytes) mode:XRESxYRES[-BPP] + XRES == LCCR1_PPL + 1 + YRES == LLCR2_LPP + 1 + The resolution of the display in pixels + BPP == The bit depth. Valid values are 1, 2, 4, 8 and 16. pixclock:PIXCLOCK + Pixel clock in picoseconds left:LEFT == LCCR1_BLW + 1 + right:RIGHT == LCCR1_ELW + 1 + hsynclen:HSYNC == LCCR1_HSW + 1 + upper:UPPER == LCCR2_BFW + lower:LOWER == LCCR2_EFR + vsynclen:VSYNC == LCCR2_VSW + 1 + Display margins and sync times color | mono => LCCR0_CMS + umm... active | passive => LCCR0_PAS + Active (TFT) or Passive (STN) display single | dual => LCCR0_SDS + Single or dual panel passive display 4pix | 8pix => LCCR0_DPD + 4 or 8 pixel monochrome single panel data -hsync:HSYNC -vsync:VSYNC +hsync:HSYNC, vsync:VSYNC + Horizontal and vertical sync. 0 => active low, 1 => active high. dpc:DPC + Double pixel clock. 1=>true, 0=>false outputen:POLARITY + Output Enable Polarity. 0 => active low, 1 => active high pixclockpol:POLARITY + pixel clock polarity 0 => falling edge, 1 => rising edge @@ -76,44 +99,50 @@ Overlay Support for PXA27x and later LCD controllers not for such purpose). 2. overlay framebuffer is allocated dynamically according to specified - 'struct fb_var_screeninfo', the amount is decided by: + 'struct fb_var_screeninfo', the amount is decided by:: - var->xres_virtual * var->yres_virtual * bpp + var->xres_virtual * var->yres_virtual * bpp bpp = 16 -- for RGB565 or RGBT555 - = 24 -- for YUV444 packed - = 24 -- for YUV444 planar - = 16 -- for YUV422 planar (1 pixel = 1 Y + 1/2 Cb + 1/2 Cr) - = 12 -- for YUV420 planar (1 pixel = 1 Y + 1/4 Cb + 1/4 Cr) + + bpp = 24 -- for YUV444 packed + + bpp = 24 -- for YUV444 planar + + bpp = 16 -- for YUV422 planar (1 pixel = 1 Y + 1/2 Cb + 1/2 Cr) + + bpp = 12 -- for YUV420 planar (1 pixel = 1 Y + 1/4 Cb + 1/4 Cr) NOTE: a. overlay does not support panning in x-direction, thus - var->xres_virtual will always be equal to var->xres + var->xres_virtual will always be equal to var->xres b. line length of overlay(s) must be on a 32-bit word boundary, - for YUV planar modes, it is a requirement for the component + for YUV planar modes, it is a requirement for the component with minimum bits per pixel, e.g. for YUV420, Cr component for one pixel is actually 2-bits, it means the line length should be a multiple of 16-pixels c. starting horizontal position (XPOS) should start on a 32-bit - word boundary, otherwise the fb_check_var() will just fail. + word boundary, otherwise the fb_check_var() will just fail. d. the rectangle of the overlay should be within the base plane, - otherwise fail + otherwise fail Applications should follow the sequence below to operate an overlay framebuffer: - a. open("/dev/fb[1-2]", ...) + a. open("/dev/fb[1-2]", ...) b. ioctl(fd, FBIOGET_VSCREENINFO, ...) c. modify 'var' with desired parameters: + 1) var->xres and var->yres 2) larger var->yres_virtual if more memory is required, usually for double-buffering 3) var->nonstd for starting (x, y) and color format 4) var->{red, green, blue, transp} if RGB mode is to be used + d. ioctl(fd, FBIOPUT_VSCREENINFO, ...) e. ioctl(fd, FBIOGET_FSCREENINFO, ...) f. mmap @@ -124,19 +153,21 @@ Overlay Support for PXA27x and later LCD controllers and lengths of each component within the framebuffer. 4. var->nonstd is used to pass starting (x, y) position and color format, - the detailed bit fields are shown below: + the detailed bit fields are shown below:: - 31 23 20 10 0 - +-----------------+---+----------+----------+ - | ... unused ... |FOR| XPOS | YPOS | - +-----------------+---+----------+----------+ + 31 23 20 10 0 + +-----------------+---+----------+----------+ + | ... unused ... |FOR| XPOS | YPOS | + +-----------------+---+----------+----------+ FOR - color format, as defined by OVERLAY_FORMAT_* in pxafb.h - 0 - RGB - 1 - YUV444 PACKED - 2 - YUV444 PLANAR - 3 - YUV422 PLANAR - 4 - YUR420 PLANAR + + - 0 - RGB + - 1 - YUV444 PACKED + - 2 - YUV444 PLANAR + - 3 - YUV422 PLANAR + - 4 - YUR420 PLANAR XPOS - starting horizontal position + YPOS - starting vertical position diff --git a/Documentation/fb/s3fb.txt b/Documentation/fb/s3fb.rst index 2c97770bdbaa..e809d69c21a7 100644 --- a/Documentation/fb/s3fb.txt +++ b/Documentation/fb/s3fb.rst @@ -1,6 +1,6 @@ - - s3fb - fbdev driver for S3 Trio/Virge chips - =========================================== +=========================================== +s3fb - fbdev driver for S3 Trio/Virge chips +=========================================== Supported Hardware @@ -56,7 +56,7 @@ Missing Features (alias TODO list) * secondary (not initialized by BIOS) device support - * big endian support + * big endian support * Zorro bus support * MMIO support * 24 bpp mode support on more cards diff --git a/Documentation/fb/sa1100fb.txt b/Documentation/fb/sa1100fb.rst index f1b4220464df..67e2650e017d 100644 --- a/Documentation/fb/sa1100fb.txt +++ b/Documentation/fb/sa1100fb.rst @@ -1,17 +1,19 @@ -[This file is cloned from VesaFB/matroxfb] - +================= What is sa1100fb? ================= +.. [This file is cloned from VesaFB/matroxfb] + + This is a driver for a graphic framebuffer for the SA-1100 LCD controller. Configuration ============== -For most common passive displays, giving the option +For most common passive displays, giving the option:: -video=sa1100fb:bpp:<value>,lccr0:<value>,lccr1:<value>,lccr2:<value>,lccr3:<value> + video=sa1100fb:bpp:<value>,lccr0:<value>,lccr1:<value>,lccr2:<value>,lccr3:<value> on the kernel command line should be enough to configure the controller. The bits per pixel (bpp) value should be 4, 8, 12, or @@ -27,13 +29,12 @@ sa1100fb_init_fbinfo(), sa1100fb_activate_var(), sa1100fb_disable_lcd_controller(), and sa1100fb_enable_lcd_controller() will probably be necessary. -Accepted options: +Accepted options:: -bpp:<value> Configure for <value> bits per pixel -lccr0:<value> Configure LCD control register 0 (11.7.3) -lccr1:<value> Configure LCD control register 1 (11.7.4) -lccr2:<value> Configure LCD control register 2 (11.7.5) -lccr3:<value> Configure LCD control register 3 (11.7.6) + bpp:<value> Configure for <value> bits per pixel + lccr0:<value> Configure LCD control register 0 (11.7.3) + lccr1:<value> Configure LCD control register 1 (11.7.4) + lccr2:<value> Configure LCD control register 2 (11.7.5) + lccr3:<value> Configure LCD control register 3 (11.7.6) --- Mark Huang <mhuang@livetoy.com> diff --git a/Documentation/fb/sh7760fb.rst b/Documentation/fb/sh7760fb.rst new file mode 100644 index 000000000000..c3266485f810 --- /dev/null +++ b/Documentation/fb/sh7760fb.rst @@ -0,0 +1,130 @@ +================================================ +SH7760/SH7763 integrated LCDC Framebuffer driver +================================================ + +0. Overview +----------- +The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which +supports (in theory) resolutions ranging from 1x1 to 1024x1024, +with color depths ranging from 1 to 16 bits, on STN, DSTN and TFT Panels. + +Caveats: + +* Framebuffer memory must be a large chunk allocated at the top + of Area3 (HW requirement). Because of this requirement you should NOT + make the driver a module since at runtime it may become impossible to + get a large enough contiguous chunk of memory. + +* The driver does not support changing resolution while loaded + (displays aren't hotpluggable anyway) + +* Heavy flickering may be observed + a) if you're using 15/16bit color modes at >= 640x480 px resolutions, + b) during PCMCIA (or any other slow bus) activity. + +* Rotation works only 90degress clockwise, and only if horizontal + resolution is <= 320 pixels. + +Files: + - drivers/video/sh7760fb.c + - include/asm-sh/sh7760fb.h + - Documentation/fb/sh7760fb.rst + +1. Platform setup +----------------- +SH7760: + Video data is fetched via the DMABRG DMA engine, so you have to + configure the SH DMAC for DMABRG mode (write 0x94808080 to the + DMARSRA register somewhere at boot). + + PFC registers PCCR and PCDR must be set to peripheral mode. + (write zeros to both). + +The driver does NOT do the above for you since board setup is, well, job +of the board setup code. + +2. Panel definitions +-------------------- +The LCDC must explicitly be told about the type of LCD panel +attached. Data must be wrapped in a "struct sh7760fb_platdata" and +passed to the driver as platform_data. + +Suggest you take a closer look at the SH7760 Manual, Section 30. +(http://documentation.renesas.com/eng/products/mpumcu/e602291_sh7760.pdf) + +The following code illustrates what needs to be done to +get the framebuffer working on a 640x480 TFT:: + + #include <linux/fb.h> + #include <asm/sh7760fb.h> + + /* + * NEC NL6440bc26-01 640x480 TFT + * dotclock 25175 kHz + * Xres 640 Yres 480 + * Htotal 800 Vtotal 525 + * HsynStart 656 VsynStart 490 + * HsynLenn 30 VsynLenn 2 + * + * The linux framebuffer layer does not use the syncstart/synclen + * values but right/left/upper/lower margin values. The comments + * for the x_margin explain how to calculate those from given + * panel sync timings. + */ + static struct fb_videomode nl6448bc26 = { + .name = "NL6448BC26", + .refresh = 60, + .xres = 640, + .yres = 480, + .pixclock = 39683, /* in picoseconds! */ + .hsync_len = 30, + .vsync_len = 2, + .left_margin = 114, /* HTOT - (HSYNSLEN + HSYNSTART) */ + .right_margin = 16, /* HSYNSTART - XRES */ + .upper_margin = 33, /* VTOT - (VSYNLEN + VSYNSTART) */ + .lower_margin = 10, /* VSYNSTART - YRES */ + .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, + .vmode = FB_VMODE_NONINTERLACED, + .flag = 0, + }; + + static struct sh7760fb_platdata sh7760fb_nl6448 = { + .def_mode = &nl6448bc26, + .ldmtr = LDMTR_TFT_COLOR_16, /* 16bit TFT panel */ + .lddfr = LDDFR_8BPP, /* we want 8bit output */ + .ldpmmr = 0x0070, + .ldpspr = 0x0500, + .ldaclnr = 0, + .ldickr = LDICKR_CLKSRC(LCDC_CLKSRC_EXTERNAL) | + LDICKR_CLKDIV(1), + .rotate = 0, + .novsync = 1, + .blank = NULL, + }; + + /* SH7760: + * 0xFE300800: 256 * 4byte xRGB palette ram + * 0xFE300C00: 42 bytes ctrl registers + */ + static struct resource sh7760_lcdc_res[] = { + [0] = { + .start = 0xFE300800, + .end = 0xFE300CFF, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 65, + .end = 65, + .flags = IORESOURCE_IRQ, + }, + }; + + static struct platform_device sh7760_lcdc_dev = { + .dev = { + .platform_data = &sh7760fb_nl6448, + }, + .name = "sh7760-lcdc", + .id = -1, + .resource = sh7760_lcdc_res, + .num_resources = ARRAY_SIZE(sh7760_lcdc_res), + }; diff --git a/Documentation/fb/sh7760fb.txt b/Documentation/fb/sh7760fb.txt deleted file mode 100644 index b994c3b10549..000000000000 --- a/Documentation/fb/sh7760fb.txt +++ /dev/null @@ -1,131 +0,0 @@ -SH7760/SH7763 integrated LCDC Framebuffer driver -================================================ - -0. Overview ------------ -The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which -supports (in theory) resolutions ranging from 1x1 to 1024x1024, -with color depths ranging from 1 to 16 bits, on STN, DSTN and TFT Panels. - -Caveats: -* Framebuffer memory must be a large chunk allocated at the top - of Area3 (HW requirement). Because of this requirement you should NOT - make the driver a module since at runtime it may become impossible to - get a large enough contiguous chunk of memory. - -* The driver does not support changing resolution while loaded - (displays aren't hotpluggable anyway) - -* Heavy flickering may be observed - a) if you're using 15/16bit color modes at >= 640x480 px resolutions, - b) during PCMCIA (or any other slow bus) activity. - -* Rotation works only 90degress clockwise, and only if horizontal - resolution is <= 320 pixels. - -files: drivers/video/sh7760fb.c - include/asm-sh/sh7760fb.h - Documentation/fb/sh7760fb.txt - -1. Platform setup ------------------ -SH7760: - Video data is fetched via the DMABRG DMA engine, so you have to - configure the SH DMAC for DMABRG mode (write 0x94808080 to the - DMARSRA register somewhere at boot). - - PFC registers PCCR and PCDR must be set to peripheral mode. - (write zeros to both). - -The driver does NOT do the above for you since board setup is, well, job -of the board setup code. - -2. Panel definitions --------------------- -The LCDC must explicitly be told about the type of LCD panel -attached. Data must be wrapped in a "struct sh7760fb_platdata" and -passed to the driver as platform_data. - -Suggest you take a closer look at the SH7760 Manual, Section 30. -(http://documentation.renesas.com/eng/products/mpumcu/e602291_sh7760.pdf) - -The following code illustrates what needs to be done to -get the framebuffer working on a 640x480 TFT: - -====================== cut here ====================================== - -#include <linux/fb.h> -#include <asm/sh7760fb.h> - -/* - * NEC NL6440bc26-01 640x480 TFT - * dotclock 25175 kHz - * Xres 640 Yres 480 - * Htotal 800 Vtotal 525 - * HsynStart 656 VsynStart 490 - * HsynLenn 30 VsynLenn 2 - * - * The linux framebuffer layer does not use the syncstart/synclen - * values but right/left/upper/lower margin values. The comments - * for the x_margin explain how to calculate those from given - * panel sync timings. - */ -static struct fb_videomode nl6448bc26 = { - .name = "NL6448BC26", - .refresh = 60, - .xres = 640, - .yres = 480, - .pixclock = 39683, /* in picoseconds! */ - .hsync_len = 30, - .vsync_len = 2, - .left_margin = 114, /* HTOT - (HSYNSLEN + HSYNSTART) */ - .right_margin = 16, /* HSYNSTART - XRES */ - .upper_margin = 33, /* VTOT - (VSYNLEN + VSYNSTART) */ - .lower_margin = 10, /* VSYNSTART - YRES */ - .sync = FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, - .vmode = FB_VMODE_NONINTERLACED, - .flag = 0, -}; - -static struct sh7760fb_platdata sh7760fb_nl6448 = { - .def_mode = &nl6448bc26, - .ldmtr = LDMTR_TFT_COLOR_16, /* 16bit TFT panel */ - .lddfr = LDDFR_8BPP, /* we want 8bit output */ - .ldpmmr = 0x0070, - .ldpspr = 0x0500, - .ldaclnr = 0, - .ldickr = LDICKR_CLKSRC(LCDC_CLKSRC_EXTERNAL) | - LDICKR_CLKDIV(1), - .rotate = 0, - .novsync = 1, - .blank = NULL, -}; - -/* SH7760: - * 0xFE300800: 256 * 4byte xRGB palette ram - * 0xFE300C00: 42 bytes ctrl registers - */ -static struct resource sh7760_lcdc_res[] = { - [0] = { - .start = 0xFE300800, - .end = 0xFE300CFF, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = 65, - .end = 65, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device sh7760_lcdc_dev = { - .dev = { - .platform_data = &sh7760fb_nl6448, - }, - .name = "sh7760-lcdc", - .id = -1, - .resource = sh7760_lcdc_res, - .num_resources = ARRAY_SIZE(sh7760_lcdc_res), -}; - -====================== cut here ====================================== diff --git a/Documentation/fb/sisfb.txt b/Documentation/fb/sisfb.rst index 2e68e503e72f..8f4e502ea12e 100644 --- a/Documentation/fb/sisfb.txt +++ b/Documentation/fb/sisfb.rst @@ -1,4 +1,4 @@ - +============== What is sisfb? ============== @@ -41,11 +41,11 @@ statement to add the parameters to the kernel command line. Please see lilo's parameters are given with the modprobe (or insmod) command. Example for sisfb as part of the static kernel: Add the following line to your -lilo.conf: +lilo.conf:: append="video=sisfb:mode:1024x768x16,mem:12288,rate:75" -Example for sisfb as a module: Start sisfb by typing +Example for sisfb as a module: Start sisfb by typing:: modprobe sisfb mode=1024x768x16 rate=75 mem=12288 @@ -57,7 +57,7 @@ described above or the vesa keyword instead of mode). If compiled as a module, the parameter format reads mode=none or mode=1024x768x16 (or whatever mode you want to use). Using a "=" for a ":" (and vice versa) is a huge difference! Additionally: If you give more than one argument to the in-kernel sisfb, the -arguments are separated with ",". For example: +arguments are separated with ",". For example:: video=sisfb:mode:1024x768x16,rate:75,mem:12288 @@ -73,6 +73,7 @@ supported options including some explanation. The desired display mode can be specified using the keyword "mode" with a parameter in one of the following formats: + - XxYxDepth or - XxY-Depth or - XxY-Depth@Rate or @@ -130,29 +131,30 @@ Configuration (Some) accepted options: -off - Disable sisfb. This option is only understood if sisfb is - in-kernel, not a module. -mem:X - size of memory for the console, rest will be used for DRI/DRM. X - is in kilobytes. On 300 series, the default is 4096, 8192 or +========= ================================================================== +off Disable sisfb. This option is only understood if sisfb is + in-kernel, not a module. +mem:X size of memory for the console, rest will be used for DRI/DRM. X + is in kilobytes. On 300 series, the default is 4096, 8192 or 16384 (each in kilobyte) depending on how much video ram the card - has. On 315/330 series, the default is the maximum available ram + has. On 315/330 series, the default is the maximum available ram (since DRI/DRM is not supported for these chipsets). -noaccel - do not use 2D acceleration engine. (Default: use acceleration) -noypan - disable y-panning and scroll by redrawing the entire screen. - This is much slower than y-panning. (Default: use y-panning) -vesa:X - selects startup videomode. X is number from 0 to 0x1FF and - represents the VESA mode number (can be given in decimal or +noaccel do not use 2D acceleration engine. (Default: use acceleration) +noypan disable y-panning and scroll by redrawing the entire screen. + This is much slower than y-panning. (Default: use y-panning) +vesa:X selects startup videomode. X is number from 0 to 0x1FF and + represents the VESA mode number (can be given in decimal or hexadecimal form, the latter prefixed with "0x"). -mode:X - selects startup videomode. Please see above for the format of - "X". +mode:X selects startup videomode. Please see above for the format of + "X". +========= ================================================================== Boolean options such as "noaccel" or "noypan" are to be given without a parameter if sisfb is in-kernel (for example "video=sisfb:noypan). If sisfb is a module, these are to be set to 1 (for example "modprobe sisfb noypan=1"). --- -Thomas Winischhofer <thomas@winischhofer.net> -May 27, 2004 +Thomas Winischhofer <thomas@winischhofer.net> +May 27, 2004 diff --git a/Documentation/fb/sm501.txt b/Documentation/fb/sm501.rst index 187f3b3ccb6c..03e02c8042a7 100644 --- a/Documentation/fb/sm501.txt +++ b/Documentation/fb/sm501.rst @@ -1,6 +1,11 @@ +======= +sm501fb +======= + Configuration: -You can pass the following kernel command line options to sm501 videoframebuffer: +You can pass the following kernel command line options to sm501 +videoframebuffer:: sm501fb.bpp= SM501 Display driver: Specify bits-per-pixel if not specified by 'mode' diff --git a/Documentation/fb/sm712fb.txt b/Documentation/fb/sm712fb.rst index c388442edf51..994dad3b0238 100644 --- a/Documentation/fb/sm712fb.txt +++ b/Documentation/fb/sm712fb.rst @@ -1,5 +1,6 @@ +================ What is sm712fb? -================= +================ This is a graphics framebuffer driver for Silicon Motion SM712 based processors. @@ -15,13 +16,16 @@ You should not compile-in vesafb. Currently supported video modes are: -[Graphic modes] +Graphic modes +------------- -bpp | 640x480 800x600 1024x768 1280x1024 -----+-------------------------------------------- - 8 | 0x301 0x303 0x305 0x307 - 16 | 0x311 0x314 0x317 0x31A - 24 | 0x312 0x315 0x318 0x31B +=== ======= ======= ======== ========= +bpp 640x480 800x600 1024x768 1280x1024 +=== ======= ======= ======== ========= + 8 0x301 0x303 0x305 0x307 + 16 0x311 0x314 0x317 0x31A + 24 0x312 0x315 0x318 0x31B +=== ======= ======= ======== ========= Missing Features ================ diff --git a/Documentation/fb/sstfb.rst b/Documentation/fb/sstfb.rst new file mode 100644 index 000000000000..8e8c1b940359 --- /dev/null +++ b/Documentation/fb/sstfb.rst @@ -0,0 +1,207 @@ +===== +sstfb +===== + +Introduction +============ + +This is a frame buffer device driver for 3dfx' Voodoo Graphics +(aka voodoo 1, aka sst1) and Voodoo² (aka Voodoo 2, aka CVG) based +video boards. It's highly experimental code, but is guaranteed to work +on my computer, with my "Maxi Gamer 3D" and "Maxi Gamer 3d²" boards, +and with me "between chair and keyboard". Some people tested other +combinations and it seems that it works. +The main page is located at <http://sstfb.sourceforge.net>, and if +you want the latest version, check out the CVS, as the driver is a work +in progress, I feel uncomfortable with releasing tarballs of something +not completely working...Don't worry, it's still more than usable +(I eat my own dog food) + +Please read the Bug section, and report any success or failure to me +(Ghozlane Toumi <gtoumi@laposte.net>). +BTW, If you have only one monitor , and you don't feel like playing +with the vga passthrou cable, I can only suggest borrowing a screen +somewhere... + + +Installation +============ + +This driver (should) work on ix86, with "late" 2.2.x kernel (tested +with x = 19) and "recent" 2.4.x kernel, as a module or compiled in. +It has been included in mainstream kernel since the infamous 2.4.10. +You can apply the patches found in `sstfb/kernel/*-2.{2|4}.x.patch`, +and copy sstfb.c to linux/drivers/video/, or apply a single patch, +`sstfb/patch-2.{2|4}.x-sstfb-yymmdd` to your linux source tree. + +Then configure your kernel as usual: choose "m" or "y" to 3Dfx Voodoo +Graphics in section "console". Compile, install, have fun... and please +drop me a report :) + + +Module Usage +============ + +.. warning:: + + #. You should read completely this section before issuing any command. + + #. If you have only one monitor to play with, once you insmod the + module, the 3dfx takes control of the output, so you'll have to + plug the monitor to the "normal" video board in order to issue + the commands, or you can blindly use sst_dbg_vgapass + in the tools directory (See Tools). The latest solution is pass the + parameter vgapass=1 when insmodding the driver. (See Kernel/Modules + Options) + +Module insertion +---------------- + + #. insmod sstfb.o + + you should see some strange output from the board: + a big blue square, a green and a red small squares and a vertical + white rectangle. why? the function's name is self-explanatory: + "sstfb_test()"... + (if you don't have a second monitor, you'll have to plug your monitor + directly to the 2D videocard to see what you're typing) + + #. con2fb /dev/fbx /dev/ttyx + + bind a tty to the new frame buffer. if you already have a frame + buffer driver, the voodoo fb will likely be /dev/fb1. if not, + the device will be /dev/fb0. You can check this by doing a + cat /proc/fb. You can find a copy of con2fb in tools/ directory. + if you don't have another fb device, this step is superfluous, + as the console subsystem automagicaly binds ttys to the fb. + #. switch to the virtual console you just mapped. "tadaaa" ... + +Module removal +-------------- + + #. con2fb /dev/fbx /dev/ttyx + + bind the tty to the old frame buffer so the module can be removed. + (how does it work with vgacon ? short answer : it doesn't work) + + #. rmmod sstfb + + +Kernel/Modules Options +---------------------- + +You can pass some options to the sstfb module, and via the kernel +command line when the driver is compiled in: +for module : insmod sstfb.o option1=value1 option2=value2 ... +in kernel : video=sstfb:option1,option2:value2,option3 ... + +sstfb supports the following options: + +=============== =============== =============================================== +Module Kernel Description +=============== =============== =============================================== +vgapass=0 vganopass Enable or disable VGA passthrou cable. +vgapass=1 vgapass When enabled, the monitor will get the signal + from the VGA board and not from the voodoo. + + Default: nopass + +mem=x mem:x Force frame buffer memory in MiB + allowed values: 0, 1, 2, 4. + + Default: 0 (= autodetect) + +inverse=1 inverse Supposed to enable inverse console. + doesn't work yet... + +clipping=1 clipping Enable or disable clipping. +clipping=0 noclipping With clipping enabled, all offscreen + reads and writes are discarded. + + Default: enable clipping. + +gfxclk=x gfxclk:x Force graphic clock frequency (in MHz). + Be careful with this option, it may be + DANGEROUS. + + Default: auto + + - 50Mhz for Voodoo 1, + - 75MHz for Voodoo 2. + +slowpci=1 fastpci Enable or disable fast PCI read/writes. +slowpci=1 slowpci Default : fastpci + +dev=x dev:x Attach the driver to device number x. + 0 is the first compatible board (in + lspci order) +=============== =============== =============================================== + +Tools +===== + +These tools are mostly for debugging purposes, but you can +find some of these interesting: + +- `con2fb`, maps a tty to a fbramebuffer:: + + con2fb /dev/fb1 /dev/tty5 + +- `sst_dbg_vgapass`, changes vga passthrou. You have to recompile the + driver with SST_DEBUG and SST_DEBUG_IOCTL set to 1:: + + sst_dbg_vgapass /dev/fb1 1 (enables vga cable) + sst_dbg_vgapass /dev/fb1 0 (disables vga cable) + +- `glide_reset`, resets the voodoo using glide + use this after rmmoding sstfb, if the module refuses to + reinsert. + +Bugs +==== + +- DO NOT use glide while the sstfb module is in, you'll most likely + hang your computer. +- If you see some artefacts (pixels not cleaning and stuff like that), + try turning off clipping (clipping=0), and/or using slowpci +- the driver don't detect the 4Mb frame buffer voodoos, it seems that + the 2 last Mbs wrap around. looking into that . +- The driver is 16 bpp only, 24/32 won't work. +- The driver is not your_favorite_toy-safe. this includes SMP... + + [Actually from inspection it seems to be safe - Alan] + +- When using XFree86 FBdev (X over fbdev) you may see strange color + patterns at the border of your windows (the pixels lose the lowest + byte -> basically the blue component and some of the green). I'm unable + to reproduce this with XFree86-3.3, but one of the testers has this + problem with XFree86-4. Apparently recent Xfree86-4.x solve this + problem. +- I didn't really test changing the palette, so you may find some weird + things when playing with that. +- Sometimes the driver will not recognise the DAC, and the + initialisation will fail. This is specifically true for + voodoo 2 boards, but it should be solved in recent versions. Please + contact me. +- The 24/32 is not likely to work anytime soon, knowing that the + hardware does ... unusual things in 24/32 bpp. +- When used with another video board, current limitations of the linux + console subsystem can cause some troubles, specifically, you should + disable software scrollback, as it can oops badly ... + +Todo +==== + +- Get rid of the previous paragraph. +- Buy more coffee. +- test/port to other arch. +- try to add panning using tweeks with front and back buffer . +- try to implement accel on voodoo2, this board can actually do a + lot in 2D even if it was sold as a 3D only board ... + +Ghozlane Toumi <gtoumi@laposte.net> + + +Date: 2002/05/09 20:11:45 + +http://sstfb.sourceforge.net/README diff --git a/Documentation/fb/sstfb.txt b/Documentation/fb/sstfb.txt deleted file mode 100644 index 13db1075e4a5..000000000000 --- a/Documentation/fb/sstfb.txt +++ /dev/null @@ -1,174 +0,0 @@ - -Introduction - - This is a frame buffer device driver for 3dfx' Voodoo Graphics - (aka voodoo 1, aka sst1) and Voodoo² (aka Voodoo 2, aka CVG) based - video boards. It's highly experimental code, but is guaranteed to work - on my computer, with my "Maxi Gamer 3D" and "Maxi Gamer 3d²" boards, - and with me "between chair and keyboard". Some people tested other - combinations and it seems that it works. - The main page is located at <http://sstfb.sourceforge.net>, and if - you want the latest version, check out the CVS, as the driver is a work - in progress, I feel uncomfortable with releasing tarballs of something - not completely working...Don't worry, it's still more than usable - (I eat my own dog food) - - Please read the Bug section, and report any success or failure to me - (Ghozlane Toumi <gtoumi@laposte.net>). - BTW, If you have only one monitor , and you don't feel like playing - with the vga passthrou cable, I can only suggest borrowing a screen - somewhere... - - -Installation - - This driver (should) work on ix86, with "late" 2.2.x kernel (tested - with x = 19) and "recent" 2.4.x kernel, as a module or compiled in. - It has been included in mainstream kernel since the infamous 2.4.10. - You can apply the patches found in sstfb/kernel/*-2.{2|4}.x.patch, - and copy sstfb.c to linux/drivers/video/, or apply a single patch, - sstfb/patch-2.{2|4}.x-sstfb-yymmdd to your linux source tree. - - Then configure your kernel as usual: choose "m" or "y" to 3Dfx Voodoo - Graphics in section "console". Compile, install, have fun... and please - drop me a report :) - - -Module Usage - - Warnings. - # You should read completely this section before issuing any command. - # If you have only one monitor to play with, once you insmod the - module, the 3dfx takes control of the output, so you'll have to - plug the monitor to the "normal" video board in order to issue - the commands, or you can blindly use sst_dbg_vgapass - in the tools directory (See Tools). The latest solution is pass the - parameter vgapass=1 when insmodding the driver. (See Kernel/Modules - Options) - - Module insertion: - # insmod sstfb.o - you should see some strange output from the board: - a big blue square, a green and a red small squares and a vertical - white rectangle. why? the function's name is self-explanatory: - "sstfb_test()"... - (if you don't have a second monitor, you'll have to plug your monitor - directly to the 2D videocard to see what you're typing) - # con2fb /dev/fbx /dev/ttyx - bind a tty to the new frame buffer. if you already have a frame - buffer driver, the voodoo fb will likely be /dev/fb1. if not, - the device will be /dev/fb0. You can check this by doing a - cat /proc/fb. You can find a copy of con2fb in tools/ directory. - if you don't have another fb device, this step is superfluous, - as the console subsystem automagicaly binds ttys to the fb. - # switch to the virtual console you just mapped. "tadaaa" ... - - Module removal: - # con2fb /dev/fbx /dev/ttyx - bind the tty to the old frame buffer so the module can be removed. - (how does it work with vgacon ? short answer : it doesn't work) - # rmmod sstfb - - -Kernel/Modules Options - - You can pass some options to the sstfb module, and via the kernel - command line when the driver is compiled in: - for module : insmod sstfb.o option1=value1 option2=value2 ... - in kernel : video=sstfb:option1,option2:value2,option3 ... - - sstfb supports the following options : - -Module Kernel Description - -vgapass=0 vganopass Enable or disable VGA passthrou cable. -vgapass=1 vgapass When enabled, the monitor will get the signal - from the VGA board and not from the voodoo. - Default: nopass - -mem=x mem:x Force frame buffer memory in MiB - allowed values: 0, 1, 2, 4. - Default: 0 (= autodetect) - -inverse=1 inverse Supposed to enable inverse console. - doesn't work yet... - -clipping=1 clipping Enable or disable clipping. -clipping=0 noclipping With clipping enabled, all offscreen - reads and writes are discarded. - Default: enable clipping. - -gfxclk=x gfxclk:x Force graphic clock frequency (in MHz). - Be careful with this option, it may be - DANGEROUS. - Default: auto - 50Mhz for Voodoo 1, - 75MHz for Voodoo 2. - -slowpci=1 fastpci Enable or disable fast PCI read/writes. -slowpci=1 slowpci Default : fastpci - -dev=x dev:x Attach the driver to device number x. - 0 is the first compatible board (in - lspci order) - -Tools - - These tools are mostly for debugging purposes, but you can - find some of these interesting : - - con2fb , maps a tty to a fbramebuffer . - con2fb /dev/fb1 /dev/tty5 - - sst_dbg_vgapass , changes vga passthrou. You have to recompile the - driver with SST_DEBUG and SST_DEBUG_IOCTL set to 1 - sst_dbg_vgapass /dev/fb1 1 (enables vga cable) - sst_dbg_vgapass /dev/fb1 0 (disables vga cable) - - glide_reset , resets the voodoo using glide - use this after rmmoding sstfb, if the module refuses to - reinsert . - -Bugs - - - DO NOT use glide while the sstfb module is in, you'll most likely - hang your computer. - - If you see some artefacts (pixels not cleaning and stuff like that), - try turning off clipping (clipping=0), and/or using slowpci - - the driver don't detect the 4Mb frame buffer voodoos, it seems that - the 2 last Mbs wrap around. looking into that . - - The driver is 16 bpp only, 24/32 won't work. - - The driver is not your_favorite_toy-safe. this includes SMP... - [Actually from inspection it seems to be safe - Alan] - - When using XFree86 FBdev (X over fbdev) you may see strange color - patterns at the border of your windows (the pixels lose the lowest - byte -> basically the blue component and some of the green). I'm unable - to reproduce this with XFree86-3.3, but one of the testers has this - problem with XFree86-4. Apparently recent Xfree86-4.x solve this - problem. - - I didn't really test changing the palette, so you may find some weird - things when playing with that. - - Sometimes the driver will not recognise the DAC, and the - initialisation will fail. This is specifically true for - voodoo 2 boards, but it should be solved in recent versions. Please - contact me. - - The 24/32 is not likely to work anytime soon, knowing that the - hardware does ... unusual things in 24/32 bpp. - - When used with another video board, current limitations of the linux - console subsystem can cause some troubles, specifically, you should - disable software scrollback, as it can oops badly ... - -Todo - - - Get rid of the previous paragraph. - - Buy more coffee. - - test/port to other arch. - - try to add panning using tweeks with front and back buffer . - - try to implement accel on voodoo2, this board can actually do a - lot in 2D even if it was sold as a 3D only board ... - -ghoz. - --- -Ghozlane Toumi <gtoumi@laposte.net> - - -$Date: 2002/05/09 20:11:45 $ -http://sstfb.sourceforge.net/README diff --git a/Documentation/fb/tgafb.txt b/Documentation/fb/tgafb.rst index 250083ada8fb..0c50d2134aa4 100644 --- a/Documentation/fb/tgafb.txt +++ b/Documentation/fb/tgafb.rst @@ -1,15 +1,14 @@ -$Id: tgafb.txt,v 1.1.2.2 2000/04/04 06:50:18 mato Exp $ - +============== What is tgafb? -=============== +============== This is a driver for DECChip 21030 based graphics framebuffers, a.k.a. TGA cards, which are usually found in older Digital Alpha systems. The following models are supported: -ZLxP-E1 (8bpp, 2 MB VRAM) -ZLxP-E2 (32bpp, 8 MB VRAM) -ZLxP-E3 (32bpp, 16 MB VRAM, Zbuffer) +- ZLxP-E1 (8bpp, 2 MB VRAM) +- ZLxP-E2 (32bpp, 8 MB VRAM) +- ZLxP-E3 (32bpp, 16 MB VRAM, Zbuffer) This version is an almost complete rewrite of the code written by Geert Uytterhoeven, which was based on the original TGA console code written by @@ -18,7 +17,7 @@ Jay Estabrook. Major new features since Linux 2.0.x: * Support for multiple resolutions - * Support for fixed-frequency and other oddball monitors + * Support for fixed-frequency and other oddball monitors (by allowing the video mode to be set at boot time) User-visible changes since Linux 2.2.x: @@ -36,19 +35,22 @@ Configuration ============= You can pass kernel command line options to tgafb with -`video=tgafb:option1,option2:value2,option3' (multiple options should be -separated by comma, values are separated from options by `:'). +`video=tgafb:option1,option2:value2,option3` (multiple options should be +separated by comma, values are separated from options by `:`). + Accepted options: -font:X - default font to use. All fonts are supported, including the - SUN12x22 font which is very nice at high resolutions. +========== ============================================================ +font:X default font to use. All fonts are supported, including the + SUN12x22 font which is very nice at high resolutions. -mode:X - default video mode. The following video modes are supported: - 640x480-60, 800x600-56, 640x480-72, 800x600-60, 800x600-72, +mode:X default video mode. The following video modes are supported: + 640x480-60, 800x600-56, 640x480-72, 800x600-60, 800x600-72, 1024x768-60, 1152x864-60, 1024x768-70, 1024x768-76, 1152x864-70, 1280x1024-61, 1024x768-85, 1280x1024-70, 1152x864-84, 1280x1024-76, 1280x1024-85 - +========== ============================================================ + Known Issues ============ diff --git a/Documentation/fb/tridentfb.txt b/Documentation/fb/tridentfb.rst index 45d9de5b13a3..7921c9dee78c 100644 --- a/Documentation/fb/tridentfb.txt +++ b/Documentation/fb/tridentfb.rst @@ -1,3 +1,7 @@ +========= +Tridentfb +========= + Tridentfb is a framebuffer driver for some Trident chip based cards. The following list of chips is thought to be supported although not all are @@ -17,6 +21,7 @@ limited comparing to the range if acceleration is disabled (see list of parameters below). Known bugs: + 1. The driver randomly locks up on 3DImage975 chip with acceleration enabled. The same happens in X11 (Xorg). 2. The ramdac speeds require some more fine tuning. It is possible to @@ -26,28 +31,30 @@ Known bugs: How to use it? ============== -When booting you can pass the video parameter. -video=tridentfb +When booting you can pass the video parameter:: + + video=tridentfb -The parameters for tridentfb are concatenated with a ':' as in this example. +The parameters for tridentfb are concatenated with a ':' as in this example:: -video=tridentfb:800x600-16@75,noaccel + video=tridentfb:800x600-16@75,noaccel The second level parameters that tridentfb understands are: -noaccel - turns off acceleration (when it doesn't work for your card) +======== ===================================================================== +noaccel turns off acceleration (when it doesn't work for your card) -fp - use flat panel related stuff -crt - assume monitor is present instead of fp +fp use flat panel related stuff +crt assume monitor is present instead of fp -center - for flat panels and resolutions smaller than native size center the +center for flat panels and resolutions smaller than native size center the image, otherwise use stretch -memsize - integer value in KB, use if your card's memory size is misdetected. +memsize integer value in KB, use if your card's memory size is misdetected. look at the driver output to see what it says when initializing. -memdiff - integer value in KB, should be nonzero if your card reports +memdiff integer value in KB, should be nonzero if your card reports more memory than it actually has. For instance mine is 192K less than detection says in all three BIOS selectable situations 2M, 4M, 8M. Only use if your video memory is taken from main memory hence of @@ -56,12 +63,13 @@ memdiff - integer value in KB, should be nonzero if your card reports at the bottom this might help by not letting change to that mode anymore. -nativex - the width in pixels of the flat panel.If you know it (usually 1024 +nativex the width in pixels of the flat panel.If you know it (usually 1024 800 or 1280) and it is not what the driver seems to detect use it. -bpp - bits per pixel (8,16 or 32) -mode - a mode name like 800x600-8@75 as described in - Documentation/fb/modedb.txt +bpp bits per pixel (8,16 or 32) +mode a mode name like 800x600-8@75 as described in + Documentation/fb/modedb.rst +======== ===================================================================== Using insane values for the above parameters will probably result in driver misbehaviour so take care(for instance memsize=12345678 or memdiff=23784 or diff --git a/Documentation/fb/udlfb.txt b/Documentation/fb/udlfb.rst index c985cb65dd06..732b37db3504 100644 --- a/Documentation/fb/udlfb.txt +++ b/Documentation/fb/udlfb.rst @@ -1,6 +1,6 @@ - +============== What is udlfb? -=============== +============== This is a driver for DisplayLink USB 2.0 era graphics chips. @@ -100,6 +100,7 @@ options udlfb fb_defio=0 console=1 shadow=1 Accepted boolean options: +=============== ================================================================ fb_defio Make use of the fb_defio (CONFIG_FB_DEFERRED_IO) kernel module to track changed areas of the framebuffer by page faults. Standard fbdev applications that use mmap but that do not @@ -109,7 +110,7 @@ fb_defio Make use of the fb_defio (CONFIG_FB_DEFERRED_IO) kernel more stable, and higher performance. default: fb_defio=1 -console Allow fbcon to attach to udlfb provided framebuffers. +console Allow fbcon to attach to udlfb provided framebuffers. Can be disabled if fbcon and other clients (e.g. X with --shared-vt) are in conflict. default: console=1 @@ -119,6 +120,7 @@ shadow Allocate a 2nd framebuffer to shadow what's currently across do not transmit. Spends host memory to save USB transfers. Enabled by default. Only disable on very low memory systems. default: shadow=1 +=============== ================================================================ Sysfs Attributes ================ @@ -126,34 +128,35 @@ Sysfs Attributes Udlfb creates several files in /sys/class/graphics/fb? Where ? is the sequential framebuffer id of the particular DisplayLink device -edid If a valid EDID blob is written to this file (typically - by a udev rule), then udlfb will use this EDID as a - backup in case reading the actual EDID of the monitor - attached to the DisplayLink device fails. This is - especially useful for fixed panels, etc. that cannot - communicate their capabilities via EDID. Reading - this file returns the current EDID of the attached - monitor (or last backup value written). This is - useful to get the EDID of the attached monitor, - which can be passed to utilities like parse-edid. +======================== ======================================================== +edid If a valid EDID blob is written to this file (typically + by a udev rule), then udlfb will use this EDID as a + backup in case reading the actual EDID of the monitor + attached to the DisplayLink device fails. This is + especially useful for fixed panels, etc. that cannot + communicate their capabilities via EDID. Reading + this file returns the current EDID of the attached + monitor (or last backup value written). This is + useful to get the EDID of the attached monitor, + which can be passed to utilities like parse-edid. -metrics_bytes_rendered 32-bit count of pixel bytes rendered +metrics_bytes_rendered 32-bit count of pixel bytes rendered -metrics_bytes_identical 32-bit count of how many of those bytes were found to be - unchanged, based on a shadow framebuffer check +metrics_bytes_identical 32-bit count of how many of those bytes were found to be + unchanged, based on a shadow framebuffer check -metrics_bytes_sent 32-bit count of how many bytes were transferred over - USB to communicate the resulting changed pixels to the - hardware. Includes compression and protocol overhead +metrics_bytes_sent 32-bit count of how many bytes were transferred over + USB to communicate the resulting changed pixels to the + hardware. Includes compression and protocol overhead metrics_cpu_kcycles_used 32-bit count of CPU cycles used in processing the - above pixels (in thousands of cycles). + above pixels (in thousands of cycles). -metrics_reset Write-only. Any write to this file resets all metrics - above to zero. Note that the 32-bit counters above - roll over very quickly. To get reliable results, design - performance tests to start and finish in a very short - period of time (one minute or less is safe). +metrics_reset Write-only. Any write to this file resets all metrics + above to zero. Note that the 32-bit counters above + roll over very quickly. To get reliable results, design + performance tests to start and finish in a very short + period of time (one minute or less is safe). +======================== ======================================================== --- Bernie Thompson <bernie@plugable.com> diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.rst index aa924196c366..d1c2523fbb33 100644 --- a/Documentation/fb/uvesafb.txt +++ b/Documentation/fb/uvesafb.rst @@ -1,4 +1,4 @@ - +========================================================== uvesafb - A Generic Driver for VBE2+ compliant video cards ========================================================== @@ -49,7 +49,7 @@ The most important limitations are: uvesafb can be compiled either as a module, or directly into the kernel. In both cases it supports the same set of configuration options, which -are either given on the kernel command line or as module parameters, e.g.: +are either given on the kernel command line or as module parameters, e.g.:: video=uvesafb:1024x768-32,mtrr:3,ywrap (compiled into the kernel) @@ -57,85 +57,90 @@ are either given on the kernel command line or as module parameters, e.g.: Accepted options: +======= ========================================================= ypan Enable display panning using the VESA protected mode - interface. The visible screen is just a window of the - video memory, console scrolling is done by changing the - start of the window. This option is available on x86 - only and is the default option on that architecture. + interface. The visible screen is just a window of the + video memory, console scrolling is done by changing the + start of the window. This option is available on x86 + only and is the default option on that architecture. ywrap Same as ypan, but assumes your gfx board can wrap-around - the video memory (i.e. starts reading from top if it - reaches the end of video memory). Faster than ypan. - Available on x86 only. + the video memory (i.e. starts reading from top if it + reaches the end of video memory). Faster than ypan. + Available on x86 only. redraw Scroll by redrawing the affected part of the screen, this - is the default on non-x86. + is the default on non-x86. +======= ========================================================= (If you're using uvesafb as a module, the above three options are - used a parameter of the scroll option, e.g. scroll=ypan.) +used a parameter of the scroll option, e.g. scroll=ypan.) -vgapal Use the standard VGA registers for palette changes. +=========== ==================================================================== +vgapal Use the standard VGA registers for palette changes. -pmipal Use the protected mode interface for palette changes. - This is the default if the protected mode interface is - available. Available on x86 only. +pmipal Use the protected mode interface for palette changes. + This is the default if the protected mode interface is + available. Available on x86 only. -mtrr:n Setup memory type range registers for the framebuffer - where n: - 0 - disabled (equivalent to nomtrr) - 3 - write-combining (default) +mtrr:n Setup memory type range registers for the framebuffer + where n: - Values other than 0 and 3 will result in a warning and will be - treated just like 3. + - 0 - disabled (equivalent to nomtrr) + - 3 - write-combining (default) -nomtrr Do not use memory type range registers. + Values other than 0 and 3 will result in a warning and will be + treated just like 3. + +nomtrr Do not use memory type range registers. vremap:n - Remap 'n' MiB of video RAM. If 0 or not specified, remap memory - according to video mode. - -vtotal:n - If the video BIOS of your card incorrectly determines the total - amount of video RAM, use this option to override the BIOS (in MiB). - -<mode> The mode you want to set, in the standard modedb format. Refer to - modedb.txt for a detailed description. When uvesafb is compiled as - a module, the mode string should be provided as a value of the - 'mode_option' option. - -vbemode:x - Force the use of VBE mode x. The mode will only be set if it's - found in the VBE-provided list of supported modes. - NOTE: The mode number 'x' should be specified in VESA mode number - notation, not the Linux kernel one (eg. 257 instead of 769). - HINT: If you use this option because normal <mode> parameter does - not work for you and you use a X server, you'll probably want to - set the 'nocrtc' option to ensure that the video mode is properly - restored after console <-> X switches. - -nocrtc Do not use CRTC timings while setting the video mode. This option - has any effect only if the Video BIOS is VBE 3.0 compliant. Use it - if you have problems with modes set the standard way. Note that - using this option implies that any refresh rate adjustments will - be ignored and the refresh rate will stay at your BIOS default (60 Hz). - -noedid Do not try to fetch and use EDID-provided modes. - -noblank Disable hardware blanking. - -v86d:path - Set path to the v86d executable. This option is only available as - a module parameter, and not as a part of the video= string. If you - need to use it and have uvesafb built into the kernel, use - uvesafb.v86d="path". + Remap 'n' MiB of video RAM. If 0 or not specified, remap memory + according to video mode. + +vtotal:n If the video BIOS of your card incorrectly determines the total + amount of video RAM, use this option to override the BIOS (in MiB). + +<mode> The mode you want to set, in the standard modedb format. Refer to + modedb.txt for a detailed description. When uvesafb is compiled as + a module, the mode string should be provided as a value of the + 'mode_option' option. + +vbemode:x Force the use of VBE mode x. The mode will only be set if it's + found in the VBE-provided list of supported modes. + NOTE: The mode number 'x' should be specified in VESA mode number + notation, not the Linux kernel one (eg. 257 instead of 769). + HINT: If you use this option because normal <mode> parameter does + not work for you and you use a X server, you'll probably want to + set the 'nocrtc' option to ensure that the video mode is properly + restored after console <-> X switches. + +nocrtc Do not use CRTC timings while setting the video mode. This option + has any effect only if the Video BIOS is VBE 3.0 compliant. Use it + if you have problems with modes set the standard way. Note that + using this option implies that any refresh rate adjustments will + be ignored and the refresh rate will stay at your BIOS default + (60 Hz). + +noedid Do not try to fetch and use EDID-provided modes. + +noblank Disable hardware blanking. + +v86d:path Set path to the v86d executable. This option is only available as + a module parameter, and not as a part of the video= string. If you + need to use it and have uvesafb built into the kernel, use + uvesafb.v86d="path". +=========== ==================================================================== Additionally, the following parameters may be provided. They all override the EDID-provided values and BIOS defaults. Refer to your monitor's specs to get the correct values for maxhf, maxvf and maxclk for your hardware. +=========== ====================================== maxhf:n Maximum horizontal frequency (in kHz). maxvf:n Maximum vertical frequency (in Hz). maxclk:n Maximum pixel clock (in MHz). +=========== ====================================== 4. The sysfs interface ---------------------- @@ -146,27 +151,26 @@ additional information. Driver attributes: /sys/bus/platform/drivers/uvesafb - - v86d (default: /sbin/v86d) + v86d + (default: /sbin/v86d) + Path to the v86d executable. v86d is started by uvesafb if an instance of the daemon isn't already running. Device attributes: /sys/bus/platform/drivers/uvesafb/uvesafb.0 - - nocrtc + nocrtc Use the default refresh rate (60 Hz) if set to 1. - - oem_product_name - - oem_product_rev - - oem_string - - oem_vendor + oem_product_name, oem_product_rev, oem_string, oem_vendor Information about the card and its maker. - - vbe_modes + vbe_modes A list of video modes supported by the Video BIOS along with their VBE mode numbers in hex. - - vbe_version + vbe_version A BCD value indicating the implemented VBE standard. 5. Miscellaneous @@ -176,9 +180,9 @@ Uvesafb will set a video mode with the default refresh rate and timings from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo. --- + Michal Januszewski <spock@gentoo.org> + Last updated: 2017-10-10 Documentation of the uvesafb options is loosely based on vesafb.txt. - diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.rst index 413bb73235be..2ed0dfb661cf 100644 --- a/Documentation/fb/vesafb.txt +++ b/Documentation/fb/vesafb.rst @@ -1,4 +1,4 @@ - +=============== What is vesafb? =============== @@ -40,30 +40,35 @@ The graphic modes are NOT in the list which you get if you boot with vga=ask and hit return. The mode you wish to use is derived from the VESA mode number. Here are those VESA mode numbers: - | 640x480 800x600 1024x768 1280x1024 -----+------------------------------------- -256 | 0x101 0x103 0x105 0x107 -32k | 0x110 0x113 0x116 0x119 -64k | 0x111 0x114 0x117 0x11A -16M | 0x112 0x115 0x118 0x11B +====== ======= ======= ======== ========= +colors 640x480 800x600 1024x768 1280x1024 +====== ======= ======= ======== ========= +256 0x101 0x103 0x105 0x107 +32k 0x110 0x113 0x116 0x119 +64k 0x111 0x114 0x117 0x11A +16M 0x112 0x115 0x118 0x11B +====== ======= ======= ======== ========= + The video mode number of the Linux kernel is the VESA mode number plus -0x200. - +0x200: + Linux_kernel_mode_number = VESA_mode_number + 0x200 So the table for the Kernel mode numbers are: - | 640x480 800x600 1024x768 1280x1024 -----+------------------------------------- -256 | 0x301 0x303 0x305 0x307 -32k | 0x310 0x313 0x316 0x319 -64k | 0x311 0x314 0x317 0x31A -16M | 0x312 0x315 0x318 0x31B +====== ======= ======= ======== ========= +colors 640x480 800x600 1024x768 1280x1024 +====== ======= ======= ======== ========= +256 0x301 0x303 0x305 0x307 +32k 0x310 0x313 0x316 0x319 +64k 0x311 0x314 0x317 0x31A +16M 0x312 0x315 0x318 0x31B +====== ======= ======= ======== ========= To enable one of those modes you have to specify "vga=ask" in the lilo.conf file and rerun LILO. Then you can type in the desired -mode at the "vga=ask" prompt. For example if you like to use +mode at the "vga=ask" prompt. For example if you like to use 1024x768x256 colors you have to say "305" at this prompt. If this does not work, this might be because your BIOS does not support @@ -72,10 +77,10 @@ Even if your board does, it might be the BIOS which does not. VESA BIOS Extensions v2.0 are required, 1.2 is NOT sufficient. You will get a "bad mode number" message if something goes wrong. -1. Note: LILO cannot handle hex, for booting directly with - "vga=mode-number" you have to transform the numbers to decimal. +1. Note: LILO cannot handle hex, for booting directly with + "vga=mode-number" you have to transform the numbers to decimal. 2. Note: Some newer versions of LILO appear to work with those hex values, - if you set the 0x in front of the numbers. + if you set the 0x in front of the numbers. X11 === @@ -120,62 +125,68 @@ Accepted options: inverse use inverse color map -ypan enable display panning using the VESA protected mode - interface. The visible screen is just a window of the - video memory, console scrolling is done by changing the - start of the window. - pro: * scrolling (fullscreen) is fast, because there is +========= ====================================================================== +ypan enable display panning using the VESA protected mode + interface. The visible screen is just a window of the + video memory, console scrolling is done by changing the + start of the window. + + pro: + + * scrolling (fullscreen) is fast, because there is no need to copy around data. * You'll get scrollback (the Shift-PgUp thing), the video memory can be used as scrollback buffer - kontra: * scrolling only parts of the screen causes some + + kontra: + + * scrolling only parts of the screen causes some ugly flicker effects (boot logo flickers for example). -ywrap Same as ypan, but assumes your gfx board can wrap-around - the video memory (i.e. starts reading from top if it - reaches the end of video memory). Faster than ypan. +ywrap Same as ypan, but assumes your gfx board can wrap-around + the video memory (i.e. starts reading from top if it + reaches the end of video memory). Faster than ypan. -redraw scroll by redrawing the affected part of the screen, this - is the safe (and slow) default. +redraw Scroll by redrawing the affected part of the screen, this + is the safe (and slow) default. -vgapal Use the standard vga registers for palette changes. - This is the default. -pmipal Use the protected mode interface for palette changes. +vgapal Use the standard vga registers for palette changes. + This is the default. +pmipal Use the protected mode interface for palette changes. -mtrr:n setup memory type range registers for the vesafb framebuffer - where n: - 0 - disabled (equivalent to nomtrr) (default) - 1 - uncachable - 2 - write-back - 3 - write-combining - 4 - write-through +mtrr:n Setup memory type range registers for the vesafb framebuffer + where n: - If you see the following in dmesg, choose the type that matches the - old one. In this example, use "mtrr:2". + - 0 - disabled (equivalent to nomtrr) (default) + - 1 - uncachable + - 2 - write-back + - 3 - write-combining + - 4 - write-through + + If you see the following in dmesg, choose the type that matches the + old one. In this example, use "mtrr:2". ... -mtrr: type mismatch for e0000000,8000000 old: write-back new: write-combining +mtrr: type mismatch for e0000000,8000000 old: write-back new: + write-combining ... -nomtrr disable mtrr +nomtrr disable mtrr vremap:n - remap 'n' MiB of video RAM. If 0 or not specified, remap memory - according to video mode. (2.5.66 patch/idea by Antonino Daplas - reversed to give override possibility (allocate more fb memory - than the kernel would) to 2.4 by tmb@iki.fi) + Remap 'n' MiB of video RAM. If 0 or not specified, remap memory + according to video mode. (2.5.66 patch/idea by Antonino Daplas + reversed to give override possibility (allocate more fb memory + than the kernel would) to 2.4 by tmb@iki.fi) -vtotal:n - if the video BIOS of your card incorrectly determines the total - amount of video RAM, use this option to override the BIOS (in MiB). +vtotal:n If the video BIOS of your card incorrectly determines the total + amount of video RAM, use this option to override the BIOS (in MiB). +========= ====================================================================== Have fun! - Gerd - --- Gerd Knorr <kraxel@goldbach.in-berlin.de> -Minor (mostly typo) changes +Minor (mostly typo) changes by Nico Schmoigl <schmoigl@rumms.uni-mannheim.de> diff --git a/Documentation/fb/viafb.rst b/Documentation/fb/viafb.rst new file mode 100644 index 000000000000..8eb7a3bb068c --- /dev/null +++ b/Documentation/fb/viafb.rst @@ -0,0 +1,297 @@ +======================================================= +VIA Integration Graphic Chip Console Framebuffer Driver +======================================================= + +Platform +-------- + The console framebuffer driver is for graphics chips of + VIA UniChrome Family + (CLE266, PM800 / CN400 / CN300, + P4M800CE / P4M800Pro / CN700 / VN800, + CX700 / VX700, K8M890, P4M890, + CN896 / P4M900, VX800, VX855) + +Driver features +--------------- + Device: CRT, LCD, DVI + + Support viafb_mode:: + + CRT: + 640x480(60, 75, 85, 100, 120 Hz), 720x480(60 Hz), + 720x576(60 Hz), 800x600(60, 75, 85, 100, 120 Hz), + 848x480(60 Hz), 856x480(60 Hz), 1024x512(60 Hz), + 1024x768(60, 75, 85, 100 Hz), 1152x864(75 Hz), + 1280x768(60 Hz), 1280x960(60 Hz), 1280x1024(60, 75, 85 Hz), + 1440x1050(60 Hz), 1600x1200(60, 75 Hz), 1280x720(60 Hz), + 1920x1080(60 Hz), 1400x1050(60 Hz), 800x480(60 Hz) + + color depth: 8 bpp, 16 bpp, 32 bpp supports. + + Support 2D hardware accelerator. + +Using the viafb module +---------------------- + Start viafb with default settings:: + + #modprobe viafb + + Start viafb with user options:: + + #modprobe viafb viafb_mode=800x600 viafb_bpp=16 viafb_refresh=60 + viafb_active_dev=CRT+DVI viafb_dvi_port=DVP1 + viafb_mode1=1024x768 viafb_bpp=16 viafb_refresh1=60 + viafb_SAMM_ON=1 + + viafb_mode: + - 640x480 (default) + - 720x480 + - 800x600 + - 1024x768 + + viafb_bpp: + - 8, 16, 32 (default:32) + + viafb_refresh: + - 60, 75, 85, 100, 120 (default:60) + + viafb_lcd_dsp_method: + - 0 : expansion (default) + - 1 : centering + + viafb_lcd_mode: + 0 : LCD panel with LSB data format input (default) + 1 : LCD panel with MSB data format input + + viafb_lcd_panel_id: + - 0 : Resolution: 640x480, Channel: single, Dithering: Enable + - 1 : Resolution: 800x600, Channel: single, Dithering: Enable + - 2 : Resolution: 1024x768, Channel: single, Dithering: Enable (default) + - 3 : Resolution: 1280x768, Channel: single, Dithering: Enable + - 4 : Resolution: 1280x1024, Channel: dual, Dithering: Enable + - 5 : Resolution: 1400x1050, Channel: dual, Dithering: Enable + - 6 : Resolution: 1600x1200, Channel: dual, Dithering: Enable + + - 8 : Resolution: 800x480, Channel: single, Dithering: Enable + - 9 : Resolution: 1024x768, Channel: dual, Dithering: Enable + - 10: Resolution: 1024x768, Channel: single, Dithering: Disable + - 11: Resolution: 1024x768, Channel: dual, Dithering: Disable + - 12: Resolution: 1280x768, Channel: single, Dithering: Disable + - 13: Resolution: 1280x1024, Channel: dual, Dithering: Disable + - 14: Resolution: 1400x1050, Channel: dual, Dithering: Disable + - 15: Resolution: 1600x1200, Channel: dual, Dithering: Disable + - 16: Resolution: 1366x768, Channel: single, Dithering: Disable + - 17: Resolution: 1024x600, Channel: single, Dithering: Enable + - 18: Resolution: 1280x768, Channel: dual, Dithering: Enable + - 19: Resolution: 1280x800, Channel: single, Dithering: Enable + + viafb_accel: + - 0 : No 2D Hardware Acceleration + - 1 : 2D Hardware Acceleration (default) + + viafb_SAMM_ON: + - 0 : viafb_SAMM_ON disable (default) + - 1 : viafb_SAMM_ON enable + + viafb_mode1: (secondary display device) + - 640x480 (default) + - 720x480 + - 800x600 + - 1024x768 + + viafb_bpp1: (secondary display device) + - 8, 16, 32 (default:32) + + viafb_refresh1: (secondary display device) + - 60, 75, 85, 100, 120 (default:60) + + viafb_active_dev: + This option is used to specify active devices.(CRT, DVI, CRT+LCD...) + DVI stands for DVI or HDMI, E.g., If you want to enable HDMI, + set viafb_active_dev=DVI. In SAMM case, the previous of + viafb_active_dev is primary device, and the following is + secondary device. + + For example: + + To enable one device, such as DVI only, we can use:: + + modprobe viafb viafb_active_dev=DVI + + To enable two devices, such as CRT+DVI:: + + modprobe viafb viafb_active_dev=CRT+DVI; + + For DuoView case, we can use:: + + modprobe viafb viafb_active_dev=CRT+DVI + + OR:: + + modprobe viafb viafb_active_dev=DVI+CRT... + + For SAMM case: + + If CRT is primary and DVI is secondary, we should use:: + + modprobe viafb viafb_active_dev=CRT+DVI viafb_SAMM_ON=1... + + If DVI is primary and CRT is secondary, we should use:: + + modprobe viafb viafb_active_dev=DVI+CRT viafb_SAMM_ON=1... + + viafb_display_hardware_layout: + This option is used to specify display hardware layout for CX700 chip. + + - 1 : LCD only + - 2 : DVI only + - 3 : LCD+DVI (default) + - 4 : LCD1+LCD2 (internal + internal) + - 16: LCD1+ExternalLCD2 (internal + external) + + viafb_second_size: + This option is used to set second device memory size(MB) in SAMM case. + The minimal size is 16. + + viafb_platform_epia_dvi: + This option is used to enable DVI on EPIA - M + + - 0 : No DVI on EPIA - M (default) + - 1 : DVI on EPIA - M + + viafb_bus_width: + When using 24 - Bit Bus Width Digital Interface, + this option should be set. + + - 12: 12-Bit LVDS or 12-Bit TMDS (default) + - 24: 24-Bit LVDS or 24-Bit TMDS + + viafb_device_lcd_dualedge: + When using Dual Edge Panel, this option should be set. + + - 0 : No Dual Edge Panel (default) + - 1 : Dual Edge Panel + + viafb_lcd_port: + This option is used to specify LCD output port, + available values are "DVP0" "DVP1" "DFP_HIGHLOW" "DFP_HIGH" "DFP_LOW". + + for external LCD + external DVI on CX700(External LCD is on DVP0), + we should use:: + + modprobe viafb viafb_lcd_port=DVP0... + +Notes: + 1. CRT may not display properly for DuoView CRT & DVI display at + the "640x480" PAL mode with DVI overscan enabled. + 2. SAMM stands for single adapter multi monitors. It is different from + multi-head since SAMM support multi monitor at driver layers, thus fbcon + layer doesn't even know about it; SAMM's second screen doesn't have a + device node file, thus a user mode application can't access it directly. + When SAMM is enabled, viafb_mode and viafb_mode1, viafb_bpp and + viafb_bpp1, viafb_refresh and viafb_refresh1 can be different. + 3. When console is depending on viafbinfo1, dynamically change resolution + and bpp, need to call VIAFB specified ioctl interface VIAFB_SET_DEVICE + instead of calling common ioctl function FBIOPUT_VSCREENINFO since + viafb doesn't support multi-head well, or it will cause screen crush. + + +Configure viafb with "fbset" tool +--------------------------------- + + "fbset" is an inbox utility of Linux. + + 1. Inquire current viafb information, type:: + + # fbset -i + + 2. Set various resolutions and viafb_refresh rates:: + + # fbset <resolution-vertical_sync> + + example:: + + # fbset "1024x768-75" + + or:: + + # fbset -g 1024 768 1024 768 32 + + Check the file "/etc/fb.modes" to find display modes available. + + 3. Set the color depth:: + + # fbset -depth <value> + + example:: + + # fbset -depth 16 + + +Configure viafb via /proc +------------------------- + The following files exist in /proc/viafb + + supported_output_devices + This read-only file contains a full ',' separated list containing all + output devices that could be available on your platform. It is likely + that not all of those have a connector on your hardware but it should + provide a good starting point to figure out which of those names match + a real connector. + + Example:: + + # cat /proc/viafb/supported_output_devices + + iga1/output_devices, iga2/output_devices + These two files are readable and writable. iga1 and iga2 are the two + independent units that produce the screen image. Those images can be + forwarded to one or more output devices. Reading those files is a way + to query which output devices are currently used by an iga. + + Example:: + + # cat /proc/viafb/iga1/output_devices + + If there are no output devices printed the output of this iga is lost. + This can happen for example if only one (the other) iga is used. + Writing to these files allows adjusting the output devices during + runtime. One can add new devices, remove existing ones or switch + between igas. Essentially you can write a ',' separated list of device + names (or a single one) in the same format as the output to those + files. You can add a '+' or '-' as a prefix allowing simple addition + and removal of devices. So a prefix '+' adds the devices from your list + to the already existing ones, '-' removes the listed devices from the + existing ones and if no prefix is given it replaces all existing ones + with the listed ones. If you remove devices they are expected to turn + off. If you add devices that are already part of the other iga they are + removed there and added to the new one. + + Examples: + + Add CRT as output device to iga1:: + + # echo +CRT > /proc/viafb/iga1/output_devices + + Remove (turn off) DVP1 and LVDS1 as output devices of iga2:: + + # echo -DVP1,LVDS1 > /proc/viafb/iga2/output_devices + + Replace all iga1 output devices by CRT:: + + # echo CRT > /proc/viafb/iga1/output_devices + + +Bootup with viafb +----------------- + +Add the following line to your grub.conf:: + + append = "video=viafb:viafb_mode=1024x768,viafb_bpp=32,viafb_refresh=85" + + +VIA Framebuffer modes +===================== + +.. include:: viafb.modes + :literal: diff --git a/Documentation/fb/viafb.txt b/Documentation/fb/viafb.txt deleted file mode 100644 index 1cb2462a71ce..000000000000 --- a/Documentation/fb/viafb.txt +++ /dev/null @@ -1,252 +0,0 @@ - - VIA Integration Graphic Chip Console Framebuffer Driver - -[Platform] ------------------------ - The console framebuffer driver is for graphics chips of - VIA UniChrome Family(CLE266, PM800 / CN400 / CN300, - P4M800CE / P4M800Pro / CN700 / VN800, - CX700 / VX700, K8M890, P4M890, - CN896 / P4M900, VX800, VX855) - -[Driver features] ------------------------- - Device: CRT, LCD, DVI - - Support viafb_mode: - CRT: - 640x480(60, 75, 85, 100, 120 Hz), 720x480(60 Hz), - 720x576(60 Hz), 800x600(60, 75, 85, 100, 120 Hz), - 848x480(60 Hz), 856x480(60 Hz), 1024x512(60 Hz), - 1024x768(60, 75, 85, 100 Hz), 1152x864(75 Hz), - 1280x768(60 Hz), 1280x960(60 Hz), 1280x1024(60, 75, 85 Hz), - 1440x1050(60 Hz), 1600x1200(60, 75 Hz), 1280x720(60 Hz), - 1920x1080(60 Hz), 1400x1050(60 Hz), 800x480(60 Hz) - - color depth: 8 bpp, 16 bpp, 32 bpp supports. - - Support 2D hardware accelerator. - -[Using the viafb module] --- -- -------------------- - Start viafb with default settings: - #modprobe viafb - - Start viafb with user options: - #modprobe viafb viafb_mode=800x600 viafb_bpp=16 viafb_refresh=60 - viafb_active_dev=CRT+DVI viafb_dvi_port=DVP1 - viafb_mode1=1024x768 viafb_bpp=16 viafb_refresh1=60 - viafb_SAMM_ON=1 - - viafb_mode: - 640x480 (default) - 720x480 - 800x600 - 1024x768 - ...... - - viafb_bpp: - 8, 16, 32 (default:32) - - viafb_refresh: - 60, 75, 85, 100, 120 (default:60) - - viafb_lcd_dsp_method: - 0 : expansion (default) - 1 : centering - - viafb_lcd_mode: - 0 : LCD panel with LSB data format input (default) - 1 : LCD panel with MSB data format input - - viafb_lcd_panel_id: - 0 : Resolution: 640x480, Channel: single, Dithering: Enable - 1 : Resolution: 800x600, Channel: single, Dithering: Enable - 2 : Resolution: 1024x768, Channel: single, Dithering: Enable (default) - 3 : Resolution: 1280x768, Channel: single, Dithering: Enable - 4 : Resolution: 1280x1024, Channel: dual, Dithering: Enable - 5 : Resolution: 1400x1050, Channel: dual, Dithering: Enable - 6 : Resolution: 1600x1200, Channel: dual, Dithering: Enable - - 8 : Resolution: 800x480, Channel: single, Dithering: Enable - 9 : Resolution: 1024x768, Channel: dual, Dithering: Enable - 10: Resolution: 1024x768, Channel: single, Dithering: Disable - 11: Resolution: 1024x768, Channel: dual, Dithering: Disable - 12: Resolution: 1280x768, Channel: single, Dithering: Disable - 13: Resolution: 1280x1024, Channel: dual, Dithering: Disable - 14: Resolution: 1400x1050, Channel: dual, Dithering: Disable - 15: Resolution: 1600x1200, Channel: dual, Dithering: Disable - 16: Resolution: 1366x768, Channel: single, Dithering: Disable - 17: Resolution: 1024x600, Channel: single, Dithering: Enable - 18: Resolution: 1280x768, Channel: dual, Dithering: Enable - 19: Resolution: 1280x800, Channel: single, Dithering: Enable - - viafb_accel: - 0 : No 2D Hardware Acceleration - 1 : 2D Hardware Acceleration (default) - - viafb_SAMM_ON: - 0 : viafb_SAMM_ON disable (default) - 1 : viafb_SAMM_ON enable - - viafb_mode1: (secondary display device) - 640x480 (default) - 720x480 - 800x600 - 1024x768 - ... ... - - viafb_bpp1: (secondary display device) - 8, 16, 32 (default:32) - - viafb_refresh1: (secondary display device) - 60, 75, 85, 100, 120 (default:60) - - viafb_active_dev: - This option is used to specify active devices.(CRT, DVI, CRT+LCD...) - DVI stands for DVI or HDMI, E.g., If you want to enable HDMI, - set viafb_active_dev=DVI. In SAMM case, the previous of - viafb_active_dev is primary device, and the following is - secondary device. - - For example: - To enable one device, such as DVI only, we can use: - modprobe viafb viafb_active_dev=DVI - To enable two devices, such as CRT+DVI: - modprobe viafb viafb_active_dev=CRT+DVI; - - For DuoView case, we can use: - modprobe viafb viafb_active_dev=CRT+DVI - OR - modprobe viafb viafb_active_dev=DVI+CRT... - - For SAMM case: - If CRT is primary and DVI is secondary, we should use: - modprobe viafb viafb_active_dev=CRT+DVI viafb_SAMM_ON=1... - If DVI is primary and CRT is secondary, we should use: - modprobe viafb viafb_active_dev=DVI+CRT viafb_SAMM_ON=1... - - viafb_display_hardware_layout: - This option is used to specify display hardware layout for CX700 chip. - 1 : LCD only - 2 : DVI only - 3 : LCD+DVI (default) - 4 : LCD1+LCD2 (internal + internal) - 16: LCD1+ExternalLCD2 (internal + external) - - viafb_second_size: - This option is used to set second device memory size(MB) in SAMM case. - The minimal size is 16. - - viafb_platform_epia_dvi: - This option is used to enable DVI on EPIA - M - 0 : No DVI on EPIA - M (default) - 1 : DVI on EPIA - M - - viafb_bus_width: - When using 24 - Bit Bus Width Digital Interface, - this option should be set. - 12: 12-Bit LVDS or 12-Bit TMDS (default) - 24: 24-Bit LVDS or 24-Bit TMDS - - viafb_device_lcd_dualedge: - When using Dual Edge Panel, this option should be set. - 0 : No Dual Edge Panel (default) - 1 : Dual Edge Panel - - viafb_lcd_port: - This option is used to specify LCD output port, - available values are "DVP0" "DVP1" "DFP_HIGHLOW" "DFP_HIGH" "DFP_LOW". - for external LCD + external DVI on CX700(External LCD is on DVP0), - we should use: - modprobe viafb viafb_lcd_port=DVP0... - -Notes: - 1. CRT may not display properly for DuoView CRT & DVI display at - the "640x480" PAL mode with DVI overscan enabled. - 2. SAMM stands for single adapter multi monitors. It is different from - multi-head since SAMM support multi monitor at driver layers, thus fbcon - layer doesn't even know about it; SAMM's second screen doesn't have a - device node file, thus a user mode application can't access it directly. - When SAMM is enabled, viafb_mode and viafb_mode1, viafb_bpp and - viafb_bpp1, viafb_refresh and viafb_refresh1 can be different. - 3. When console is depending on viafbinfo1, dynamically change resolution - and bpp, need to call VIAFB specified ioctl interface VIAFB_SET_DEVICE - instead of calling common ioctl function FBIOPUT_VSCREENINFO since - viafb doesn't support multi-head well, or it will cause screen crush. - - -[Configure viafb with "fbset" tool] ------------------------------------ - "fbset" is an inbox utility of Linux. - 1. Inquire current viafb information, type, - # fbset -i - - 2. Set various resolutions and viafb_refresh rates, - # fbset <resolution-vertical_sync> - - example, - # fbset "1024x768-75" - or - # fbset -g 1024 768 1024 768 32 - Check the file "/etc/fb.modes" to find display modes available. - - 3. Set the color depth, - # fbset -depth <value> - - example, - # fbset -depth 16 - - -[Configure viafb via /proc] ---------------------------- - The following files exist in /proc/viafb - - supported_output_devices - - This read-only file contains a full ',' separated list containing all - output devices that could be available on your platform. It is likely - that not all of those have a connector on your hardware but it should - provide a good starting point to figure out which of those names match - a real connector. - Example: - # cat /proc/viafb/supported_output_devices - - iga1/output_devices - iga2/output_devices - - These two files are readable and writable. iga1 and iga2 are the two - independent units that produce the screen image. Those images can be - forwarded to one or more output devices. Reading those files is a way - to query which output devices are currently used by an iga. - Example: - # cat /proc/viafb/iga1/output_devices - If there are no output devices printed the output of this iga is lost. - This can happen for example if only one (the other) iga is used. - Writing to these files allows adjusting the output devices during - runtime. One can add new devices, remove existing ones or switch - between igas. Essentially you can write a ',' separated list of device - names (or a single one) in the same format as the output to those - files. You can add a '+' or '-' as a prefix allowing simple addition - and removal of devices. So a prefix '+' adds the devices from your list - to the already existing ones, '-' removes the listed devices from the - existing ones and if no prefix is given it replaces all existing ones - with the listed ones. If you remove devices they are expected to turn - off. If you add devices that are already part of the other iga they are - removed there and added to the new one. - Examples: - Add CRT as output device to iga1 - # echo +CRT > /proc/viafb/iga1/output_devices - - Remove (turn off) DVP1 and LVDS1 as output devices of iga2 - # echo -DVP1,LVDS1 > /proc/viafb/iga2/output_devices - - Replace all iga1 output devices by CRT - # echo CRT > /proc/viafb/iga1/output_devices - - -[Bootup with viafb]: --------------------- - Add the following line to your grub.conf: - append = "video=viafb:viafb_mode=1024x768,viafb_bpp=32,viafb_refresh=85" - diff --git a/Documentation/fb/vt8623fb.txt b/Documentation/fb/vt8623fb.rst index f654576c56b7..ba1730937dd8 100644 --- a/Documentation/fb/vt8623fb.txt +++ b/Documentation/fb/vt8623fb.rst @@ -1,13 +1,13 @@ - - vt8623fb - fbdev driver for graphics core in VIA VT8623 chipset - =============================================================== +=============================================================== +vt8623fb - fbdev driver for graphics core in VIA VT8623 chipset +=============================================================== Supported Hardware ================== - VIA VT8623 [CLE266] chipset and its graphics core - (known as CastleRock or Unichrome) +VIA VT8623 [CLE266] chipset and its graphics core +(known as CastleRock or Unichrome) I tested vt8623fb on VIA EPIA ML-6000 diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt index 9999ea521f3e..32bbdfc64c32 100644 --- a/Documentation/features/debug/stackprotector/arch-support.txt +++ b/Documentation/features/debug/stackprotector/arch-support.txt @@ -22,7 +22,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | riscv: | TODO | | s390: | TODO | | sh: | ok | diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst index aa51ffcfa029..bbb0c1c0e5cf 100644 --- a/Documentation/filesystems/api-summary.rst +++ b/Documentation/filesystems/api-summary.rst @@ -89,9 +89,6 @@ Other Functions .. kernel-doc:: fs/direct-io.c :export: -.. kernel-doc:: fs/file_table.c - :export: - .. kernel-doc:: fs/libfs.c :export: diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 3be3e54d480d..705d813d558f 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst @@ -8,7 +8,7 @@ ext4 Data Structures and Algorithms :maxdepth: 6 :numbered: - about.rst - overview.rst - globals.rst - dynamic.rst + about + overview + globals + dynamic diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 1131c34d77f6..2de2fe2ab078 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -16,7 +16,8 @@ algorithms work. .. toctree:: :maxdepth: 2 - path-lookup.rst + vfs + path-lookup api-summary splice @@ -31,13 +32,3 @@ filesystem implementations. journalling fscrypt - -Filesystem-specific documentation -================================= - -Documentation for individual filesystem types can be found here. - -.. toctree:: - :maxdepth: 2 - - binderfs.rst diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 3bd1148d8bb6..2813a19389fe 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -330,14 +330,14 @@ unreferenced dentries, and is now only called when the dentry refcount goes to [mandatory] .d_compare() calling convention and locking rules are significantly -changed. Read updated documentation in Documentation/filesystems/vfs.txt (and +changed. Read updated documentation in Documentation/filesystems/vfs.rst (and look at examples of other filesystems) for guidance. --- [mandatory] .d_hash() calling convention and locking rules are significantly -changed. Read updated documentation in Documentation/filesystems/vfs.txt (and +changed. Read updated documentation in Documentation/filesystems/vfs.rst (and look at examples of other filesystems) for guidance. --- @@ -377,12 +377,12 @@ where possible. the filesystem provides it), which requires dropping out of rcu-walk mode. This may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be returned if the filesystem cannot handle rcu-walk. See -Documentation/filesystems/vfs.txt for more details. +Documentation/filesystems/vfs.rst for more details. permission is an inode permission check that is called on many or all directory inodes on the way down a path walk (to check for exec permission). It must now be rcu-walk aware (mask & MAY_NOT_BLOCK). See -Documentation/filesystems/vfs.txt for more details. +Documentation/filesystems/vfs.rst for more details. -- [mandatory] @@ -625,7 +625,7 @@ in your dentry operations instead. -- [mandatory] ->clone_file_range() and ->dedupe_file_range have been replaced with - ->remap_file_range(). See Documentation/filesystems/vfs.txt for more + ->remap_file_range(). See Documentation/filesystems/vfs.rst for more information. -- [recommended] diff --git a/Documentation/filesystems/ubifs-authentication.md b/Documentation/filesystems/ubifs-authentication.md index 028b3e2e25f9..23e698167141 100644 --- a/Documentation/filesystems/ubifs-authentication.md +++ b/Documentation/filesystems/ubifs-authentication.md @@ -417,9 +417,9 @@ will then have to be provided beforehand in the normal way. [DMC-CBC-ATTACK] http://www.jakoblell.com/blog/2013/12/22/practical-malleability-attack-against-cbc-encrypted-luks-partitions/ -[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.txt +[DM-INTEGRITY] https://www.kernel.org/doc/Documentation/device-mapper/dm-integrity.rst -[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.txt +[DM-VERITY] https://www.kernel.org/doc/Documentation/device-mapper/verity.rst [FSCRYPT-POLICY2] https://www.spinics.net/lists/linux-ext4/msg58710.html diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst new file mode 100644 index 000000000000..0f85ab21c2ca --- /dev/null +++ b/Documentation/filesystems/vfs.rst @@ -0,0 +1,1428 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Overview of the Linux Virtual File System +========================================= + +Original author: Richard Gooch <rgooch@atnf.csiro.au> + +- Copyright (C) 1999 Richard Gooch +- Copyright (C) 2005 Pekka Enberg + + +Introduction +============ + +The Virtual File System (also known as the Virtual Filesystem Switch) is +the software layer in the kernel that provides the filesystem interface +to userspace programs. It also provides an abstraction within the +kernel which allows different filesystem implementations to coexist. + +VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so on +are called from a process context. Filesystem locking is described in +the document Documentation/filesystems/Locking. + + +Directory Entry Cache (dcache) +------------------------------ + +The VFS implements the open(2), stat(2), chmod(2), and similar system +calls. The pathname argument that is passed to them is used by the VFS +to search through the directory entry cache (also known as the dentry +cache or dcache). This provides a very fast look-up mechanism to +translate a pathname (filename) into a specific dentry. Dentries live +in RAM and are never saved to disc: they exist only for performance. + +The dentry cache is meant to be a view into your entire filespace. As +most computers cannot fit all dentries in the RAM at the same time, some +bits of the cache are missing. In order to resolve your pathname into a +dentry, the VFS may have to resort to creating dentries along the way, +and then loading the inode. This is done by looking up the inode. + + +The Inode Object +---------------- + +An individual dentry usually has a pointer to an inode. Inodes are +filesystem objects such as regular files, directories, FIFOs and other +beasts. They live either on the disc (for block device filesystems) or +in the memory (for pseudo filesystems). Inodes that live on the disc +are copied into the memory when required and changes to the inode are +written back to disc. A single inode can be pointed to by multiple +dentries (hard links, for example, do this). + +To look up an inode requires that the VFS calls the lookup() method of +the parent directory inode. This method is installed by the specific +filesystem implementation that the inode lives in. Once the VFS has the +required dentry (and hence the inode), we can do all those boring things +like open(2) the file, or stat(2) it to peek at the inode data. The +stat(2) operation is fairly simple: once the VFS has the dentry, it +peeks at the inode data and passes some of it back to userspace. + + +The File Object +--------------- + +Opening a file requires another operation: allocation of a file +structure (this is the kernel-side implementation of file descriptors). +The freshly allocated file structure is initialized with a pointer to +the dentry and a set of file operation member functions. These are +taken from the inode data. The open() file method is then called so the +specific filesystem implementation can do its work. You can see that +this is another switch performed by the VFS. The file structure is +placed into the file descriptor table for the process. + +Reading, writing and closing files (and other assorted VFS operations) +is done by using the userspace file descriptor to grab the appropriate +file structure, and then calling the required file structure method to +do whatever is required. For as long as the file is open, it keeps the +dentry in use, which in turn means that the VFS inode is still in use. + + +Registering and Mounting a Filesystem +===================================== + +To register and unregister a filesystem, use the following API +functions: + +.. code-block:: c + + #include <linux/fs.h> + + extern int register_filesystem(struct file_system_type *); + extern int unregister_filesystem(struct file_system_type *); + +The passed struct file_system_type describes your filesystem. When a +request is made to mount a filesystem onto a directory in your +namespace, the VFS will call the appropriate mount() method for the +specific filesystem. New vfsmount referring to the tree returned by +->mount() will be attached to the mountpoint, so that when pathname +resolution reaches the mountpoint it will jump into the root of that +vfsmount. + +You can see all filesystems that are registered to the kernel in the +file /proc/filesystems. + + +struct file_system_type +----------------------- + +This describes the filesystem. As of kernel 2.6.39, the following +members are defined: + +.. code-block:: c + + struct file_system_operations { + const char *name; + int fs_flags; + struct dentry *(*mount) (struct file_system_type *, int, + const char *, void *); + void (*kill_sb) (struct super_block *); + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; + struct lock_class_key s_lock_key; + struct lock_class_key s_umount_key; + }; + +``name`` + the name of the filesystem type, such as "ext2", "iso9660", + "msdos" and so on + +``fs_flags`` + various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) + +``mount`` + the method to call when a new instance of this filesystem should + be mounted + +``kill_sb`` + the method to call when an instance of this filesystem should be + shut down + + +``owner`` + for internal VFS use: you should initialize this to THIS_MODULE + in most cases. + +``next`` + for internal VFS use: you should initialize this to NULL + + s_lock_key, s_umount_key: lockdep-specific + +The mount() method has the following arguments: + +``struct file_system_type *fs_type`` + describes the filesystem, partly initialized by the specific + filesystem code + +``int flags`` + mount flags + +``const char *dev_name`` + the device name we are mounting. + +``void *data`` + arbitrary mount options, usually comes as an ASCII string (see + "Mount Options" section) + +The mount() method must return the root dentry of the tree requested by +caller. An active reference to its superblock must be grabbed and the +superblock must be locked. On failure it should return ERR_PTR(error). + +The arguments match those of mount(2) and their interpretation depends +on filesystem type. E.g. for block filesystems, dev_name is interpreted +as block device name, that device is opened and if it contains a +suitable filesystem image the method creates and initializes struct +super_block accordingly, returning its root dentry to caller. + +->mount() may choose to return a subtree of existing filesystem - it +doesn't have to create a new one. The main result from the caller's +point of view is a reference to dentry at the root of (sub)tree to be +attached; creation of new superblock is a common side effect. + +The most interesting member of the superblock structure that the mount() +method fills in is the "s_op" field. This is a pointer to a "struct +super_operations" which describes the next level of the filesystem +implementation. + +Usually, a filesystem uses one of the generic mount() implementations +and provides a fill_super() callback instead. The generic variants are: + +``mount_bdev`` + mount a filesystem residing on a block device + +``mount_nodev`` + mount a filesystem that is not backed by a device + +``mount_single`` + mount a filesystem which shares the instance between all mounts + +A fill_super() callback implementation has the following arguments: + +``struct super_block *sb`` + the superblock structure. The callback must initialize this + properly. + +``void *data`` + arbitrary mount options, usually comes as an ASCII string (see + "Mount Options" section) + +``int silent`` + whether or not to be silent on error + + +The Superblock Object +===================== + +A superblock object represents a mounted filesystem. + + +struct super_operations +----------------------- + +This describes how the VFS can manipulate the superblock of your +filesystem. As of kernel 2.6.22, the following members are defined: + +.. code-block:: c + + struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *); + + void (*dirty_inode) (struct inode *, int flags); + int (*write_inode) (struct inode *, int); + void (*drop_inode) (struct inode *); + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); + int (*statfs) (struct dentry *, struct kstatfs *); + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + + int (*show_options)(struct seq_file *, struct dentry *); + + ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); + ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + int (*nr_cached_objects)(struct super_block *); + void (*free_cached_objects)(struct super_block *, int); + }; + +All methods are called without any locks being held, unless otherwise +noted. This means that most methods can block safely. All methods are +only called from a process context (i.e. not from an interrupt handler +or bottom half). + +``alloc_inode`` + this method is called by alloc_inode() to allocate memory for + struct inode and initialize it. If this function is not + defined, a simple 'struct inode' is allocated. Normally + alloc_inode will be used to allocate a larger structure which + contains a 'struct inode' embedded within it. + +``destroy_inode`` + this method is called by destroy_inode() to release resources + allocated for struct inode. It is only required if + ->alloc_inode was defined and simply undoes anything done by + ->alloc_inode. + +``dirty_inode`` + this method is called by the VFS to mark an inode dirty. + +``write_inode`` + this method is called when the VFS needs to write an inode to + disc. The second parameter indicates whether the write should + be synchronous or not, not all filesystems check this flag. + +``drop_inode`` + called when the last access to the inode is dropped, with the + inode->i_lock spinlock held. + + This method should be either NULL (normal UNIX filesystem + semantics) or "generic_delete_inode" (for filesystems that do + not want to cache inodes - causing "delete_inode" to always be + called regardless of the value of i_nlink) + + The "generic_delete_inode()" behavior is equivalent to the old + practice of using "force_delete" in the put_inode() case, but + does not have the races that the "force_delete()" approach had. + +``delete_inode`` + called when the VFS wants to delete an inode + +``put_super`` + called when the VFS wishes to free the superblock + (i.e. unmount). This is called with the superblock lock held + +``sync_fs`` + called when VFS is writing out all dirty data associated with a + superblock. The second parameter indicates whether the method + should wait until the write out has been completed. Optional. + +``freeze_fs`` + called when VFS is locking a filesystem and forcing it into a + consistent state. This method is currently used by the Logical + Volume Manager (LVM). + +``unfreeze_fs`` + called when VFS is unlocking a filesystem and making it writable + again. + +``statfs`` + called when the VFS needs to get filesystem statistics. + +``remount_fs`` + called when the filesystem is remounted. This is called with + the kernel lock held + +``clear_inode`` + called then the VFS clears the inode. Optional + +``umount_begin`` + called when the VFS is unmounting a filesystem. + +``show_options`` + called by the VFS to show mount options for /proc/<pid>/mounts. + (see "Mount Options" section) + +``quota_read`` + called by the VFS to read from filesystem quota file. + +``quota_write`` + called by the VFS to write to filesystem quota file. + +``nr_cached_objects`` + called by the sb cache shrinking function for the filesystem to + return the number of freeable cached objects it contains. + Optional. + +``free_cache_objects`` + called by the sb cache shrinking function for the filesystem to + scan the number of objects indicated to try to free them. + Optional, but any filesystem implementing this method needs to + also implement ->nr_cached_objects for it to be called + correctly. + + We can't do anything with any errors that the filesystem might + encountered, hence the void return type. This will never be + called if the VM is trying to reclaim under GFP_NOFS conditions, + hence this method does not need to handle that situation itself. + + Implementations must include conditional reschedule calls inside + any scanning loop that is done. This allows the VFS to + determine appropriate scan batch sizes without having to worry + about whether implementations will cause holdoff problems due to + large scan batch sizes. + +Whoever sets up the inode is responsible for filling in the "i_op" +field. This is a pointer to a "struct inode_operations" which describes +the methods that can be performed on individual inodes. + + +struct xattr_handlers +--------------------- + +On filesystems that support extended attributes (xattrs), the s_xattr +superblock field points to a NULL-terminated array of xattr handlers. +Extended attributes are name:value pairs. + +``name`` + Indicates that the handler matches attributes with the specified + name (such as "system.posix_acl_access"); the prefix field must + be NULL. + +``prefix`` + Indicates that the handler matches all attributes with the + specified name prefix (such as "user."); the name field must be + NULL. + +``list`` + Determine if attributes matching this xattr handler should be + listed for a particular dentry. Used by some listxattr + implementations like generic_listxattr. + +``get`` + Called by the VFS to get the value of a particular extended + attribute. This method is called by the getxattr(2) system + call. + +``set`` + Called by the VFS to set the value of a particular extended + attribute. When the new value is NULL, called to remove a + particular extended attribute. This method is called by the the + setxattr(2) and removexattr(2) system calls. + +When none of the xattr handlers of a filesystem match the specified +attribute name or when a filesystem doesn't support extended attributes, +the various ``*xattr(2)`` system calls return -EOPNOTSUPP. + + +The Inode Object +================ + +An inode object represents an object within the filesystem. + + +struct inode_operations +----------------------- + +This describes how the VFS can manipulate an inode in your filesystem. +As of kernel 2.6.22, the following members are defined: + +.. code-block:: c + + struct inode_operations { + int (*create) (struct inode *,struct dentry *, umode_t, bool); + struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); + int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*unlink) (struct inode *,struct dentry *); + int (*symlink) (struct inode *,struct dentry *,const char *); + int (*mkdir) (struct inode *,struct dentry *,umode_t); + int (*rmdir) (struct inode *,struct dentry *); + int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); + int (*readlink) (struct dentry *, char __user *,int); + const char *(*get_link) (struct dentry *, struct inode *, + struct delayed_call *); + int (*permission) (struct inode *, int); + int (*get_acl)(struct inode *, int); + int (*setattr) (struct dentry *, struct iattr *); + int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + void (*update_time)(struct inode *, struct timespec *, int); + int (*atomic_open)(struct inode *, struct dentry *, struct file *, + unsigned open_flag, umode_t create_mode); + int (*tmpfile) (struct inode *, struct dentry *, umode_t); + }; + +Again, all methods are called without any locks being held, unless +otherwise noted. + +``create`` + called by the open(2) and creat(2) system calls. Only required + if you want to support regular files. The dentry you get should + not have an inode (i.e. it should be a negative dentry). Here + you will probably call d_instantiate() with the dentry and the + newly created inode + +``lookup`` + called when the VFS needs to look up an inode in a parent + directory. The name to look for is found in the dentry. This + method must call d_add() to insert the found inode into the + dentry. The "i_count" field in the inode structure should be + incremented. If the named inode does not exist a NULL inode + should be inserted into the dentry (this is called a negative + dentry). Returning an error code from this routine must only be + done on a real error, otherwise creating inodes with system + calls like create(2), mknod(2), mkdir(2) and so on will fail. + If you wish to overload the dentry methods then you should + initialise the "d_dop" field in the dentry; this is a pointer to + a struct "dentry_operations". This method is called with the + directory inode semaphore held + +``link`` + called by the link(2) system call. Only required if you want to + support hard links. You will probably need to call + d_instantiate() just as you would in the create() method + +``unlink`` + called by the unlink(2) system call. Only required if you want + to support deleting inodes + +``symlink`` + called by the symlink(2) system call. Only required if you want + to support symlinks. You will probably need to call + d_instantiate() just as you would in the create() method + +``mkdir`` + called by the mkdir(2) system call. Only required if you want + to support creating subdirectories. You will probably need to + call d_instantiate() just as you would in the create() method + +``rmdir`` + called by the rmdir(2) system call. Only required if you want + to support deleting subdirectories + +``mknod`` + called by the mknod(2) system call to create a device (char, + block) inode or a named pipe (FIFO) or socket. Only required if + you want to support creating these types of inodes. You will + probably need to call d_instantiate() just as you would in the + create() method + +``rename`` + called by the rename(2) system call to rename the object to have + the parent and name given by the second inode and dentry. + + The filesystem must return -EINVAL for any unsupported or + unknown flags. Currently the following flags are implemented: + (1) RENAME_NOREPLACE: this flag indicates that if the target of + the rename exists the rename should fail with -EEXIST instead of + replacing the target. The VFS already checks for existence, so + for local filesystems the RENAME_NOREPLACE implementation is + equivalent to plain rename. + (2) RENAME_EXCHANGE: exchange source and target. Both must + exist; this is checked by the VFS. Unlike plain rename, source + and target may be of different type. + +``get_link`` + called by the VFS to follow a symbolic link to the inode it + points to. Only required if you want to support symbolic links. + This method returns the symlink body to traverse (and possibly + resets the current position with nd_jump_link()). If the body + won't go away until the inode is gone, nothing else is needed; + if it needs to be otherwise pinned, arrange for its release by + having get_link(..., ..., done) do set_delayed_call(done, + destructor, argument). In that case destructor(argument) will + be called once VFS is done with the body you've returned. May + be called in RCU mode; that is indicated by NULL dentry + argument. If request can't be handled without leaving RCU mode, + have it return ERR_PTR(-ECHILD). + + If the filesystem stores the symlink target in ->i_link, the + VFS may use it directly without calling ->get_link(); however, + ->get_link() must still be provided. ->i_link must not be + freed until after an RCU grace period. Writing to ->i_link + post-iget() time requires a 'release' memory barrier. + +``readlink`` + this is now just an override for use by readlink(2) for the + cases when ->get_link uses nd_jump_link() or object is not in + fact a symlink. Normally filesystems should only implement + ->get_link for symlinks and readlink(2) will automatically use + that. + +``permission`` + called by the VFS to check for access rights on a POSIX-like + filesystem. + + May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in + rcu-walk mode, the filesystem must check the permission without + blocking or storing to the inode. + + If a situation is encountered that rcu-walk cannot handle, + return + -ECHILD and it will be called again in ref-walk mode. + +``setattr`` + called by the VFS to set attributes for a file. This method is + called by chmod(2) and related system calls. + +``getattr`` + called by the VFS to get attributes of a file. This method is + called by stat(2) and related system calls. + +``listxattr`` + called by the VFS to list all extended attributes for a given + file. This method is called by the listxattr(2) system call. + +``update_time`` + called by the VFS to update a specific time or the i_version of + an inode. If this is not defined the VFS will update the inode + itself and call mark_inode_dirty_sync. + +``atomic_open`` + called on the last component of an open. Using this optional + method the filesystem can look up, possibly create and open the + file in one atomic operation. If it wants to leave actual + opening to the caller (e.g. if the file turned out to be a + symlink, device, or just something filesystem won't do atomic + open for), it may signal this by returning finish_no_open(file, + dentry). This method is only called if the last component is + negative or needs lookup. Cached positive dentries are still + handled by f_op->open(). If the file was created, FMODE_CREATED + flag should be set in file->f_mode. In case of O_EXCL the + method must only succeed if the file didn't exist and hence + FMODE_CREATED shall always be set on success. + +``tmpfile`` + called in the end of O_TMPFILE open(). Optional, equivalent to + atomically creating, opening and unlinking a file in given + directory. + + +The Address Space Object +======================== + +The address space object is used to group and manage pages in the page +cache. It can be used to keep track of the pages in a file (or anything +else) and also track the mapping of sections of the file into process +address spaces. + +There are a number of distinct yet related services that an +address-space can provide. These include communicating memory pressure, +page lookup by address, and keeping track of pages tagged as Dirty or +Writeback. + +The first can be used independently to the others. The VM can try to +either write dirty pages in order to clean them, or release clean pages +in order to reuse them. To do this it can call the ->writepage method +on dirty pages, and ->releasepage on clean pages with PagePrivate set. +Clean pages without PagePrivate and with no external references will be +released without notice being given to the address_space. + +To achieve this functionality, pages need to be placed on an LRU with +lru_cache_add and mark_page_active needs to be called whenever the page +is used. + +Pages are normally kept in a radix tree index by ->index. This tree +maintains information about the PG_Dirty and PG_Writeback status of each +page, so that pages with either of these flags can be found quickly. + +The Dirty tag is primarily used by mpage_writepages - the default +->writepages method. It uses the tag to find dirty pages to call +->writepage on. If mpage_writepages is not used (i.e. the address +provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost +unused. write_inode_now and sync_inode do use it (through +__sync_single_inode) to check if ->writepages has been successful in +writing out the whole address_space. + +The Writeback tag is used by filemap*wait* and sync_page* functions, via +filemap_fdatawait_range, to wait for all writeback to complete. + +An address_space handler may attach extra information to a page, +typically using the 'private' field in the 'struct page'. If such +information is attached, the PG_Private flag should be set. This will +cause various VM routines to make extra calls into the address_space +handler to deal with that data. + +An address space acts as an intermediate between storage and +application. Data is read into the address space a whole page at a +time, and provided to the application either by copying of the page, or +by memory-mapping the page. Data is written into the address space by +the application, and then written-back to storage typically in whole +pages, however the address_space has finer control of write sizes. + +The read process essentially only requires 'readpage'. The write +process is more complicated and uses write_begin/write_end or +set_page_dirty to write data into the address_space, and writepage and +writepages to writeback data to storage. + +Adding and removing pages to/from an address_space is protected by the +inode's i_mutex. + +When data is written to a page, the PG_Dirty flag should be set. It +typically remains set until writepage asks for it to be written. This +should clear PG_Dirty and set PG_Writeback. It can be actually written +at any point after PG_Dirty is clear. Once it is known to be safe, +PG_Writeback is cleared. + +Writeback makes use of a writeback_control structure to direct the +operations. This gives the the writepage and writepages operations some +information about the nature of and reason for the writeback request, +and the constraints under which it is being done. It is also used to +return information back to the caller about the result of a writepage or +writepages request. + + +Handling errors during writeback +-------------------------------- + +Most applications that do buffered I/O will periodically call a file +synchronization call (fsync, fdatasync, msync or sync_file_range) to +ensure that data written has made it to the backing store. When there +is an error during writeback, they expect that error to be reported when +a file sync request is made. After an error has been reported on one +request, subsequent requests on the same file descriptor should return +0, unless further writeback errors have occurred since the previous file +syncronization. + +Ideally, the kernel would report errors only on file descriptions on +which writes were done that subsequently failed to be written back. The +generic pagecache infrastructure does not track the file descriptions +that have dirtied each individual page however, so determining which +file descriptors should get back an error is not possible. + +Instead, the generic writeback error tracking infrastructure in the +kernel settles for reporting errors to fsync on all file descriptions +that were open at the time that the error occurred. In a situation with +multiple writers, all of them will get back an error on a subsequent +fsync, even if all of the writes done through that particular file +descriptor succeeded (or even if there were no writes on that file +descriptor at all). + +Filesystems that wish to use this infrastructure should call +mapping_set_error to record the error in the address_space when it +occurs. Then, after writing back data from the pagecache in their +file->fsync operation, they should call file_check_and_advance_wb_err to +ensure that the struct file's error cursor has advanced to the correct +point in the stream of errors emitted by the backing device(s). + + +struct address_space_operations +------------------------------- + +This describes how the VFS can manipulate mapping of a file to page +cache in your filesystem. The following members are defined: + +.. code-block:: c + + struct address_space_operations { + int (*writepage)(struct page *page, struct writeback_control *wbc); + int (*readpage)(struct file *, struct page *); + int (*writepages)(struct address_space *, struct writeback_control *); + int (*set_page_dirty)(struct page *page); + int (*readpages)(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + sector_t (*bmap)(struct address_space *, sector_t); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); + int (*releasepage) (struct page *, int); + void (*freepage)(struct page *); + ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); + /* isolate a page for migration */ + bool (*isolate_page) (struct page *, isolate_mode_t); + /* migrate the contents of a page to the specified target */ + int (*migratepage) (struct page *, struct page *); + /* put migration-failed page back to right list */ + void (*putback_page) (struct page *); + int (*launder_page) (struct page *); + + int (*is_partially_uptodate) (struct page *, unsigned long, + unsigned long); + void (*is_dirty_writeback) (struct page *, bool *, bool *); + int (*error_remove_page) (struct mapping *mapping, struct page *page); + int (*swap_activate)(struct file *); + int (*swap_deactivate)(struct file *); + }; + +``writepage`` + called by the VM to write a dirty page to backing store. This + may happen for data integrity reasons (i.e. 'sync'), or to free + up memory (flush). The difference can be seen in + wbc->sync_mode. The PG_Dirty flag has been cleared and + PageLocked is true. writepage should start writeout, should set + PG_Writeback, and should make sure the page is unlocked, either + synchronously or asynchronously when the write operation + completes. + + If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to + try too hard if there are problems, and may choose to write out + other pages from the mapping if that is easier (e.g. due to + internal dependencies). If it chooses not to start writeout, it + should return AOP_WRITEPAGE_ACTIVATE so that the VM will not + keep calling ->writepage on that page. + + See the file "Locking" for more details. + +``readpage`` + called by the VM to read a page from backing store. The page + will be Locked when readpage is called, and should be unlocked + and marked uptodate once the read completes. If ->readpage + discovers that it needs to unlock the page for some reason, it + can do so, and then return AOP_TRUNCATED_PAGE. In this case, + the page will be relocated, relocked and if that all succeeds, + ->readpage will be called again. + +``writepages`` + called by the VM to write out pages associated with the + address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then + the writeback_control will specify a range of pages that must be + written out. If it is WBC_SYNC_NONE, then a nr_to_write is + given and that many pages should be written if possible. If no + ->writepages is given, then mpage_writepages is used instead. + This will choose pages from the address space that are tagged as + DIRTY and will pass them to ->writepage. + +``set_page_dirty`` + called by the VM to set a page dirty. This is particularly + needed if an address space attaches private data to a page, and + that data needs to be updated when a page is dirtied. This is + called, for example, when a memory mapped page gets modified. + If defined, it should set the PageDirty flag, and the + PAGECACHE_TAG_DIRTY tag in the radix tree. + +``readpages`` + called by the VM to read pages associated with the address_space + object. This is essentially just a vector version of readpage. + Instead of just one page, several pages are requested. + readpages is only used for read-ahead, so read errors are + ignored. If anything goes wrong, feel free to give up. + +``write_begin`` + Called by the generic buffered write code to ask the filesystem + to prepare to write len bytes at the given offset in the file. + The address_space should check that the write will be able to + complete, by allocating space if necessary and doing any other + internal housekeeping. If the write will update parts of any + basic-blocks on storage, then those blocks should be pre-read + (if they haven't been read already) so that the updated blocks + can be written out properly. + + The filesystem must return the locked pagecache page for the + specified offset, in ``*pagep``, for the caller to write into. + + It must be able to cope with short writes (where the length + passed to write_begin is greater than the number of bytes copied + into the page). + + flags is a field for AOP_FLAG_xxx flags, described in + include/linux/fs.h. + + A void * may be returned in fsdata, which then gets passed into + write_end. + + Returns 0 on success; < 0 on failure (which is the error code), + in which case write_end is not called. + +``write_end`` + After a successful write_begin, and data copy, write_end must be + called. len is the original len passed to write_begin, and + copied is the amount that was able to be copied. + + The filesystem must take care of unlocking the page and + releasing it refcount, and updating i_size. + + Returns < 0 on failure, otherwise the number of bytes (<= + 'copied') that were able to be copied into pagecache. + +``bmap`` + called by the VFS to map a logical block offset within object to + physical block number. This method is used by the FIBMAP ioctl + and for working with swap-files. To be able to swap to a file, + the file must have a stable mapping to a block device. The swap + system does not go through the filesystem but instead uses bmap + to find out where the blocks in the file are and uses those + addresses directly. + +``invalidatepage`` + If a page has PagePrivate set, then invalidatepage will be + called when part or all of the page is to be removed from the + address space. This generally corresponds to either a + truncation, punch hole or a complete invalidation of the address + space (in the latter case 'offset' will always be 0 and 'length' + will be PAGE_SIZE). Any private data associated with the page + should be updated to reflect this truncation. If offset is 0 + and length is PAGE_SIZE, then the private data should be + released, because the page must be able to be completely + discarded. This may be done by calling the ->releasepage + function, but in this case the release MUST succeed. + +``releasepage`` + releasepage is called on PagePrivate pages to indicate that the + page should be freed if possible. ->releasepage should remove + any private data from the page and clear the PagePrivate flag. + If releasepage() fails for some reason, it must indicate failure + with a 0 return value. releasepage() is used in two distinct + though related cases. The first is when the VM finds a clean + page with no active users and wants to make it a free page. If + ->releasepage succeeds, the page will be removed from the + address_space and become free. + + The second case is when a request has been made to invalidate + some or all pages in an address_space. This can happen through + the fadvise(POSIX_FADV_DONTNEED) system call or by the + filesystem explicitly requesting it as nfs and 9fs do (when they + believe the cache may be out of date with storage) by calling + invalidate_inode_pages2(). If the filesystem makes such a call, + and needs to be certain that all pages are invalidated, then its + releasepage will need to ensure this. Possibly it can clear the + PageUptodate bit if it cannot free private data yet. + +``freepage`` + freepage is called once the page is no longer visible in the + page cache in order to allow the cleanup of any private data. + Since it may be called by the memory reclaimer, it should not + assume that the original address_space mapping still exists, and + it should not block. + +``direct_IO`` + called by the generic read/write routines to perform direct_IO - + that is IO requests which bypass the page cache and transfer + data directly between the storage and the application's address + space. + +``isolate_page`` + Called by the VM when isolating a movable non-lru page. If page + is successfully isolated, VM marks the page as PG_isolated via + __SetPageIsolated. + +``migrate_page`` + This is used to compact the physical memory usage. If the VM + wants to relocate a page (maybe off a memory card that is + signalling imminent failure) it will pass a new page and an old + page to this function. migrate_page should transfer any private + data across and update any references that it has to the page. + +``putback_page`` + Called by the VM when isolated page's migration fails. + +``launder_page`` + Called before freeing a page - it writes back the dirty page. + To prevent redirtying the page, it is kept locked during the + whole operation. + +``is_partially_uptodate`` + Called by the VM when reading a file through the pagecache when + the underlying blocksize != pagesize. If the required block is + up to date then the read can complete without needing the IO to + bring the whole page up to date. + +``is_dirty_writeback`` + Called by the VM when attempting to reclaim a page. The VM uses + dirty and writeback information to determine if it needs to + stall to allow flushers a chance to complete some IO. + Ordinarily it can use PageDirty and PageWriteback but some + filesystems have more complex state (unstable pages in NFS + prevent reclaim) or do not set those flags due to locking + problems. This callback allows a filesystem to indicate to the + VM if a page should be treated as dirty or writeback for the + purposes of stalling. + +``error_remove_page`` + normally set to generic_error_remove_page if truncation is ok + for this address space. Used for memory failure handling. + Setting this implies you deal with pages going away under you, + unless you have them locked or reference counts increased. + +``swap_activate`` + Called when swapon is used on a file to allocate space if + necessary and pin the block lookup information in memory. A + return value of zero indicates success, in which case this file + can be used to back swapspace. + +``swap_deactivate`` + Called during swapoff on files where swap_activate was + successful. + + +The File Object +=============== + +A file object represents a file opened by a process. This is also known +as an "open file description" in POSIX parlance. + + +struct file_operations +---------------------- + +This describes how the VFS can manipulate an open file. As of kernel +4.18, the following members are defined: + +.. code-block:: c + + struct file_operations { + struct module *owner; + loff_t (*llseek) (struct file *, loff_t, int); + ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); + ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); + ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); + ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); + int (*iopoll)(struct kiocb *kiocb, bool spin); + int (*iterate) (struct file *, struct dir_context *); + int (*iterate_shared) (struct file *, struct dir_context *); + __poll_t (*poll) (struct file *, struct poll_table_struct *); + long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); + long (*compat_ioctl) (struct file *, unsigned int, unsigned long); + int (*mmap) (struct file *, struct vm_area_struct *); + int (*open) (struct inode *, struct file *); + int (*flush) (struct file *, fl_owner_t id); + int (*release) (struct inode *, struct file *); + int (*fsync) (struct file *, loff_t, loff_t, int datasync); + int (*fasync) (int, struct file *, int); + int (*lock) (struct file *, int, struct file_lock *); + ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + int (*check_flags)(int); + int (*flock) (struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + int (*setlease)(struct file *, long, struct file_lock **, void **); + long (*fallocate)(struct file *file, int mode, loff_t offset, + loff_t len); + void (*show_fdinfo)(struct seq_file *m, struct file *f); + #ifndef CONFIG_MMU + unsigned (*mmap_capabilities)(struct file *); + #endif + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); + int (*fadvise)(struct file *, loff_t, loff_t, int); + }; + +Again, all methods are called without any locks being held, unless +otherwise noted. + +``llseek`` + called when the VFS needs to move the file position index + +``read`` + called by read(2) and related system calls + +``read_iter`` + possibly asynchronous read with iov_iter as destination + +``write`` + called by write(2) and related system calls + +``write_iter`` + possibly asynchronous write with iov_iter as source + +``iopoll`` + called when aio wants to poll for completions on HIPRI iocbs + +``iterate`` + called when the VFS needs to read the directory contents + +``iterate_shared`` + called when the VFS needs to read the directory contents when + filesystem supports concurrent dir iterators + +``poll`` + called by the VFS when a process wants to check if there is + activity on this file and (optionally) go to sleep until there + is activity. Called by the select(2) and poll(2) system calls + +``unlocked_ioctl`` + called by the ioctl(2) system call. + +``compat_ioctl`` + called by the ioctl(2) system call when 32 bit system calls are + used on 64 bit kernels. + +``mmap`` + called by the mmap(2) system call + +``open`` + called by the VFS when an inode should be opened. When the VFS + opens a file, it creates a new "struct file". It then calls the + open method for the newly allocated file structure. You might + think that the open method really belongs in "struct + inode_operations", and you may be right. I think it's done the + way it is because it makes filesystems simpler to implement. + The open() method is a good place to initialize the + "private_data" member in the file structure if you want to point + to a device structure + +``flush`` + called by the close(2) system call to flush a file + +``release`` + called when the last reference to an open file is closed + +``fsync`` + called by the fsync(2) system call. Also see the section above + entitled "Handling errors during writeback". + +``fasync`` + called by the fcntl(2) system call when asynchronous + (non-blocking) mode is enabled for a file + +``lock`` + called by the fcntl(2) system call for F_GETLK, F_SETLK, and + F_SETLKW commands + +``get_unmapped_area`` + called by the mmap(2) system call + +``check_flags`` + called by the fcntl(2) system call for F_SETFL command + +``flock`` + called by the flock(2) system call + +``splice_write`` + called by the VFS to splice data from a pipe to a file. This + method is used by the splice(2) system call + +``splice_read`` + called by the VFS to splice data from file to a pipe. This + method is used by the splice(2) system call + +``setlease`` + called by the VFS to set or release a file lock lease. setlease + implementations should call generic_setlease to record or remove + the lease in the inode after setting it. + +``fallocate`` + called by the VFS to preallocate blocks or punch a hole. + +``copy_file_range`` + called by the copy_file_range(2) system call. + +``remap_file_range`` + called by the ioctl(2) system call for FICLONERANGE and FICLONE + and FIDEDUPERANGE commands to remap file ranges. An + implementation should remap len bytes at pos_in of the source + file into the dest file at pos_out. Implementations must handle + callers passing in len == 0; this means "remap to the end of the + source file". The return value should the number of bytes + remapped, or the usual negative error code if errors occurred + before any bytes were remapped. The remap_flags parameter + accepts REMAP_FILE_* flags. If REMAP_FILE_DEDUP is set then the + implementation must only remap if the requested file ranges have + identical contents. If REMAP_CAN_SHORTEN is set, the caller is + ok with the implementation shortening the request length to + satisfy alignment or EOF requirements (or any other reason). + +``fadvise`` + possibly called by the fadvise64() system call. + +Note that the file operations are implemented by the specific +filesystem in which the inode resides. When opening a device node +(character or block special) most filesystems will call special +support routines in the VFS which will locate the required device +driver information. These support routines replace the filesystem file +operations with those for the device driver, and then proceed to call +the new open() method for the file. This is how opening a device file +in the filesystem eventually ends up calling the device driver open() +method. + + +Directory Entry Cache (dcache) +============================== + + +struct dentry_operations +------------------------ + +This describes how a filesystem can overload the standard dentry +operations. Dentries and the dcache are the domain of the VFS and the +individual filesystem implementations. Device drivers have no business +here. These methods may be set to NULL, as they are either optional or +the VFS uses a default. As of kernel 2.6.22, the following members are +defined: + +.. code-block:: c + + struct dentry_operations { + int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_weak_revalidate)(struct dentry *, unsigned int); + int (*d_hash)(const struct dentry *, struct qstr *); + int (*d_compare)(const struct dentry *, + unsigned int, const char *, const struct qstr *); + int (*d_delete)(const struct dentry *); + int (*d_init)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); + char *(*d_dname)(struct dentry *, char *, int); + struct vfsmount *(*d_automount)(struct path *); + int (*d_manage)(const struct path *, bool); + struct dentry *(*d_real)(struct dentry *, const struct inode *); + }; + +``d_revalidate`` + called when the VFS needs to revalidate a dentry. This is + called whenever a name look-up finds a dentry in the dcache. + Most local filesystems leave this as NULL, because all their + dentries in the dcache are valid. Network filesystems are + different since things can change on the server without the + client necessarily being aware of it. + + This function should return a positive value if the dentry is + still valid, and zero or a negative error code if it isn't. + + d_revalidate may be called in rcu-walk mode (flags & + LOOKUP_RCU). If in rcu-walk mode, the filesystem must + revalidate the dentry without blocking or storing to the dentry, + d_parent and d_inode should not be used without care (because + they can change and, in d_inode case, even become NULL under + us). + + If a situation is encountered that rcu-walk cannot handle, + return + -ECHILD and it will be called again in ref-walk mode. + +``_weak_revalidate`` + called when the VFS needs to revalidate a "jumped" dentry. This + is called when a path-walk ends at dentry that was not acquired + by doing a lookup in the parent directory. This includes "/", + "." and "..", as well as procfs-style symlinks and mountpoint + traversal. + + In this case, we are less concerned with whether the dentry is + still fully correct, but rather that the inode is still valid. + As with d_revalidate, most local filesystems will set this to + NULL since their dcache entries are always valid. + + This function has the same return code semantics as + d_revalidate. + + d_weak_revalidate is only called after leaving rcu-walk mode. + +``d_hash`` + called when the VFS adds a dentry to the hash table. The first + dentry passed to d_hash is the parent directory that the name is + to be hashed into. + + Same locking and synchronisation rules as d_compare regarding + what is safe to dereference etc. + +``d_compare`` + called to compare a dentry name with a given name. The first + dentry is the parent of the dentry to be compared, the second is + the child dentry. len and name string are properties of the + dentry to be compared. qstr is the name to compare it with. + + Must be constant and idempotent, and should not take locks if + possible, and should not or store into the dentry. Should not + dereference pointers outside the dentry without lots of care + (eg. d_parent, d_inode, d_name should not be used). + + However, our vfsmount is pinned, and RCU held, so the dentries + and inodes won't disappear, neither will our sb or filesystem + module. ->d_sb may be used. + + It is a tricky calling convention because it needs to be called + under "rcu-walk", ie. without any locks or references on things. + +``d_delete`` + called when the last reference to a dentry is dropped and the + dcache is deciding whether or not to cache it. Return 1 to + delete immediately, or 0 to cache the dentry. Default is NULL + which means to always cache a reachable dentry. d_delete must + be constant and idempotent. + +``d_init`` + called when a dentry is allocated + +``d_release`` + called when a dentry is really deallocated + +``d_iput`` + called when a dentry loses its inode (just prior to its being + deallocated). The default when this is NULL is that the VFS + calls iput(). If you define this method, you must call iput() + yourself + +``d_dname`` + called when the pathname of a dentry should be generated. + Useful for some pseudo filesystems (sockfs, pipefs, ...) to + delay pathname generation. (Instead of doing it when dentry is + created, it's done only when the path is needed.). Real + filesystems probably dont want to use it, because their dentries + are present in global dcache hash, so their hash should be an + invariant. As no lock is held, d_dname() should not try to + modify the dentry itself, unless appropriate SMP safety is used. + CAUTION : d_path() logic is quite tricky. The correct way to + return for example "Hello" is to put it at the end of the + buffer, and returns a pointer to the first char. + dynamic_dname() helper function is provided to take care of + this. + + Example : + +.. code-block:: c + + static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) + { + return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", + dentry->d_inode->i_ino); + } + +``d_automount`` + called when an automount dentry is to be traversed (optional). + This should create a new VFS mount record and return the record + to the caller. The caller is supplied with a path parameter + giving the automount directory to describe the automount target + and the parent VFS mount record to provide inheritable mount + parameters. NULL should be returned if someone else managed to + make the automount first. If the vfsmount creation failed, then + an error code should be returned. If -EISDIR is returned, then + the directory will be treated as an ordinary directory and + returned to pathwalk to continue walking. + + If a vfsmount is returned, the caller will attempt to mount it + on the mountpoint and will remove the vfsmount from its + expiration list in the case of failure. The vfsmount should be + returned with 2 refs on it to prevent automatic expiration - the + caller will clean up the additional ref. + + This function is only used if DCACHE_NEED_AUTOMOUNT is set on + the dentry. This is set by __d_instantiate() if S_AUTOMOUNT is + set on the inode being added. + +``d_manage`` + called to allow the filesystem to manage the transition from a + dentry (optional). This allows autofs, for example, to hold up + clients waiting to explore behind a 'mountpoint' while letting + the daemon go past and construct the subtree there. 0 should be + returned to let the calling process continue. -EISDIR can be + returned to tell pathwalk to use this directory as an ordinary + directory and to ignore anything mounted on it and not to check + the automount flag. Any other error code will abort pathwalk + completely. + + If the 'rcu_walk' parameter is true, then the caller is doing a + pathwalk in RCU-walk mode. Sleeping is not permitted in this + mode, and the caller can be asked to leave it and call again by + returning -ECHILD. -EISDIR may also be returned to tell + pathwalk to ignore d_automount or any mounts. + + This function is only used if DCACHE_MANAGE_TRANSIT is set on + the dentry being transited from. + +``d_real`` + overlay/union type filesystems implement this method to return + one of the underlying dentries hidden by the overlay. It is + used in two different modes: + + Called from file_dentry() it returns the real dentry matching + the inode argument. The real dentry may be from a lower layer + already copied up, but still referenced from the file. This + mode is selected with a non-NULL inode argument. + + With NULL inode the topmost real underlying dentry is returned. + +Each dentry has a pointer to its parent dentry, as well as a hash list +of child dentries. Child dentries are basically like files in a +directory. + + +Directory Entry Cache API +-------------------------- + +There are a number of functions defined which permit a filesystem to +manipulate dentries: + +``dget`` + open a new handle for an existing dentry (this just increments + the usage count) + +``dput`` + close a handle for a dentry (decrements the usage count). If + the usage count drops to 0, and the dentry is still in its + parent's hash, the "d_delete" method is called to check whether + it should be cached. If it should not be cached, or if the + dentry is not hashed, it is deleted. Otherwise cached dentries + are put into an LRU list to be reclaimed on memory shortage. + +``d_drop`` + this unhashes a dentry from its parents hash list. A subsequent + call to dput() will deallocate the dentry if its usage count + drops to 0 + +``d_delete`` + delete a dentry. If there are no other open references to the + dentry then the dentry is turned into a negative dentry (the + d_iput() method is called). If there are other references, then + d_drop() is called instead + +``d_add`` + add a dentry to its parents hash list and then calls + d_instantiate() + +``d_instantiate`` + add a dentry to the alias hash list for the inode and updates + the "d_inode" member. The "i_count" member in the inode + structure should be set/incremented. If the inode pointer is + NULL, the dentry is called a "negative dentry". This function + is commonly called when an inode is created for an existing + negative dentry + +``d_lookup`` + look up a dentry given its parent and path name component It + looks up the child of that given name from the dcache hash + table. If it is found, the reference count is incremented and + the dentry is returned. The caller must use dput() to free the + dentry when it finishes using it. + + +Mount Options +============= + + +Parsing options +--------------- + +On mount and remount the filesystem is passed a string containing a +comma separated list of mount options. The options can have either of +these forms: + + option + option=value + +The <linux/parser.h> header defines an API that helps parse these +options. There are plenty of examples on how to use it in existing +filesystems. + + +Showing options +--------------- + +If a filesystem accepts mount options, it must define show_options() to +show all the currently active options. The rules are: + + - options MUST be shown which are not default or their values differ + from the default + + - options MAY be shown which are enabled by default or have their + default value + +Options used only internally between a mount helper and the kernel (such +as file descriptors), or which only have an effect during the mounting +(such as ones controlling the creation of a journal) are exempt from the +above rules. + +The underlying reason for the above rules is to make sure, that a mount +can be accurately replicated (e.g. umounting and mounting again) based +on the information found in /proc/mounts. + + +Resources +========= + +(Note some of these resources are not up-to-date with the latest kernel + version.) + +Creating Linux virtual filesystems. 2002 + <http://lwn.net/Articles/13325/> + +The Linux Virtual File-system Layer by Neil Brown. 1999 + <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html> + +A tour of the Linux VFS by Michael K. Johnson. 1996 + <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html> + +A small trail through the Linux kernel by Andries Brouwer. 2001 + <http://www.win.tue.nl/~aeb/linux/vfs/trail.html> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt deleted file mode 100644 index 57fc576b1f3e..000000000000 --- a/Documentation/filesystems/vfs.txt +++ /dev/null @@ -1,1268 +0,0 @@ - - Overview of the Linux Virtual File System - - Original author: Richard Gooch <rgooch@atnf.csiro.au> - - Copyright (C) 1999 Richard Gooch - Copyright (C) 2005 Pekka Enberg - - This file is released under the GPLv2. - - -Introduction -============ - -The Virtual File System (also known as the Virtual Filesystem Switch) -is the software layer in the kernel that provides the filesystem -interface to userspace programs. It also provides an abstraction -within the kernel which allows different filesystem implementations to -coexist. - -VFS system calls open(2), stat(2), read(2), write(2), chmod(2) and so -on are called from a process context. Filesystem locking is described -in the document Documentation/filesystems/Locking. - - -Directory Entry Cache (dcache) ------------------------------- - -The VFS implements the open(2), stat(2), chmod(2), and similar system -calls. The pathname argument that is passed to them is used by the VFS -to search through the directory entry cache (also known as the dentry -cache or dcache). This provides a very fast look-up mechanism to -translate a pathname (filename) into a specific dentry. Dentries live -in RAM and are never saved to disc: they exist only for performance. - -The dentry cache is meant to be a view into your entire filespace. As -most computers cannot fit all dentries in the RAM at the same time, -some bits of the cache are missing. In order to resolve your pathname -into a dentry, the VFS may have to resort to creating dentries along -the way, and then loading the inode. This is done by looking up the -inode. - - -The Inode Object ----------------- - -An individual dentry usually has a pointer to an inode. Inodes are -filesystem objects such as regular files, directories, FIFOs and other -beasts. They live either on the disc (for block device filesystems) -or in the memory (for pseudo filesystems). Inodes that live on the -disc are copied into the memory when required and changes to the inode -are written back to disc. A single inode can be pointed to by multiple -dentries (hard links, for example, do this). - -To look up an inode requires that the VFS calls the lookup() method of -the parent directory inode. This method is installed by the specific -filesystem implementation that the inode lives in. Once the VFS has -the required dentry (and hence the inode), we can do all those boring -things like open(2) the file, or stat(2) it to peek at the inode -data. The stat(2) operation is fairly simple: once the VFS has the -dentry, it peeks at the inode data and passes some of it back to -userspace. - - -The File Object ---------------- - -Opening a file requires another operation: allocation of a file -structure (this is the kernel-side implementation of file -descriptors). The freshly allocated file structure is initialized with -a pointer to the dentry and a set of file operation member functions. -These are taken from the inode data. The open() file method is then -called so the specific filesystem implementation can do its work. You -can see that this is another switch performed by the VFS. The file -structure is placed into the file descriptor table for the process. - -Reading, writing and closing files (and other assorted VFS operations) -is done by using the userspace file descriptor to grab the appropriate -file structure, and then calling the required file structure method to -do whatever is required. For as long as the file is open, it keeps the -dentry in use, which in turn means that the VFS inode is still in use. - - -Registering and Mounting a Filesystem -===================================== - -To register and unregister a filesystem, use the following API -functions: - - #include <linux/fs.h> - - extern int register_filesystem(struct file_system_type *); - extern int unregister_filesystem(struct file_system_type *); - -The passed struct file_system_type describes your filesystem. When a -request is made to mount a filesystem onto a directory in your namespace, -the VFS will call the appropriate mount() method for the specific -filesystem. New vfsmount referring to the tree returned by ->mount() -will be attached to the mountpoint, so that when pathname resolution -reaches the mountpoint it will jump into the root of that vfsmount. - -You can see all filesystems that are registered to the kernel in the -file /proc/filesystems. - - -struct file_system_type ------------------------ - -This describes the filesystem. As of kernel 2.6.39, the following -members are defined: - -struct file_system_type { - const char *name; - int fs_flags; - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); - void (*kill_sb) (struct super_block *); - struct module *owner; - struct file_system_type * next; - struct list_head fs_supers; - struct lock_class_key s_lock_key; - struct lock_class_key s_umount_key; -}; - - name: the name of the filesystem type, such as "ext2", "iso9660", - "msdos" and so on - - fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) - - mount: the method to call when a new instance of this - filesystem should be mounted - - kill_sb: the method to call when an instance of this filesystem - should be shut down - - owner: for internal VFS use: you should initialize this to THIS_MODULE in - most cases. - - next: for internal VFS use: you should initialize this to NULL - - s_lock_key, s_umount_key: lockdep-specific - -The mount() method has the following arguments: - - struct file_system_type *fs_type: describes the filesystem, partly initialized - by the specific filesystem code - - int flags: mount flags - - const char *dev_name: the device name we are mounting. - - void *data: arbitrary mount options, usually comes as an ASCII - string (see "Mount Options" section) - -The mount() method must return the root dentry of the tree requested by -caller. An active reference to its superblock must be grabbed and the -superblock must be locked. On failure it should return ERR_PTR(error). - -The arguments match those of mount(2) and their interpretation -depends on filesystem type. E.g. for block filesystems, dev_name is -interpreted as block device name, that device is opened and if it -contains a suitable filesystem image the method creates and initializes -struct super_block accordingly, returning its root dentry to caller. - -->mount() may choose to return a subtree of existing filesystem - it -doesn't have to create a new one. The main result from the caller's -point of view is a reference to dentry at the root of (sub)tree to -be attached; creation of new superblock is a common side effect. - -The most interesting member of the superblock structure that the -mount() method fills in is the "s_op" field. This is a pointer to -a "struct super_operations" which describes the next level of the -filesystem implementation. - -Usually, a filesystem uses one of the generic mount() implementations -and provides a fill_super() callback instead. The generic variants are: - - mount_bdev: mount a filesystem residing on a block device - - mount_nodev: mount a filesystem that is not backed by a device - - mount_single: mount a filesystem which shares the instance between - all mounts - -A fill_super() callback implementation has the following arguments: - - struct super_block *sb: the superblock structure. The callback - must initialize this properly. - - void *data: arbitrary mount options, usually comes as an ASCII - string (see "Mount Options" section) - - int silent: whether or not to be silent on error - - -The Superblock Object -===================== - -A superblock object represents a mounted filesystem. - - -struct super_operations ------------------------ - -This describes how the VFS can manipulate the superblock of your -filesystem. As of kernel 2.6.22, the following members are defined: - -struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); - void (*destroy_inode)(struct inode *); - - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, int); - void (*drop_inode) (struct inode *); - void (*delete_inode) (struct inode *); - void (*put_super) (struct super_block *); - int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_fs) (struct super_block *); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*clear_inode) (struct inode *); - void (*umount_begin) (struct super_block *); - - int (*show_options)(struct seq_file *, struct dentry *); - - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - int (*nr_cached_objects)(struct super_block *); - void (*free_cached_objects)(struct super_block *, int); -}; - -All methods are called without any locks being held, unless otherwise -noted. This means that most methods can block safely. All methods are -only called from a process context (i.e. not from an interrupt handler -or bottom half). - - alloc_inode: this method is called by alloc_inode() to allocate memory - for struct inode and initialize it. If this function is not - defined, a simple 'struct inode' is allocated. Normally - alloc_inode will be used to allocate a larger structure which - contains a 'struct inode' embedded within it. - - destroy_inode: this method is called by destroy_inode() to release - resources allocated for struct inode. It is only required if - ->alloc_inode was defined and simply undoes anything done by - ->alloc_inode. - - dirty_inode: this method is called by the VFS to mark an inode dirty. - - write_inode: this method is called when the VFS needs to write an - inode to disc. The second parameter indicates whether the write - should be synchronous or not, not all filesystems check this flag. - - drop_inode: called when the last access to the inode is dropped, - with the inode->i_lock spinlock held. - - This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do not - want to cache inodes - causing "delete_inode" to always be - called regardless of the value of i_nlink) - - The "generic_delete_inode()" behavior is equivalent to the - old practice of using "force_delete" in the put_inode() case, - but does not have the races that the "force_delete()" approach - had. - - delete_inode: called when the VFS wants to delete an inode - - put_super: called when the VFS wishes to free the superblock - (i.e. unmount). This is called with the superblock lock held - - sync_fs: called when VFS is writing out all dirty data associated with - a superblock. The second parameter indicates whether the method - should wait until the write out has been completed. Optional. - - freeze_fs: called when VFS is locking a filesystem and - forcing it into a consistent state. This method is currently - used by the Logical Volume Manager (LVM). - - unfreeze_fs: called when VFS is unlocking a filesystem and making it writable - again. - - statfs: called when the VFS needs to get filesystem statistics. - - remount_fs: called when the filesystem is remounted. This is called - with the kernel lock held - - clear_inode: called then the VFS clears the inode. Optional - - umount_begin: called when the VFS is unmounting a filesystem. - - show_options: called by the VFS to show mount options for - /proc/<pid>/mounts. (see "Mount Options" section) - - quota_read: called by the VFS to read from filesystem quota file. - - quota_write: called by the VFS to write to filesystem quota file. - - nr_cached_objects: called by the sb cache shrinking function for the - filesystem to return the number of freeable cached objects it contains. - Optional. - - free_cache_objects: called by the sb cache shrinking function for the - filesystem to scan the number of objects indicated to try to free them. - Optional, but any filesystem implementing this method needs to also - implement ->nr_cached_objects for it to be called correctly. - - We can't do anything with any errors that the filesystem might - encountered, hence the void return type. This will never be called if - the VM is trying to reclaim under GFP_NOFS conditions, hence this - method does not need to handle that situation itself. - - Implementations must include conditional reschedule calls inside any - scanning loop that is done. This allows the VFS to determine - appropriate scan batch sizes without having to worry about whether - implementations will cause holdoff problems due to large scan batch - sizes. - -Whoever sets up the inode is responsible for filling in the "i_op" field. This -is a pointer to a "struct inode_operations" which describes the methods that -can be performed on individual inodes. - -struct xattr_handlers ---------------------- - -On filesystems that support extended attributes (xattrs), the s_xattr -superblock field points to a NULL-terminated array of xattr handlers. Extended -attributes are name:value pairs. - - name: Indicates that the handler matches attributes with the specified name - (such as "system.posix_acl_access"); the prefix field must be NULL. - - prefix: Indicates that the handler matches all attributes with the specified - name prefix (such as "user."); the name field must be NULL. - - list: Determine if attributes matching this xattr handler should be listed - for a particular dentry. Used by some listxattr implementations like - generic_listxattr. - - get: Called by the VFS to get the value of a particular extended attribute. - This method is called by the getxattr(2) system call. - - set: Called by the VFS to set the value of a particular extended attribute. - When the new value is NULL, called to remove a particular extended - attribute. This method is called by the the setxattr(2) and - removexattr(2) system calls. - -When none of the xattr handlers of a filesystem match the specified attribute -name or when a filesystem doesn't support extended attributes, the various -*xattr(2) system calls return -EOPNOTSUPP. - - -The Inode Object -================ - -An inode object represents an object within the filesystem. - - -struct inode_operations ------------------------ - -This describes how the VFS can manipulate an inode in your -filesystem. As of kernel 2.6.22, the following members are defined: - -struct inode_operations { - int (*create) (struct inode *,struct dentry *, umode_t, bool); - struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); - int (*link) (struct dentry *,struct inode *,struct dentry *); - int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,umode_t); - int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *, unsigned int); - int (*readlink) (struct dentry *, char __user *,int); - const char *(*get_link) (struct dentry *, struct inode *, - struct delayed_call *); - int (*permission) (struct inode *, int); - int (*get_acl)(struct inode *, int); - int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); - ssize_t (*listxattr) (struct dentry *, char *, size_t); - void (*update_time)(struct inode *, struct timespec *, int); - int (*atomic_open)(struct inode *, struct dentry *, struct file *, - unsigned open_flag, umode_t create_mode); - int (*tmpfile) (struct inode *, struct dentry *, umode_t); -}; - -Again, all methods are called without any locks being held, unless -otherwise noted. - - create: called by the open(2) and creat(2) system calls. Only - required if you want to support regular files. The dentry you - get should not have an inode (i.e. it should be a negative - dentry). Here you will probably call d_instantiate() with the - dentry and the newly created inode - - lookup: called when the VFS needs to look up an inode in a parent - directory. The name to look for is found in the dentry. This - method must call d_add() to insert the found inode into the - dentry. The "i_count" field in the inode structure should be - incremented. If the named inode does not exist a NULL inode - should be inserted into the dentry (this is called a negative - dentry). Returning an error code from this routine must only - be done on a real error, otherwise creating inodes with system - calls like create(2), mknod(2), mkdir(2) and so on will fail. - If you wish to overload the dentry methods then you should - initialise the "d_dop" field in the dentry; this is a pointer - to a struct "dentry_operations". - This method is called with the directory inode semaphore held - - link: called by the link(2) system call. Only required if you want - to support hard links. You will probably need to call - d_instantiate() just as you would in the create() method - - unlink: called by the unlink(2) system call. Only required if you - want to support deleting inodes - - symlink: called by the symlink(2) system call. Only required if you - want to support symlinks. You will probably need to call - d_instantiate() just as you would in the create() method - - mkdir: called by the mkdir(2) system call. Only required if you want - to support creating subdirectories. You will probably need to - call d_instantiate() just as you would in the create() method - - rmdir: called by the rmdir(2) system call. Only required if you want - to support deleting subdirectories - - mknod: called by the mknod(2) system call to create a device (char, - block) inode or a named pipe (FIFO) or socket. Only required - if you want to support creating these types of inodes. You - will probably need to call d_instantiate() just as you would - in the create() method - - rename: called by the rename(2) system call to rename the object to - have the parent and name given by the second inode and dentry. - - The filesystem must return -EINVAL for any unsupported or - unknown flags. Currently the following flags are implemented: - (1) RENAME_NOREPLACE: this flag indicates that if the target - of the rename exists the rename should fail with -EEXIST - instead of replacing the target. The VFS already checks for - existence, so for local filesystems the RENAME_NOREPLACE - implementation is equivalent to plain rename. - (2) RENAME_EXCHANGE: exchange source and target. Both must - exist; this is checked by the VFS. Unlike plain rename, - source and target may be of different type. - - get_link: called by the VFS to follow a symbolic link to the - inode it points to. Only required if you want to support - symbolic links. This method returns the symlink body - to traverse (and possibly resets the current position with - nd_jump_link()). If the body won't go away until the inode - is gone, nothing else is needed; if it needs to be otherwise - pinned, arrange for its release by having get_link(..., ..., done) - do set_delayed_call(done, destructor, argument). - In that case destructor(argument) will be called once VFS is - done with the body you've returned. - May be called in RCU mode; that is indicated by NULL dentry - argument. If request can't be handled without leaving RCU mode, - have it return ERR_PTR(-ECHILD). - - If the filesystem stores the symlink target in ->i_link, the - VFS may use it directly without calling ->get_link(); however, - ->get_link() must still be provided. ->i_link must not be - freed until after an RCU grace period. Writing to ->i_link - post-iget() time requires a 'release' memory barrier. - - readlink: this is now just an override for use by readlink(2) for the - cases when ->get_link uses nd_jump_link() or object is not in - fact a symlink. Normally filesystems should only implement - ->get_link for symlinks and readlink(2) will automatically use - that. - - permission: called by the VFS to check for access rights on a POSIX-like - filesystem. - - May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk - mode, the filesystem must check the permission without blocking or - storing to the inode. - - If a situation is encountered that rcu-walk cannot handle, return - -ECHILD and it will be called again in ref-walk mode. - - setattr: called by the VFS to set attributes for a file. This method - is called by chmod(2) and related system calls. - - getattr: called by the VFS to get attributes of a file. This method - is called by stat(2) and related system calls. - - listxattr: called by the VFS to list all extended attributes for a - given file. This method is called by the listxattr(2) system call. - - update_time: called by the VFS to update a specific time or the i_version of - an inode. If this is not defined the VFS will update the inode itself - and call mark_inode_dirty_sync. - - atomic_open: called on the last component of an open. Using this optional - method the filesystem can look up, possibly create and open the file in - one atomic operation. If it wants to leave actual opening to the - caller (e.g. if the file turned out to be a symlink, device, or just - something filesystem won't do atomic open for), it may signal this by - returning finish_no_open(file, dentry). This method is only called if - the last component is negative or needs lookup. Cached positive dentries - are still handled by f_op->open(). If the file was created, - FMODE_CREATED flag should be set in file->f_mode. In case of O_EXCL - the method must only succeed if the file didn't exist and hence FMODE_CREATED - shall always be set on success. - - tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to - atomically creating, opening and unlinking a file in given directory. - -The Address Space Object -======================== - -The address space object is used to group and manage pages in the page -cache. It can be used to keep track of the pages in a file (or -anything else) and also track the mapping of sections of the file into -process address spaces. - -There are a number of distinct yet related services that an -address-space can provide. These include communicating memory -pressure, page lookup by address, and keeping track of pages tagged as -Dirty or Writeback. - -The first can be used independently to the others. The VM can try to -either write dirty pages in order to clean them, or release clean -pages in order to reuse them. To do this it can call the ->writepage -method on dirty pages, and ->releasepage on clean pages with -PagePrivate set. Clean pages without PagePrivate and with no external -references will be released without notice being given to the -address_space. - -To achieve this functionality, pages need to be placed on an LRU with -lru_cache_add and mark_page_active needs to be called whenever the -page is used. - -Pages are normally kept in a radix tree index by ->index. This tree -maintains information about the PG_Dirty and PG_Writeback status of -each page, so that pages with either of these flags can be found -quickly. - -The Dirty tag is primarily used by mpage_writepages - the default -->writepages method. It uses the tag to find dirty pages to call -->writepage on. If mpage_writepages is not used (i.e. the address -provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is -almost unused. write_inode_now and sync_inode do use it (through -__sync_single_inode) to check if ->writepages has been successful in -writing out the whole address_space. - -The Writeback tag is used by filemap*wait* and sync_page* functions, -via filemap_fdatawait_range, to wait for all writeback to complete. - -An address_space handler may attach extra information to a page, -typically using the 'private' field in the 'struct page'. If such -information is attached, the PG_Private flag should be set. This will -cause various VM routines to make extra calls into the address_space -handler to deal with that data. - -An address space acts as an intermediate between storage and -application. Data is read into the address space a whole page at a -time, and provided to the application either by copying of the page, -or by memory-mapping the page. -Data is written into the address space by the application, and then -written-back to storage typically in whole pages, however the -address_space has finer control of write sizes. - -The read process essentially only requires 'readpage'. The write -process is more complicated and uses write_begin/write_end or -set_page_dirty to write data into the address_space, and writepage -and writepages to writeback data to storage. - -Adding and removing pages to/from an address_space is protected by the -inode's i_mutex. - -When data is written to a page, the PG_Dirty flag should be set. It -typically remains set until writepage asks for it to be written. This -should clear PG_Dirty and set PG_Writeback. It can be actually -written at any point after PG_Dirty is clear. Once it is known to be -safe, PG_Writeback is cleared. - -Writeback makes use of a writeback_control structure to direct the -operations. This gives the the writepage and writepages operations some -information about the nature of and reason for the writeback request, -and the constraints under which it is being done. It is also used to -return information back to the caller about the result of a writepage or -writepages request. - -Handling errors during writeback --------------------------------- -Most applications that do buffered I/O will periodically call a file -synchronization call (fsync, fdatasync, msync or sync_file_range) to -ensure that data written has made it to the backing store. When there -is an error during writeback, they expect that error to be reported when -a file sync request is made. After an error has been reported on one -request, subsequent requests on the same file descriptor should return -0, unless further writeback errors have occurred since the previous file -syncronization. - -Ideally, the kernel would report errors only on file descriptions on -which writes were done that subsequently failed to be written back. The -generic pagecache infrastructure does not track the file descriptions -that have dirtied each individual page however, so determining which -file descriptors should get back an error is not possible. - -Instead, the generic writeback error tracking infrastructure in the -kernel settles for reporting errors to fsync on all file descriptions -that were open at the time that the error occurred. In a situation with -multiple writers, all of them will get back an error on a subsequent fsync, -even if all of the writes done through that particular file descriptor -succeeded (or even if there were no writes on that file descriptor at all). - -Filesystems that wish to use this infrastructure should call -mapping_set_error to record the error in the address_space when it -occurs. Then, after writing back data from the pagecache in their -file->fsync operation, they should call file_check_and_advance_wb_err to -ensure that the struct file's error cursor has advanced to the correct -point in the stream of errors emitted by the backing device(s). - -struct address_space_operations -------------------------------- - -This describes how the VFS can manipulate mapping of a file to page cache in -your filesystem. The following members are defined: - -struct address_space_operations { - int (*writepage)(struct page *page, struct writeback_control *wbc); - int (*readpage)(struct file *, struct page *); - int (*writepages)(struct address_space *, struct writeback_control *); - int (*set_page_dirty)(struct page *page); - int (*readpages)(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); - int (*write_begin)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); - sector_t (*bmap)(struct address_space *, sector_t); - void (*invalidatepage) (struct page *, unsigned int, unsigned int); - int (*releasepage) (struct page *, int); - void (*freepage)(struct page *); - ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); - /* isolate a page for migration */ - bool (*isolate_page) (struct page *, isolate_mode_t); - /* migrate the contents of a page to the specified target */ - int (*migratepage) (struct page *, struct page *); - /* put migration-failed page back to right list */ - void (*putback_page) (struct page *); - int (*launder_page) (struct page *); - - int (*is_partially_uptodate) (struct page *, unsigned long, - unsigned long); - void (*is_dirty_writeback) (struct page *, bool *, bool *); - int (*error_remove_page) (struct mapping *mapping, struct page *page); - int (*swap_activate)(struct file *); - int (*swap_deactivate)(struct file *); -}; - - writepage: called by the VM to write a dirty page to backing store. - This may happen for data integrity reasons (i.e. 'sync'), or - to free up memory (flush). The difference can be seen in - wbc->sync_mode. - The PG_Dirty flag has been cleared and PageLocked is true. - writepage should start writeout, should set PG_Writeback, - and should make sure the page is unlocked, either synchronously - or asynchronously when the write operation completes. - - If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to - try too hard if there are problems, and may choose to write out - other pages from the mapping if that is easier (e.g. due to - internal dependencies). If it chooses not to start writeout, it - should return AOP_WRITEPAGE_ACTIVATE so that the VM will not keep - calling ->writepage on that page. - - See the file "Locking" for more details. - - readpage: called by the VM to read a page from backing store. - The page will be Locked when readpage is called, and should be - unlocked and marked uptodate once the read completes. - If ->readpage discovers that it needs to unlock the page for - some reason, it can do so, and then return AOP_TRUNCATED_PAGE. - In this case, the page will be relocated, relocked and if - that all succeeds, ->readpage will be called again. - - writepages: called by the VM to write out pages associated with the - address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then - the writeback_control will specify a range of pages that must be - written out. If it is WBC_SYNC_NONE, then a nr_to_write is given - and that many pages should be written if possible. - If no ->writepages is given, then mpage_writepages is used - instead. This will choose pages from the address space that are - tagged as DIRTY and will pass them to ->writepage. - - set_page_dirty: called by the VM to set a page dirty. - This is particularly needed if an address space attaches - private data to a page, and that data needs to be updated when - a page is dirtied. This is called, for example, when a memory - mapped page gets modified. - If defined, it should set the PageDirty flag, and the - PAGECACHE_TAG_DIRTY tag in the radix tree. - - readpages: called by the VM to read pages associated with the address_space - object. This is essentially just a vector version of - readpage. Instead of just one page, several pages are - requested. - readpages is only used for read-ahead, so read errors are - ignored. If anything goes wrong, feel free to give up. - - write_begin: - Called by the generic buffered write code to ask the filesystem to - prepare to write len bytes at the given offset in the file. The - address_space should check that the write will be able to complete, - by allocating space if necessary and doing any other internal - housekeeping. If the write will update parts of any basic-blocks on - storage, then those blocks should be pre-read (if they haven't been - read already) so that the updated blocks can be written out properly. - - The filesystem must return the locked pagecache page for the specified - offset, in *pagep, for the caller to write into. - - It must be able to cope with short writes (where the length passed to - write_begin is greater than the number of bytes copied into the page). - - flags is a field for AOP_FLAG_xxx flags, described in - include/linux/fs.h. - - A void * may be returned in fsdata, which then gets passed into - write_end. - - Returns 0 on success; < 0 on failure (which is the error code), in - which case write_end is not called. - - write_end: After a successful write_begin, and data copy, write_end must - be called. len is the original len passed to write_begin, and copied - is the amount that was able to be copied. - - The filesystem must take care of unlocking the page and releasing it - refcount, and updating i_size. - - Returns < 0 on failure, otherwise the number of bytes (<= 'copied') - that were able to be copied into pagecache. - - bmap: called by the VFS to map a logical block offset within object to - physical block number. This method is used by the FIBMAP - ioctl and for working with swap-files. To be able to swap to - a file, the file must have a stable mapping to a block - device. The swap system does not go through the filesystem - but instead uses bmap to find out where the blocks in the file - are and uses those addresses directly. - - invalidatepage: If a page has PagePrivate set, then invalidatepage - will be called when part or all of the page is to be removed - from the address space. This generally corresponds to either a - truncation, punch hole or a complete invalidation of the address - space (in the latter case 'offset' will always be 0 and 'length' - will be PAGE_SIZE). Any private data associated with the page - should be updated to reflect this truncation. If offset is 0 and - length is PAGE_SIZE, then the private data should be released, - because the page must be able to be completely discarded. This may - be done by calling the ->releasepage function, but in this case the - release MUST succeed. - - releasepage: releasepage is called on PagePrivate pages to indicate - that the page should be freed if possible. ->releasepage - should remove any private data from the page and clear the - PagePrivate flag. If releasepage() fails for some reason, it must - indicate failure with a 0 return value. - releasepage() is used in two distinct though related cases. The - first is when the VM finds a clean page with no active users and - wants to make it a free page. If ->releasepage succeeds, the - page will be removed from the address_space and become free. - - The second case is when a request has been made to invalidate - some or all pages in an address_space. This can happen - through the fadvise(POSIX_FADV_DONTNEED) system call or by the - filesystem explicitly requesting it as nfs and 9fs do (when - they believe the cache may be out of date with storage) by - calling invalidate_inode_pages2(). - If the filesystem makes such a call, and needs to be certain - that all pages are invalidated, then its releasepage will - need to ensure this. Possibly it can clear the PageUptodate - bit if it cannot free private data yet. - - freepage: freepage is called once the page is no longer visible in - the page cache in order to allow the cleanup of any private - data. Since it may be called by the memory reclaimer, it - should not assume that the original address_space mapping still - exists, and it should not block. - - direct_IO: called by the generic read/write routines to perform - direct_IO - that is IO requests which bypass the page cache - and transfer data directly between the storage and the - application's address space. - - isolate_page: Called by the VM when isolating a movable non-lru page. - If page is successfully isolated, VM marks the page as PG_isolated - via __SetPageIsolated. - - migrate_page: This is used to compact the physical memory usage. - If the VM wants to relocate a page (maybe off a memory card - that is signalling imminent failure) it will pass a new page - and an old page to this function. migrate_page should - transfer any private data across and update any references - that it has to the page. - - putback_page: Called by the VM when isolated page's migration fails. - - launder_page: Called before freeing a page - it writes back the dirty page. To - prevent redirtying the page, it is kept locked during the whole - operation. - - is_partially_uptodate: Called by the VM when reading a file through the - pagecache when the underlying blocksize != pagesize. If the required - block is up to date then the read can complete without needing the IO - to bring the whole page up to date. - - is_dirty_writeback: Called by the VM when attempting to reclaim a page. - The VM uses dirty and writeback information to determine if it needs - to stall to allow flushers a chance to complete some IO. Ordinarily - it can use PageDirty and PageWriteback but some filesystems have - more complex state (unstable pages in NFS prevent reclaim) or - do not set those flags due to locking problems. This callback - allows a filesystem to indicate to the VM if a page should be - treated as dirty or writeback for the purposes of stalling. - - error_remove_page: normally set to generic_error_remove_page if truncation - is ok for this address space. Used for memory failure handling. - Setting this implies you deal with pages going away under you, - unless you have them locked or reference counts increased. - - swap_activate: Called when swapon is used on a file to allocate - space if necessary and pin the block lookup information in - memory. A return value of zero indicates success, - in which case this file can be used to back swapspace. - - swap_deactivate: Called during swapoff on files where swap_activate - was successful. - - -The File Object -=============== - -A file object represents a file opened by a process. This is also known -as an "open file description" in POSIX parlance. - - -struct file_operations ----------------------- - -This describes how the VFS can manipulate an open file. As of kernel -4.18, the following members are defined: - -struct file_operations { - struct module *owner; - loff_t (*llseek) (struct file *, loff_t, int); - ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); - ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); - ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); - ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iopoll)(struct kiocb *kiocb, bool spin); - int (*iterate) (struct file *, struct dir_context *); - int (*iterate_shared) (struct file *, struct dir_context *); - __poll_t (*poll) (struct file *, struct poll_table_struct *); - long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); - long (*compat_ioctl) (struct file *, unsigned int, unsigned long); - int (*mmap) (struct file *, struct vm_area_struct *); - int (*open) (struct inode *, struct file *); - int (*flush) (struct file *, fl_owner_t id); - int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, loff_t, loff_t, int datasync); - int (*fasync) (int, struct file *, int); - int (*lock) (struct file *, int, struct file_lock *); - ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - int (*check_flags)(int); - int (*flock) (struct file *, int, struct file_lock *); - ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); - ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); - int (*setlease)(struct file *, long, struct file_lock **, void **); - long (*fallocate)(struct file *file, int mode, loff_t offset, - loff_t len); - void (*show_fdinfo)(struct seq_file *m, struct file *f); -#ifndef CONFIG_MMU - unsigned (*mmap_capabilities)(struct file *); -#endif - ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); - int (*fadvise)(struct file *, loff_t, loff_t, int); -}; - -Again, all methods are called without any locks being held, unless -otherwise noted. - - llseek: called when the VFS needs to move the file position index - - read: called by read(2) and related system calls - - read_iter: possibly asynchronous read with iov_iter as destination - - write: called by write(2) and related system calls - - write_iter: possibly asynchronous write with iov_iter as source - - iopoll: called when aio wants to poll for completions on HIPRI iocbs - - iterate: called when the VFS needs to read the directory contents - - iterate_shared: called when the VFS needs to read the directory contents - when filesystem supports concurrent dir iterators - - poll: called by the VFS when a process wants to check if there is - activity on this file and (optionally) go to sleep until there - is activity. Called by the select(2) and poll(2) system calls - - unlocked_ioctl: called by the ioctl(2) system call. - - compat_ioctl: called by the ioctl(2) system call when 32 bit system calls - are used on 64 bit kernels. - - mmap: called by the mmap(2) system call - - open: called by the VFS when an inode should be opened. When the VFS - opens a file, it creates a new "struct file". It then calls the - open method for the newly allocated file structure. You might - think that the open method really belongs in - "struct inode_operations", and you may be right. I think it's - done the way it is because it makes filesystems simpler to - implement. The open() method is a good place to initialize the - "private_data" member in the file structure if you want to point - to a device structure - - flush: called by the close(2) system call to flush a file - - release: called when the last reference to an open file is closed - - fsync: called by the fsync(2) system call. Also see the section above - entitled "Handling errors during writeback". - - fasync: called by the fcntl(2) system call when asynchronous - (non-blocking) mode is enabled for a file - - lock: called by the fcntl(2) system call for F_GETLK, F_SETLK, and F_SETLKW - commands - - get_unmapped_area: called by the mmap(2) system call - - check_flags: called by the fcntl(2) system call for F_SETFL command - - flock: called by the flock(2) system call - - splice_write: called by the VFS to splice data from a pipe to a file. This - method is used by the splice(2) system call - - splice_read: called by the VFS to splice data from file to a pipe. This - method is used by the splice(2) system call - - setlease: called by the VFS to set or release a file lock lease. setlease - implementations should call generic_setlease to record or remove - the lease in the inode after setting it. - - fallocate: called by the VFS to preallocate blocks or punch a hole. - - copy_file_range: called by the copy_file_range(2) system call. - - remap_file_range: called by the ioctl(2) system call for FICLONERANGE and - FICLONE and FIDEDUPERANGE commands to remap file ranges. An - implementation should remap len bytes at pos_in of the source file into - the dest file at pos_out. Implementations must handle callers passing - in len == 0; this means "remap to the end of the source file". The - return value should the number of bytes remapped, or the usual - negative error code if errors occurred before any bytes were remapped. - The remap_flags parameter accepts REMAP_FILE_* flags. If - REMAP_FILE_DEDUP is set then the implementation must only remap if the - requested file ranges have identical contents. If REMAP_CAN_SHORTEN is - set, the caller is ok with the implementation shortening the request - length to satisfy alignment or EOF requirements (or any other reason). - - fadvise: possibly called by the fadvise64() system call. - -Note that the file operations are implemented by the specific -filesystem in which the inode resides. When opening a device node -(character or block special) most filesystems will call special -support routines in the VFS which will locate the required device -driver information. These support routines replace the filesystem file -operations with those for the device driver, and then proceed to call -the new open() method for the file. This is how opening a device file -in the filesystem eventually ends up calling the device driver open() -method. - - -Directory Entry Cache (dcache) -============================== - - -struct dentry_operations ------------------------- - -This describes how a filesystem can overload the standard dentry -operations. Dentries and the dcache are the domain of the VFS and the -individual filesystem implementations. Device drivers have no business -here. These methods may be set to NULL, as they are either optional or -the VFS uses a default. As of kernel 2.6.22, the following members are -defined: - -struct dentry_operations { - int (*d_revalidate)(struct dentry *, unsigned int); - int (*d_weak_revalidate)(struct dentry *, unsigned int); - int (*d_hash)(const struct dentry *, struct qstr *); - int (*d_compare)(const struct dentry *, - unsigned int, const char *, const struct qstr *); - int (*d_delete)(const struct dentry *); - int (*d_init)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); - char *(*d_dname)(struct dentry *, char *, int); - struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); -}; - - d_revalidate: called when the VFS needs to revalidate a dentry. This - is called whenever a name look-up finds a dentry in the - dcache. Most local filesystems leave this as NULL, because all their - dentries in the dcache are valid. Network filesystems are different - since things can change on the server without the client necessarily - being aware of it. - - This function should return a positive value if the dentry is still - valid, and zero or a negative error code if it isn't. - - d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU). - If in rcu-walk mode, the filesystem must revalidate the dentry without - blocking or storing to the dentry, d_parent and d_inode should not be - used without care (because they can change and, in d_inode case, even - become NULL under us). - - If a situation is encountered that rcu-walk cannot handle, return - -ECHILD and it will be called again in ref-walk mode. - - d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry. - This is called when a path-walk ends at dentry that was not acquired by - doing a lookup in the parent directory. This includes "/", "." and "..", - as well as procfs-style symlinks and mountpoint traversal. - - In this case, we are less concerned with whether the dentry is still - fully correct, but rather that the inode is still valid. As with - d_revalidate, most local filesystems will set this to NULL since their - dcache entries are always valid. - - This function has the same return code semantics as d_revalidate. - - d_weak_revalidate is only called after leaving rcu-walk mode. - - d_hash: called when the VFS adds a dentry to the hash table. The first - dentry passed to d_hash is the parent directory that the name is - to be hashed into. - - Same locking and synchronisation rules as d_compare regarding - what is safe to dereference etc. - - d_compare: called to compare a dentry name with a given name. The first - dentry is the parent of the dentry to be compared, the second is - the child dentry. len and name string are properties of the dentry - to be compared. qstr is the name to compare it with. - - Must be constant and idempotent, and should not take locks if - possible, and should not or store into the dentry. - Should not dereference pointers outside the dentry without - lots of care (eg. d_parent, d_inode, d_name should not be used). - - However, our vfsmount is pinned, and RCU held, so the dentries and - inodes won't disappear, neither will our sb or filesystem module. - ->d_sb may be used. - - It is a tricky calling convention because it needs to be called under - "rcu-walk", ie. without any locks or references on things. - - d_delete: called when the last reference to a dentry is dropped and the - dcache is deciding whether or not to cache it. Return 1 to delete - immediately, or 0 to cache the dentry. Default is NULL which means to - always cache a reachable dentry. d_delete must be constant and - idempotent. - - d_init: called when a dentry is allocated - - d_release: called when a dentry is really deallocated - - d_iput: called when a dentry loses its inode (just prior to its - being deallocated). The default when this is NULL is that the - VFS calls iput(). If you define this method, you must call - iput() yourself - - d_dname: called when the pathname of a dentry should be generated. - Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay - pathname generation. (Instead of doing it when dentry is created, - it's done only when the path is needed.). Real filesystems probably - dont want to use it, because their dentries are present in global - dcache hash, so their hash should be an invariant. As no lock is - held, d_dname() should not try to modify the dentry itself, unless - appropriate SMP safety is used. CAUTION : d_path() logic is quite - tricky. The correct way to return for example "Hello" is to put it - at the end of the buffer, and returns a pointer to the first char. - dynamic_dname() helper function is provided to take care of this. - - Example : - - static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) - { - return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", - dentry->d_inode->i_ino); - } - - d_automount: called when an automount dentry is to be traversed (optional). - This should create a new VFS mount record and return the record to the - caller. The caller is supplied with a path parameter giving the - automount directory to describe the automount target and the parent - VFS mount record to provide inheritable mount parameters. NULL should - be returned if someone else managed to make the automount first. If - the vfsmount creation failed, then an error code should be returned. - If -EISDIR is returned, then the directory will be treated as an - ordinary directory and returned to pathwalk to continue walking. - - If a vfsmount is returned, the caller will attempt to mount it on the - mountpoint and will remove the vfsmount from its expiration list in - the case of failure. The vfsmount should be returned with 2 refs on - it to prevent automatic expiration - the caller will clean up the - additional ref. - - This function is only used if DCACHE_NEED_AUTOMOUNT is set on the - dentry. This is set by __d_instantiate() if S_AUTOMOUNT is set on the - inode being added. - - d_manage: called to allow the filesystem to manage the transition from a - dentry (optional). This allows autofs, for example, to hold up clients - waiting to explore behind a 'mountpoint' while letting the daemon go - past and construct the subtree there. 0 should be returned to let the - calling process continue. -EISDIR can be returned to tell pathwalk to - use this directory as an ordinary directory and to ignore anything - mounted on it and not to check the automount flag. Any other error - code will abort pathwalk completely. - - If the 'rcu_walk' parameter is true, then the caller is doing a - pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, - and the caller can be asked to leave it and call again by returning - -ECHILD. -EISDIR may also be returned to tell pathwalk to - ignore d_automount or any mounts. - - This function is only used if DCACHE_MANAGE_TRANSIT is set on the - dentry being transited from. - - d_real: overlay/union type filesystems implement this method to return one of - the underlying dentries hidden by the overlay. It is used in two - different modes: - - Called from file_dentry() it returns the real dentry matching the inode - argument. The real dentry may be from a lower layer already copied up, - but still referenced from the file. This mode is selected with a - non-NULL inode argument. - - With NULL inode the topmost real underlying dentry is returned. - -Each dentry has a pointer to its parent dentry, as well as a hash list -of child dentries. Child dentries are basically like files in a -directory. - - -Directory Entry Cache API --------------------------- - -There are a number of functions defined which permit a filesystem to -manipulate dentries: - - dget: open a new handle for an existing dentry (this just increments - the usage count) - - dput: close a handle for a dentry (decrements the usage count). If - the usage count drops to 0, and the dentry is still in its - parent's hash, the "d_delete" method is called to check whether - it should be cached. If it should not be cached, or if the dentry - is not hashed, it is deleted. Otherwise cached dentries are put - into an LRU list to be reclaimed on memory shortage. - - d_drop: this unhashes a dentry from its parents hash list. A - subsequent call to dput() will deallocate the dentry if its - usage count drops to 0 - - d_delete: delete a dentry. If there are no other open references to - the dentry then the dentry is turned into a negative dentry - (the d_iput() method is called). If there are other - references, then d_drop() is called instead - - d_add: add a dentry to its parents hash list and then calls - d_instantiate() - - d_instantiate: add a dentry to the alias hash list for the inode and - updates the "d_inode" member. The "i_count" member in the - inode structure should be set/incremented. If the inode - pointer is NULL, the dentry is called a "negative - dentry". This function is commonly called when an inode is - created for an existing negative dentry - - d_lookup: look up a dentry given its parent and path name component - It looks up the child of that given name from the dcache - hash table. If it is found, the reference count is incremented - and the dentry is returned. The caller must use dput() - to free the dentry when it finishes using it. - -Mount Options -============= - -Parsing options ---------------- - -On mount and remount the filesystem is passed a string containing a -comma separated list of mount options. The options can have either of -these forms: - - option - option=value - -The <linux/parser.h> header defines an API that helps parse these -options. There are plenty of examples on how to use it in existing -filesystems. - -Showing options ---------------- - -If a filesystem accepts mount options, it must define show_options() -to show all the currently active options. The rules are: - - - options MUST be shown which are not default or their values differ - from the default - - - options MAY be shown which are enabled by default or have their - default value - -Options used only internally between a mount helper and the kernel -(such as file descriptors), or which only have an effect during the -mounting (such as ones controlling the creation of a journal) are exempt -from the above rules. - -The underlying reason for the above rules is to make sure, that a -mount can be accurately replicated (e.g. umounting and mounting again) -based on the information found in /proc/mounts. - -Resources -========= - -(Note some of these resources are not up-to-date with the latest kernel - version.) - -Creating Linux virtual filesystems. 2002 - <http://lwn.net/Articles/13325/> - -The Linux Virtual File-system Layer by Neil Brown. 1999 - <http://www.cse.unsw.edu.au/~neilb/oss/linux-commentary/vfs.html> - -A tour of the Linux VFS by Michael K. Johnson. 1996 - <http://www.tldp.org/LDP/khg/HyperNews/get/fs/vfstour.html> - -A small trail through the Linux kernel by Andries Brouwer. 2001 - <http://www.win.tue.nl/~aeb/linux/vfs/trail.html> diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt index 2ce36439c09f..9a6dd289b17b 100644 --- a/Documentation/filesystems/xfs-delayed-logging-design.txt +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt @@ -34,7 +34,7 @@ transaction: D A+B+C+D X+n+m+o <object written to disk> E E Y (> X+n+m+o) - F E+F Yٍ+p + F E+F Y+p In other words, each time an object is relogged, the new transaction contains the aggregation of all the previous changes currently held only in the log. diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst index 850be9696931..1252617b520f 100644 --- a/Documentation/firmware-guide/acpi/enumeration.rst +++ b/Documentation/firmware-guide/acpi/enumeration.rst @@ -339,7 +339,7 @@ a code like this:: There are also devm_* versions of these functions which release the descriptors once the device is released. -See Documentation/acpi/gpio-properties.txt for more information about the +See Documentation/firmware-guide/acpi/gpio-properties.rst for more information about the _DSD binding related to GPIOs. MFD devices diff --git a/Documentation/firmware-guide/acpi/method-tracing.rst b/Documentation/firmware-guide/acpi/method-tracing.rst index d0b077b73f5f..0aa7e2c5d32a 100644 --- a/Documentation/firmware-guide/acpi/method-tracing.rst +++ b/Documentation/firmware-guide/acpi/method-tracing.rst @@ -68,7 +68,7 @@ c. Filter out the debug layer/level matched logs when the specified Where: 0xXXXXXXXX/0xYYYYYYYY - Refer to Documentation/acpi/debug.txt for possible debug layer/level + Refer to Documentation/firmware-guide/acpi/debug.rst for possible debug layer/level masking values. \PPPP.AAAA.TTTT.HHHH Full path of a control method that can be found in the ACPI namespace. diff --git a/Documentation/fpga/dfl.txt b/Documentation/fpga/dfl.rst index 6df4621c3f2a..2f125abd777f 100644 --- a/Documentation/fpga/dfl.txt +++ b/Documentation/fpga/dfl.rst @@ -1,9 +1,12 @@ -=============================================================================== - FPGA Device Feature List (DFL) Framework Overview -------------------------------------------------------------------------------- - Enno Luebbers <enno.luebbers@intel.com> - Xiao Guangrong <guangrong.xiao@linux.intel.com> - Wu Hao <hao.wu@intel.com> +================================================= +FPGA Device Feature List (DFL) Framework Overview +================================================= + +Authors: + +- Enno Luebbers <enno.luebbers@intel.com> +- Xiao Guangrong <guangrong.xiao@linux.intel.com> +- Wu Hao <hao.wu@intel.com> The Device Feature List (DFL) FPGA framework (and drivers according to this this framework) hides the very details of low layer hardwares and provides @@ -19,7 +22,7 @@ Device Feature List (DFL) defines a linked list of feature headers within the device MMIO space to provide an extensible way of adding features. Software can walk through these predefined data structures to enumerate FPGA features: FPGA Interface Unit (FIU), Accelerated Function Unit (AFU) and Private Features, -as illustrated below: +as illustrated below:: Header Header Header Header +----------+ +-->+----------+ +-->+----------+ +-->+----------+ @@ -81,9 +84,9 @@ and release it using close(). The following functions are exposed through ioctls: - Get driver API version (DFL_FPGA_GET_API_VERSION) - Check for extensions (DFL_FPGA_CHECK_EXTENSION) - Program bitstream (DFL_FPGA_FME_PORT_PR) +- Get driver API version (DFL_FPGA_GET_API_VERSION) +- Check for extensions (DFL_FPGA_CHECK_EXTENSION) +- Program bitstream (DFL_FPGA_FME_PORT_PR) More functions are exposed through sysfs (/sys/class/fpga_region/regionX/dfl-fme.n/): @@ -118,18 +121,19 @@ port by using open() on the port device node and release it using close(). The following functions are exposed through ioctls: - Get driver API version (DFL_FPGA_GET_API_VERSION) - Check for extensions (DFL_FPGA_CHECK_EXTENSION) - Get port info (DFL_FPGA_PORT_GET_INFO) - Get MMIO region info (DFL_FPGA_PORT_GET_REGION_INFO) - Map DMA buffer (DFL_FPGA_PORT_DMA_MAP) - Unmap DMA buffer (DFL_FPGA_PORT_DMA_UNMAP) - Reset AFU (*DFL_FPGA_PORT_RESET) +- Get driver API version (DFL_FPGA_GET_API_VERSION) +- Check for extensions (DFL_FPGA_CHECK_EXTENSION) +- Get port info (DFL_FPGA_PORT_GET_INFO) +- Get MMIO region info (DFL_FPGA_PORT_GET_REGION_INFO) +- Map DMA buffer (DFL_FPGA_PORT_DMA_MAP) +- Unmap DMA buffer (DFL_FPGA_PORT_DMA_UNMAP) +- Reset AFU (DFL_FPGA_PORT_RESET) -*DFL_FPGA_PORT_RESET: reset the FPGA Port and its AFU. Userspace can do Port -reset at any time, e.g. during DMA or Partial Reconfiguration. But it should -never cause any system level issue, only functional failure (e.g. DMA or PR -operation failure) and be recoverable from the failure. +DFL_FPGA_PORT_RESET: + reset the FPGA Port and its AFU. Userspace can do Port + reset at any time, e.g. during DMA or Partial Reconfiguration. But it should + never cause any system level issue, only functional failure (e.g. DMA or PR + operation failure) and be recoverable from the failure. User-space applications can also mmap() accelerator MMIO regions. @@ -143,6 +147,8 @@ More functions are exposed through sysfs: DFL Framework Overview ====================== +:: + +----------+ +--------+ +--------+ +--------+ | FME | | AFU | | AFU | | AFU | | Module | | Module | | Module | | Module | @@ -151,7 +157,7 @@ DFL Framework Overview | FPGA Container Device | Device Feature List | (FPGA Base Region) | Framework +-----------------------+ --------------------------------------------------------------------- + ------------------------------------------------------------------ +----------------------------+ | FPGA DFL Device Module | | (e.g. PCIE/Platform Device)| @@ -220,7 +226,7 @@ the sysfs hierarchy under /sys/class/fpga_region. In the example below, two DFL based FPGA devices are installed in the host. Each fpga device has one FME and two ports (AFUs). -FPGA regions are created under /sys/class/fpga_region/ +FPGA regions are created under /sys/class/fpga_region/:: /sys/class/fpga_region/region0 /sys/class/fpga_region/region1 @@ -231,7 +237,7 @@ Application needs to search each regionX folder, if feature device is found, (e.g. "dfl-port.n" or "dfl-fme.m" is found), then it's the base fpga region which represents the FPGA device. -Each base region has one FME and two ports (AFUs) as child devices: +Each base region has one FME and two ports (AFUs) as child devices:: /sys/class/fpga_region/region0/dfl-fme.0 /sys/class/fpga_region/region0/dfl-port.0 @@ -243,7 +249,7 @@ Each base region has one FME and two ports (AFUs) as child devices: /sys/class/fpga_region/region3/dfl-port.3 ... -In general, the FME/AFU sysfs interfaces are named as follows: +In general, the FME/AFU sysfs interfaces are named as follows:: /sys/class/fpga_region/<regionX>/<dfl-fme.n>/ /sys/class/fpga_region/<regionX>/<dfl-port.m>/ @@ -251,7 +257,7 @@ In general, the FME/AFU sysfs interfaces are named as follows: with 'n' consecutively numbering all FMEs and 'm' consecutively numbering all ports. -The device nodes used for ioctl() or mmap() can be referenced through: +The device nodes used for ioctl() or mmap() can be referenced through:: /sys/class/fpga_region/<regionX>/<dfl-fme.n>/dev /sys/class/fpga_region/<regionX>/<dfl-port.n>/dev diff --git a/Documentation/fpga/index.rst b/Documentation/fpga/index.rst new file mode 100644 index 000000000000..2c87d1ea084f --- /dev/null +++ b/Documentation/fpga/index.rst @@ -0,0 +1,17 @@ +:orphan: + +==== +fpga +==== + +.. toctree:: + :maxdepth: 1 + + dfl + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/gpu/msm-crash-dump.rst b/Documentation/gpu/msm-crash-dump.rst index 757cd257e0d8..240ef200f76c 100644 --- a/Documentation/gpu/msm-crash-dump.rst +++ b/Documentation/gpu/msm-crash-dump.rst @@ -1,3 +1,5 @@ +:orphan: + ===================== MSM Crash Dump Format ===================== diff --git a/Documentation/hid/hid-transport.txt b/Documentation/hid/hid-transport.txt index 3dcba9fd4a3a..4f41d67f1b4b 100644 --- a/Documentation/hid/hid-transport.txt +++ b/Documentation/hid/hid-transport.txt @@ -194,9 +194,9 @@ with HID core: goto err_<...>; } - strlcpy(hid->name, <device-name-src>, 127); - strlcpy(hid->phys, <device-phys-src>, 63); - strlcpy(hid->uniq, <device-uniq-src>, 63); + strscpy(hid->name, <device-name-src>, sizeof(hid->name)); + strscpy(hid->phys, <device-phys-src>, sizeof(hid->phys)); + strscpy(hid->uniq, <device-uniq-src>, sizeof(hid->uniq)); hid->ll_driver = &custom_ll_driver; hid->bus = <device-bus>; diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices index 0d85ac1935b7..345e9ea8281a 100644 --- a/Documentation/i2c/instantiating-devices +++ b/Documentation/i2c/instantiating-devices @@ -85,7 +85,7 @@ Method 1c: Declare the I2C devices via ACPI ------------------------------------------- ACPI can also describe I2C devices. There is special documentation for this -which is currently located at Documentation/acpi/enumeration.txt. +which is currently located at Documentation/firmware-guide/acpi/enumeration.rst. Method 2: Instantiate the devices explicitly @@ -137,7 +137,7 @@ static int usb_hcd_nxp_probe(struct platform_device *pdev) (...) i2c_adap = i2c_get_adapter(2); memset(&i2c_info, 0, sizeof(struct i2c_board_info)); - strlcpy(i2c_info.type, "isp1301_nxp", I2C_NAME_SIZE); + strscpy(i2c_info.type, "isp1301_nxp", sizeof(i2c_info.type)); isp1301_i2c_client = i2c_new_probed_device(i2c_adap, &i2c_info, normal_i2c, NULL); i2c_put_adapter(i2c_adap); diff --git a/Documentation/i2c/upgrading-clients b/Documentation/i2c/upgrading-clients index ccba3ffd6e80..96392cc5b5c7 100644 --- a/Documentation/i2c/upgrading-clients +++ b/Documentation/i2c/upgrading-clients @@ -43,7 +43,7 @@ static int example_attach(struct i2c_adapter *adap, int addr, int kind) example->client.adapter = adap; i2c_set_clientdata(&state->i2c_client, state); - strlcpy(client->i2c_client.name, "example", I2C_NAME_SIZE); + strscpy(client->i2c_client.name, "example", sizeof(client->i2c_client.name)); ret = i2c_attach_client(&state->i2c_client); if (ret < 0) { @@ -138,7 +138,7 @@ can be removed: - example->client.flags = 0; - example->client.adapter = adap; - -- strlcpy(client->i2c_client.name, "example", I2C_NAME_SIZE); +- strscpy(client->i2c_client.name, "example", sizeof(client->i2c_client.name)); The i2c_set_clientdata is now: diff --git a/Documentation/ide/changelogs.rst b/Documentation/ide/changelogs.rst new file mode 100644 index 000000000000..fdf9d0fb8027 --- /dev/null +++ b/Documentation/ide/changelogs.rst @@ -0,0 +1,17 @@ +Changelog for ide cd +-------------------- + + .. include:: ChangeLog.ide-cd.1994-2004 + :literal: + +Changelog for ide floppy +------------------------ + + .. include:: ChangeLog.ide-floppy.1996-2002 + :literal: + +Changelog for ide tape +---------------------- + + .. include:: ChangeLog.ide-tape.1995-2002 + :literal: diff --git a/Documentation/ide/ide-tape.txt b/Documentation/ide/ide-tape.rst index 3f348a0b21d8..3e061d9c0e38 100644 --- a/Documentation/ide/ide-tape.txt +++ b/Documentation/ide/ide-tape.rst @@ -1,4 +1,6 @@ -IDE ATAPI streaming tape driver. +=============================== +IDE ATAPI streaming tape driver +=============================== This driver is a part of the Linux ide driver. @@ -10,14 +12,14 @@ to the request-list of the block device, and waits for their completion. The block device major and minor numbers are determined from the tape's relative position in the ide interfaces, as explained in ide.c. -The character device interface consists of the following devices: +The character device interface consists of the following devices:: -ht0 major 37, minor 0 first IDE tape, rewind on close. -ht1 major 37, minor 1 second IDE tape, rewind on close. -... -nht0 major 37, minor 128 first IDE tape, no rewind on close. -nht1 major 37, minor 129 second IDE tape, no rewind on close. -... + ht0 major 37, minor 0 first IDE tape, rewind on close. + ht1 major 37, minor 1 second IDE tape, rewind on close. + ... + nht0 major 37, minor 128 first IDE tape, no rewind on close. + nht1 major 37, minor 129 second IDE tape, no rewind on close. + ... The general magnetic tape commands compatible interface, as defined by include/linux/mtio.h, is accessible through the character device. @@ -40,9 +42,10 @@ Testing was done with a 2 GB CONNER CTMA 4000 IDE ATAPI Streaming Tape Drive. Here are some words from the first releases of hd.c, which are quoted in ide.c and apply here as well: -| Special care is recommended. Have Fun! +* Special care is recommended. Have Fun! -Possible improvements: +Possible improvements +===================== 1. Support for the ATAPI overlap protocol. diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.rst index 7aca987c23d9..88bdcba92f7d 100644 --- a/Documentation/ide/ide.txt +++ b/Documentation/ide/ide.rst @@ -1,41 +1,43 @@ - - Information regarding the Enhanced IDE drive in Linux 2.6 - -============================================================================== - +============================================ +Information regarding the Enhanced IDE drive +============================================ The hdparm utility can be used to control various IDE features on a running system. It is packaged separately. Please Look for it on popular linux FTP sites. +------------------------------------------------------------------------------- + +.. important:: + + BUGGY IDE CHIPSETS CAN CORRUPT DATA!! + + PCI versions of the CMD640 and RZ1000 interfaces are now detected + automatically at startup when PCI BIOS support is configured. + + Linux disables the "prefetch" ("readahead") mode of the RZ1000 + to prevent data corruption possible due to hardware design flaws. + + For the CMD640, linux disables "IRQ unmasking" (hdparm -u1) on any + drive for which the "prefetch" mode of the CMD640 is turned on. + If "prefetch" is disabled (hdparm -p8), then "IRQ unmasking" can be + used again. + + For the CMD640, linux disables "32bit I/O" (hdparm -c1) on any drive + for which the "prefetch" mode of the CMD640 is turned off. + If "prefetch" is enabled (hdparm -p9), then "32bit I/O" can be + used again. + + The CMD640 is also used on some Vesa Local Bus (VLB) cards, and is *NOT* + automatically detected by Linux. For safe, reliable operation with such + interfaces, one *MUST* use the "cmd640.probe_vlb" kernel option. + + Use of the "serialize" option is no longer necessary. +------------------------------------------------------------------------------- -*** IMPORTANT NOTICES: BUGGY IDE CHIPSETS CAN CORRUPT DATA!! -*** ================= -*** PCI versions of the CMD640 and RZ1000 interfaces are now detected -*** automatically at startup when PCI BIOS support is configured. -*** -*** Linux disables the "prefetch" ("readahead") mode of the RZ1000 -*** to prevent data corruption possible due to hardware design flaws. -*** -*** For the CMD640, linux disables "IRQ unmasking" (hdparm -u1) on any -*** drive for which the "prefetch" mode of the CMD640 is turned on. -*** If "prefetch" is disabled (hdparm -p8), then "IRQ unmasking" can be -*** used again. -*** -*** For the CMD640, linux disables "32bit I/O" (hdparm -c1) on any drive -*** for which the "prefetch" mode of the CMD640 is turned off. -*** If "prefetch" is enabled (hdparm -p9), then "32bit I/O" can be -*** used again. -*** -*** The CMD640 is also used on some Vesa Local Bus (VLB) cards, and is *NOT* -*** automatically detected by Linux. For safe, reliable operation with such -*** interfaces, one *MUST* use the "cmd640.probe_vlb" kernel option. -*** -*** Use of the "serialize" option is no longer necessary. - -================================================================================ -Common pitfalls: +Common pitfalls +=============== - 40-conductor IDE cables are capable of transferring data in DMA modes up to udma2, but no faster. @@ -49,19 +51,18 @@ Common pitfalls: - Even better try to stick to the same vendor and device type on the same cable. -================================================================================ - -This is the multiple IDE interface driver, as evolved from hd.c. +This is the multiple IDE interface driver, as evolved from hd.c +=============================================================== It supports up to 9 IDE interfaces per default, on one or more IRQs (usually -14 & 15). There can be up to two drives per interface, as per the ATA-6 spec. +14 & 15). There can be up to two drives per interface, as per the ATA-6 spec.:: -Primary: ide0, port 0x1f0; major=3; hda is minor=0; hdb is minor=64 -Secondary: ide1, port 0x170; major=22; hdc is minor=0; hdd is minor=64 -Tertiary: ide2, port 0x1e8; major=33; hde is minor=0; hdf is minor=64 -Quaternary: ide3, port 0x168; major=34; hdg is minor=0; hdh is minor=64 -fifth.. ide4, usually PCI, probed -sixth.. ide5, usually PCI, probed + Primary: ide0, port 0x1f0; major=3; hda is minor=0; hdb is minor=64 + Secondary: ide1, port 0x170; major=22; hdc is minor=0; hdd is minor=64 + Tertiary: ide2, port 0x1e8; major=33; hde is minor=0; hdf is minor=64 + Quaternary: ide3, port 0x168; major=34; hdg is minor=0; hdh is minor=64 + fifth.. ide4, usually PCI, probed + sixth.. ide5, usually PCI, probed To access devices on interfaces > ide0, device entries please make sure that device files for them are present in /dev. If not, please create such @@ -80,12 +81,15 @@ seldom occurs. Be careful, and if in doubt, don't do it! Drives are normally found by auto-probing and/or examining the CMOS/BIOS data. For really weird situations, the apparent (fdisk) geometry can also be specified -on the kernel "command line" using LILO. The format of such lines is: +on the kernel "command line" using LILO. The format of such lines is:: ide_core.chs=[interface_number.device_number]:cyls,heads,sects -or ide_core.cdrom=[interface_number.device_number] -For example: +or:: + + ide_core.cdrom=[interface_number.device_number] + +For example:: ide_core.chs=1.0:1050,32,64 ide_core.cdrom=1.1 @@ -96,10 +100,12 @@ geometry for partitioning purposes (fdisk). If the auto-probing during boot time confuses a drive (ie. the drive works with hd.c but not with ide.c), then an command line option may be specified for each drive for which you'd like the drive to skip the hardware -probe/identification sequence. For example: +probe/identification sequence. For example:: ide_core.noprobe=0.1 -or + +or:: + ide_core.chs=1.0:768,16,32 ide_core.noprobe=1.0 @@ -115,22 +121,24 @@ Such drives will be identified at boot time, just like a hard disk. If for some reason your cdrom drive is *not* found at boot time, you can force the probe to look harder by supplying a kernel command line parameter -via LILO, such as: +via LILO, such as::: ide_core.cdrom=1.0 /* "master" on second interface (hdc) */ -or + +or:: + ide_core.cdrom=1.1 /* "slave" on second interface (hdd) */ For example, a GW2000 system might have a hard drive on the primary interface (/dev/hda) and an IDE cdrom drive on the secondary interface -(/dev/hdc). To mount a CD in the cdrom drive, one would use something like: +(/dev/hdc). To mount a CD in the cdrom drive, one would use something like:: ln -sf /dev/hdc /dev/cdrom mkdir /mnt/cdrom mount /dev/cdrom /mnt/cdrom -t iso9660 -o ro If, after doing all of the above, mount doesn't work and you see -errors from the driver (with dmesg) complaining about `status=0xff', +errors from the driver (with dmesg) complaining about `status=0xff`, this means that the hardware is not responding to the driver's attempts to read it. One of the following is probably the problem: @@ -165,7 +173,7 @@ drivers can always be compiled as loadable modules, the chipset drivers can only be compiled into the kernel, and the core code (ide.c) can be compiled as a loadable module provided no chipset support is needed. -When using ide.c as a module in combination with kmod, add: +When using ide.c as a module in combination with kmod, add:: alias block-major-3 ide-probe @@ -176,10 +184,8 @@ driver using the "options=" keyword to insmod, while replacing any ',' with ';'. -================================================================================ - Summary of ide driver parameters for kernel command line --------------------------------------------------------- +======================================================== For legacy IDE VLB host drivers (ali14xx/dtc2278/ht6560b/qd65xx/umc8672) you need to explicitly enable probing by using "probe" kernel parameter, @@ -226,28 +232,31 @@ Other kernel parameters for ide_core are: * "chs=[interface_number.device_number]" to force device as a disk (using CHS) -================================================================================ Some Terminology ----------------- -IDE = Integrated Drive Electronics, meaning that each drive has a built-in -controller, which is why an "IDE interface card" is not a "controller card". +================ -ATA = AT (the old IBM 286 computer) Attachment Interface, a draft American -National Standard for connecting hard drives to PCs. This is the official -name for "IDE". +IDE + Integrated Drive Electronics, meaning that each drive has a built-in + controller, which is why an "IDE interface card" is not a "controller card". -The latest standards define some enhancements, known as the ATA-6 spec, -which grew out of vendor-specific "Enhanced IDE" (EIDE) implementations. +ATA + AT (the old IBM 286 computer) Attachment Interface, a draft American + National Standard for connecting hard drives to PCs. This is the official + name for "IDE". -ATAPI = ATA Packet Interface, a new protocol for controlling the drives, -similar to SCSI protocols, created at the same time as the ATA2 standard. -ATAPI is currently used for controlling CDROM, TAPE and FLOPPY (ZIP or -LS120/240) devices, removable R/W cartridges, and for high capacity hard disk -drives. + The latest standards define some enhancements, known as the ATA-6 spec, + which grew out of vendor-specific "Enhanced IDE" (EIDE) implementations. + +ATAPI + ATA Packet Interface, a new protocol for controlling the drives, + similar to SCSI protocols, created at the same time as the ATA2 standard. + ATAPI is currently used for controlling CDROM, TAPE and FLOPPY (ZIP or + LS120/240) devices, removable R/W cartridges, and for high capacity hard disk + drives. mlord@pobox.com --- + Wed Apr 17 22:52:44 CEST 2002 edited by Marcin Dalecki, the current maintainer. diff --git a/Documentation/ide/index.rst b/Documentation/ide/index.rst new file mode 100644 index 000000000000..45bc12d3957f --- /dev/null +++ b/Documentation/ide/index.rst @@ -0,0 +1,21 @@ +:orphan: + +================================== +Integrated Drive Electronics (IDE) +================================== + +.. toctree:: + :maxdepth: 1 + + ide + ide-tape + warm-plug-howto + + changelogs + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/ide/warm-plug-howto.txt b/Documentation/ide/warm-plug-howto.rst index 98152bcd515a..c245242ef2f1 100644 --- a/Documentation/ide/warm-plug-howto.txt +++ b/Documentation/ide/warm-plug-howto.rst @@ -1,14 +1,14 @@ - +=================== IDE warm-plug HOWTO =================== -To warm-plug devices on a port 'idex': +To warm-plug devices on a port 'idex':: -# echo -n "1" > /sys/class/ide_port/idex/delete_devices + # echo -n "1" > /sys/class/ide_port/idex/delete_devices -unplug old device(s) and plug new device(s) +unplug old device(s) and plug new device(s):: -# echo -n "1" > /sys/class/ide_port/idex/scan + # echo -n "1" > /sys/class/ide_port/idex/scan done diff --git a/Documentation/index.rst b/Documentation/index.rst index a7566ef62411..781042b4579d 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -112,7 +112,6 @@ implementation. .. toctree:: :maxdepth: 2 - x86/index sh/index x86/index diff --git a/Documentation/interconnect/interconnect.rst b/Documentation/interconnect/interconnect.rst index b8107dcc4cd3..56e331dab70e 100644 --- a/Documentation/interconnect/interconnect.rst +++ b/Documentation/interconnect/interconnect.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 +:orphan: + ===================================== GENERIC SYSTEM INTERCONNECT SUBSYSTEM ===================================== @@ -89,6 +91,5 @@ Interconnect consumers Interconnect consumers are the clients which use the interconnect APIs to get paths between endpoints and set their bandwidth/latency/QoS requirements -for these interconnect paths. - -.. kernel-doc:: include/linux/interconnect.h +for these interconnect paths. These interfaces are not currently +documented. diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt index 49df45f90e8a..5d63b18bd6d1 100644 --- a/Documentation/iostats.txt +++ b/Documentation/iostats.txt @@ -97,6 +97,10 @@ Field 9 -- # of I/Os currently in progress Field 10 -- # of milliseconds spent doing I/Os This field increases so long as field 9 is nonzero. + Since 5.0 this field counts jiffies when at least one request was + started or completed. If request runs more than 2 jiffies then some + I/O time will not be accounted unless there are other requests. + Field 11 -- weighted # of milliseconds spent doing I/Os This field is incremented at each I/O start, I/O completion, I/O merge, or read of these stats by the number of I/Os in progress diff --git a/Documentation/kbuild/headers_install.txt b/Documentation/kbuild/headers_install.rst index f0153adb95e2..1ab7294e41ac 100644 --- a/Documentation/kbuild/headers_install.txt +++ b/Documentation/kbuild/headers_install.rst @@ -1,3 +1,4 @@ +============================================= Exporting kernel headers for use by userspace ============================================= @@ -22,14 +23,14 @@ older kernel. The "make headers_install" command can be run in the top level directory of the kernel source code (or using a standard out-of-tree build). It takes two -optional arguments: +optional arguments:: make headers_install ARCH=i386 INSTALL_HDR_PATH=/usr ARCH indicates which architecture to produce headers for, and defaults to the current architecture. The linux/asm directory of the exported kernel headers is platform-specific, to see a complete list of supported architectures use -the command: +the command:: ls -d include/asm-* | sed 's/.*-//' diff --git a/Documentation/kbuild/index.rst b/Documentation/kbuild/index.rst new file mode 100644 index 000000000000..42d4cbe4460c --- /dev/null +++ b/Documentation/kbuild/index.rst @@ -0,0 +1,27 @@ +:orphan: + +=================== +Kernel Build System +=================== + +.. toctree:: + :maxdepth: 1 + + kconfig-language + kconfig-macro-language + + kbuild + kconfig + makefiles + modules + + headers_install + + issues + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/kbuild/issues.rst b/Documentation/kbuild/issues.rst new file mode 100644 index 000000000000..9fdded4b681c --- /dev/null +++ b/Documentation/kbuild/issues.rst @@ -0,0 +1,11 @@ +Recursion issue #1 +------------------ + + .. include:: Kconfig.recursion-issue-01 + :literal: + +Recursion issue #2 +------------------ + + .. include:: Kconfig.recursion-issue-02 + :literal: diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.rst index 9c230ea71963..e774e760522d 100644 --- a/Documentation/kbuild/kbuild.txt +++ b/Documentation/kbuild/kbuild.rst @@ -1,13 +1,19 @@ +====== +Kbuild +====== + + Output files +============ modules.order --------------------------------------------------- +------------- This file records the order in which modules appear in Makefiles. This is used by modprobe to deterministically resolve aliases that match multiple modules. modules.builtin --------------------------------------------------- +--------------- This file lists all modules that are built into the kernel. This is used by modprobe to not fail when trying to load something builtin. @@ -18,84 +24,90 @@ Unlike modinfo of a separate module, all fields are prefixed with module name. Environment variables +===================== KCPPFLAGS --------------------------------------------------- +--------- Additional options to pass when preprocessing. The preprocessing options will be used in all cases where kbuild does preprocessing including building C files and assembler files. KAFLAGS --------------------------------------------------- +------- Additional options to the assembler (for built-in and modules). AFLAGS_MODULE --------------------------------------------------- +------------- Additional module specific options to use for $(AS). AFLAGS_KERNEL --------------------------------------------------- +------------- Additional options for $(AS) when used for assembler code for code that is compiled as built-in. KCFLAGS --------------------------------------------------- +------- Additional options to the C compiler (for built-in and modules). CFLAGS_KERNEL --------------------------------------------------- +------------- Additional options for $(CC) when used to compile code that is compiled as built-in. CFLAGS_MODULE --------------------------------------------------- +------------- Additional module specific options to use for $(CC). LDFLAGS_MODULE --------------------------------------------------- +-------------- Additional options used for $(LD) when linking modules. HOSTCFLAGS --------------------------------------------------- +---------- Additional flags to be passed to $(HOSTCC) when building host programs. HOSTCXXFLAGS --------------------------------------------------- +------------ Additional flags to be passed to $(HOSTCXX) when building host programs. HOSTLDFLAGS --------------------------------------------------- +----------- Additional flags to be passed when linking host programs. HOSTLDLIBS --------------------------------------------------- +---------- Additional libraries to link against when building host programs. KBUILD_KCONFIG --------------------------------------------------- +-------------- Set the top-level Kconfig file to the value of this environment variable. The default name is "Kconfig". KBUILD_VERBOSE --------------------------------------------------- +-------------- Set the kbuild verbosity. Can be assigned same values as "V=...". + See make help for the full list. + Setting "V=..." takes precedence over KBUILD_VERBOSE. KBUILD_EXTMOD --------------------------------------------------- +------------- Set the directory to look for the kernel source when building external modules. + Setting "M=..." takes precedence over KBUILD_EXTMOD. KBUILD_OUTPUT --------------------------------------------------- +------------- Specify the output directory when building the kernel. + The output directory can also be specified using "O=...". + Setting "O=..." takes precedence over KBUILD_OUTPUT. KBUILD_DEBARCH --------------------------------------------------- +-------------- For the deb-pkg target, allows overriding the normal heuristics deployed by deb-pkg. Normally deb-pkg attempts to guess the right architecture based on the UTS_MACHINE variable, and on some architectures also the kernel config. @@ -103,44 +115,48 @@ The value of KBUILD_DEBARCH is assumed (not checked) to be a valid Debian architecture. ARCH --------------------------------------------------- +---- Set ARCH to the architecture to be built. + In most cases the name of the architecture is the same as the directory name found in the arch/ directory. + But some architectures such as x86 and sparc have aliases. -x86: i386 for 32 bit, x86_64 for 64 bit -sh: sh for 32 bit, sh64 for 64 bit -sparc: sparc32 for 32 bit, sparc64 for 64 bit + +- x86: i386 for 32 bit, x86_64 for 64 bit +- sh: sh for 32 bit, sh64 for 64 bit +- sparc: sparc32 for 32 bit, sparc64 for 64 bit CROSS_COMPILE --------------------------------------------------- +------------- Specify an optional fixed part of the binutils filename. CROSS_COMPILE can be a part of the filename or the full path. CROSS_COMPILE is also used for ccache in some setups. CF --------------------------------------------------- +-- Additional options for sparse. -CF is often used on the command-line like this: + +CF is often used on the command-line like this:: make CF=-Wbitwise C=2 INSTALL_PATH --------------------------------------------------- +------------ INSTALL_PATH specifies where to place the updated kernel and system map images. Default is /boot, but you can set it to other values. INSTALLKERNEL --------------------------------------------------- +------------- Install script called when using "make install". The default name is "installkernel". The script will be called with the following arguments: - $1 - kernel version - $2 - kernel image file - $3 - kernel map file - $4 - default install path (use root directory if blank) + - $1 - kernel version + - $2 - kernel image file + - $3 - kernel map file + - $4 - default install path (use root directory if blank) The implementation of "make install" is architecture specific and it may differ from the above. @@ -149,32 +165,33 @@ INSTALLKERNEL is provided to enable the possibility to specify a custom installer when cross compiling a kernel. MODLIB --------------------------------------------------- +------ Specify where to install modules. -The default value is: +The default value is:: $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) The value can be overridden in which case the default value is ignored. INSTALL_MOD_PATH --------------------------------------------------- +---------------- INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory relocations required by build roots. This is not defined in the makefile but the argument can be passed to make if needed. INSTALL_MOD_STRIP --------------------------------------------------- +----------------- INSTALL_MOD_STRIP, if defined, will cause modules to be stripped after they are installed. If INSTALL_MOD_STRIP is '1', then the default option --strip-debug will be used. Otherwise, INSTALL_MOD_STRIP value will be used as the options to the strip command. INSTALL_HDR_PATH --------------------------------------------------- +---------------- INSTALL_HDR_PATH specifies where to install user space headers when executing "make headers_*". -The default value is: + +The default value is:: $(objtree)/usr @@ -184,65 +201,65 @@ The output directory is often set using "O=..." on the commandline. The value can be overridden in which case the default value is ignored. KBUILD_SIGN_PIN --------------------------------------------------- +--------------- This variable allows a passphrase or PIN to be passed to the sign-file utility when signing kernel modules, if the private key requires such. KBUILD_MODPOST_WARN --------------------------------------------------- +------------------- KBUILD_MODPOST_WARN can be set to avoid errors in case of undefined symbols in the final module linking stage. It changes such errors into warnings. KBUILD_MODPOST_NOFINAL --------------------------------------------------- +---------------------- KBUILD_MODPOST_NOFINAL can be set to skip the final link of modules. This is solely useful to speed up test compiles. KBUILD_EXTRA_SYMBOLS --------------------------------------------------- +-------------------- For modules that use symbols from other modules. See more details in modules.txt. ALLSOURCE_ARCHS --------------------------------------------------- +--------------- For tags/TAGS/cscope targets, you can specify more than one arch -to be included in the databases, separated by blank space. E.g.: +to be included in the databases, separated by blank space. E.g.:: $ make ALLSOURCE_ARCHS="x86 mips arm" tags -To get all available archs you can also specify all. E.g.: +To get all available archs you can also specify all. E.g.:: $ make ALLSOURCE_ARCHS=all tags KBUILD_ENABLE_EXTRA_GCC_CHECKS --------------------------------------------------- +------------------------------ If enabled over the make command line with "W=1", it turns on additional gcc -W... options for more extensive build-time checking. KBUILD_BUILD_TIMESTAMP --------------------------------------------------- +---------------------- Setting this to a date string overrides the timestamp used in the UTS_VERSION definition (uname -v in the running kernel). The value has to be a string that can be passed to date -d. The default value is the output of the date command at one point during build. KBUILD_BUILD_USER, KBUILD_BUILD_HOST --------------------------------------------------- +------------------------------------ These two variables allow to override the user@host string displayed during boot and in /proc/version. The default value is the output of the commands whoami and host, respectively. KBUILD_LDS --------------------------------------------------- +---------- The linker script with full path. Assigned by the top-level Makefile. KBUILD_VMLINUX_OBJS --------------------------------------------------- +------------------- All object files for vmlinux. They are linked to vmlinux in the same order as listed in KBUILD_VMLINUX_OBJS. KBUILD_VMLINUX_LIBS --------------------------------------------------- +------------------- All .a "lib" files for vmlinux. KBUILD_VMLINUX_OBJS and KBUILD_VMLINUX_LIBS together specify all the object files used to link vmlinux. diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.rst index 864e740811da..2bc8a7803365 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.rst @@ -1,8 +1,12 @@ +================ +Kconfig Language +================ + Introduction ------------ The configuration database is a collection of configuration options -organized in a tree structure: +organized in a tree structure:: +- Code maturity level options | +- Prompt for development and/or incomplete code/drivers @@ -25,9 +29,9 @@ Menu entries ------------ Most entries define a config option; all other entries help to organize -them. A single configuration option is defined like this: +them. A single configuration option is defined like this:: -config MODVERSIONS + config MODVERSIONS bool "Set version information on all module symbols" depends on MODULES help @@ -52,10 +56,12 @@ applicable everywhere (see syntax). Every config option must have a type. There are only two basic types: tristate and string; the other types are based on these two. The type definition optionally accepts an input prompt, so these two examples - are equivalent: + are equivalent:: bool "Networking support" - and + + and:: + bool prompt "Networking support" @@ -98,8 +104,10 @@ applicable everywhere (see syntax). d) Hardware or infrastructure that everybody expects, such as CONFIG_NET or CONFIG_BLOCK. These are rare exceptions. -- type definition + default value: +- type definition + default value:: + "def_bool"/"def_tristate" <expr> ["if" <expr>] + This is a shorthand notation for a type definition plus a value. Optionally dependencies for this default value can be added with "if". @@ -107,11 +115,13 @@ applicable everywhere (see syntax). This defines a dependency for this menu entry. If multiple dependencies are defined, they are connected with '&&'. Dependencies are applied to all other options within this menu entry (which also - accept an "if" expression), so these two examples are equivalent: + accept an "if" expression), so these two examples are equivalent:: bool "foo" if BAR default y if BAR - and + + and:: + depends on BAR bool "foo" default y @@ -124,6 +134,7 @@ applicable everywhere (see syntax). times, the limit is set to the largest selection. Reverse dependencies can only be used with boolean or tristate symbols. + Note: select should be used with care. select will force a symbol to a value without visiting the dependencies. @@ -139,24 +150,26 @@ applicable everywhere (see syntax). symbol except that the "implied" symbol's value may still be set to n from a direct dependency or with a visible prompt. - Given the following example: + Given the following example:: - config FOO + config FOO tristate imply BAZ - config BAZ + config BAZ tristate depends on BAR The following values are possible: + === === ============= ============== FOO BAR BAZ's default choice for BAZ - --- --- ------------- -------------- + === === ============= ============== n y n N/m/y m y m M/y/n y y y Y/n y n * N + === === ============= ============== This is useful e.g. with multiple drivers that want to indicate their ability to hook into a secondary subsystem while allowing the user to @@ -208,9 +221,9 @@ Menu dependencies Dependencies define the visibility of a menu entry and can also reduce the input range of tristate symbols. The tristate logic used in the expressions uses one more state than normal boolean logic to express the -module state. Dependency expressions have the following syntax: +module state. Dependency expressions have the following syntax:: -<expr> ::= <symbol> (1) + <expr> ::= <symbol> (1) <symbol> '=' <symbol> (2) <symbol> '!=' <symbol> (3) <symbol1> '<' <symbol2> (4) @@ -222,7 +235,7 @@ module state. Dependency expressions have the following syntax: <expr> '&&' <expr> (7) <expr> '||' <expr> (8) -Expressions are listed in decreasing order of precedence. +Expressions are listed in decreasing order of precedence. (1) Convert the symbol into an expression. Boolean and tristate symbols are simply converted into the respective expression values. All @@ -255,15 +268,15 @@ Menu structure -------------- The position of a menu entry in the tree is determined in two ways. First -it can be specified explicitly: +it can be specified explicitly:: -menu "Network device support" + menu "Network device support" depends on NET -config NETDEVICES + config NETDEVICES ... -endmenu + endmenu All entries within the "menu" ... "endmenu" block become a submenu of "Network device support". All subentries inherit the dependencies from @@ -275,17 +288,18 @@ dependencies. If a menu entry somehow depends on the previous entry, it can be made a submenu of it. First, the previous (parent) symbol must be part of the dependency list and then one of these two conditions must be true: + - the child entry must become invisible, if the parent is set to 'n' -- the child entry must only be visible, if the parent is visible +- the child entry must only be visible, if the parent is visible:: -config MODULES + config MODULES bool "Enable loadable module support" -config MODVERSIONS + config MODVERSIONS bool "Set version information on all module symbols" depends on MODULES -comment "module support disabled" + comment "module support disabled" depends on !MODULES MODVERSIONS directly depends on MODULES, this means it's only visible if @@ -299,6 +313,7 @@ Kconfig syntax The configuration file describes a series of menu entries, where every line starts with a keyword (except help texts). The following keywords end a menu entry: + - config - menuconfig - choice/endchoice @@ -306,17 +321,17 @@ end a menu entry: - menu/endmenu - if/endif - source -The first five also start the definition of a menu entry. -config: +The first five also start the definition of a menu entry. +config:: "config" <symbol> <config options> This defines a config symbol <symbol> and accepts any of above attributes as options. -menuconfig: +menuconfig:: "menuconfig" <symbol> <config options> @@ -325,43 +340,43 @@ hint to front ends, that all suboptions should be displayed as a separate list of options. To make sure all the suboptions will really show up under the menuconfig entry and not outside of it, every item from the <config options> list must depend on the menuconfig symbol. -In practice, this is achieved by using one of the next two constructs: - -(1): -menuconfig M -if M - config C1 - config C2 -endif - -(2): -menuconfig M -config C1 - depends on M -config C2 - depends on M +In practice, this is achieved by using one of the next two constructs:: + + (1): + menuconfig M + if M + config C1 + config C2 + endif + + (2): + menuconfig M + config C1 + depends on M + config C2 + depends on M In the following examples (3) and (4), C1 and C2 still have the M dependency, but will not appear under menuconfig M anymore, because -of C0, which doesn't depend on M: - -(3): -menuconfig M - config C0 -if M - config C1 - config C2 -endif - -(4): -menuconfig M -config C0 -config C1 - depends on M -config C2 - depends on M - -choices: +of C0, which doesn't depend on M:: + + (3): + menuconfig M + config C0 + if M + config C1 + config C2 + endif + + (4): + menuconfig M + config C0 + config C1 + depends on M + config C2 + depends on M + +choices:: "choice" [symbol] <choice options> @@ -387,7 +402,7 @@ definitions of that choice. If a [symbol] is associated to the choice, then you may define the same choice (i.e. with the same entries) in another place. -comment: +comment:: "comment" <prompt> <comment options> @@ -396,7 +411,7 @@ This defines a comment which is displayed to the user during the configuration process and is also echoed to the output files. The only possible options are dependencies. -menu: +menu:: "menu" <prompt> <menu options> @@ -407,7 +422,7 @@ This defines a menu block, see "Menu structure" above for more information. The only possible options are dependencies and "visible" attributes. -if: +if:: "if" <expr> <if block> @@ -416,13 +431,13 @@ if: This defines an if block. The dependency expression <expr> is appended to all enclosed menu entries. -source: +source:: "source" <prompt> This reads the specified configuration file. This file is always parsed. -mainmenu: +mainmenu:: "mainmenu" <prompt> @@ -452,20 +467,21 @@ that is defined in a common Kconfig file and selected by the relevant architectures. An example is the generic IOMAP functionality. -We would in lib/Kconfig see: +We would in lib/Kconfig see:: -# Generic IOMAP is used to ... -config HAVE_GENERIC_IOMAP + # Generic IOMAP is used to ... + config HAVE_GENERIC_IOMAP -config GENERIC_IOMAP + config GENERIC_IOMAP depends on HAVE_GENERIC_IOMAP && FOO -And in lib/Makefile we would see: -obj-$(CONFIG_GENERIC_IOMAP) += iomap.o +And in lib/Makefile we would see:: -For each architecture using the generic IOMAP functionality we would see: + obj-$(CONFIG_GENERIC_IOMAP) += iomap.o -config X86 +For each architecture using the generic IOMAP functionality we would see:: + + config X86 select ... select HAVE_GENERIC_IOMAP select ... @@ -484,25 +500,25 @@ Adding features that need compiler support There are several features that need compiler support. The recommended way to describe the dependency on the compiler feature is to use "depends on" -followed by a test macro. +followed by a test macro:: -config STACKPROTECTOR + config STACKPROTECTOR bool "Stack Protector buffer overflow detection" depends on $(cc-option,-fstack-protector) ... If you need to expose a compiler capability to makefiles and/or C source files, -CC_HAS_ is the recommended prefix for the config option. +`CC_HAS_` is the recommended prefix for the config option:: -config CC_HAS_STACKPROTECTOR_NONE + config CC_HAS_STACKPROTECTOR_NONE def_bool $(cc-option,-fno-stack-protector) Build as module only ~~~~~~~~~~~~~~~~~~~~ To restrict a component build to module-only, qualify its config symbol -with "depends on m". E.g.: +with "depends on m". E.g.:: -config FOO + config FOO depends on BAR && m limits FOO to module (=m) or disabled (=n). @@ -529,18 +545,18 @@ Simple Kconfig recursive issue Read: Documentation/kbuild/Kconfig.recursion-issue-01 -Test with: +Test with:: -make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-01 allnoconfig + make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-01 allnoconfig Cumulative Kconfig recursive issue ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Read: Documentation/kbuild/Kconfig.recursion-issue-02 -Test with: +Test with:: -make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-02 allnoconfig + make KBUILD_KCONFIG=Documentation/kbuild/Kconfig.recursion-issue-02 allnoconfig Practical solutions to kconfig recursive issue ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -551,7 +567,9 @@ historical issues resolved through these different solutions. a) Remove any superfluous "select FOO" or "depends on FOO" b) Match dependency semantics: + b1) Swap all "select FOO" to "depends on FOO" or, + b2) Swap all "depends on FOO" to "select FOO" The resolution to a) can be tested with the sample Kconfig file @@ -566,8 +584,9 @@ Documentation/kbuild/Kconfig.recursion-issue-02. Below is a list of examples of prior fixes for these types of recursive issues; all errors appear to involve one or more select's and one or more "depends on". +============ =================================== commit fix -====== === +============ =================================== 06b718c01208 select A -> depends on A c22eacfe82f9 depends on A -> depends on B 6a91e854442c select A -> depends on A @@ -590,6 +609,7 @@ d9f9ab51e55e select A -> depends on A 0c51a4d8abd6 depends on A -> select A (3) e98062ed6dc4 select A -> depends on A (3) 91e5d284a7f1 select A -> (null) +============ =================================== (1) Partial (or no) quote of error. (2) That seems to be the gist of that fix. @@ -616,11 +636,11 @@ Semantics of Kconfig ~~~~~~~~~~~~~~~~~~~~ The use of Kconfig is broad, Linux is now only one of Kconfig's users: -one study has completed a broad analysis of Kconfig use in 12 projects [0]. +one study has completed a broad analysis of Kconfig use in 12 projects [0]_. Despite its widespread use, and although this document does a reasonable job in documenting basic Kconfig syntax a more precise definition of Kconfig semantics is welcomed. One project deduced Kconfig semantics through -the use of the xconfig configurator [1]. Work should be done to confirm if +the use of the xconfig configurator [1]_. Work should be done to confirm if the deduced semantics matches our intended Kconfig design goals. Having well defined semantics can be useful for tools for practical @@ -628,42 +648,42 @@ evaluation of depenencies, for instance one such use known case was work to express in boolean abstraction of the inferred semantics of Kconfig to translate Kconfig logic into boolean formulas and run a SAT solver on this to find dead code / features (always inactive), 114 dead features were found in -Linux using this methodology [1] (Section 8: Threats to validity). +Linux using this methodology [1]_ (Section 8: Threats to validity). Confirming this could prove useful as Kconfig stands as one of the the leading -industrial variability modeling languages [1] [2]. Its study would help +industrial variability modeling languages [1]_ [2]_. Its study would help evaluate practical uses of such languages, their use was only theoretical and real world requirements were not well understood. As it stands though only reverse engineering techniques have been used to deduce semantics from -variability modeling languages such as Kconfig [3]. +variability modeling languages such as Kconfig [3]_. -[0] http://www.eng.uwaterloo.ca/~shshe/kconfig_semantics.pdf -[1] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf -[2] http://gsd.uwaterloo.ca/sites/default/files/ase241-berger_0.pdf -[3] http://gsd.uwaterloo.ca/sites/default/files/icse2011.pdf +.. [0] http://www.eng.uwaterloo.ca/~shshe/kconfig_semantics.pdf +.. [1] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf +.. [2] http://gsd.uwaterloo.ca/sites/default/files/ase241-berger_0.pdf +.. [3] http://gsd.uwaterloo.ca/sites/default/files/icse2011.pdf Full SAT solver for Kconfig ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Although SAT solvers [0] haven't yet been used by Kconfig directly, as noted in -the previous subsection, work has been done however to express in boolean +Although SAT solvers [4]_ haven't yet been used by Kconfig directly, as noted +in the previous subsection, work has been done however to express in boolean abstraction the inferred semantics of Kconfig to translate Kconfig logic into -boolean formulas and run a SAT solver on it [1]. Another known related project -is CADOS [2] (former VAMOS [3]) and the tools, mainly undertaker [4], which has -been introduced first with [5]. The basic concept of undertaker is to exract -variability models from Kconfig, and put them together with a propositional -formula extracted from CPP #ifdefs and build-rules into a SAT solver in order -to find dead code, dead files, and dead symbols. If using a SAT solver is -desirable on Kconfig one approach would be to evaluate repurposing such efforts -somehow on Kconfig. There is enough interest from mentors of existing projects -to not only help advise how to integrate this work upstream but also help -maintain it long term. Interested developers should visit: +boolean formulas and run a SAT solver on it [5]_. Another known related project +is CADOS [6]_ (former VAMOS [7]_) and the tools, mainly undertaker [8]_, which +has been introduced first with [9]_. The basic concept of undertaker is to +exract variability models from Kconfig, and put them together with a +propositional formula extracted from CPP #ifdefs and build-rules into a SAT +solver in order to find dead code, dead files, and dead symbols. If using a SAT +solver is desirable on Kconfig one approach would be to evaluate repurposing +such efforts somehow on Kconfig. There is enough interest from mentors of +existing projects to not only help advise how to integrate this work upstream +but also help maintain it long term. Interested developers should visit: http://kernelnewbies.org/KernelProjects/kconfig-sat -[0] http://www.cs.cornell.edu/~sabhar/chapters/SATSolvers-KR-Handbook.pdf -[1] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf -[2] https://cados.cs.fau.de -[3] https://vamos.cs.fau.de -[4] https://undertaker.cs.fau.de -[5] https://www4.cs.fau.de/Publications/2011/tartler_11_eurosys.pdf +.. [4] http://www.cs.cornell.edu/~sabhar/chapters/SATSolvers-KR-Handbook.pdf +.. [5] http://gsd.uwaterloo.ca/sites/default/files/vm-2013-berger.pdf +.. [6] https://cados.cs.fau.de +.. [7] https://vamos.cs.fau.de +.. [8] https://undertaker.cs.fau.de +.. [9] https://www4.cs.fau.de/Publications/2011/tartler_11_eurosys.pdf diff --git a/Documentation/kbuild/kconfig-macro-language.txt b/Documentation/kbuild/kconfig-macro-language.rst index 07da2ea68dce..35b3263b7e40 100644 --- a/Documentation/kbuild/kconfig-macro-language.txt +++ b/Documentation/kbuild/kconfig-macro-language.rst @@ -1,3 +1,7 @@ +====================== +Kconfig macro language +====================== + Concept ------- @@ -7,7 +11,7 @@ targets and prerequisites. The other is a macro language for performing textual substitution. There is clear distinction between the two language stages. For example, you -can write a makefile like follows: +can write a makefile like follows:: APP := foo SRC := foo.c @@ -17,7 +21,7 @@ can write a makefile like follows: $(CC) -o $(APP) $(SRC) The macro language replaces the variable references with their expanded form, -and handles as if the source file were input like follows: +and handles as if the source file were input like follows:: foo: foo.c gcc -o foo foo.c @@ -26,7 +30,7 @@ Then, Make analyzes the dependency graph and determines the targets to be updated. The idea is quite similar in Kconfig - it is possible to describe a Kconfig -file like this: +file like this:: CC := gcc @@ -34,7 +38,7 @@ file like this: def_bool $(shell, $(srctree)/scripts/gcc-check-foo.sh $(CC)) The macro language in Kconfig processes the source file into the following -intermediate: +intermediate:: config CC_HAS_FOO def_bool y @@ -69,7 +73,7 @@ variable. The righthand side of += is expanded immediately if the lefthand side was originally defined as a simple variable. Otherwise, its evaluation is deferred. -The variable reference can take parameters, in the following form: +The variable reference can take parameters, in the following form:: $(name,arg1,arg2,arg3) @@ -141,7 +145,7 @@ Make vs Kconfig Kconfig adopts Make-like macro language, but the function call syntax is slightly different. -A function call in Make looks like this: +A function call in Make looks like this:: $(func-name arg1,arg2,arg3) @@ -149,14 +153,14 @@ The function name and the first argument are separated by at least one whitespace. Then, leading whitespaces are trimmed from the first argument, while whitespaces in the other arguments are kept. You need to use a kind of trick to start the first parameter with spaces. For example, if you want -to make "info" function print " hello", you can write like follows: +to make "info" function print " hello", you can write like follows:: empty := space := $(empty) $(empty) $(info $(space)$(space)hello) Kconfig uses only commas for delimiters, and keeps all whitespaces in the -function call. Some people prefer putting a space after each comma delimiter: +function call. Some people prefer putting a space after each comma delimiter:: $(func-name, arg1, arg2, arg3) @@ -166,7 +170,7 @@ Make - for example, $(subst .c, .o, $(sources)) is a typical mistake; it replaces ".c" with " .o". In Make, a user-defined function is referenced by using a built-in function, -'call', like this: +'call', like this:: $(call my-func,arg1,arg2,arg3) @@ -179,12 +183,12 @@ Likewise, $(info hello, world) prints "hello, world" to stdout. You could say this is _useful_ inconsistency. In Kconfig, for simpler implementation and grammatical consistency, commas that -appear in the $( ) context are always delimiters. It means +appear in the $( ) context are always delimiters. It means:: $(shell, echo hello, world) is an error because it is passing two parameters where the 'shell' function -accepts only one. To pass commas in arguments, you can use the following trick: +accepts only one. To pass commas in arguments, you can use the following trick:: comma := , $(shell, echo hello$(comma) world) @@ -195,7 +199,7 @@ Caveats A variable (or function) cannot be expanded across tokens. So, you cannot use a variable as a shorthand for an expression that consists of multiple tokens. -The following works: +The following works:: RANGE_MIN := 1 RANGE_MAX := 3 @@ -204,7 +208,7 @@ The following works: int "foo" range $(RANGE_MIN) $(RANGE_MAX) -But, the following does not work: +But, the following does not work:: RANGES := 1 3 @@ -213,7 +217,7 @@ But, the following does not work: range $(RANGES) A variable cannot be expanded to any keyword in Kconfig. The following does -not work: +not work:: MY_TYPE := tristate @@ -223,7 +227,8 @@ not work: Obviously from the design, $(shell command) is expanded in the textual substitution phase. You cannot pass symbols to the 'shell' function. -The following does not work as expected. + +The following does not work as expected:: config ENDIAN_FLAG string @@ -234,7 +239,7 @@ The following does not work as expected. def_bool $(shell $(srctree)/scripts/gcc-check-flag ENDIAN_FLAG) Instead, you can do like follows so that any function call is statically -expanded. +expanded:: config CC_HAS_ENDIAN_FLAG bool diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.rst index 68c82914c0f3..88129af7e539 100644 --- a/Documentation/kbuild/kconfig.txt +++ b/Documentation/kbuild/kconfig.rst @@ -1,4 +1,8 @@ -This file contains some assistance for using "make *config". +=================== +Kconfig make config +=================== + +This file contains some assistance for using `make *config`. Use "make help" to list all of the possible configuration targets. @@ -6,9 +10,8 @@ The xconfig ('qconf'), menuconfig ('mconf'), and nconfig ('nconf') programs also have embedded help text. Be sure to check that for navigation, search, and other general help text. -====================================================================== General --------------------------------------------------- +------- New kernel releases often introduce new config symbols. Often more important, new kernel releases may rename config symbols. When @@ -17,51 +20,55 @@ this happens, using a previously working .config file and running for you, so you may find that you need to see what NEW kernel symbols have been introduced. -To see a list of new config symbols, use +To see a list of new config symbols, use:: cp user/some/old.config .config make listnewconfig and the config program will list any new symbols, one per line. -Alternatively, you can use the brute force method: +Alternatively, you can use the brute force method:: make oldconfig scripts/diffconfig .config.old .config | less -______________________________________________________________________ -Environment variables for '*config' +---------------------------------------------------------------------- + +Environment variables for `*config` KCONFIG_CONFIG --------------------------------------------------- +-------------- This environment variable can be used to specify a default kernel config file name to override the default name of ".config". KCONFIG_OVERWRITECONFIG --------------------------------------------------- +----------------------- If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not break symlinks when .config is a symlink to somewhere else. -CONFIG_ --------------------------------------------------- -If you set CONFIG_ in the environment, Kconfig will prefix all symbols +`CONFIG_` +--------- +If you set `CONFIG_` in the environment, Kconfig will prefix all symbols with its value when saving the configuration, instead of using the default, -"CONFIG_". +`CONFIG_`. + +---------------------------------------------------------------------- -______________________________________________________________________ Environment variables for '{allyes/allmod/allno/rand}config' KCONFIG_ALLCONFIG --------------------------------------------------- +----------------- (partially based on lkml email from/by Rob Landley, re: miniconfig) + -------------------------------------------------- + The allyesconfig/allmodconfig/allnoconfig/randconfig variants can also use the environment variable KCONFIG_ALLCONFIG as a flag or a filename that contains config symbols that the user requires to be set to a specific value. If KCONFIG_ALLCONFIG is used without a filename where -KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", "make *config" +KCONFIG_ALLCONFIG == "" or KCONFIG_ALLCONFIG == "1", `make *config` checks for a file named "all{yes/mod/no/def/random}.config" -(corresponding to the *config command that was used) for symbol values +(corresponding to the `*config` command that was used) for symbol values that are to be forced. If this file is not found, it checks for a file named "all.config" to contain forced values. @@ -74,43 +81,55 @@ This 'KCONFIG_ALLCONFIG' file is a config file which contains (usually a subset of all) preset config symbols. These variable settings are still subject to normal dependency checks. -Examples: +Examples:: + KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig -or + +or:: + KCONFIG_ALLCONFIG=mini.config make allnoconfig -or + +or:: + make KCONFIG_ALLCONFIG=mini.config allnoconfig These examples will disable most options (allnoconfig) but enable or disable the options that are explicitly listed in the specified mini-config files. -______________________________________________________________________ +---------------------------------------------------------------------- + Environment variables for 'randconfig' KCONFIG_SEED --------------------------------------------------- +------------ You can set this to the integer value used to seed the RNG, if you want to somehow debug the behaviour of the kconfig parser/frontends. If not set, the current time will be used. KCONFIG_PROBABILITY --------------------------------------------------- +------------------- This variable can be used to skew the probabilities. This variable can be unset or empty, or set to three different formats: + + ======================= ================== ===================== KCONFIG_PROBABILITY y:n split y:m:n split - ----------------------------------------------------------------- + ======================= ================== ===================== unset or empty 50 : 50 33 : 33 : 34 N N : 100-N N/2 : N/2 : 100-N [1] N:M N+M : 100-(N+M) N : M : 100-(N+M) [2] N:M:L N : 100-N M : L : 100-(M+L) + ======================= ================== ===================== where N, M and L are integers (in base 10) in the range [0,100], and so that: + [1] N+M is in the range [0,100] + [2] M+L is in the range [0,100] -Examples: +Examples:: + KCONFIG_PROBABILITY=10 10% of booleans will be set to 'y', 90% to 'n' 5% of tristates will be set to 'y', 5% to 'm', 90% to 'n' @@ -121,34 +140,36 @@ Examples: 10% of booleans will be set to 'y', 90% to 'n' 15% of tristates will be set to 'y', 15% to 'm', 70% to 'n' -______________________________________________________________________ +---------------------------------------------------------------------- + Environment variables for 'syncconfig' KCONFIG_NOSILENTUPDATE --------------------------------------------------- +---------------------- If this variable has a non-blank value, it prevents silent kernel config updates (requires explicit updates). KCONFIG_AUTOCONFIG --------------------------------------------------- +------------------ This environment variable can be set to specify the path & name of the "auto.conf" file. Its default value is "include/config/auto.conf". KCONFIG_TRISTATE --------------------------------------------------- +---------------- This environment variable can be set to specify the path & name of the "tristate.conf" file. Its default value is "include/config/tristate.conf". KCONFIG_AUTOHEADER --------------------------------------------------- +------------------ This environment variable can be set to specify the path & name of the "autoconf.h" (header) file. Its default value is "include/generated/autoconf.h". -====================================================================== +---------------------------------------------------------------------- + menuconfig --------------------------------------------------- +---------- SEARCHING for CONFIG symbols @@ -158,7 +179,8 @@ Searching in menuconfig: names, so you have to know something close to what you are looking for. - Example: + Example:: + /hotplug This lists all config symbols that contain "hotplug", e.g., HOTPLUG_CPU, MEMORY_HOTPLUG. @@ -166,48 +188,55 @@ Searching in menuconfig: For search help, enter / followed by TAB-TAB (to highlight <Help>) and Enter. This will tell you that you can also use regular expressions (regexes) in the search string, so if you - are not interested in MEMORY_HOTPLUG, you could try + are not interested in MEMORY_HOTPLUG, you could try:: /^hotplug When searching, symbols are sorted thus: + - first, exact matches, sorted alphabetically (an exact match is when the search matches the complete symbol name); - then, other matches, sorted alphabetically. + For example: ^ATH.K matches: + ATH5K ATH9K ATH5K_AHB ATH5K_DEBUG [...] ATH6KL ATH6KL_DEBUG [...] ATH9K_AHB ATH9K_BTCOEX_SUPPORT ATH9K_COMMON [...] + of which only ATH5K and ATH9K match exactly and so are sorted first (and in alphabetical order), then come all other symbols, sorted in alphabetical order. -______________________________________________________________________ +---------------------------------------------------------------------- + User interface options for 'menuconfig' MENUCONFIG_COLOR --------------------------------------------------- +---------------- It is possible to select different color themes using the variable -MENUCONFIG_COLOR. To select a theme use: +MENUCONFIG_COLOR. To select a theme use:: make MENUCONFIG_COLOR=<theme> menuconfig -Available themes are: - mono => selects colors suitable for monochrome displays - blackbg => selects a color scheme with black background - classic => theme with blue background. The classic look - bluetitle => a LCD friendly version of classic. (default) +Available themes are:: + + - mono => selects colors suitable for monochrome displays + - blackbg => selects a color scheme with black background + - classic => theme with blue background. The classic look + - bluetitle => a LCD friendly version of classic. (default) MENUCONFIG_MODE --------------------------------------------------- +--------------- This mode shows all sub-menus in one large tree. -Example: +Example:: + make MENUCONFIG_MODE=single_menu menuconfig +---------------------------------------------------------------------- -====================================================================== nconfig --------------------------------------------------- +------- nconfig is an alternate text-based configurator. It lists function keys across the bottom of the terminal (window) that execute commands. @@ -231,16 +260,16 @@ Searching in nconfig: given string or regular expression (regex). NCONFIG_MODE --------------------------------------------------- +------------ This mode shows all sub-menus in one large tree. -Example: +Example:: make NCONFIG_MODE=single_menu nconfig +---------------------------------------------------------------------- -====================================================================== xconfig --------------------------------------------------- +------- Searching in xconfig: @@ -260,13 +289,12 @@ Searching in xconfig: to return to the main menu. -====================================================================== +---------------------------------------------------------------------- + gconfig --------------------------------------------------- +------- Searching in gconfig: There is no search command in gconfig. However, gconfig does have several different viewing choices, modes, and options. - -### diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.rst index d65ad5746f94..9274cdcc9bd2 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.rst @@ -1,8 +1,10 @@ +====================== Linux Kernel Makefiles +====================== This document describes the Linux kernel Makefiles. -=== Table of Contents +.. Table of Contents === 1 Overview === 2 Who does what @@ -54,9 +56,10 @@ This document describes the Linux kernel Makefiles. === 10 Credits === 11 TODO -=== 1 Overview +1 Overview +========== -The Makefiles have five parts: +The Makefiles have five parts:: Makefile the top Makefile. .config the kernel configuration file. @@ -85,7 +88,8 @@ scripts/Makefile.* contains all the definitions/rules etc. that are used to build the kernel based on the kbuild makefiles. -=== 2 Who does what +2 Who does what +=============== People have four different relationships with the kernel Makefiles. @@ -110,7 +114,8 @@ These people need to know about all aspects of the kernel Makefiles. This document is aimed towards normal developers and arch developers. -=== 3 The kbuild files +3 The kbuild files +================== Most Makefiles within the kernel are kbuild Makefiles that use the kbuild infrastructure. This chapter introduces the syntax used in the @@ -122,7 +127,8 @@ file will be used. Section 3.1 "Goal definitions" is a quick intro, further chapters provide more details, with real examples. ---- 3.1 Goal definitions +3.1 Goal definitions +-------------------- Goal definitions are the main part (heart) of the kbuild Makefile. These lines define the files to be built, any special compilation @@ -130,7 +136,8 @@ more details, with real examples. The most simple kbuild makefile contains one line: - Example: + Example:: + obj-y += foo.o This tells kbuild that there is one object in that directory, named @@ -139,14 +146,16 @@ more details, with real examples. If foo.o shall be built as a module, the variable obj-m is used. Therefore the following pattern is often used: - Example: + Example:: + obj-$(CONFIG_FOO) += foo.o $(CONFIG_FOO) evaluates to either y (for built-in) or m (for module). If CONFIG_FOO is neither y nor m, then the file will not be compiled nor linked. ---- 3.2 Built-in object goals - obj-y +3.2 Built-in object goals - obj-y +--------------------------------- The kbuild Makefile specifies object files for vmlinux in the $(obj-y) lists. These lists depend on the kernel @@ -167,14 +176,16 @@ more details, with real examples. order may e.g. change the order in which your SCSI controllers are detected, and thus your disks are renumbered. - Example: + Example:: + #drivers/isdn/i4l/Makefile # Makefile for the kernel ISDN subsystem and device drivers. # Each configuration option enables a list of files. obj-$(CONFIG_ISDN_I4L) += isdn.o obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o ---- 3.3 Loadable module goals - obj-m +3.3 Loadable module goals - obj-m +--------------------------------- $(obj-m) specifies object files which are built as loadable kernel modules. @@ -183,7 +194,8 @@ more details, with real examples. files. In the case of one source file, the kbuild makefile simply adds the file to $(obj-m). - Example: + Example:: + #drivers/isdn/i4l/Makefile obj-$(CONFIG_ISDN_PPP_BSDCOMP) += isdn_bsdcomp.o @@ -195,7 +207,8 @@ more details, with real examples. module from, so you have to tell it by setting a $(<module_name>-y) variable. - Example: + Example:: + #drivers/isdn/i4l/Makefile obj-$(CONFIG_ISDN_I4L) += isdn.o isdn-y := isdn_net_lib.o isdn_v110.o isdn_common.o @@ -205,10 +218,11 @@ more details, with real examples. "$(LD) -r" on the list of these files to generate isdn.o. Due to kbuild recognizing $(<module_name>-y) for composite objects, - you can use the value of a CONFIG_ symbol to optionally include an + you can use the value of a `CONFIG_` symbol to optionally include an object file as part of a composite object. - Example: + Example:: + #fs/ext2/Makefile obj-$(CONFIG_EXT2_FS) += ext2.o ext2-y := balloc.o dir.o file.o ialloc.o inode.o ioctl.o \ @@ -225,12 +239,14 @@ more details, with real examples. kbuild will build an ext2.o file for you out of the individual parts and then link this into built-in.a, as you would expect. ---- 3.4 Objects which export symbols +3.4 Objects which export symbols +-------------------------------- No special notation is required in the makefiles for modules exporting symbols. ---- 3.5 Library file goals - lib-y +3.5 Library file goals - lib-y +------------------------------ Objects listed with obj-* are used for modules, or combined in a built-in.a for that specific directory. @@ -247,18 +263,21 @@ more details, with real examples. and to be part of a library. Therefore the same directory may contain both a built-in.a and a lib.a file. - Example: + Example:: + #arch/x86/lib/Makefile lib-y := delay.o This will create a library lib.a based on delay.o. For kbuild to actually recognize that there is a lib.a being built, the directory shall be listed in libs-y. + See also "6.4 List directories to visit when descending". - Use of lib-y is normally restricted to lib/ and arch/*/lib. + Use of lib-y is normally restricted to `lib/` and `arch/*/lib`. ---- 3.6 Descending down in directories +3.6 Descending down in directories +---------------------------------- A Makefile is only responsible for building objects in its own directory. Files in subdirectories should be taken care of by @@ -270,7 +289,8 @@ more details, with real examples. ext2 lives in a separate directory, and the Makefile present in fs/ tells kbuild to descend down using the following assignment. - Example: + Example:: + #fs/Makefile obj-$(CONFIG_EXT2_FS) += ext2/ @@ -281,11 +301,12 @@ more details, with real examples. the directory, it is the Makefile in the subdirectory that specifies what is modular and what is built-in. - It is good practice to use a CONFIG_ variable when assigning directory + It is good practice to use a `CONFIG_` variable when assigning directory names. This allows kbuild to totally skip the directory if the - corresponding CONFIG_ option is neither 'y' nor 'm'. + corresponding `CONFIG_` option is neither 'y' nor 'm'. ---- 3.7 Compilation flags +3.7 Compilation flags +--------------------- ccflags-y, asflags-y and ldflags-y These three flags apply only to the kbuild makefile in which they @@ -297,7 +318,8 @@ more details, with real examples. ccflags-y specifies options for compiling with $(CC). - Example: + Example:: + # drivers/acpi/acpica/Makefile ccflags-y := -Os -D_LINUX -DBUILDING_ACPICA ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT @@ -308,13 +330,15 @@ more details, with real examples. asflags-y specifies options for assembling with $(AS). - Example: + Example:: + #arch/sparc/kernel/Makefile asflags-y := -ansi ldflags-y specifies options for linking with $(LD). - Example: + Example:: + #arch/cris/boot/compressed/Makefile ldflags-y += -T $(srctree)/$(src)/decompress_$(arch-y).lds @@ -325,18 +349,19 @@ more details, with real examples. Options specified using subdir-* are added to the commandline before the options specified using the non-subdir variants. - Example: + Example:: + subdir-ccflags-y := -Werror CFLAGS_$@, AFLAGS_$@ - CFLAGS_$@ and AFLAGS_$@ only apply to commands in current kbuild makefile. $(CFLAGS_$@) specifies per-file options for $(CC). The $@ part has a literal value which specifies the file that it is for. - Example: + Example:: + # drivers/scsi/Makefile CFLAGS_aha152x.o = -DAHA152X_STAT -DAUTOCONF CFLAGS_gdth.o = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ \ @@ -347,24 +372,27 @@ more details, with real examples. $(AFLAGS_$@) is a similar feature for source files in assembly languages. - Example: + Example:: + # arch/arm/kernel/Makefile AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) AFLAGS_crunch-bits.o := -Wa,-mcpu=ep9312 AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt ---- 3.9 Dependency tracking +3.9 Dependency tracking +----------------------- Kbuild tracks dependencies on the following: - 1) All prerequisite files (both *.c and *.h) - 2) CONFIG_ options used in all prerequisite files + 1) All prerequisite files (both `*.c` and `*.h`) + 2) `CONFIG_` options used in all prerequisite files 3) Command-line used to compile target Thus, if you change an option to $(CC) all affected files will be re-compiled. ---- 3.10 Special Rules +3.10 Special Rules +------------------ Special rules are used when the kbuild infrastructure does not provide the required support. A typical example is @@ -379,43 +407,47 @@ more details, with real examples. Two variables are used when defining special rules: - $(src) - $(src) is a relative path which points to the directory - where the Makefile is located. Always use $(src) when - referring to files located in the src tree. + $(src) + $(src) is a relative path which points to the directory + where the Makefile is located. Always use $(src) when + referring to files located in the src tree. + + $(obj) + $(obj) is a relative path which points to the directory + where the target is saved. Always use $(obj) when + referring to generated files. - $(obj) - $(obj) is a relative path which points to the directory - where the target is saved. Always use $(obj) when - referring to generated files. + Example:: - Example: #drivers/scsi/Makefile $(obj)/53c8xx_d.h: $(src)/53c7,8xx.scr $(src)/script_asm.pl $(CPP) -DCHIP=810 - < $< | ... $(src)/script_asm.pl - This is a special rule, following the normal syntax - required by make. - The target file depends on two prerequisite files. References - to the target file are prefixed with $(obj), references - to prerequisites are referenced with $(src) (because they are not - generated files). - - $(kecho) - echoing information to user in a rule is often a good practice - but when execution "make -s" one does not expect to see any output - except for warnings/errors. - To support this kbuild defines $(kecho) which will echo out the - text following $(kecho) to stdout except if "make -s" is used. - - Example: + This is a special rule, following the normal syntax + required by make. + + The target file depends on two prerequisite files. References + to the target file are prefixed with $(obj), references + to prerequisites are referenced with $(src) (because they are not + generated files). + + $(kecho) + echoing information to user in a rule is often a good practice + but when execution "make -s" one does not expect to see any output + except for warnings/errors. + To support this kbuild defines $(kecho) which will echo out the + text following $(kecho) to stdout except if "make -s" is used. + + Example:: + #arch/blackfin/boot/Makefile $(obj)/vmImage: $(obj)/vmlinux.gz $(call if_changed,uimage) @$(kecho) 'Kernel: $@ is ready' ---- 3.11 $(CC) support functions +3.11 $(CC) support functions +---------------------------- The kernel may be built with several different versions of $(CC), each supporting a unique set of features and options. @@ -425,10 +457,11 @@ more details, with real examples. as-option as-option is used to check if $(CC) -- when used to compile - assembler (*.S) files -- supports the given option. An optional + assembler (`*.S`) files -- supports the given option. An optional second option may be specified if the first option is not supported. - Example: + Example:: + #arch/sh/Makefile cflags-y += $(call as-option,-Wa$(comma)-isa=$(isa-y),) @@ -437,6 +470,21 @@ more details, with real examples. The second argument is optional, and if supplied will be used if first argument is not supported. + cc-ldoption + cc-ldoption is used to check if $(CC) when used to link object files + supports the given option. An optional second option may be + specified if first option are not supported. + + Example:: + + #arch/x86/kernel/Makefile + vsyscall-flags += $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) + + In the above example, vsyscall-flags will be assigned the option + -Wl$(comma)--hash-style=sysv if it is supported by $(CC). + The second argument is optional, and if supplied will be used + if first argument is not supported. + as-instr as-instr checks if the assembler reports a specific instruction and then outputs either option1 or option2 @@ -447,7 +495,8 @@ more details, with real examples. cc-option is used to check if $(CC) supports a given option, and if not supported to use an optional second option. - Example: + Example:: + #arch/x86/Makefile cflags-y += $(call cc-option,-march=pentium-mmx,-march=i586) @@ -461,7 +510,8 @@ more details, with real examples. cc-option-yn is used to check if gcc supports a given option and return 'y' if supported, otherwise 'n'. - Example: + Example:: + #arch/ppc/Makefile biarch := $(call cc-option-yn, -m32) aflags-$(biarch) += -a32 @@ -479,7 +529,8 @@ more details, with real examples. because gcc 4.4 and later accept any unknown -Wno-* option and only warn about it if there is another warning in the source file. - Example: + Example:: + KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) In the above example, -Wno-unused-but-set-variable will be added to @@ -490,7 +541,8 @@ more details, with real examples. if version expression is true, or the fifth (if given) if the version expression is false. - Example: + Example:: + #fs/reiserfs/Makefile ccflags-y := $(call cc-ifversion, -lt, 0402, -O1) @@ -515,7 +567,8 @@ more details, with real examples. build (host arch is different from target arch). And if CROSS_COMPILE is already set then leave it with the old value. - Example: + Example:: + #arch/m68k/Makefile ifneq ($(SUBARCH),$(ARCH)) ifeq ($(CROSS_COMPILE),) @@ -523,7 +576,8 @@ more details, with real examples. endif endif ---- 3.12 $(LD) support functions +3.12 $(LD) support functions +---------------------------- ld-option ld-option is used to check if $(LD) supports the supplied option. @@ -531,12 +585,14 @@ more details, with real examples. The second argument is an optional option that can be used if the first option is not supported by $(LD). - Example: + Example:: + #Makefile LDFLAGS_vmlinux += $(call ld-option, -X) -=== 4 Host Program support +4 Host Program support +====================== Kbuild supports building executables on the host for use during the compilation stage. @@ -550,21 +606,24 @@ This can be done in two ways. Either add the dependency in a rule, or utilise the variable $(always). Both possibilities are described in the following. ---- 4.1 Simple Host Program +4.1 Simple Host Program +----------------------- In some cases there is a need to compile and run a program on the computer where the build is running. The following line tells kbuild that the program bin2hex shall be built on the build host. - Example: + Example:: + hostprogs-y := bin2hex Kbuild assumes in the above example that bin2hex is made from a single c-source file named bin2hex.c located in the same directory as the Makefile. ---- 4.2 Composite Host Programs +4.2 Composite Host Programs +--------------------------- Host programs can be made up based on composite objects. The syntax used to define composite objects for host programs is @@ -572,7 +631,8 @@ Both possibilities are described in the following. $(<executable>-objs) lists all objects used to link the final executable. - Example: + Example:: + #scripts/lxdialog/Makefile hostprogs-y := lxdialog lxdialog-objs := checklist.o lxdialog.o @@ -580,16 +640,19 @@ Both possibilities are described in the following. Objects with extension .o are compiled from the corresponding .c files. In the above example, checklist.c is compiled to checklist.o and lxdialog.c is compiled to lxdialog.o. + Finally, the two .o files are linked to the executable, lxdialog. Note: The syntax <executable>-y is not permitted for host-programs. ---- 4.3 Using C++ for host programs +4.3 Using C++ for host programs +------------------------------- kbuild offers support for host programs written in C++. This was introduced solely to support kconfig, and is not recommended for general use. - Example: + Example:: + #scripts/kconfig/Makefile hostprogs-y := qconf qconf-cxxobjs := qconf.o @@ -600,13 +663,15 @@ Both possibilities are described in the following. If qconf is composed of a mixture of .c and .cc files, then an additional line can be used to identify this. - Example: + Example:: + #scripts/kconfig/Makefile hostprogs-y := qconf qconf-cxxobjs := qconf.o qconf-objs := check.o ---- 4.4 Controlling compiler options for host programs +4.4 Controlling compiler options for host programs +-------------------------------------------------- When compiling host programs, it is possible to set specific flags. The programs will always be compiled utilising $(HOSTCC) passed @@ -614,27 +679,31 @@ Both possibilities are described in the following. To set flags that will take effect for all host programs created in that Makefile, use the variable HOST_EXTRACFLAGS. - Example: + Example:: + #scripts/lxdialog/Makefile HOST_EXTRACFLAGS += -I/usr/include/ncurses To set specific flags for a single file the following construction is used: - Example: + Example:: + #arch/ppc64/boot/Makefile HOSTCFLAGS_piggyback.o := -DKERNELBASE=$(KERNELBASE) It is also possible to specify additional options to the linker. - Example: + Example:: + #scripts/kconfig/Makefile HOSTLDLIBS_qconf := -L$(QTDIR)/lib When linking qconf, it will be passed the extra option "-L$(QTDIR)/lib". ---- 4.5 When host programs are actually built +4.5 When host programs are actually built +----------------------------------------- Kbuild will only build host-programs when they are referenced as a prerequisite. @@ -642,7 +711,8 @@ Both possibilities are described in the following. (1) List the prerequisite explicitly in a special rule. - Example: + Example:: + #drivers/pci/Makefile hostprogs-y := gen-devlist $(obj)/devlist.h: $(src)/pci.ids $(obj)/gen-devlist @@ -653,11 +723,13 @@ Both possibilities are described in the following. the host programs in special rules must be prefixed with $(obj). (2) Use $(always) + When there is no suitable special rule, and the host program shall be built when a makefile is entered, the $(always) variable shall be used. - Example: + Example:: + #scripts/lxdialog/Makefile hostprogs-y := lxdialog always := $(hostprogs-y) @@ -665,11 +737,13 @@ Both possibilities are described in the following. This will tell kbuild to build lxdialog even if not referenced in any rule. ---- 4.6 Using hostprogs-$(CONFIG_FOO) +4.6 Using hostprogs-$(CONFIG_FOO) +--------------------------------- A typical pattern in a Kbuild file looks like this: - Example: + Example:: + #scripts/Makefile hostprogs-$(CONFIG_KALLSYMS) += kallsyms @@ -679,7 +753,8 @@ Both possibilities are described in the following. like hostprogs-y. But only hostprogs-y is recommended to be used when no CONFIG symbols are involved. -=== 5 Kbuild clean infrastructure +5 Kbuild clean infrastructure +============================= "make clean" deletes most generated files in the obj tree where the kernel is compiled. This includes generated files such as host programs. @@ -691,7 +766,8 @@ generated by kbuild are deleted all over the kernel src tree when Additional files can be specified in kbuild makefiles by use of $(clean-files). - Example: + Example:: + #lib/Makefile clean-files := crc32table.h @@ -701,7 +777,8 @@ Makefile, except if prefixed with $(objtree). To delete a directory hierarchy use: - Example: + Example:: + #scripts/package/Makefile clean-dirs := $(objtree)/debian/ @@ -711,7 +788,8 @@ subdirectories. To exclude certain files from make clean, use the $(no-clean-files) variable. This is only a special case used in the top level Kbuild file: - Example: + Example:: + #Kbuild no-clean-files := $(bounds-file) $(offsets-file) @@ -719,7 +797,8 @@ Usually kbuild descends down in subdirectories due to "obj-* := dir/", but in the architecture makefiles where the kbuild infrastructure is not sufficient this sometimes needs to be explicit. - Example: + Example:: + #arch/x86/boot/Makefile subdir- := compressed/ @@ -729,7 +808,8 @@ directory compressed/ when "make clean" is executed. To support the clean infrastructure in the Makefiles that build the final bootimage there is an optional target named archclean: - Example: + Example:: + #arch/x86/Makefile archclean: $(Q)$(MAKE) $(clean)=arch/x86/boot @@ -745,7 +825,8 @@ is not operational at that point. Note 2: All directories listed in core-y, libs-y, drivers-y and net-y will be visited during "make clean". -=== 6 Architecture Makefiles +6 Architecture Makefiles +======================== The top level Makefile sets up the environment and does the preparation, before starting to descend down in the individual directories. @@ -756,6 +837,7 @@ To do so, arch/$(ARCH)/Makefile sets up a number of variables and defines a few targets. When kbuild executes, the following steps are followed (roughly): + 1) Configuration of the kernel => produce .config 2) Store kernel version in include/linux/version.h 3) Updating all other prerequisites to the target prepare: @@ -773,37 +855,45 @@ When kbuild executes, the following steps are followed (roughly): - Preparing initrd images and the like ---- 6.1 Set variables to tweak the build to the architecture +6.1 Set variables to tweak the build to the architecture +-------------------------------------------------------- - LDFLAGS Generic $(LD) options + LDFLAGS + Generic $(LD) options Flags used for all invocations of the linker. Often specifying the emulation is sufficient. - Example: + Example:: + #arch/s390/Makefile LDFLAGS := -m elf_s390 + Note: ldflags-y can be used to further customise the flags used. See chapter 3.7. - LDFLAGS_vmlinux Options for $(LD) when linking vmlinux + LDFLAGS_vmlinux + Options for $(LD) when linking vmlinux LDFLAGS_vmlinux is used to specify additional flags to pass to the linker when linking the final vmlinux image. LDFLAGS_vmlinux uses the LDFLAGS_$@ support. - Example: + Example:: + #arch/x86/Makefile LDFLAGS_vmlinux := -e stext - OBJCOPYFLAGS objcopy flags + OBJCOPYFLAGS + objcopy flags When $(call if_changed,objcopy) is used to translate a .o file, the flags specified in OBJCOPYFLAGS will be used. $(call if_changed,objcopy) is often used to generate raw binaries on vmlinux. - Example: + Example:: + #arch/s390/Makefile OBJCOPYFLAGS := -O binary @@ -814,30 +904,34 @@ When kbuild executes, the following steps are followed (roughly): In this example, the binary $(obj)/image is a binary version of vmlinux. The usage of $(call if_changed,xxx) will be described later. - KBUILD_AFLAGS $(AS) assembler flags + KBUILD_AFLAGS + $(AS) assembler flags Default value - see top level Makefile Append or modify as required per architecture. - Example: + Example:: + #arch/sparc64/Makefile KBUILD_AFLAGS += -m64 -mcpu=ultrasparc - KBUILD_CFLAGS $(CC) compiler flags + KBUILD_CFLAGS + $(CC) compiler flags Default value - see top level Makefile Append or modify as required per architecture. Often, the KBUILD_CFLAGS variable depends on the configuration. - Example: + Example:: + #arch/x86/boot/compressed/Makefile cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small KBUILD_CFLAGS += $(cflags-y) Many arch Makefiles dynamically run the target C compiler to - probe supported options: + probe supported options:: #arch/x86/Makefile @@ -853,32 +947,39 @@ When kbuild executes, the following steps are followed (roughly): The first example utilises the trick that a config option expands to 'y' when selected. - KBUILD_AFLAGS_KERNEL $(AS) options specific for built-in + KBUILD_AFLAGS_KERNEL + $(AS) options specific for built-in $(KBUILD_AFLAGS_KERNEL) contains extra C compiler flags used to compile resident kernel code. - KBUILD_AFLAGS_MODULE Options for $(AS) when building modules + KBUILD_AFLAGS_MODULE + Options for $(AS) when building modules $(KBUILD_AFLAGS_MODULE) is used to add arch-specific options that are used for $(AS). + From commandline AFLAGS_MODULE shall be used (see kbuild.txt). - KBUILD_CFLAGS_KERNEL $(CC) options specific for built-in + KBUILD_CFLAGS_KERNEL + $(CC) options specific for built-in $(KBUILD_CFLAGS_KERNEL) contains extra C compiler flags used to compile resident kernel code. - KBUILD_CFLAGS_MODULE Options for $(CC) when building modules + KBUILD_CFLAGS_MODULE + Options for $(CC) when building modules $(KBUILD_CFLAGS_MODULE) is used to add arch-specific options that are used for $(CC). From commandline CFLAGS_MODULE shall be used (see kbuild.txt). - KBUILD_LDFLAGS_MODULE Options for $(LD) when linking modules + KBUILD_LDFLAGS_MODULE + Options for $(LD) when linking modules $(KBUILD_LDFLAGS_MODULE) is used to add arch-specific options used when linking modules. This is often a linker script. + From commandline LDFLAGS_MODULE shall be used (see kbuild.txt). KBUILD_ARFLAGS Options for $(AR) when creating archives @@ -894,7 +995,8 @@ When kbuild executes, the following steps are followed (roughly): means for an architecture to override the defaults. ---- 6.2 Add prerequisites to archheaders: +6.2 Add prerequisites to archheaders +------------------------------------ The archheaders: rule is used to generate header files that may be installed into user space by "make header_install" or @@ -907,13 +1009,15 @@ When kbuild executes, the following steps are followed (roughly): architecture itself. ---- 6.3 Add prerequisites to archprepare: +6.3 Add prerequisites to archprepare +------------------------------------ The archprepare: rule is used to list prerequisites that need to be built before starting to descend down in the subdirectories. This is usually used for header files containing assembler constants. - Example: + Example:: + #arch/arm/Makefile archprepare: maketools @@ -923,7 +1027,8 @@ When kbuild executes, the following steps are followed (roughly): generating offset header files. ---- 6.4 List directories to visit when descending +6.4 List directories to visit when descending +--------------------------------------------- An arch Makefile cooperates with the top Makefile to define variables which specify how to build the vmlinux file. Note that there is no @@ -931,28 +1036,34 @@ When kbuild executes, the following steps are followed (roughly): machinery is all architecture-independent. - head-y, init-y, core-y, libs-y, drivers-y, net-y + head-y, init-y, core-y, libs-y, drivers-y, net-y + $(head-y) lists objects to be linked first in vmlinux. + + $(libs-y) lists directories where a lib.a archive can be located. + + The rest list directories where a built-in.a object file can be + located. - $(head-y) lists objects to be linked first in vmlinux. - $(libs-y) lists directories where a lib.a archive can be located. - The rest list directories where a built-in.a object file can be - located. + $(init-y) objects will be located after $(head-y). - $(init-y) objects will be located after $(head-y). - Then the rest follows in this order: - $(core-y), $(libs-y), $(drivers-y) and $(net-y). + Then the rest follows in this order: - The top level Makefile defines values for all generic directories, - and arch/$(ARCH)/Makefile only adds architecture-specific directories. + $(core-y), $(libs-y), $(drivers-y) and $(net-y). + + The top level Makefile defines values for all generic directories, + and arch/$(ARCH)/Makefile only adds architecture-specific + directories. + + Example:: - Example: #arch/sparc64/Makefile core-y += arch/sparc64/kernel/ libs-y += arch/sparc64/prom/ arch/sparc64/lib/ drivers-$(CONFIG_OPROFILE) += arch/sparc64/oprofile/ ---- 6.5 Architecture-specific boot images +6.5 Architecture-specific boot images +------------------------------------- An arch Makefile specifies goals that take the vmlinux file, compress it, wrap it in bootstrapping code, and copy the resulting files @@ -970,7 +1081,8 @@ When kbuild executes, the following steps are followed (roughly): arch/$(ARCH)/Makefile, and use the full path when calling down into the arch/$(ARCH)/boot/Makefile. - Example: + Example:: + #arch/x86/Makefile boot := arch/x86/boot bzImage: vmlinux @@ -983,7 +1095,8 @@ When kbuild executes, the following steps are followed (roughly): but executing "make help" will list all relevant targets. To support this, $(archhelp) must be defined. - Example: + Example:: + #arch/x86/Makefile define archhelp echo '* bzImage - Image (arch/$(ARCH)/boot/bzImage)' @@ -997,25 +1110,30 @@ When kbuild executes, the following steps are followed (roughly): Add a new prerequisite to all: to select a default goal different from vmlinux. - Example: + Example:: + #arch/x86/Makefile all: bzImage When "make" is executed without arguments, bzImage will be built. ---- 6.6 Building non-kbuild targets +6.6 Building non-kbuild targets +------------------------------- extra-y - extra-y specifies additional targets created in the current - directory, in addition to any targets specified by obj-*. + directory, in addition to any targets specified by `obj-*`. Listing all targets in extra-y is required for two purposes: + 1) Enable kbuild to check changes in command lines + - When $(call if_changed,xxx) is used + 2) kbuild knows what files to delete during "make clean" - Example: + Example:: + #arch/x86/kernel/Makefile extra-y := head.o init_task.o @@ -1023,16 +1141,17 @@ When kbuild executes, the following steps are followed (roughly): shall be built, but shall not be linked as part of built-in.a. ---- 6.7 Commands useful for building a boot image +6.7 Commands useful for building a boot image +--------------------------------------------- - Kbuild provides a few macros that are useful when building a - boot image. + Kbuild provides a few macros that are useful when building a + boot image. if_changed - if_changed is the infrastructure used for the following commands. - Usage: + Usage:: + target: source(s) FORCE $(call if_changed,ld/objcopy/gzip/...) @@ -1050,12 +1169,16 @@ When kbuild executes, the following steps are followed (roughly): Note: It is a typical mistake to forget the FORCE prerequisite. Another common pitfall is that whitespace is sometimes significant; for instance, the below will fail (note the extra space - after the comma): + after the comma):: + target: source(s) FORCE - #WRONG!# $(call if_changed, ld/objcopy/gzip/...) - Note: if_changed should not be used more than once per target. + **WRONG!** $(call if_changed, ld/objcopy/gzip/...) + + Note: + if_changed should not be used more than once per target. It stores the executed command in a corresponding .cmd + file and multiple calls would result in overwrites and unwanted results when the target is up to date and only the tests on changed commands trigger execution of commands. @@ -1063,7 +1186,8 @@ When kbuild executes, the following steps are followed (roughly): ld Link target. Often, LDFLAGS_$@ is used to set specific options to ld. - Example: + Example:: + #arch/x86/boot/Makefile LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext @@ -1077,12 +1201,15 @@ When kbuild executes, the following steps are followed (roughly): LDFLAGS_$@ syntax - one for each potential target. $(targets) are assigned all potential targets, by which kbuild knows the targets and will: + 1) check for commandline changes 2) delete target during make clean The ": %: %.o" part of the prerequisite is a shorthand that frees us from listing the setup.o and bootsect.o files. - Note: It is a common mistake to forget the "targets :=" assignment, + + Note: + It is a common mistake to forget the "targets :=" assignment, resulting in the target file being recompiled for no obvious reason. @@ -1094,7 +1221,8 @@ When kbuild executes, the following steps are followed (roughly): gzip Compress target. Use maximum compression to compress target. - Example: + Example:: + #arch/x86/boot/compressed/Makefile $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) @@ -1105,26 +1233,30 @@ When kbuild executes, the following steps are followed (roughly): in an init section in the image. Platform code *must* copy the blob to non-init memory prior to calling unflatten_device_tree(). - To use this command, simply add *.dtb into obj-y or targets, or make - some other target depend on %.dtb + To use this command, simply add `*.dtb` into obj-y or targets, or make + some other target depend on `%.dtb` - A central rule exists to create $(obj)/%.dtb from $(src)/%.dts; + A central rule exists to create `$(obj)/%.dtb` from `$(src)/%.dts`; architecture Makefiles do no need to explicitly write out that rule. - Example: + Example:: + targets += $(dtb-y) DTC_FLAGS ?= -p 1024 ---- 6.8 Custom kbuild commands +6.8 Custom kbuild commands +-------------------------- When kbuild is executing with KBUILD_VERBOSE=0, then only a shorthand of a command is normally displayed. To enable this behaviour for custom commands kbuild requires - two variables to be set: - quiet_cmd_<command> - what shall be echoed - cmd_<command> - the command to execute + two variables to be set:: + + quiet_cmd_<command> - what shall be echoed + cmd_<command> - the command to execute + + Example:: - Example: # quiet_cmd_image = BUILD $@ cmd_image = $(obj)/tools/build $(BUILDFLAGS) \ @@ -1135,9 +1267,9 @@ When kbuild executes, the following steps are followed (roughly): $(call if_changed,image) @echo 'Kernel: $@ is ready' - When updating the $(obj)/bzImage target, the line + When updating the $(obj)/bzImage target, the line: - BUILD arch/x86/boot/bzImage + BUILD arch/x86/boot/bzImage will be displayed with "make KBUILD_VERBOSE=0". @@ -1148,9 +1280,10 @@ When kbuild executes, the following steps are followed (roughly): arch/$(ARCH)/kernel/vmlinux.lds is used. The script is a preprocessed variant of the file vmlinux.lds.S located in the same directory. - kbuild knows .lds files and includes a rule *lds.S -> *lds. + kbuild knows .lds files and includes a rule `*lds.S` -> `*lds`. + + Example:: - Example: #arch/x86/kernel/Makefile always := vmlinux.lds @@ -1162,17 +1295,19 @@ When kbuild executes, the following steps are followed (roughly): The assignment to $(CPPFLAGS_vmlinux.lds) tells kbuild to use the specified options when building the target vmlinux.lds. - When building the *.lds target, kbuild uses the variables: - KBUILD_CPPFLAGS : Set in top-level Makefile - cppflags-y : May be set in the kbuild makefile - CPPFLAGS_$(@F) : Target-specific flags. - Note that the full filename is used in this - assignment. + When building the `*.lds` target, kbuild uses the variables:: + + KBUILD_CPPFLAGS : Set in top-level Makefile + cppflags-y : May be set in the kbuild makefile + CPPFLAGS_$(@F) : Target-specific flags. + Note that the full filename is used in this + assignment. - The kbuild infrastructure for *lds files is used in several + The kbuild infrastructure for `*lds` files is used in several architecture-specific files. ---- 6.10 Generic header files +6.10 Generic header files +------------------------- The directory include/asm-generic contains the header files that may be shared between individual architectures. @@ -1180,7 +1315,8 @@ When kbuild executes, the following steps are followed (roughly): to list the file in the Kbuild file. See "7.2 generic-y" for further info on syntax etc. ---- 6.11 Post-link pass +6.11 Post-link pass +------------------- If the file arch/xxx/Makefile.postlink exists, this makefile will be invoked for post-link objects (vmlinux and modules.ko) @@ -1195,15 +1331,17 @@ When kbuild executes, the following steps are followed (roughly): For example, powerpc uses this to check relocation sanity of the linked vmlinux file. -=== 7 Kbuild syntax for exported headers +7 Kbuild syntax for exported headers +------------------------------------ The kernel includes a set of headers that is exported to userspace. Many headers can be exported as-is but other headers require a minimal pre-processing before they are ready for user-space. The pre-processing does: + - drop kernel-specific annotations - drop include of compiler.h -- drop all sections that are kernel internal (guarded by ifdef __KERNEL__) +- drop all sections that are kernel internal (guarded by `ifdef __KERNEL__`) All headers under include/uapi/, include/generated/uapi/, arch/<arch>/include/uapi/ and arch/<arch>/include/generated/uapi/ @@ -1213,40 +1351,45 @@ A Kbuild file may be defined under arch/<arch>/include/uapi/asm/ and arch/<arch>/include/asm/ to list asm files coming from asm-generic. See subsequent chapter for the syntax of the Kbuild file. ---- 7.1 no-export-headers +7.1 no-export-headers +--------------------- no-export-headers is essentially used by include/uapi/linux/Kbuild to avoid exporting specific headers (e.g. kvm.h) on architectures that do not support it. It should be avoided as much as possible. ---- 7.2 generic-y +7.2 generic-y +------------- If an architecture uses a verbatim copy of a header from include/asm-generic then this is listed in the file arch/$(ARCH)/include/asm/Kbuild like this: - Example: + Example:: + #arch/x86/include/asm/Kbuild generic-y += termios.h generic-y += rtc.h During the prepare phase of the build a wrapper include - file is generated in the directory: + file is generated in the directory:: arch/$(ARCH)/include/generated/asm When a header is exported where the architecture uses the generic header a similar wrapper is generated as part - of the set of exported headers in the directory: + of the set of exported headers in the directory:: usr/include/asm The generated wrapper will in both cases look like the following: - Example: termios.h + Example: termios.h:: + #include <asm-generic/termios.h> ---- 7.3 generated-y +7.3 generated-y +--------------- If an architecture generates other header files alongside generic-y wrappers, generated-y specifies them. @@ -1254,11 +1397,13 @@ See subsequent chapter for the syntax of the Kbuild file. This prevents them being treated as stale asm-generic wrappers and removed. - Example: + Example:: + #arch/x86/include/asm/Kbuild generated-y += syscalls_32.h ---- 7.4 mandatory-y +7.4 mandatory-y +--------------- mandatory-y is essentially used by include/(uapi/)asm-generic/Kbuild to define the minimum set of ASM headers that all architectures must have. @@ -1270,12 +1415,12 @@ See subsequent chapter for the syntax of the Kbuild file. The convention is to list one subdir per line and preferably in alphabetic order. -=== 8 Kbuild Variables +8 Kbuild Variables +================== The top Makefile exports the following variables: VERSION, PATCHLEVEL, SUBLEVEL, EXTRAVERSION - These variables define the current kernel version. A few arch Makefiles actually use these values directly; they should use $(KERNELRELEASE) instead. @@ -1289,32 +1434,28 @@ The top Makefile exports the following variables: such as "-pre4", and is often blank. KERNELRELEASE - $(KERNELRELEASE) is a single string such as "2.4.0-pre4", suitable for constructing installation directory names or showing in version strings. Some arch Makefiles use it for this purpose. ARCH - This variable defines the target architecture, such as "i386", "arm", or "sparc". Some kbuild Makefiles test $(ARCH) to determine which files to compile. By default, the top Makefile sets $(ARCH) to be the same as the host system architecture. For a cross build, a user may - override the value of $(ARCH) on the command line: + override the value of $(ARCH) on the command line:: make ARCH=m68k ... INSTALL_PATH - This variable defines a place for the arch Makefiles to install the resident kernel image and System.map file. Use this for architecture-specific install targets. INSTALL_MOD_PATH, MODLIB - $(INSTALL_MOD_PATH) specifies a prefix to $(MODLIB) for module installation. This variable is not defined in the Makefile but may be passed in by the user if desired. @@ -1325,7 +1466,6 @@ The top Makefile exports the following variables: override this value on the command line if desired. INSTALL_MOD_STRIP - If this variable is specified, it will cause modules to be stripped after they are installed. If INSTALL_MOD_STRIP is '1', then the default option --strip-debug will be used. Otherwise, the @@ -1333,7 +1473,8 @@ The top Makefile exports the following variables: command. -=== 9 Makefile language +9 Makefile language +=================== The kernel Makefiles are designed to be run with GNU Make. The Makefiles use only the documented features of GNU Make, but they do use many @@ -1352,18 +1493,17 @@ time the left-hand side is used. There are some cases where "=" is appropriate. Usually, though, ":=" is the right choice. -=== 10 Credits +10 Credits +========== -Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> -Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> -Updates by Sam Ravnborg <sam@ravnborg.org> -Language QA by Jan Engelhardt <jengelh@gmx.de> +- Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> +- Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> +- Updates by Sam Ravnborg <sam@ravnborg.org> +- Language QA by Jan Engelhardt <jengelh@gmx.de> -=== 11 TODO +11 TODO +======= - Describe how kbuild supports shipped files with _shipped. - Generating offset header files. - Add more variables to section 7? - - - diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.rst index 80295c613e37..24e763482650 100644 --- a/Documentation/kbuild/modules.txt +++ b/Documentation/kbuild/modules.rst @@ -1,8 +1,10 @@ +========================= Building External Modules +========================= This document describes how to build an out-of-tree kernel module. -=== Table of Contents +.. Table of Contents === 1 Introduction === 2 How to Build External Modules @@ -31,7 +33,8 @@ This document describes how to build an out-of-tree kernel module. -=== 1. Introduction +1. Introduction +=============== "kbuild" is the build system used by the Linux kernel. Modules must use kbuild to stay compatible with changes in the build infrastructure and @@ -48,7 +51,8 @@ easily accomplished, and a complete example will be presented in section 3. -=== 2. How to Build External Modules +2. How to Build External Modules +================================ To build external modules, you must have a prebuilt kernel available that contains the configuration and header files used in the build. @@ -65,25 +69,27 @@ NOTE: "modules_prepare" will not build Module.symvers even if CONFIG_MODVERSIONS is set; therefore, a full kernel build needs to be executed to make module versioning work. ---- 2.1 Command Syntax +2.1 Command Syntax +================== - The command to build an external module is: + The command to build an external module is:: $ make -C <path_to_kernel_src> M=$PWD The kbuild system knows that an external module is being built due to the "M=<dir>" option given in the command. - To build against the running kernel use: + To build against the running kernel use:: $ make -C /lib/modules/`uname -r`/build M=$PWD Then to install the module(s) just built, add the target - "modules_install" to the command: + "modules_install" to the command:: $ make -C /lib/modules/`uname -r`/build M=$PWD modules_install ---- 2.2 Options +2.2 Options +=========== ($KDIR refers to the path of the kernel source directory.) @@ -100,7 +106,8 @@ executed to make module versioning work. directory where the external module (kbuild file) is located. ---- 2.3 Targets +2.3 Targets +=========== When building an external module, only a subset of the "make" targets are available. @@ -130,26 +137,29 @@ executed to make module versioning work. help List the available targets for external modules. ---- 2.4 Building Separate Files +2.4 Building Separate Files +=========================== It is possible to build single files that are part of a module. This works equally well for the kernel, a module, and even for external modules. - Example (The module foo.ko, consist of bar.o and baz.o): + Example (The module foo.ko, consist of bar.o and baz.o):: + make -C $KDIR M=$PWD bar.lst make -C $KDIR M=$PWD baz.o make -C $KDIR M=$PWD foo.ko make -C $KDIR M=$PWD ./ -=== 3. Creating a Kbuild File for an External Module +3. Creating a Kbuild File for an External Module +================================================ In the last section we saw the command to build a module for the running kernel. The module is not actually built, however, because a build file is required. Contained in this file will be the name of the module(s) being built, along with the list of requisite source -files. The file may be as simple as a single line: +files. The file may be as simple as a single line:: obj-m := <module_name>.o @@ -157,15 +167,15 @@ The kbuild system will build <module_name>.o from <module_name>.c, and, after linking, will result in the kernel module <module_name>.ko. The above line can be put in either a "Kbuild" file or a "Makefile." When the module is built from multiple sources, an additional line is -needed listing the files: +needed listing the files:: <module_name>-y := <src1>.o <src2>.o ... NOTE: Further documentation describing the syntax used by kbuild is -located in Documentation/kbuild/makefiles.txt. +located in Documentation/kbuild/makefiles.rst. The examples below demonstrate how to create a build file for the -module 8123.ko, which is built from the following files: +module 8123.ko, which is built from the following files:: 8123_if.c 8123_if.h @@ -181,7 +191,8 @@ module 8123.ko, which is built from the following files: but should be filtered out from kbuild due to possible name clashes. - Example 1: + Example 1:: + --> filename: Makefile ifneq ($(KERNELRELEASE),) # kbuild part of makefile @@ -209,14 +220,16 @@ module 8123.ko, which is built from the following files: line; the second pass is by the kbuild system, which is initiated by the parameterized "make" in the default target. ---- 3.2 Separate Kbuild File and Makefile +3.2 Separate Kbuild File and Makefile +------------------------------------- In newer versions of the kernel, kbuild will first look for a file named "Kbuild," and only if that is not found, will it then look for a makefile. Utilizing a "Kbuild" file allows us to split up the makefile from example 1 into two files: - Example 2: + Example 2:: + --> filename: Kbuild obj-m := 8123.o 8123-y := 8123_if.o 8123_pci.o 8123_bin.o @@ -238,7 +251,8 @@ module 8123.ko, which is built from the following files: The next example shows a backward compatible version. - Example 3: + Example 3:: + --> filename: Kbuild obj-m := 8123.o 8123-y := 8123_if.o 8123_pci.o 8123_bin.o @@ -266,7 +280,8 @@ module 8123.ko, which is built from the following files: makefiles, to be used when the "make" and kbuild parts are split into separate files. ---- 3.3 Binary Blobs +3.3 Binary Blobs +---------------- Some external modules need to include an object file as a blob. kbuild has support for this, but requires the blob file to be @@ -277,7 +292,7 @@ module 8123.ko, which is built from the following files: Throughout this section, 8123_bin.o_shipped has been used to build the kernel module 8123.ko; it has been included as - 8123_bin.o. + 8123_bin.o:: 8123-y := 8123_if.o 8123_pci.o 8123_bin.o @@ -285,11 +300,12 @@ module 8123.ko, which is built from the following files: files and the binary file, kbuild will pick up different rules when creating the object file for the module. ---- 3.4 Building Multiple Modules +3.4 Building Multiple Modules +============================= kbuild supports building multiple modules with a single build file. For example, if you wanted to build two modules, foo.ko - and bar.ko, the kbuild lines would be: + and bar.ko, the kbuild lines would be:: obj-m := foo.o bar.o foo-y := <foo_srcs> @@ -298,7 +314,8 @@ module 8123.ko, which is built from the following files: It is that simple! -=== 4. Include Files +4. Include Files +================ Within the kernel, header files are kept in standard locations according to the following rule: @@ -310,22 +327,25 @@ according to the following rule: of the kernel that are located in different directories, then the file is placed in include/linux/. - NOTE: There are two notable exceptions to this rule: larger - subsystems have their own directory under include/, such as - include/scsi; and architecture specific headers are located - under arch/$(ARCH)/include/. + NOTE: + There are two notable exceptions to this rule: larger + subsystems have their own directory under include/, such as + include/scsi; and architecture specific headers are located + under arch/$(ARCH)/include/. ---- 4.1 Kernel Includes +4.1 Kernel Includes +------------------- To include a header file located under include/linux/, simply - use: + use:: #include <linux/module.h> kbuild will add options to "gcc" so the relevant directories are searched. ---- 4.2 Single Subdirectory +4.2 Single Subdirectory +----------------------- External modules tend to place header files in a separate include/ directory where their source is located, although this @@ -334,7 +354,7 @@ according to the following rule: Using the example from section 3, if we moved 8123_if.h to a subdirectory named include, the resulting kbuild file would - look like: + look like:: --> filename: Kbuild obj-m := 8123.o @@ -346,23 +366,24 @@ according to the following rule: the path. This is a limitation of kbuild: there must be no space present. ---- 4.3 Several Subdirectories +4.3 Several Subdirectories +-------------------------- kbuild can handle files that are spread over several directories. - Consider the following example: - - . - |__ src - | |__ complex_main.c - | |__ hal - | |__ hardwareif.c - | |__ include - | |__ hardwareif.h - |__ include - |__ complex.h + Consider the following example:: + + . + |__ src + | |__ complex_main.c + | |__ hal + | |__ hardwareif.c + | |__ include + | |__ hardwareif.h + |__ include + |__ complex.h To build the module complex.ko, we then need the following - kbuild file: + kbuild file:: --> filename: Kbuild obj-m := complex.o @@ -385,7 +406,8 @@ according to the following rule: file is located. -=== 5. Module Installation +5. Module Installation +====================== Modules which are included in the kernel are installed in the directory: @@ -396,11 +418,12 @@ And external modules are installed in: /lib/modules/$(KERNELRELEASE)/extra/ ---- 5.1 INSTALL_MOD_PATH +5.1 INSTALL_MOD_PATH +-------------------- Above are the default directories but as always some level of customization is possible. A prefix can be added to the - installation path using the variable INSTALL_MOD_PATH: + installation path using the variable INSTALL_MOD_PATH:: $ make INSTALL_MOD_PATH=/frodo modules_install => Install dir: /frodo/lib/modules/$(KERNELRELEASE)/kernel/ @@ -410,20 +433,22 @@ And external modules are installed in: calling "make." This has effect when installing both in-tree and out-of-tree modules. ---- 5.2 INSTALL_MOD_DIR +5.2 INSTALL_MOD_DIR +------------------- External modules are by default installed to a directory under /lib/modules/$(KERNELRELEASE)/extra/, but you may wish to locate modules for a specific functionality in a separate directory. For this purpose, use INSTALL_MOD_DIR to specify an - alternative name to "extra." + alternative name to "extra.":: $ make INSTALL_MOD_DIR=gandalf -C $KDIR \ M=$PWD modules_install => Install dir: /lib/modules/$(KERNELRELEASE)/gandalf/ -=== 6. Module Versioning +6. Module Versioning +==================== Module versioning is enabled by the CONFIG_MODVERSIONS tag, and is used as a simple ABI consistency check. A CRC value of the full prototype @@ -435,14 +460,16 @@ module. Module.symvers contains a list of all exported symbols from a kernel build. ---- 6.1 Symbols From the Kernel (vmlinux + modules) +6.1 Symbols From the Kernel (vmlinux + modules) +----------------------------------------------- During a kernel build, a file named Module.symvers will be generated. Module.symvers contains all exported symbols from the kernel and compiled modules. For each symbol, the corresponding CRC value is also stored. - The syntax of the Module.symvers file is: + The syntax of the Module.symvers file is:: + <CRC> <Symbol> <module> 0x2d036834 scsi_remove_host drivers/scsi/scsi_mod @@ -451,10 +478,12 @@ build. would read 0x00000000. Module.symvers serves two purposes: + 1) It lists all exported symbols from vmlinux and all modules. 2) It lists the CRC if CONFIG_MODVERSIONS is enabled. ---- 6.2 Symbols and External Modules +6.2 Symbols and External Modules +-------------------------------- When building an external module, the build system needs access to the symbols from the kernel to check if all external symbols @@ -481,17 +510,17 @@ build. foo.ko needs symbols from bar.ko, you can use a common top-level kbuild file so both modules are compiled in the same build. Consider the following - directory layout: + directory layout:: - ./foo/ <= contains foo.ko - ./bar/ <= contains bar.ko + ./foo/ <= contains foo.ko + ./bar/ <= contains bar.ko - The top-level kbuild file would then look like: + The top-level kbuild file would then look like:: - #./Kbuild (or ./Makefile): - obj-y := foo/ bar/ + #./Kbuild (or ./Makefile): + obj-y := foo/ bar/ - And executing + And executing:: $ make -C $KDIR M=$PWD @@ -518,14 +547,16 @@ build. initialization of its symbol tables. -=== 7. Tips & Tricks +7. Tips & Tricks +================ ---- 7.1 Testing for CONFIG_FOO_BAR +7.1 Testing for CONFIG_FOO_BAR +------------------------------ - Modules often need to check for certain CONFIG_ options to + Modules often need to check for certain `CONFIG_` options to decide if a specific feature is included in the module. In - kbuild this is done by referencing the CONFIG_ variable - directly. + kbuild this is done by referencing the `CONFIG_` variable + directly:: #fs/ext2/Makefile obj-$(CONFIG_EXT2_FS) += ext2.o @@ -534,8 +565,7 @@ build. ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o External modules have traditionally used "grep" to check for - specific CONFIG_ settings directly in .config. This usage is + specific `CONFIG_` settings directly in .config. This usage is broken. As introduced before, external modules should use kbuild for building and can therefore use the same methods as - in-tree modules when testing for CONFIG_ definitions. - + in-tree modules when testing for `CONFIG_` definitions. diff --git a/Documentation/kdump/index.rst b/Documentation/kdump/index.rst new file mode 100644 index 000000000000..2b17fcf6867a --- /dev/null +++ b/Documentation/kdump/index.rst @@ -0,0 +1,21 @@ +:orphan: + +================================================================ +Documentation for Kdump - The kexec-based Crash Dumping Solution +================================================================ + +This document includes overview, setup and installation, and analysis +information. + +.. toctree:: + :maxdepth: 1 + + kdump + vmcoreinfo + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.rst index 3162eeb8c262..ac7e131d2935 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.rst @@ -71,9 +71,8 @@ This is a symlink to the latest version. The latest kexec-tools git tree is available at: -git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git -and -http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git +- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git +- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git There is also a gitweb interface available at http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git @@ -81,25 +80,25 @@ http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git More information about kexec-tools can be found at http://horms.net/projects/kexec/ -3) Unpack the tarball with the tar command, as follows: +3) Unpack the tarball with the tar command, as follows:: - tar xvpzf kexec-tools.tar.gz + tar xvpzf kexec-tools.tar.gz -4) Change to the kexec-tools directory, as follows: +4) Change to the kexec-tools directory, as follows:: - cd kexec-tools-VERSION + cd kexec-tools-VERSION -5) Configure the package, as follows: +5) Configure the package, as follows:: - ./configure + ./configure -6) Compile the package, as follows: +6) Compile the package, as follows:: - make + make -7) Install the package, as follows: +7) Install the package, as follows:: - make install + make install Build the system and dump-capture kernels @@ -126,25 +125,25 @@ dump-capture kernels for enabling kdump support. System kernel config options ---------------------------- -1) Enable "kexec system call" in "Processor type and features." +1) Enable "kexec system call" in "Processor type and features.":: - CONFIG_KEXEC=y + CONFIG_KEXEC=y 2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo - filesystems." This is usually enabled by default. + filesystems." This is usually enabled by default:: - CONFIG_SYSFS=y + CONFIG_SYSFS=y Note that "sysfs file system support" might not appear in the "Pseudo filesystems" menu if "Configure standard kernel features (for small systems)" is not enabled in "General Setup." In this case, check the - .config file itself to ensure that sysfs is turned on, as follows: + .config file itself to ensure that sysfs is turned on, as follows:: - grep 'CONFIG_SYSFS' .config + grep 'CONFIG_SYSFS' .config -3) Enable "Compile the kernel with debug info" in "Kernel hacking." +3) Enable "Compile the kernel with debug info" in "Kernel hacking.":: - CONFIG_DEBUG_INFO=Y + CONFIG_DEBUG_INFO=Y This causes the kernel to be built with debug symbols. The dump analysis tools require a vmlinux with debug symbols in order to read @@ -154,29 +153,32 @@ Dump-capture kernel config options (Arch Independent) ----------------------------------------------------- 1) Enable "kernel crash dumps" support under "Processor type and - features": + features":: - CONFIG_CRASH_DUMP=y + CONFIG_CRASH_DUMP=y -2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems". +2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: + + CONFIG_PROC_VMCORE=y - CONFIG_PROC_VMCORE=y (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.) Dump-capture kernel config options (Arch Dependent, i386 and x86_64) -------------------------------------------------------------------- 1) On i386, enable high memory support under "Processor type and - features": + features":: + + CONFIG_HIGHMEM64G=y + + or:: - CONFIG_HIGHMEM64G=y - or - CONFIG_HIGHMEM4G + CONFIG_HIGHMEM4G 2) On i386 and x86_64, disable symmetric multi-processing support - under "Processor type and features": + under "Processor type and features":: - CONFIG_SMP=n + CONFIG_SMP=n (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line when loading the dump-capture kernel, see section "Load the Dump-capture @@ -184,9 +186,9 @@ Dump-capture kernel config options (Arch Dependent, i386 and x86_64) 3) If one wants to build and use a relocatable kernel, Enable "Build a relocatable kernel" support under "Processor type and - features" + features":: - CONFIG_RELOCATABLE=y + CONFIG_RELOCATABLE=y 4) Use a suitable value for "Physical address where the kernel is loaded" (under "Processor type and features"). This only appears when @@ -211,13 +213,13 @@ Dump-capture kernel config options (Arch Dependent, i386 and x86_64) Dump-capture kernel config options (Arch Dependent, ppc64) ---------------------------------------------------------- -1) Enable "Build a kdump crash kernel" support under "Kernel" options: +1) Enable "Build a kdump crash kernel" support under "Kernel" options:: - CONFIG_CRASH_DUMP=y + CONFIG_CRASH_DUMP=y -2) Enable "Build a relocatable kernel" support +2) Enable "Build a relocatable kernel" support:: - CONFIG_RELOCATABLE=y + CONFIG_RELOCATABLE=y Make and install the kernel and its modules. @@ -231,11 +233,13 @@ Dump-capture kernel config options (Arch Dependent, ia64) The crashkernel region can be automatically placed by the system kernel at run time. This is done by specifying the base address as 0, - or omitting it all together. + or omitting it all together:: - crashkernel=256M@0 - or - crashkernel=256M + crashkernel=256M@0 + + or:: + + crashkernel=256M If the start address is specified, note that the start address of the kernel will be aligned to 64Mb, so if the start address is not then @@ -245,9 +249,9 @@ Dump-capture kernel config options (Arch Dependent, arm) ---------------------------------------------------------- - To use a relocatable kernel, - Enable "AUTO_ZRELADDR" support under "Boot" options: + Enable "AUTO_ZRELADDR" support under "Boot" options:: - AUTO_ZRELADDR=y + AUTO_ZRELADDR=y Dump-capture kernel config options (Arch Dependent, arm64) ---------------------------------------------------------- @@ -265,12 +269,12 @@ on the value of System RAM -- that's mostly for distributors that pre-setup the kernel command line to avoid a unbootable system after some memory has been removed from the machine. -The syntax is: +The syntax is:: crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset] range=start-[end] -For example: +For example:: crashkernel=512M-2G:64M,2G-:128M @@ -326,35 +330,46 @@ can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz of dump-capture kernel. Following is the summary. For i386 and x86_64: + - Use vmlinux if kernel is not relocatable. - Use bzImage/vmlinuz if kernel is relocatable. + For ppc64: + - Use vmlinux + For ia64: + - Use vmlinux or vmlinuz.gz + For s390x: + - Use image or bzImage + For arm: + - Use zImage + For arm64: + - Use vmlinux or Image If you are using an uncompressed vmlinux image then use following command -to load dump-capture kernel. +to load dump-capture kernel:: kexec -p <dump-capture-kernel-vmlinux-image> \ --initrd=<initrd-for-dump-capture-kernel> --args-linux \ --append="root=<root-dev> <arch-specific-options>" If you are using a compressed bzImage/vmlinuz, then use following command -to load dump-capture kernel. +to load dump-capture kernel:: kexec -p <dump-capture-kernel-bzImage> \ --initrd=<initrd-for-dump-capture-kernel> \ --append="root=<root-dev> <arch-specific-options>" If you are using a compressed zImage, then use following command -to load dump-capture kernel. +to load dump-capture kernel:: kexec --type zImage -p <dump-capture-kernel-bzImage> \ --initrd=<initrd-for-dump-capture-kernel> \ @@ -362,7 +377,7 @@ to load dump-capture kernel. --append="root=<root-dev> <arch-specific-options>" If you are using an uncompressed Image, then use following command -to load dump-capture kernel. +to load dump-capture kernel:: kexec -p <dump-capture-kernel-Image> \ --initrd=<initrd-for-dump-capture-kernel> \ @@ -376,18 +391,23 @@ Following are the arch specific command line options to be used while loading dump-capture kernel. For i386, x86_64 and ia64: + "1 irqpoll maxcpus=1 reset_devices" For ppc64: + "1 maxcpus=1 noirqdistrib reset_devices" For s390x: + "1 maxcpus=1 cgroup_disable=memory" For arm: + "1 maxcpus=1 reset_devices" For arm64: + "1 maxcpus=1 reset_devices" Notes on loading the dump-capture kernel: @@ -464,7 +484,7 @@ Write Out the Dump File ======================= After the dump-capture kernel is booted, write out the dump file with -the following command: +the following command:: cp /proc/vmcore <dump-file> @@ -476,7 +496,7 @@ Before analyzing the dump image, you should reboot into a stable kernel. You can do limited analysis using GDB on the dump file copied out of /proc/vmcore. Use the debug vmlinux built with -g and run the following -command: +command:: gdb vmlinux <dump-file> @@ -504,6 +524,11 @@ to achieve the same behaviour. Contact ======= -Vivek Goyal (vgoyal@redhat.com) -Maneesh Soni (maneesh@in.ibm.com) +- Vivek Goyal (vgoyal@redhat.com) +- Maneesh Soni (maneesh@in.ibm.com) + +GDB macros +========== +.. include:: gdbmacros.txt + :literal: diff --git a/Documentation/kdump/vmcoreinfo.txt b/Documentation/kdump/vmcoreinfo.rst index bb94a4bd597a..007a6b86e0ee 100644 --- a/Documentation/kdump/vmcoreinfo.txt +++ b/Documentation/kdump/vmcoreinfo.rst @@ -1,8 +1,7 @@ -================================================================ - VMCOREINFO -================================================================ +========== +VMCOREINFO +========== -=========== What is it? =========== @@ -12,7 +11,6 @@ values, field offsets, etc. These data are packed into an ELF note section and used by user-space tools like crash and makedumpfile to analyze a kernel's memory layout. -================ Common variables ================ @@ -49,7 +47,7 @@ in a system, one bit position per node number. Used to keep track of which nodes are in the system and online. swapper_pg_dir -------------- +-------------- The global page directory pointer of the kernel. Used to translate virtual to physical addresses. @@ -132,16 +130,14 @@ nodemask_t The size of a nodemask_t type. Used to compute the number of online nodes. -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor| - compound_order|compound_head) -------------------------------------------------------------------- +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) +------------------------------------------------------------------------------------------------- User-space tools compute their values based on the offset of these variables. The variables are used when excluding unnecessary pages. -(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_ - spanned_pages|node_id) -------------------------------------------------------------------- +(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id) +----------------------------------------------------------------------------------------- On NUMA machines, each NUMA node has a pg_data_t to describe its memory layout. On UMA machines there is a single pglist_data which describes the @@ -245,21 +241,25 @@ NR_FREE_PAGES On linux-2.6.21 or later, the number of free pages is in vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision -|PG_head_mask|PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy) -|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) ------------------------------------------------------------------ +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask +------------------------------------------------------------------------------ Page attributes. These flags are used to filter various unnecessary for dumping pages. +PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) +----------------------------------------------------------------------------- + +More page attributes. These flags are used to filter various unnecessary for +dumping pages. + + HUGETLB_PAGE_DTOR ----------------- The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile excludes these pages. -====== x86_64 ====== @@ -318,12 +318,12 @@ address. Currently, sme_mask stores the value of the C-bit position. If needed, additional SME-relevant info can be placed in that variable. -For example: -[ misc ][ enc bit ][ other misc SME info ] -0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 -63 59 55 51 47 43 39 35 31 27 ... 3 +For example:: + + [ misc ][ enc bit ][ other misc SME info ] + 0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 + 63 59 55 51 47 43 39 35 31 27 ... 3 -====== x86_32 ====== @@ -335,7 +335,6 @@ of a higher page table lookup overhead, and also consumes more page table space per process. Used to check whether PAE was enabled in the crash kernel when converting virtual addresses to physical addresses. -==== ia64 ==== @@ -366,7 +365,6 @@ PGTABLE_3|PGTABLE_4 User-space tools need to know whether the crash kernel was in 3-level or 4-level paging mode. Used to distinguish the page table. -===== ARM64 ===== @@ -395,9 +393,8 @@ KERNELOFFSET The kernel randomization offset. Used to compute the page offset. If KASLR is disabled, this value is zero. -==== arm -==== +=== ARM_LPAE -------- @@ -405,12 +402,11 @@ ARM_LPAE It indicates whether the crash kernel supports large physical address extensions. Used to translate virtual to physical addresses. -==== s390 ==== lowcore_ptr ----------- +----------- An array with a pointer to the lowcore of every CPU. Used to print the psw and all registers information. @@ -425,7 +421,6 @@ Used to get the vmalloc_start address from the high_memory symbol. The maximum number of CPUs. -======= powerpc ======= @@ -460,9 +455,8 @@ Page size definitions, i.e. 4k, 64k, or 16M. Used to make vtop translations. -vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)| -(vmemmap_backing, virt_addr) ----------------------------------------------------------------- +vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr) +-------------------------------------------------------------------------------------------- The vmemmap virtual address space management does not have a traditional page table to track which virtual struct pages are backed by a physical @@ -480,7 +474,6 @@ member. Used in vtop translations. -== sh == diff --git a/Documentation/kernel-hacking/hacking.rst b/Documentation/kernel-hacking/hacking.rst index d824e4feaff3..5891a701a159 100644 --- a/Documentation/kernel-hacking/hacking.rst +++ b/Documentation/kernel-hacking/hacking.rst @@ -718,7 +718,7 @@ make a neat patch, there's administrative work to be done: - Usually you want a configuration option for your kernel hack. Edit ``Kconfig`` in the appropriate directory. The Config language is simple to use by cut and paste, and there's complete documentation in - ``Documentation/kbuild/kconfig-language.txt``. + ``Documentation/kbuild/kconfig-language.rst``. In your description of the option, make sure you address both the expert user and the user who knows nothing about your feature. @@ -728,7 +728,7 @@ make a neat patch, there's administrative work to be done: - Edit the ``Makefile``: the CONFIG variables are exported here so you can usually just add a "obj-$(CONFIG_xxx) += xxx.o" line. The syntax - is documented in ``Documentation/kbuild/makefiles.txt``. + is documented in ``Documentation/kbuild/makefiles.rst``. - Put yourself in ``CREDITS`` if you've done something noteworthy, usually beyond a single file (your name should be at the top of the diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst index 519673df0e82..dc698ea456e0 100644 --- a/Documentation/kernel-hacking/locking.rst +++ b/Documentation/kernel-hacking/locking.rst @@ -451,7 +451,7 @@ to protect the cache and all the objects within it. Here's the code:: if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) return -ENOMEM; - strlcpy(obj->name, name, sizeof(obj->name)); + strscpy(obj->name, name, sizeof(obj->name)); obj->id = id; obj->popularity = 0; @@ -660,7 +660,7 @@ Here is the code:: } @@ -63,6 +94,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); + strscpy(obj->name, name, sizeof(obj->name)); obj->id = id; obj->popularity = 0; + obj->refcnt = 1; /* The cache holds a reference */ @@ -774,7 +774,7 @@ the lock is no longer used to protect the reference count itself. } @@ -94,7 +76,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); + strscpy(obj->name, name, sizeof(obj->name)); obj->id = id; obj->popularity = 0; - obj->refcnt = 1; /* The cache holds a reference */ diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index 23b0c8b20cd1..5623b9916411 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -348,7 +348,7 @@ To reduce its OS jitter, do at least one of the following: 2. Boot with "nosoftlockup=0", which will also prevent these kthreads from being created. Other related watchdog and softlockup boot parameters may be found in Documentation/admin-guide/kernel-parameters.rst - and Documentation/watchdog/watchdog-parameters.txt. + and Documentation/watchdog/watchdog-parameters.rst. 3. Echo a zero to /proc/sys/kernel/watchdog to disable the watchdog timer. 4. Echo a large number of /proc/sys/kernel/watchdog_thresh in diff --git a/Documentation/laptops/lg-laptop.rst b/Documentation/laptops/lg-laptop.rst index aa503ee9b3bc..f2c2ffe31101 100644 --- a/Documentation/laptops/lg-laptop.rst +++ b/Documentation/laptops/lg-laptop.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0+ +:orphan: + LG Gram laptop extra features ============================= diff --git a/Documentation/maintainer/index.rst b/Documentation/maintainer/index.rst index 2a14916930cb..56e2c09dfa39 100644 --- a/Documentation/maintainer/index.rst +++ b/Documentation/maintainer/index.rst @@ -10,5 +10,6 @@ additions to this manual. :maxdepth: 2 configure-git + rebasing-and-merging pull-requests diff --git a/Documentation/maintainer/rebasing-and-merging.rst b/Documentation/maintainer/rebasing-and-merging.rst new file mode 100644 index 000000000000..09f988e7fa71 --- /dev/null +++ b/Documentation/maintainer/rebasing-and-merging.rst @@ -0,0 +1,226 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Rebasing and merging +==================== + +Maintaining a subsystem, as a general rule, requires a familiarity with the +Git source-code management system. Git is a powerful tool with a lot of +features; as is often the case with such tools, there are right and wrong +ways to use those features. This document looks in particular at the use +of rebasing and merging. Maintainers often get in trouble when they use +those tools incorrectly, but avoiding problems is not actually all that +hard. + +One thing to be aware of in general is that, unlike many other projects, +the kernel community is not scared by seeing merge commits in its +development history. Indeed, given the scale of the project, avoiding +merges would be nearly impossible. Some problems encountered by +maintainers result from a desire to avoid merges, while others come from +merging a little too often. + +Rebasing +======== + +"Rebasing" is the process of changing the history of a series of commits +within a repository. There are two different types of operations that are +referred to as rebasing since both are done with the ``git rebase`` +command, but there are significant differences between them: + + - Changing the parent (starting) commit upon which a series of patches is + built. For example, a rebase operation could take a patch set built on + the previous kernel release and base it, instead, on the current + release. We'll call this operation "reparenting" in the discussion + below. + + - Changing the history of a set of patches by fixing (or deleting) broken + commits, adding patches, adding tags to commit changelogs, or changing + the order in which commits are applied. In the following text, this + type of operation will be referred to as "history modification" + +The term "rebasing" will be used to refer to both of the above operations. +Used properly, rebasing can yield a cleaner and clearer development +history; used improperly, it can obscure that history and introduce bugs. + +There are a few rules of thumb that can help developers to avoid the worst +perils of rebasing: + + - History that has been exposed to the world beyond your private system + should usually not be changed. Others may have pulled a copy of your + tree and built on it; modifying your tree will create pain for them. If + work is in need of rebasing, that is usually a sign that it is not yet + ready to be committed to a public repository. + + That said, there are always exceptions. Some trees (linux-next being + a significant example) are frequently rebased by their nature, and + developers know not to base work on them. Developers will sometimes + expose an unstable branch for others to test with or for automated + testing services. If you do expose a branch that may be unstable in + this way, be sure that prospective users know not to base work on it. + + - Do not rebase a branch that contains history created by others. If you + have pulled changes from another developer's repository, you are now a + custodian of their history. You should not change it. With few + exceptions, for example, a broken commit in a tree like this should be + explicitly reverted rather than disappeared via history modification. + + - Do not reparent a tree without a good reason to do so. Just being on a + newer base or avoiding a merge with an upstream repository is not + generally a good reason. + + - If you must reparent a repository, do not pick some random kernel commit + as the new base. The kernel is often in a relatively unstable state + between release points; basing development on one of those points + increases the chances of running into surprising bugs. When a patch + series must move to a new base, pick a stable point (such as one of + the -rc releases) to move to. + + - Realize that reparenting a patch series (or making significant history + modifications) changes the environment in which it was developed and, + likely, invalidates much of the testing that was done. A reparented + patch series should, as a general rule, be treated like new code and + retested from the beginning. + +A frequent cause of merge-window trouble is when Linus is presented with a +patch series that has clearly been reparented, often to a random commit, +shortly before the pull request was sent. The chances of such a series +having been adequately tested are relatively low - as are the chances of +the pull request being acted upon. + +If, instead, rebasing is limited to private trees, commits are based on a +well-known starting point, and they are well tested, the potential for +trouble is low. + +Merging +======= + +Merging is a common operation in the kernel development process; the 5.1 +development cycle included 1,126 merge commits - nearly 9% of the total. +Kernel work is accumulated in over 100 different subsystem trees, each of +which may contain multiple topic branches; each branch is usually developed +independently of the others. So naturally, at least one merge will be +required before any given branch finds its way into an upstream repository. + +Many projects require that branches in pull requests be based on the +current trunk so that no merge commits appear in the history. The kernel +is not such a project; any rebasing of branches to avoid merges will, most +likely, lead to trouble. + +Subsystem maintainers find themselves having to do two types of merges: +from lower-level subsystem trees and from others, either sibling trees or +the mainline. The best practices to follow differ in those two situations. + +Merging from lower-level trees +------------------------------ + +Larger subsystems tend to have multiple levels of maintainers, with the +lower-level maintainers sending pull requests to the higher levels. Acting +on such a pull request will almost certainly generate a merge commit; that +is as it should be. In fact, subsystem maintainers may want to use +the --no-ff flag to force the addition of a merge commit in the rare cases +where one would not normally be created so that the reasons for the merge +can be recorded. The changelog for the merge should, for any kind of +merge, say *why* the merge is being done. For a lower-level tree, "why" is +usually a summary of the changes that will come with that pull. + +Maintainers at all levels should be using signed tags on their pull +requests, and upstream maintainers should verify the tags when pulling +branches. Failure to do so threatens the security of the development +process as a whole. + +As per the rules outlined above, once you have merged somebody else's +history into your tree, you cannot rebase that branch, even if you +otherwise would be able to. + +Merging from sibling or upstream trees +-------------------------------------- + +While merges from downstream are common and unremarkable, merges from other +trees tend to be a red flag when it comes time to push a branch upstream. +Such merges need to be carefully thought about and well justified, or +there's a good chance that a subsequent pull request will be rejected. + +It is natural to want to merge the master branch into a repository; this +type of merge is often called a "back merge". Back merges can help to make +sure that there are no conflicts with parallel development and generally +gives a warm, fuzzy feeling of being up-to-date. But this temptation +should be avoided almost all of the time. + +Why is that? Back merges will muddy the development history of your own +branch. They will significantly increase your chances of encountering bugs +from elsewhere in the community and make it hard to ensure that the work +you are managing is stable and ready for upstream. Frequent merges can +also obscure problems with the development process in your tree; they can +hide interactions with other trees that should not be happening (often) in +a well-managed branch. + +That said, back merges are occasionally required; when that happens, be +sure to document *why* it was required in the commit message. As always, +merge to a well-known stable point, rather than to some random commit. +Even then, you should not back merge a tree above your immediate upstream +tree; if a higher-level back merge is really required, the upstream tree +should do it first. + +One of the most frequent causes of merge-related trouble is when a +maintainer merges with the upstream in order to resolve merge conflicts +before sending a pull request. Again, this temptation is easy enough to +understand, but it should absolutely be avoided. This is especially true +for the final pull request: Linus is adamant that he would much rather see +merge conflicts than unnecessary back merges. Seeing the conflicts lets +him know where potential problem areas are. He does a lot of merges (382 +in the 5.1 development cycle) and has gotten quite good at conflict +resolution - often better than the developers involved. + +So what should a maintainer do when there is a conflict between their +subsystem branch and the mainline? The most important step is to warn +Linus in the pull request that the conflict will happen; if nothing else, +that demonstrates an awareness of how your branch fits into the whole. For +especially difficult conflicts, create and push a *separate* branch to show +how you would resolve things. Mention that branch in your pull request, +but the pull request itself should be for the unmerged branch. + +Even in the absence of known conflicts, doing a test merge before sending a +pull request is a good idea. It may alert you to problems that you somehow +didn't see from linux-next and helps to understand exactly what you are +asking upstream to do. + +Another reason for doing merges of upstream or another subsystem tree is to +resolve dependencies. These dependency issues do happen at times, and +sometimes a cross-merge with another tree is the best way to resolve them; +as always, in such situations, the merge commit should explain why the +merge has been done. Take a moment to do it right; people will read those +changelogs. + +Often, though, dependency issues indicate that a change of approach is +needed. Merging another subsystem tree to resolve a dependency risks +bringing in other bugs and should almost never be done. If that subsystem +tree fails to be pulled upstream, whatever problems it had will block the +merging of your tree as well. Preferable alternatives include agreeing +with the maintainer to carry both sets of changes in one of the trees or +creating a topic branch dedicated to the prerequisite commits that can be +merged into both trees. If the dependency is related to major +infrastructural changes, the right solution might be to hold the dependent +commits for one development cycle so that those changes have time to +stabilize in the mainline. + +Finally +======= + +It is relatively common to merge with the mainline toward the beginning of +the development cycle in order to pick up changes and fixes done elsewhere +in the tree. As always, such a merge should pick a well-known release +point rather than some random spot. If your upstream-bound branch has +emptied entirely into the mainline during the merge window, you can pull it +forward with a command like:: + + git merge v5.2-rc1^0 + +The "^0" will cause Git to do a fast-forward merge (which should be +possible in this situation), thus avoiding the addition of a spurious merge +commit. + +The guidelines laid out above are just that: guidelines. There will always +be situations that call out for a different solution, and these guidelines +should not prevent developers from doing the right thing when the need +arises. But one should always think about whether the need has truly +arisen and be prepared to explain why something abnormal needs to be done. diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index e4e07c8ab89e..045bb8148fe9 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -548,7 +548,7 @@ There are certain things that the Linux kernel memory barriers do not guarantee: [*] For information on bus mastering DMA and coherency please read: - Documentation/PCI/pci.txt + Documentation/PCI/pci.rst Documentation/DMA-API-HOWTO.txt Documentation/DMA-API.txt diff --git a/Documentation/mic/index.rst b/Documentation/mic/index.rst new file mode 100644 index 000000000000..082fa8f6a260 --- /dev/null +++ b/Documentation/mic/index.rst @@ -0,0 +1,18 @@ +:orphan: + +============================================= +Intel Many Integrated Core (MIC) architecture +============================================= + +.. toctree:: + :maxdepth: 1 + + mic_overview + scif_overview + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/mic/mic_overview.txt b/Documentation/mic/mic_overview.rst index 074adbdf83a4..17d956bdaf7c 100644 --- a/Documentation/mic/mic_overview.txt +++ b/Documentation/mic/mic_overview.rst @@ -1,3 +1,7 @@ +====================================================== +Intel Many Integrated Core (MIC) architecture overview +====================================================== + An Intel MIC X100 device is a PCIe form factor add-in coprocessor card based on the Intel Many Integrated Core (MIC) architecture that runs a Linux OS. It is a PCIe endpoint in a platform and therefore @@ -45,7 +49,7 @@ Here is a block diagram of the various components described above. The virtio backends are situated on the host rather than the card given better single threaded performance for the host compared to MIC, the ability of the host to initiate DMA's to/from the card using the MIC DMA engine and -the fact that the virtio block storage backend can only be on the host. +the fact that the virtio block storage backend can only be on the host:: +----------+ | +----------+ | Card OS | | | Host OS | diff --git a/Documentation/mic/scif_overview.txt b/Documentation/mic/scif_overview.rst index 0a280d986731..4c8ad9e43706 100644 --- a/Documentation/mic/scif_overview.txt +++ b/Documentation/mic/scif_overview.rst @@ -1,3 +1,7 @@ +======================================== +Symmetric Communication Interface (SCIF) +======================================== + The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a low level communications API across PCIe currently implemented for MIC. Currently SCIF provides inter-node communication within a single host platform, where a @@ -8,8 +12,11 @@ is to deliver the maximum possible performance given the communication abilities of the hardware. SCIF has been used to implement an offload compiler runtime and OFED support for MPI implementations for MIC coprocessors. -==== SCIF API Components ==== +SCIF API Components +=================== + The SCIF API has the following parts: + 1. Connection establishment using a client server model 2. Byte stream messaging intended for short messages 3. Node enumeration to determine online nodes @@ -28,9 +35,12 @@ can also register local memory which is followed by data transfer using either DMA, CPU copies or remote memory mapping via mmap. SCIF supports both user and kernel mode clients which are functionally equivalent. -==== SCIF Performance for MIC ==== +SCIF Performance for MIC +======================== + DMA bandwidth comparison between the TCP (over ethernet over PCIe) stack versus -SCIF shows the performance advantages of SCIF for HPC applications and runtimes. +SCIF shows the performance advantages of SCIF for HPC applications and +runtimes:: Comparison of TCP and SCIF based BW @@ -66,33 +76,33 @@ space API similar to the kernel API in scif.h. The SCIF user space library is distributed @ https://software.intel.com/en-us/mic-developer Here is some pseudo code for an example of how two applications on two PCIe -nodes would typically use the SCIF API: +nodes would typically use the SCIF API:: -Process A (on node A) Process B (on node B) + Process A (on node A) Process B (on node B) -/* get online node information */ -scif_get_node_ids(..) scif_get_node_ids(..) -scif_open(..) scif_open(..) -scif_bind(..) scif_bind(..) -scif_listen(..) -scif_accept(..) scif_connect(..) -/* SCIF connection established */ + /* get online node information */ + scif_get_node_ids(..) scif_get_node_ids(..) + scif_open(..) scif_open(..) + scif_bind(..) scif_bind(..) + scif_listen(..) + scif_accept(..) scif_connect(..) + /* SCIF connection established */ -/* Send and receive short messages */ -scif_send(..)/scif_recv(..) scif_send(..)/scif_recv(..) + /* Send and receive short messages */ + scif_send(..)/scif_recv(..) scif_send(..)/scif_recv(..) -/* Register memory */ -scif_register(..) scif_register(..) + /* Register memory */ + scif_register(..) scif_register(..) -/* RDMA */ -scif_readfrom(..)/scif_writeto(..) scif_readfrom(..)/scif_writeto(..) + /* RDMA */ + scif_readfrom(..)/scif_writeto(..) scif_readfrom(..)/scif_writeto(..) -/* Fence DMAs */ -scif_fence_signal(..) scif_fence_signal(..) + /* Fence DMAs */ + scif_fence_signal(..) scif_fence_signal(..) -mmap(..) mmap(..) + mmap(..) mmap(..) -/* Access remote registered memory */ + /* Access remote registered memory */ -/* Close the endpoints */ -scif_close(..) scif_close(..) + /* Close the endpoints */ + scif_close(..) scif_close(..) diff --git a/Documentation/netlabel/cipso_ipv4.txt b/Documentation/netlabel/cipso_ipv4.rst index a6075481fd60..cbd3f3231221 100644 --- a/Documentation/netlabel/cipso_ipv4.txt +++ b/Documentation/netlabel/cipso_ipv4.rst @@ -1,10 +1,13 @@ +=================================== NetLabel CIPSO/IPv4 Protocol Engine -============================================================================== +=================================== + Paul Moore, paul.moore@hp.com May 17, 2006 - * Overview +Overview +======== The NetLabel CIPSO/IPv4 protocol engine is based on the IETF Commercial IP Security Option (CIPSO) draft from July 16, 1992. A copy of this @@ -13,7 +16,8 @@ draft can be found in this directory it to an RFC standard it has become a de-facto standard for labeled networking and is used in many trusted operating systems. - * Outbound Packet Processing +Outbound Packet Processing +========================== The CIPSO/IPv4 protocol engine applies the CIPSO IP option to packets by adding the CIPSO label to the socket. This causes all packets leaving the @@ -24,7 +28,8 @@ label by using the NetLabel security module API; if the NetLabel "domain" is configured to use CIPSO for packet labeling then a CIPSO IP option will be generated and attached to the socket. - * Inbound Packet Processing +Inbound Packet Processing +========================= The CIPSO/IPv4 protocol engine validates every CIPSO IP option it finds at the IP layer without any special handling required by the LSM. However, in order @@ -33,7 +38,8 @@ NetLabel security module API to extract the security attributes of the packet. This is typically done at the socket layer using the 'socket_sock_rcv_skb()' LSM hook. - * Label Translation +Label Translation +================= The CIPSO/IPv4 protocol engine contains a mechanism to translate CIPSO security attributes such as sensitivity level and category to values which are @@ -42,7 +48,8 @@ Domain Of Interpretation (DOI) definition and are configured through the NetLabel user space communication layer. Each DOI definition can have a different security attribute mapping table. - * Label Translation Cache +Label Translation Cache +======================= The NetLabel system provides a framework for caching security attribute mappings from the network labels to the corresponding LSM identifiers. The diff --git a/Documentation/netlabel/draft_ietf.rst b/Documentation/netlabel/draft_ietf.rst new file mode 100644 index 000000000000..5ed39ab8234b --- /dev/null +++ b/Documentation/netlabel/draft_ietf.rst @@ -0,0 +1,5 @@ +Draft IETF CIPSO IP Security +---------------------------- + + .. include:: draft-ietf-cipso-ipsecurity-01.txt + :literal: diff --git a/Documentation/netlabel/index.rst b/Documentation/netlabel/index.rst new file mode 100644 index 000000000000..47f1e0e5acd1 --- /dev/null +++ b/Documentation/netlabel/index.rst @@ -0,0 +1,21 @@ +:orphan: + +======== +NetLabel +======== + +.. toctree:: + :maxdepth: 1 + + introduction + cipso_ipv4 + lsm_interface + + draft_ietf + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/netlabel/introduction.txt b/Documentation/netlabel/introduction.rst index 3caf77bcff0f..9333bbb0adc1 100644 --- a/Documentation/netlabel/introduction.txt +++ b/Documentation/netlabel/introduction.rst @@ -1,10 +1,13 @@ +===================== NetLabel Introduction -============================================================================== +===================== + Paul Moore, paul.moore@hp.com August 2, 2006 - * Overview +Overview +======== NetLabel is a mechanism which can be used by kernel security modules to attach security attributes to outgoing network packets generated from user space @@ -12,7 +15,8 @@ applications and read security attributes from incoming network packets. It is composed of three main components, the protocol engines, the communication layer, and the kernel security module API. - * Protocol Engines +Protocol Engines +================ The protocol engines are responsible for both applying and retrieving the network packet's security attributes. If any translation between the network @@ -24,7 +28,8 @@ the NetLabel kernel security module API described below. Detailed information about each NetLabel protocol engine can be found in this directory. - * Communication Layer +Communication Layer +=================== The communication layer exists to allow NetLabel configuration and monitoring from user space. The NetLabel communication layer uses a message based @@ -33,7 +38,8 @@ formatting of these NetLabel messages as well as the Generic NETLINK family names can be found in the 'net/netlabel/' directory as comments in the header files as well as in 'include/net/netlabel.h'. - * Security Module API +Security Module API +=================== The purpose of the NetLabel security module API is to provide a protocol independent interface to the underlying NetLabel protocol engines. In addition diff --git a/Documentation/netlabel/lsm_interface.txt b/Documentation/netlabel/lsm_interface.rst index 638c74f7de7f..026fc267f798 100644 --- a/Documentation/netlabel/lsm_interface.txt +++ b/Documentation/netlabel/lsm_interface.rst @@ -1,10 +1,13 @@ +======================================== NetLabel Linux Security Module Interface -============================================================================== +======================================== + Paul Moore, paul.moore@hp.com May 17, 2006 - * Overview +Overview +======== NetLabel is a mechanism which can set and retrieve security attributes from network packets. It is intended to be used by LSM developers who want to make @@ -12,7 +15,8 @@ use of a common code base for several different packet labeling protocols. The NetLabel security module API is defined in 'include/net/netlabel.h' but a brief overview is given below. - * NetLabel Security Attributes +NetLabel Security Attributes +============================ Since NetLabel supports multiple different packet labeling protocols and LSMs it uses the concept of security attributes to refer to the packet's security @@ -24,7 +28,8 @@ configuration. It is up to the LSM developer to translate the NetLabel security attributes into whatever security identifiers are in use for their particular LSM. - * NetLabel LSM Protocol Operations +NetLabel LSM Protocol Operations +================================ These are the functions which allow the LSM developer to manipulate the labels on outgoing packets as well as read the labels on incoming packets. Functions @@ -32,7 +37,8 @@ exist to operate both on sockets as well as the sk_buffs directly. These high level functions are translated into low level protocol operations based on how the administrator has configured the NetLabel subsystem. - * NetLabel Label Mapping Cache Operations +NetLabel Label Mapping Cache Operations +======================================= Depending on the exact configuration, translation between the network packet label and the internal LSM security identifier can be time consuming. The diff --git a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst index 5045df990a4c..17dbee1ac53e 100644 --- a/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst +++ b/Documentation/networking/device_drivers/freescale/dpaa2/dpio-driver.rst @@ -39,8 +39,7 @@ The Linux DPIO driver consists of 3 primary components-- DPIO service-- provides APIs to other Linux drivers for services - QBman portal interface-- sends portal commands, gets responses -:: + QBman portal interface-- sends portal commands, gets responses:: fsl-mc other bus drivers @@ -60,6 +59,7 @@ The Linux DPIO driver consists of 3 primary components-- The diagram below shows how the DPIO driver components fit with the other DPAA2 Linux driver components:: + +------------+ | OS Network | | Stack | diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index ca87068b9ab9..563d56c6a25c 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -531,7 +531,7 @@ Bridge VLAN filtering a software implementation. .. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be the its port-based VLAN, used by the associated bridge device. + of DSA, would be its port-based VLAN, used by the associated bridge device. - ``port_fdb_del``: bridge layer function invoked when the bridge wants to remove a Forwarding Database entry, the switch hardware should be programmed to delete @@ -554,7 +554,7 @@ Bridge VLAN filtering associated with this VLAN ID. .. note:: VLAN ID 0 corresponds to the port private database, which, in the context - of DSA, would be the its port-based VLAN, used by the associated bridge device. + of DSA, would be its port-based VLAN, used by the associated bridge device. - ``port_mdb_del``: bridge layer function invoked when the bridge wants to remove a multicast database entry, the switch hardware should be programmed to delete diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst index ea7bac438cfd..cb2858dece93 100644 --- a/Documentation/networking/dsa/sja1105.rst +++ b/Documentation/networking/dsa/sja1105.rst @@ -86,13 +86,13 @@ functionality. The following traffic modes are supported over the switch netdevices: +--------------------+------------+------------------+------------------+ -| | Standalone | Bridged with | Bridged with | -| | ports | vlan_filtering 0 | vlan_filtering 1 | +| | Standalone | Bridged with | Bridged with | +| | ports | vlan_filtering 0 | vlan_filtering 1 | +====================+============+==================+==================+ | Regular traffic | Yes | Yes | No (use master) | +--------------------+------------+------------------+------------------+ | Management traffic | Yes | Yes | Yes | -| (BPDU, PTP) | | | | +| (BPDU, PTP) | | | | +--------------------+------------+------------------+------------------+ Switching features diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index bbdaf8990031..8dd6333c3270 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -368,7 +368,7 @@ ts[1] used to hold hardware timestamps converted to system time. Instead, expose the hardware clock device on the NIC directly as a HW PTP clock source, to allow time conversion in userspace and optionally synchronize system time with a userspace PTP stack such -as linuxptp. For the PTP clock API, see Documentation/ptp/ptp.txt. +as linuxptp. For the PTP clock API, see Documentation/driver-api/ptp.rst. Note that if the SO_TIMESTAMP or SO_TIMESTAMPNS option is enabled together with SO_TIMESTAMPING using SOF_TIMESTAMPING_SOFTWARE, a false diff --git a/Documentation/nvdimm/nvdimm.txt b/Documentation/nvdimm/nvdimm.txt index e894de69915a..1669f626b037 100644 --- a/Documentation/nvdimm/nvdimm.txt +++ b/Documentation/nvdimm/nvdimm.txt @@ -284,8 +284,8 @@ A bus has a 1:1 relationship with an NFIT. The current expectation for ACPI based systems is that there is only ever one platform-global NFIT. That said, it is trivial to register multiple NFITs, the specification does not preclude it. The infrastructure supports multiple busses and -we we use this capability to test multiple NFIT configurations in the -unit test. +we use this capability to test multiple NFIT configurations in the unit +test. LIBNVDIMM: control class device in /sys/class diff --git a/Documentation/pcmcia/devicetable.txt b/Documentation/pcmcia/devicetable.rst index 5f3e00ab54c4..fd1d60d12ca1 100644 --- a/Documentation/pcmcia/devicetable.txt +++ b/Documentation/pcmcia/devicetable.rst @@ -1,3 +1,7 @@ +============ +Device table +============ + Matching of PCMCIA devices to drivers is done using one or more of the following criteria: diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.rst index 78355c4c268a..33fe9ebec049 100644 --- a/Documentation/pcmcia/driver-changes.txt +++ b/Documentation/pcmcia/driver-changes.rst @@ -1,15 +1,21 @@ +============== +Driver changes +============== + This file details changes in 2.6 which affect PCMCIA card driver authors: + * pcmcia_loop_config() and autoconfiguration (as of 2.6.36) - If struct pcmcia_device *p_dev->config_flags is set accordingly, + If `struct pcmcia_device *p_dev->config_flags` is set accordingly, pcmcia_loop_config() now sets up certain configuration values automatically, though the driver may still override the settings in the callback function. The following autoconfiguration options are provided at the moment: - CONF_AUTO_CHECK_VCC : check for matching Vcc - CONF_AUTO_SET_VPP : set Vpp - CONF_AUTO_AUDIO : auto-enable audio line, if required - CONF_AUTO_SET_IO : set ioport resources (->resource[0,1]) - CONF_AUTO_SET_IOMEM : set first iomem resource (->resource[2]) + + - CONF_AUTO_CHECK_VCC : check for matching Vcc + - CONF_AUTO_SET_VPP : set Vpp + - CONF_AUTO_AUDIO : auto-enable audio line, if required + - CONF_AUTO_SET_IO : set ioport resources (->resource[0,1]) + - CONF_AUTO_SET_IOMEM : set first iomem resource (->resource[2]) * pcmcia_request_configuration -> pcmcia_enable_device (as of 2.6.36) pcmcia_request_configuration() got renamed to pcmcia_enable_device(), @@ -19,14 +25,14 @@ This file details changes in 2.6 which affect PCMCIA card driver authors: * pcmcia_request_window changes (as of 2.6.36) Instead of win_req_t, drivers are now requested to fill out - struct pcmcia_device *p_dev->resource[2,3,4,5] for up to four ioport + `struct pcmcia_device *p_dev->resource[2,3,4,5]` for up to four ioport ranges. After a call to pcmcia_request_window(), the regions found there are reserved and may be used immediately -- until pcmcia_release_window() is called. * pcmcia_request_io changes (as of 2.6.36) Instead of io_req_t, drivers are now requested to fill out - struct pcmcia_device *p_dev->resource[0,1] for up to two ioport + `struct pcmcia_device *p_dev->resource[0,1]` for up to two ioport ranges. After a call to pcmcia_request_io(), the ports found there are reserved, after calling pcmcia_request_configuration(), they may be used. @@ -42,7 +48,8 @@ This file details changes in 2.6 which affect PCMCIA card driver authors: * New IRQ request rules (as of 2.6.35) Instead of the old pcmcia_request_irq() interface, drivers may now choose between: - - calling request_irq/free_irq directly. Use the IRQ from *p_dev->irq. + + - calling request_irq/free_irq directly. Use the IRQ from `*p_dev->irq`. - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will clean up automatically on calls to pcmcia_disable_device() or device ejection. @@ -72,13 +79,16 @@ This file details changes in 2.6 which affect PCMCIA card driver authors: exports for them were removed. * Unify detach and REMOVAL event code, as well as attach and INSERTION - code (as of 2.6.16) + code (as of 2.6.16):: + void (*remove) (struct pcmcia_device *dev); int (*probe) (struct pcmcia_device *dev); -* Move suspend, resume and reset out of event handler (as of 2.6.16) +* Move suspend, resume and reset out of event handler (as of 2.6.16):: + int (*suspend) (struct pcmcia_device *dev); int (*resume) (struct pcmcia_device *dev); + should be initialized in struct pcmcia_driver, and handle (SUSPEND == RESET_PHYSICAL) and (RESUME == CARD_RESET) events @@ -117,7 +127,8 @@ This file details changes in 2.6 which affect PCMCIA card driver authors: * core functions no longer available (as of 2.6.11) The following functions have been removed from the kernel source because they are unused by all in-kernel drivers, and no external - driver was reported to rely on them: + driver was reported to rely on them:: + pcmcia_get_first_region() pcmcia_get_next_region() pcmcia_modify_window() diff --git a/Documentation/pcmcia/driver.txt b/Documentation/pcmcia/driver.rst index 0ac167920778..5c4fe84d51c1 100644 --- a/Documentation/pcmcia/driver.txt +++ b/Documentation/pcmcia/driver.rst @@ -1,16 +1,16 @@ +============= PCMCIA Driver -------------- - +============= sysfs ----- New PCMCIA IDs may be added to a device driver pcmcia_device_id table at -runtime as shown below: +runtime as shown below:: -echo "match_flags manf_id card_id func_id function device_no \ -prod_id_hash[0] prod_id_hash[1] prod_id_hash[2] prod_id_hash[3]" > \ -/sys/bus/pcmcia/drivers/{driver}/new_id + echo "match_flags manf_id card_id func_id function device_no \ + prod_id_hash[0] prod_id_hash[1] prod_id_hash[2] prod_id_hash[3]" > \ + /sys/bus/pcmcia/drivers/{driver}/new_id All fields are passed in as hexadecimal values (no leading 0x). The meaning is described in the PCMCIA specification, the match_flags is @@ -22,9 +22,9 @@ PCMCIA device listed in its (newly updated) pcmcia_device_id list. A common use-case is to add a new device according to the manufacturer ID and the card ID (form the manf_id and card_id file in the device tree). -For this, just use: +For this, just use:: -echo "0x3 manf_id card_id 0 0 0 0 0 0 0" > \ - /sys/bus/pcmcia/drivers/{driver}/new_id + echo "0x3 manf_id card_id 0 0 0 0 0 0 0" > \ + /sys/bus/pcmcia/drivers/{driver}/new_id after loading the driver. diff --git a/Documentation/pcmcia/index.rst b/Documentation/pcmcia/index.rst new file mode 100644 index 000000000000..779c8527109e --- /dev/null +++ b/Documentation/pcmcia/index.rst @@ -0,0 +1,20 @@ +:orphan: + +====== +pcmcia +====== + +.. toctree:: + :maxdepth: 1 + + driver + devicetable + locking + driver-changes + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/pcmcia/locking.txt b/Documentation/pcmcia/locking.rst index b2c9b478906b..e35257139c89 100644 --- a/Documentation/pcmcia/locking.txt +++ b/Documentation/pcmcia/locking.rst @@ -1,3 +1,7 @@ +======= +Locking +======= + This file explains the locking and exclusion scheme used in the PCCARD and PCMCIA subsystems. @@ -5,16 +9,21 @@ and PCMCIA subsystems. A) Overview, Locking Hierarchy: =============================== -pcmcia_socket_list_rwsem - protects only the list of sockets -- skt_mutex - serializes card insert / ejection - - ops_mutex - serializes socket operation +pcmcia_socket_list_rwsem + - protects only the list of sockets + +- skt_mutex + - serializes card insert / ejection + + - ops_mutex + - serializes socket operation B) Exclusion ============ The following functions and callbacks to struct pcmcia_socket must -be called with "skt_mutex" held: +be called with "skt_mutex" held:: socket_detect_change() send_event() @@ -31,7 +40,7 @@ be called with "skt_mutex" held: struct pcmcia_callback *callback The following functions and callbacks to struct pcmcia_socket must -be called with "ops_mutex" held: +be called with "ops_mutex" held:: socket_reset() socket_setup() @@ -39,7 +48,7 @@ be called with "ops_mutex" held: struct pccard_operations *ops struct pccard_resource_ops *resource_ops; -Note that send_event() and struct pcmcia_callback *callback must not be +Note that send_event() and `struct pcmcia_callback *callback` must not be called with "ops_mutex" held. @@ -60,19 +69,23 @@ The resource_ops and their data are protected by ops_mutex. The "main" struct pcmcia_socket is protected as follows (read-only fields or single-use fields not mentioned): -- by pcmcia_socket_list_rwsem: +- by pcmcia_socket_list_rwsem:: + struct list_head socket_list; -- by thread_lock: +- by thread_lock:: + unsigned int thread_events; -- by skt_mutex: +- by skt_mutex:: + u_int suspended_state; void (*tune_bridge); struct pcmcia_callback *callback; int resume_status; -- by ops_mutex: +- by ops_mutex:: + socket_state_t socket; u_int state; u_short lock_count; @@ -100,7 +113,8 @@ The "main" struct pcmcia_device is protected as follows (read-only fields or single-use fields not mentioned): -- by pcmcia_socket->ops_mutex: +- by pcmcia_socket->ops_mutex:: + struct list_head socket_device_list; struct config_t *function_config; u16 _irq:1; @@ -111,7 +125,8 @@ or single-use fields not mentioned): u16 suspended:1; u16 _removed:1; -- by the PCMCIA driver: +- by the PCMCIA driver:: + io_req_t io; irq_req_t irq; config_req_t conf; diff --git a/Documentation/platform/x86-laptop-drivers.txt b/Documentation/platform/x86-laptop-drivers.txt deleted file mode 100644 index 01facd2590bb..000000000000 --- a/Documentation/platform/x86-laptop-drivers.txt +++ /dev/null @@ -1,18 +0,0 @@ -compal-laptop -============= -List of supported hardware: - -by Compal: - Compal FL90/IFL90 - Compal FL91/IFL91 - Compal FL92/JFL92 - Compal FT00/IFT00 - -by Dell: - Dell Vostro 1200 - Dell Mini 9 (Inspiron 910) - Dell Mini 10 (Inspiron 1010) - Dell Mini 10v (Inspiron 1011) - Dell Mini 1012 (Inspiron 1012) - Dell Inspiron 11z (Inspiron 1110) - Dell Mini 12 (Inspiron 1210) diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 18c5feef2577..0c41d6d463f3 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -59,7 +59,7 @@ as follows: the default calculated size. Use this option if default boot memory size is not sufficient for second kernel to boot successfully. For syntax of crashkernel= parameter, - refer to Documentation/kdump/kdump.txt. If any offset is + refer to Documentation/kdump/kdump.rst. If any offset is provided in crashkernel= parameter, it will be ignored as fadump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. diff --git a/Documentation/powerpc/isa-versions.rst b/Documentation/powerpc/isa-versions.rst index 812e20cc898c..66c24140ebf1 100644 --- a/Documentation/powerpc/isa-versions.rst +++ b/Documentation/powerpc/isa-versions.rst @@ -1,3 +1,5 @@ +:orphan: + CPU to ISA Version Mapping ========================== diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 4b7a5ab3cec1..13dd893c9f88 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -298,7 +298,7 @@ enabled, a configurable percentage of memory allocations will be made to fail; these failures can be restricted to a specific range of code. Running with fault injection enabled allows the programmer to see how the code responds when things go badly. See -Documentation/fault-injection/fault-injection.txt for more information on +Documentation/fault-injection/fault-injection.rst for more information on how to use this facility. Other kinds of errors can be found with the "sparse" static analysis tool. diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index fa864a51e6ea..f4a2198187f9 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -686,7 +686,7 @@ filesystems) should advertise this prominently in their prompt string:: ... For full documentation on the configuration files, see the file -Documentation/kbuild/kconfig-language.txt. +Documentation/kbuild/kconfig-language.rst. 11) Data structures diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst index 4bab7464ff8c..17db11b7ed48 100644 --- a/Documentation/process/maintainer-pgp-guide.rst +++ b/Documentation/process/maintainer-pgp-guide.rst @@ -238,7 +238,10 @@ your new subkey:: work. If for some reason you prefer to stay with RSA subkeys, just replace - "ed25519" with "rsa2048" in the above command. + "ed25519" with "rsa2048" in the above command. Additionally, if you + plan to use a hardware device that does not support ED25519 ECC + keys, like Nitrokey Pro or a Yubikey, then you should use + "nistp256" instead or "ed25519." Back up your master key for disaster recovery @@ -432,23 +435,23 @@ Available smartcard devices Unless all your laptops and workstations have smartcard readers, the easiest is to get a specialized USB device that implements smartcard -functionality. There are several options available: +functionality. There are several options available: - `Nitrokey Start`_: Open hardware and Free Software, based on FSI - Japan's `Gnuk`_. Offers support for ECC keys, but fewest security - features (such as resistance to tampering or some side-channel - attacks). -- `Nitrokey Pro`_: Similar to the Nitrokey Start, but more - tamper-resistant and offers more security features, but no ECC - support. -- `Yubikey 4`_: proprietary hardware and software, but cheaper than + Japan's `Gnuk`_. One of the few available commercial devices that + support ED25519 ECC keys, but offer fewest security features (such as + resistance to tampering or some side-channel attacks). +- `Nitrokey Pro 2`_: Similar to the Nitrokey Start, but more + tamper-resistant and offers more security features. Pro 2 supports ECC + cryptography (NISTP). +- `Yubikey 5`_: proprietary hardware and software, but cheaper than Nitrokey Pro and comes available in the USB-C form that is more useful with newer laptops. Offers additional security features such as FIDO - U2F, but no ECC. + U2F, among others, and now finally supports ECC keys (NISTP). `LWN has a good review`_ of some of the above models, as well as several -others. If you want to use ECC keys, your best bet among commercially -available devices is the Nitrokey Start. +others. Your choice will depend on cost, shipping availability in your +geographical region, and open/proprietary hardware considerations. .. note:: @@ -457,8 +460,8 @@ available devices is the Nitrokey Start. Foundation. .. _`Nitrokey Start`: https://shop.nitrokey.com/shop/product/nitrokey-start-6 -.. _`Nitrokey Pro`: https://shop.nitrokey.com/shop/product/nitrokey-pro-3 -.. _`Yubikey 4`: https://www.yubico.com/product/yubikey-4-series/ +.. _`Nitrokey Pro 2`: https://shop.nitrokey.com/shop/product/nitrokey-pro-2-3 +.. _`Yubikey 5`: https://www.yubico.com/products/yubikey-5-overview/ .. _Gnuk: http://www.fsij.org/doc-gnuk/ .. _`LWN has a good review`: https://lwn.net/Articles/736231/ .. _`qualify for a free Nitrokey Start`: https://www.kernel.org/nitrokey-digital-tokens-for-kernel-developers.html diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index c88867b173d9..365efc9e4aa8 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -39,7 +39,7 @@ and elsewhere regarding submitting Linux kernel patches. 6) Any new or modified ``CONFIG`` options do not muck up the config menu and default to off unless they meet the exception criteria documented in - ``Documentation/kbuild/kconfig-language.txt`` Menu attributes: default value. + ``Documentation/kbuild/kconfig-language.rst`` Menu attributes: default value. 7) All new ``Kconfig`` options have help text. diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst new file mode 100644 index 000000000000..c4b906d9b5a7 --- /dev/null +++ b/Documentation/riscv/index.rst @@ -0,0 +1,17 @@ +:orphan: + +=================== +RISC-V architecture +=================== + +.. toctree:: + :maxdepth: 1 + + pmu + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/riscv/pmu.txt b/Documentation/riscv/pmu.rst index b29f03a6d82f..acb216b99c26 100644 --- a/Documentation/riscv/pmu.txt +++ b/Documentation/riscv/pmu.rst @@ -1,5 +1,7 @@ +=================================== Supporting PMUs on RISC-V platforms -========================================== +=================================== + Alan Kao <alankao@andestech.com>, Mar 2018 Introduction @@ -77,13 +79,13 @@ Note that some features can be done in this stage as well: (2) privilege level setting (user space only, kernel space only, both); (3) destructor setting. Normally it is sufficient to apply *riscv_destroy_event*; (4) tweaks for non-sampling events, which will be utilized by functions such as -*perf_adjust_period*, usually something like the follows: + *perf_adjust_period*, usually something like the follows:: -if (!is_sampling_event(event)) { - hwc->sample_period = x86_pmu.max_period; - hwc->last_period = hwc->sample_period; - local64_set(&hwc->period_left, hwc->sample_period); -} + if (!is_sampling_event(event)) { + hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } In the case of *riscv_base_pmu*, only (3) is provided for now. @@ -94,10 +96,10 @@ In the case of *riscv_base_pmu*, only (3) is provided for now. 3.1. Interrupt Initialization This often occurs at the beginning of the *event_init* method. In common -practice, this should be a code segment like +practice, this should be a code segment like:: -int x86_reserve_hardware(void) -{ + int x86_reserve_hardware(void) + { int err = 0; if (!atomic_inc_not_zero(&pmc_refcount)) { @@ -114,7 +116,7 @@ int x86_reserve_hardware(void) } return err; -} + } And the magic is in *reserve_pmc_hardware*, which usually does atomic operations to make implemented IRQ accessible from some global function pointer. @@ -128,28 +130,28 @@ which will be introduced in the next section.) 3.2. IRQ Structure -Basically, a IRQ runs the following pseudo code: +Basically, a IRQ runs the following pseudo code:: -for each hardware counter that triggered this overflow + for each hardware counter that triggered this overflow - get the event of this counter + get the event of this counter - // following two steps are defined as *read()*, - // check the section Reading/Writing Counters for details. - count the delta value since previous interrupt - update the event->count (# event occurs) by adding delta, and - event->hw.period_left by subtracting delta + // following two steps are defined as *read()*, + // check the section Reading/Writing Counters for details. + count the delta value since previous interrupt + update the event->count (# event occurs) by adding delta, and + event->hw.period_left by subtracting delta - if the event overflows - sample data - set the counter appropriately for the next overflow + if the event overflows + sample data + set the counter appropriately for the next overflow - if the event overflows again - too frequently, throttle this event - fi - fi + if the event overflows again + too frequently, throttle this event + fi + fi -end for + end for However as of this writing, none of the RISC-V implementations have designed an interrupt for perf, so the details are to be completed in the future. @@ -195,23 +197,26 @@ A normal flow of these state transitions are as follows: At this stage, a general event is bound to a physical counter, if any. The state changes to PERF_HES_STOPPED and PERF_HES_UPTODATE, because it is now stopped, and the (software) event count does not need updating. -** *start* is then called, and the counter is enabled. - With flag PERF_EF_RELOAD, it writes an appropriate value to the counter (check - previous section for detail). - Nothing is written if the flag does not contain PERF_EF_RELOAD. - The state now is reset to none, because it is neither stopped nor updated - (the counting already started) + + - *start* is then called, and the counter is enabled. + With flag PERF_EF_RELOAD, it writes an appropriate value to the counter (check + previous section for detail). + Nothing is written if the flag does not contain PERF_EF_RELOAD. + The state now is reset to none, because it is neither stopped nor updated + (the counting already started) + * When being context-switched out, *del* is called. It then checks out all the events in the PMU and calls *stop* to update their counts. -** *stop* is called by *del* - and the perf core with flag PERF_EF_UPDATE, and it often shares the same - subroutine as *read* with the same logic. - The state changes to PERF_HES_STOPPED and PERF_HES_UPTODATE, again. -** Life cycle of these two pairs: *add* and *del* are called repeatedly as - tasks switch in-and-out; *start* and *stop* is also called when the perf core - needs a quick stop-and-start, for instance, when the interrupt period is being - adjusted. + - *stop* is called by *del* + and the perf core with flag PERF_EF_UPDATE, and it often shares the same + subroutine as *read* with the same logic. + The state changes to PERF_HES_STOPPED and PERF_HES_UPTODATE, again. + + - Life cycle of these two pairs: *add* and *del* are called repeatedly as + tasks switch in-and-out; *start* and *stop* is also called when the perf core + needs a quick stop-and-start, for instance, when the interrupt period is being + adjusted. Current implementation is sufficient for now and can be easily extended to features in the future. @@ -225,25 +230,26 @@ A. Related Structures Both structures are designed to be read-only. *struct pmu* defines some function pointer interfaces, and most of them take -*struct perf_event* as a main argument, dealing with perf events according to -perf's internal state machine (check kernel/events/core.c for details). + *struct perf_event* as a main argument, dealing with perf events according to + perf's internal state machine (check kernel/events/core.c for details). *struct riscv_pmu* defines PMU-specific parameters. The naming follows the -convention of all other architectures. + convention of all other architectures. * struct perf_event: include/linux/perf_event.h * struct hw_perf_event The generic structure that represents perf events, and the hardware-related -details. + details. * struct riscv_hw_events: arch/riscv/include/asm/perf_event.h The structure that holds the status of events, has two fixed members: -the number of events and the array of the events. + the number of events and the array of the events. References ---------- [1] https://github.com/riscv/riscv-linux/pull/124 + [2] https://groups.google.com/a/groups.riscv.org/forum/#!topic/sw-dev/f19TmCNP6yA diff --git a/Documentation/scheduler/completion.txt b/Documentation/scheduler/completion.rst index e5b9df4d8078..9f039b4f4b09 100644 --- a/Documentation/scheduler/completion.txt +++ b/Documentation/scheduler/completion.rst @@ -1,3 +1,4 @@ +================================================ Completions - "wait for completion" barrier APIs ================================================ @@ -46,7 +47,7 @@ it has to wait for it. To use completions you need to #include <linux/completion.h> and create a static or dynamic variable of type 'struct completion', -which has only two fields: +which has only two fields:: struct completion { unsigned int done; @@ -57,7 +58,7 @@ This provides the ->wait waitqueue to place tasks on for waiting (if any), and the ->done completion flag for indicating whether it's completed or not. Completions should be named to refer to the event that is being synchronized on. -A good example is: +A good example is:: wait_for_completion(&early_console_added); @@ -81,7 +82,7 @@ have taken place, even if these wait functions return prematurely due to a timeo or a signal triggering. Initializing of dynamically allocated completion objects is done via a call to -init_completion(): +init_completion():: init_completion(&dynamic_object->done); @@ -100,7 +101,8 @@ but be aware of other races. For static declaration and initialization, macros are available. -For static (or global) declarations in file scope you can use DECLARE_COMPLETION(): +For static (or global) declarations in file scope you can use +DECLARE_COMPLETION():: static DECLARE_COMPLETION(setup_done); DECLARE_COMPLETION(setup_done); @@ -111,7 +113,7 @@ initialized to 'not done' and doesn't require an init_completion() call. When a completion is declared as a local variable within a function, then the initialization should always use DECLARE_COMPLETION_ONSTACK() explicitly, not just to make lockdep happy, but also to make it clear -that limited scope had been considered and is intentional: +that limited scope had been considered and is intentional:: DECLARE_COMPLETION_ONSTACK(setup_done) @@ -140,11 +142,11 @@ Waiting for completions: ------------------------ For a thread to wait for some concurrent activity to finish, it -calls wait_for_completion() on the initialized completion structure: +calls wait_for_completion() on the initialized completion structure:: void wait_for_completion(struct completion *done) -A typical usage scenario is: +A typical usage scenario is:: CPU#1 CPU#2 @@ -192,17 +194,17 @@ A common problem that occurs is to have unclean assignment of return types, so take care to assign return-values to variables of the proper type. Checking for the specific meaning of return values also has been found -to be quite inaccurate, e.g. constructs like: +to be quite inaccurate, e.g. constructs like:: if (!wait_for_completion_interruptible_timeout(...)) ... would execute the same code path for successful completion and for the -interrupted case - which is probably not what you want. +interrupted case - which is probably not what you want:: int wait_for_completion_interruptible(struct completion *done) This function marks the task TASK_INTERRUPTIBLE while it is waiting. -If a signal was received while waiting it will return -ERESTARTSYS; 0 otherwise. +If a signal was received while waiting it will return -ERESTARTSYS; 0 otherwise:: unsigned long wait_for_completion_timeout(struct completion *done, unsigned long timeout) @@ -214,7 +216,7 @@ Timeouts are preferably calculated with msecs_to_jiffies() or usecs_to_jiffies() to make the code largely HZ-invariant. If the returned timeout value is deliberately ignored a comment should probably explain -why (e.g. see drivers/mfd/wm8350-core.c wm8350_read_auxadc()). +why (e.g. see drivers/mfd/wm8350-core.c wm8350_read_auxadc()):: long wait_for_completion_interruptible_timeout(struct completion *done, unsigned long timeout) @@ -225,14 +227,14 @@ jiffies if completion occurred. Further variants include _killable which uses TASK_KILLABLE as the designated tasks state and will return -ERESTARTSYS if it is interrupted, -or 0 if completion was achieved. There is a _timeout variant as well: +or 0 if completion was achieved. There is a _timeout variant as well:: long wait_for_completion_killable(struct completion *done) long wait_for_completion_killable_timeout(struct completion *done, unsigned long timeout) The _io variants wait_for_completion_io() behave the same as the non-_io variants, except for accounting waiting time as 'waiting on IO', which has -an impact on how the task is accounted in scheduling/IO stats: +an impact on how the task is accounted in scheduling/IO stats:: void wait_for_completion_io(struct completion *done) unsigned long wait_for_completion_io_timeout(struct completion *done, unsigned long timeout) @@ -243,11 +245,11 @@ Signaling completions: A thread that wants to signal that the conditions for continuation have been achieved calls complete() to signal exactly one of the waiters that it can -continue: +continue:: void complete(struct completion *done) -... or calls complete_all() to signal all current and future waiters: +... or calls complete_all() to signal all current and future waiters:: void complete_all(struct completion *done) @@ -268,7 +270,7 @@ probably are a design bug. Signaling completion from IRQ context is fine as it will appropriately lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never -sleep. +sleep. try_wait_for_completion()/completion_done(): @@ -276,14 +278,14 @@ try_wait_for_completion()/completion_done(): The try_wait_for_completion() function will not put the thread on the wait queue but rather returns false if it would need to enqueue (block) the thread, -else it consumes one posted completion and returns true. +else it consumes one posted completion and returns true:: bool try_wait_for_completion(struct completion *done) Finally, to check the state of a completion without changing it in any way, call completion_done(), which returns false if there are no posted completions that were not yet consumed by waiters (implying that there are -waiters) and true otherwise; +waiters) and true otherwise:: bool completion_done(struct completion *done) diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst new file mode 100644 index 000000000000..058be77a4c34 --- /dev/null +++ b/Documentation/scheduler/index.rst @@ -0,0 +1,29 @@ +:orphan: + +=============== +Linux Scheduler +=============== + +.. toctree:: + :maxdepth: 1 + + + completion + sched-arch + sched-bwc + sched-deadline + sched-design-CFS + sched-domains + sched-energy + sched-nice-design + sched-rt-group + sched-stats + + text_files + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.rst index a2f27bbf2cba..0eaec669790a 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.rst @@ -1,4 +1,6 @@ - CPU Scheduler implementation hints for architecture specific code +================================================================= +CPU Scheduler implementation hints for architecture specific code +================================================================= Nick Piggin, 2005 @@ -35,9 +37,10 @@ Your cpu_idle routines need to obey the following rules: 4. The only time interrupts need to be disabled when checking need_resched is if we are about to sleep the processor until the next interrupt (this doesn't provide any protection of - need_resched, it prevents losing an interrupt). + need_resched, it prevents losing an interrupt): + + 4a. Common problem with this type of sleep appears to be:: - 4a. Common problem with this type of sleep appears to be: local_irq_disable(); if (!need_resched()) { local_irq_enable(); @@ -51,10 +54,10 @@ Your cpu_idle routines need to obey the following rules: although it may be reasonable to do some background work or enter a low CPU priority. - 5a. If TIF_POLLING_NRFLAG is set, and we do decide to enter - an interrupt sleep, it needs to be cleared then a memory - barrier issued (followed by a test of need_resched with - interrupts disabled, as explained in 3). + - 5a. If TIF_POLLING_NRFLAG is set, and we do decide to enter + an interrupt sleep, it needs to be cleared then a memory + barrier issued (followed by a test of need_resched with + interrupts disabled, as explained in 3). arch/x86/kernel/process.c has examples of both polling and sleeping idle functions. @@ -71,4 +74,3 @@ sh64 - Is sleeping racy vs interrupts? (See #4a) sparc - IRQs on at this point(?), change local_irq_save to _disable. - TODO: needs secondary CPUs to disable preempt (See #1) - diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.rst index f6b1873f68ab..3a9064219656 100644 --- a/Documentation/scheduler/sched-bwc.txt +++ b/Documentation/scheduler/sched-bwc.rst @@ -1,8 +1,9 @@ +===================== CFS Bandwidth Control ===================== [ This document only discusses CPU bandwidth control for SCHED_NORMAL. - The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ] + The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.rst ] CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the specification of the maximum CPU bandwidth available to a group or hierarchy. @@ -27,7 +28,8 @@ cpu.cfs_quota_us: the total available run-time within a period (in microseconds) cpu.cfs_period_us: the length of a period (in microseconds) cpu.stat: exports throttling statistics [explained further below] -The default values are: +The default values are:: + cpu.cfs_period_us=100ms cpu.cfs_quota=-1 @@ -55,7 +57,8 @@ For efficiency run-time is transferred between the global pool and CPU local on large systems. The amount transferred each time such an update is required is described as the "slice". -This is tunable via procfs: +This is tunable via procfs:: + /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms) Larger slice values will reduce transfer overheads, while smaller values allow @@ -66,6 +69,7 @@ Statistics A group's bandwidth statistics are exported via 3 fields in cpu.stat. cpu.stat: + - nr_periods: Number of enforcement intervals that have elapsed. - nr_throttled: Number of times the group has been throttled/limited. - throttled_time: The total time duration (in nanoseconds) for which entities @@ -78,12 +82,15 @@ Hierarchical considerations The interface enforces that an individual entity's bandwidth is always attainable, that is: max(c_i) <= C. However, over-subscription in the aggregate case is explicitly allowed to enable work-conserving semantics -within a hierarchy. +within a hierarchy: + e.g. \Sum (c_i) may exceed C + [ Where C is the parent's bandwidth, and c_i its children ] There are two ways in which a group may become throttled: + a. it fully consumes its own quota within a period b. a parent's quota is fully consumed within its period @@ -92,7 +99,7 @@ be allowed to until the parent's runtime is refreshed. Examples -------- -1. Limit a group to 1 CPU worth of runtime. +1. Limit a group to 1 CPU worth of runtime:: If period is 250ms and quota is also 250ms, the group will get 1 CPU worth of runtime every 250ms. @@ -100,10 +107,10 @@ Examples # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */ # echo 250000 > cpu.cfs_period_us /* period = 250ms */ -2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine. +2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine - With 500ms period and 1000ms quota, the group can get 2 CPUs worth of - runtime every 500ms. + With 500ms period and 1000ms quota, the group can get 2 CPUs worth of + runtime every 500ms:: # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */ # echo 500000 > cpu.cfs_period_us /* period = 500ms */ @@ -112,11 +119,10 @@ Examples 3. Limit a group to 20% of 1 CPU. - With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU. + With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU:: # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */ # echo 50000 > cpu.cfs_period_us /* period = 50ms */ - By using a small period here we are ensuring a consistent latency - response at the expense of burst capacity. - + By using a small period here we are ensuring a consistent latency + response at the expense of burst capacity. diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.rst index a7514343b660..3391e86d810c 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.rst @@ -1,29 +1,29 @@ - Deadline Task Scheduling - ------------------------ - -CONTENTS -======== - - 0. WARNING - 1. Overview - 2. Scheduling algorithm - 2.1 Main algorithm - 2.2 Bandwidth reclaiming - 3. Scheduling Real-Time Tasks - 3.1 Definitions - 3.2 Schedulability Analysis for Uniprocessor Systems - 3.3 Schedulability Analysis for Multiprocessor Systems - 3.4 Relationship with SCHED_DEADLINE Parameters - 4. Bandwidth management - 4.1 System-wide settings - 4.2 Task interface - 4.3 Default behavior - 4.4 Behavior of sched_yield() - 5. Tasks CPU affinity - 5.1 SCHED_DEADLINE and cpusets HOWTO - 6. Future plans - A. Test suite - B. Minimal main() +======================== +Deadline Task Scheduling +======================== + +.. CONTENTS + + 0. WARNING + 1. Overview + 2. Scheduling algorithm + 2.1 Main algorithm + 2.2 Bandwidth reclaiming + 3. Scheduling Real-Time Tasks + 3.1 Definitions + 3.2 Schedulability Analysis for Uniprocessor Systems + 3.3 Schedulability Analysis for Multiprocessor Systems + 3.4 Relationship with SCHED_DEADLINE Parameters + 4. Bandwidth management + 4.1 System-wide settings + 4.2 Task interface + 4.3 Default behavior + 4.4 Behavior of sched_yield() + 5. Tasks CPU affinity + 5.1 SCHED_DEADLINE and cpusets HOWTO + 6. Future plans + A. Test suite + B. Minimal main() 0. WARNING @@ -44,7 +44,7 @@ CONTENTS 2. Scheduling algorithm -================== +======================= 2.1 Main algorithm ------------------ @@ -80,7 +80,7 @@ CONTENTS a "remaining runtime". These two parameters are initially set to 0; - When a SCHED_DEADLINE task wakes up (becomes ready for execution), - the scheduler checks if + the scheduler checks if:: remaining runtime runtime ---------------------------------- > --------- @@ -97,7 +97,7 @@ CONTENTS left unchanged; - When a SCHED_DEADLINE task executes for an amount of time t, its - remaining runtime is decreased as + remaining runtime is decreased as:: remaining runtime = remaining runtime - t @@ -112,7 +112,7 @@ CONTENTS - When the current time is equal to the replenishment time of a throttled task, the scheduling deadline and the remaining runtime are - updated as + updated as:: scheduling deadline = scheduling deadline + period remaining runtime = remaining runtime + runtime @@ -129,7 +129,7 @@ CONTENTS Reclamation of Unused Bandwidth) algorithm [15, 16, 17] and it is enabled when flag SCHED_FLAG_RECLAIM is set. - The following diagram illustrates the state names for tasks handled by GRUB: + The following diagram illustrates the state names for tasks handled by GRUB:: ------------ (d) | Active | @@ -168,7 +168,7 @@ CONTENTS breaking the real-time guarantees. The 0-lag time for a task entering the ActiveNonContending state is - computed as + computed as:: (runtime * dl_period) deadline - --------------------- @@ -183,7 +183,7 @@ CONTENTS the task's utilization must be removed from the previous runqueue's active utilization and must be added to the new runqueue's active utilization. In order to avoid races between a task waking up on a runqueue while the - "inactive timer" is running on a different CPU, the "dl_non_contending" + "inactive timer" is running on a different CPU, the "dl_non_contending" flag is used to indicate that a task is not on a runqueue but is active (so, the flag is set when the task blocks and is cleared when the "inactive timer" fires or when the task wakes up). @@ -222,36 +222,36 @@ CONTENTS Let's now see a trivial example of two deadline tasks with runtime equal - to 4 and period equal to 8 (i.e., bandwidth equal to 0.5): - - A Task T1 - | - | | - | | - |-------- |---- - | | V - |---|---|---|---|---|---|---|---|--------->t - 0 1 2 3 4 5 6 7 8 - - - A Task T2 - | - | | - | | - | ------------------------| - | | V - |---|---|---|---|---|---|---|---|--------->t - 0 1 2 3 4 5 6 7 8 - - - A running_bw - | - 1 ----------------- ------ - | | | - 0.5- ----------------- - | | - |---|---|---|---|---|---|---|---|--------->t - 0 1 2 3 4 5 6 7 8 + to 4 and period equal to 8 (i.e., bandwidth equal to 0.5):: + + A Task T1 + | + | | + | | + |-------- |---- + | | V + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + A Task T2 + | + | | + | | + | ------------------------| + | | V + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 + + + A running_bw + | + 1 ----------------- ------ + | | | + 0.5- ----------------- + | | + |---|---|---|---|---|---|---|---|--------->t + 0 1 2 3 4 5 6 7 8 - Time t = 0: @@ -284,7 +284,7 @@ CONTENTS 2.3 Energy-aware scheduling ------------------------- +--------------------------- When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum @@ -299,15 +299,20 @@ CONTENTS 3. Scheduling Real-Time Tasks ============================= - * BIG FAT WARNING ****************************************************** - * - * This section contains a (not-thorough) summary on classical deadline - * scheduling theory, and how it applies to SCHED_DEADLINE. - * The reader can "safely" skip to Section 4 if only interested in seeing - * how the scheduling policy can be used. Anyway, we strongly recommend - * to come back here and continue reading (once the urge for testing is - * satisfied :P) to be sure of fully understanding all technical details. - ************************************************************************ + + + .. BIG FAT WARNING ****************************************************** + + .. warning:: + + This section contains a (not-thorough) summary on classical deadline + scheduling theory, and how it applies to SCHED_DEADLINE. + The reader can "safely" skip to Section 4 if only interested in seeing + how the scheduling policy can be used. Anyway, we strongly recommend + to come back here and continue reading (once the urge for testing is + satisfied :P) to be sure of fully understanding all technical details. + + .. ************************************************************************ There are no limitations on what kind of task can exploit this new scheduling discipline, even if it must be said that it is particularly @@ -329,6 +334,7 @@ CONTENTS sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally, d_j = r_j + D, where D is the task's relative deadline. Summing up, a real-time task can be described as + Task = (WCET, D, P) The utilization of a real-time task is defined as the ratio between its @@ -352,13 +358,15 @@ CONTENTS between the finishing time of a job and its absolute deadline). More precisely, it can be proven that using a global EDF scheduler the maximum tardiness of each task is smaller or equal than + ((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max + where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i} is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum utilization[12]. 3.2 Schedulability Analysis for Uniprocessor Systems ------------------------- +---------------------------------------------------- If M=1 (uniprocessor system), or in case of partitioned scheduling (each real-time task is statically assigned to one and only one CPU), it is @@ -370,7 +378,9 @@ CONTENTS a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines of all the tasks running on a CPU if the sum of the densities of the tasks running on such a CPU is smaller or equal than 1: + sum(WCET_i / min{D_i, P_i}) <= 1 + It is important to notice that this condition is only sufficient, and not necessary: there are task sets that are schedulable, but do not respect the condition. For example, consider the task set {Task_1,Task_2} composed by @@ -379,7 +389,9 @@ CONTENTS (Task_1 is scheduled as soon as it is released, and finishes just in time to respect its deadline; Task_2 is scheduled immediately after Task_1, hence its response time cannot be larger than 50ms + 10ms = 60ms) even if + 50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1 + Of course it is possible to test the exact schedulability of tasks with D_i != P_i (checking a condition that is both sufficient and necessary), but this cannot be done by comparing the total utilization or density with @@ -399,7 +411,7 @@ CONTENTS 4 Linux uses an admission test based on the tasks' utilizations. 3.3 Schedulability Analysis for Multiprocessor Systems ------------------------- +------------------------------------------------------ On multiprocessor systems with global EDF scheduling (non partitioned systems), a sufficient test for schedulability can not be based on the @@ -428,7 +440,9 @@ CONTENTS between total utilization (or density) and a fixed constant. If all tasks have D_i = P_i, a sufficient schedulability condition can be expressed in a simple way: + sum(WCET_i / P_i) <= M - (M - 1) · U_max + where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1, M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition just confirms the Dhall's effect. A more complete survey of the literature @@ -447,7 +461,7 @@ CONTENTS the tasks are limited. 3.4 Relationship with SCHED_DEADLINE Parameters ------------------------- +----------------------------------------------- Finally, it is important to understand the relationship between the SCHED_DEADLINE scheduling parameters described in Section 2 (runtime, @@ -473,6 +487,7 @@ CONTENTS this task, as it is not possible to respect its temporal constraints. References: + 1 - C. L. Liu and J. W. Layland. Scheduling algorithms for multiprogram- ming in a hard-real-time environment. Journal of the Association for Computing Machinery, 20(1), 1973. @@ -550,7 +565,7 @@ CONTENTS The interface used to control the CPU bandwidth that can be allocated to -deadline tasks is similar to the one already used for -rt tasks with real-time group scheduling (a.k.a. RT-throttling - see - Documentation/scheduler/sched-rt-group.txt), and is based on readable/ + Documentation/scheduler/sched-rt-group.rst), and is based on readable/ writable control files located in procfs (for system wide settings). Notice that per-group settings (controlled through cgroupfs) are still not defined for -deadline tasks, because more discussion is needed in order to @@ -596,11 +611,13 @@ CONTENTS Specifying a periodic/sporadic task that executes for a given amount of runtime at each instance, and that is scheduled according to the urgency of its own timing constraints needs, in general, a way of declaring: + - a (maximum/typical) instance execution time, - a minimum interval between consecutive instances, - a time constraint by which each instance must be completed. Therefore: + * a new struct sched_attr, containing all the necessary fields is provided; * the new scheduling related syscalls that manipulate it, i.e., @@ -658,21 +675,21 @@ CONTENTS ------------------------------------ An example of a simple configuration (pin a -deadline task to CPU0) - follows (rt-app is used to create a -deadline task). - - mkdir /dev/cpuset - mount -t cgroup -o cpuset cpuset /dev/cpuset - cd /dev/cpuset - mkdir cpu0 - echo 0 > cpu0/cpuset.cpus - echo 0 > cpu0/cpuset.mems - echo 1 > cpuset.cpu_exclusive - echo 0 > cpuset.sched_load_balance - echo 1 > cpu0/cpuset.cpu_exclusive - echo 1 > cpu0/cpuset.mem_exclusive - echo $$ > cpu0/tasks - rt-app -t 100000:10000:d:0 -D5 (it is now actually superfluous to specify - task affinity) + follows (rt-app is used to create a -deadline task):: + + mkdir /dev/cpuset + mount -t cgroup -o cpuset cpuset /dev/cpuset + cd /dev/cpuset + mkdir cpu0 + echo 0 > cpu0/cpuset.cpus + echo 0 > cpu0/cpuset.mems + echo 1 > cpuset.cpu_exclusive + echo 0 > cpuset.sched_load_balance + echo 1 > cpu0/cpuset.cpu_exclusive + echo 1 > cpu0/cpuset.mem_exclusive + echo $$ > cpu0/tasks + rt-app -t 100000:10000:d:0 -D5 # it is now actually superfluous to specify + # task affinity 6. Future plans =============== @@ -711,7 +728,7 @@ Appendix A. Test suite rt-app is available at: https://github.com/scheduler-tools/rt-app. Thread parameters can be specified from the command line, with something like - this: + this:: # rt-app -t 100000:10000:d -t 150000:20000:f:10 -D5 @@ -721,27 +738,27 @@ Appendix A. Test suite of 5 seconds. More interestingly, configurations can be described with a json file that - can be passed as input to rt-app with something like this: + can be passed as input to rt-app with something like this:: # rt-app my_config.json The parameters that can be specified with the second method are a superset of the command line options. Please refer to rt-app documentation for more - details (<rt-app-sources>/doc/*.json). + details (`<rt-app-sources>/doc/*.json`). The second testing application is a modification of schedtool, called schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a certain pid/application. schedtool-dl is available at: https://github.com/scheduler-tools/schedtool-dl.git. - The usage is straightforward: + The usage is straightforward:: # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation of 10ms every 100ms (note that parameters are expressed in microseconds). You can also use schedtool to create a reservation for an already running - application, given that you know its pid: + application, given that you know its pid:: # schedtool -E -t 10000000:100000000 my_app_pid @@ -750,43 +767,43 @@ Appendix B. Minimal main() We provide in what follows a simple (ugly) self-contained code snippet showing how SCHED_DEADLINE reservations can be created by a real-time - application developer. - - #define _GNU_SOURCE - #include <unistd.h> - #include <stdio.h> - #include <stdlib.h> - #include <string.h> - #include <time.h> - #include <linux/unistd.h> - #include <linux/kernel.h> - #include <linux/types.h> - #include <sys/syscall.h> - #include <pthread.h> - - #define gettid() syscall(__NR_gettid) - - #define SCHED_DEADLINE 6 - - /* XXX use the proper syscall numbers */ - #ifdef __x86_64__ - #define __NR_sched_setattr 314 - #define __NR_sched_getattr 315 - #endif - - #ifdef __i386__ - #define __NR_sched_setattr 351 - #define __NR_sched_getattr 352 - #endif - - #ifdef __arm__ - #define __NR_sched_setattr 380 - #define __NR_sched_getattr 381 - #endif - - static volatile int done; - - struct sched_attr { + application developer:: + + #define _GNU_SOURCE + #include <unistd.h> + #include <stdio.h> + #include <stdlib.h> + #include <string.h> + #include <time.h> + #include <linux/unistd.h> + #include <linux/kernel.h> + #include <linux/types.h> + #include <sys/syscall.h> + #include <pthread.h> + + #define gettid() syscall(__NR_gettid) + + #define SCHED_DEADLINE 6 + + /* XXX use the proper syscall numbers */ + #ifdef __x86_64__ + #define __NR_sched_setattr 314 + #define __NR_sched_getattr 315 + #endif + + #ifdef __i386__ + #define __NR_sched_setattr 351 + #define __NR_sched_getattr 352 + #endif + + #ifdef __arm__ + #define __NR_sched_setattr 380 + #define __NR_sched_getattr 381 + #endif + + static volatile int done; + + struct sched_attr { __u32 size; __u32 sched_policy; @@ -802,25 +819,25 @@ Appendix B. Minimal main() __u64 sched_runtime; __u64 sched_deadline; __u64 sched_period; - }; + }; - int sched_setattr(pid_t pid, + int sched_setattr(pid_t pid, const struct sched_attr *attr, unsigned int flags) - { + { return syscall(__NR_sched_setattr, pid, attr, flags); - } + } - int sched_getattr(pid_t pid, + int sched_getattr(pid_t pid, struct sched_attr *attr, unsigned int size, unsigned int flags) - { + { return syscall(__NR_sched_getattr, pid, attr, size, flags); - } + } - void *run_deadline(void *data) - { + void *run_deadline(void *data) + { struct sched_attr attr; int x = 0; int ret; @@ -851,10 +868,10 @@ Appendix B. Minimal main() printf("deadline thread dies [%ld]\n", gettid()); return NULL; - } + } - int main (int argc, char **argv) - { + int main (int argc, char **argv) + { pthread_t thread; printf("main thread [%ld]\n", gettid()); @@ -868,4 +885,4 @@ Appendix B. Minimal main() printf("main dies [%ld]\n", gettid()); return 0; - } + } diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.rst index d1328890ef28..53b30d1967cf 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -1,9 +1,10 @@ - ============= - CFS Scheduler - ============= +============= +CFS Scheduler +============= 1. OVERVIEW +============ CFS stands for "Completely Fair Scheduler," and is the new "desktop" process scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the @@ -27,6 +28,7 @@ is its actual runtime normalized to the total number of running tasks. 2. FEW IMPLEMENTATION DETAILS +============================== In CFS the virtual runtime is expressed and tracked via the per-task p->se.vruntime (nanosec-unit) value. This way, it's possible to accurately @@ -49,6 +51,7 @@ algorithm variants to recognize sleepers. 3. THE RBTREE +============== CFS's design is quite radical: it does not use the old data structures for the runqueues, but it uses a time-ordered rbtree to build a "timeline" of future @@ -84,6 +87,7 @@ picked and the current task is preempted. 4. SOME FEATURES OF CFS +======================== CFS uses nanosecond granularity accounting and does not rely on any jiffies or other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the @@ -113,6 +117,7 @@ result. 5. Scheduling policies +====================== CFS implements three scheduling policies: @@ -137,6 +142,7 @@ SCHED_IDLE. 6. SCHEDULING CLASSES +====================== The new CFS scheduler has been designed in such a way to introduce "Scheduling Classes," an extensible hierarchy of scheduler modules. These modules @@ -197,6 +203,7 @@ This is the (partial) list of the hooks: 7. GROUP SCHEDULER EXTENSIONS TO CFS +===================================== Normally, the scheduler operates on individual tasks and strives to provide fair CPU time to each task. Sometimes, it may be desirable to group tasks and @@ -219,7 +226,7 @@ SCHED_BATCH) tasks. When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each group created using the pseudo filesystem. See example steps below to create -task groups and modify their CPU share using the "cgroups" pseudo filesystem. +task groups and modify their CPU share using the "cgroups" pseudo filesystem:: # mount -t tmpfs cgroup_root /sys/fs/cgroup # mkdir /sys/fs/cgroup/cpu diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.rst index 4af80b1c05aa..f7504226f445 100644 --- a/Documentation/scheduler/sched-domains.txt +++ b/Documentation/scheduler/sched-domains.rst @@ -1,3 +1,7 @@ +================= +Scheduler Domains +================= + Each CPU has a "base" scheduling domain (struct sched_domain). The domain hierarchy is built from these base domains via the ->parent pointer. ->parent MUST be NULL terminated, and domain structures should be per-CPU as they are @@ -46,7 +50,9 @@ CPU's runqueue and the newly found busiest one and starts moving tasks from it to our runqueue. The exact number of tasks amounts to an imbalance previously computed while iterating over this sched domain's groups. -*** Implementing sched domains *** +Implementing sched domains +========================== + The "base" domain will "span" the first level of the hierarchy. In the case of SMT, you'll span all siblings of the physical CPU, with each group being a single virtual CPU. diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.rst index 197d81f4b836..fce5858c9082 100644 --- a/Documentation/scheduler/sched-energy.txt +++ b/Documentation/scheduler/sched-energy.rst @@ -1,6 +1,6 @@ - ======================= - Energy Aware Scheduling - ======================= +======================= +Energy Aware Scheduling +======================= 1. Introduction --------------- @@ -12,7 +12,7 @@ with a minimal impact on throughput. This document aims at providing an introduction on how EAS works, what are the main design decisions behind it, and details what is needed to get it to run. -Before going any further, please note that at the time of writing: +Before going any further, please note that at the time of writing:: /!\ EAS does not support platforms with symmetric CPU topologies /!\ @@ -33,13 +33,13 @@ To make it clear from the start: - power = energy/time = [joule/second] = [watt] The goal of EAS is to minimize energy, while still getting the job done. That -is, we want to maximize: +is, we want to maximize:: performance [inst/s] -------------------- power [W] -which is equivalent to minimizing: +which is equivalent to minimizing:: energy [J] ----------- @@ -97,7 +97,7 @@ domains can contain duplicate elements. Example 1. Let us consider a platform with 12 CPUs, split in 3 performance domains - (pd0, pd4 and pd8), organized as follows: + (pd0, pd4 and pd8), organized as follows:: CPUs: 0 1 2 3 4 5 6 7 8 9 10 11 PDs: |--pd0--|--pd4--|---pd8---| @@ -108,6 +108,7 @@ Example 1. containing 6 CPUs. The two root domains are denoted rd1 and rd2 in the above figure. Since pd4 intersects with both rd1 and rd2, it will be present in the linked list '->pd' attached to each of them: + * rd1->pd: pd0 -> pd4 * rd2->pd: pd4 -> pd8 @@ -159,9 +160,9 @@ Example 2. Each performance domain has three Operating Performance Points (OPPs). The CPU capacity and power cost associated with each OPP is listed in the Energy Model table. The util_avg of P is shown on the figures - below as 'PP'. + below as 'PP':: - CPU util. + CPU util. 1024 - - - - - - - Energy Model +-----------+-------------+ | Little | Big | @@ -188,8 +189,7 @@ Example 2. (which is coherent with the behaviour of the schedutil CPUFreq governor, see Section 6. for more details on this topic). - Case 1. P is migrated to CPU1 - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + **Case 1. P is migrated to CPU1**:: 1024 - - - - - - - @@ -207,8 +207,7 @@ Example 2. CPU0 CPU1 CPU2 CPU3 - Case 2. P is migrated to CPU3 - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + **Case 2. P is migrated to CPU3**:: 1024 - - - - - - - @@ -226,8 +225,7 @@ Example 2. CPU0 CPU1 CPU2 CPU3 - Case 3. P stays on prev_cpu / CPU 0 - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + **Case 3. P stays on prev_cpu / CPU 0**:: 1024 - - - - - - - @@ -324,7 +322,9 @@ hardware properties and on other features of the kernel being enabled. This section lists these dependencies and provides hints as to how they can be met. - 6.1 - Asymmetric CPU topology +6.1 - Asymmetric CPU topology +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + As mentioned in the introduction, EAS is only supported on platforms with asymmetric CPU topologies for now. This requirement is checked at run-time by @@ -347,7 +347,8 @@ significant savings on SMP platforms have been observed yet. This restriction could be amended in the future if proven otherwise. - 6.2 - Energy Model presence +6.2 - Energy Model presence +^^^^^^^^^^^^^^^^^^^^^^^^^^^ EAS uses the EM of a platform to estimate the impact of scheduling decisions on energy. So, your platform must provide power cost tables to the EM framework in @@ -358,7 +359,8 @@ Please also note that the scheduling domains need to be re-built after the EM has been registered in order to start EAS. - 6.3 - Energy Model complexity +6.3 - Energy Model complexity +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The task wake-up path is very latency-sensitive. When the EM of a platform is too complex (too many CPUs, too many performance domains, too many performance @@ -388,7 +390,8 @@ two possible options: hence enabling it to cope with larger EMs in reasonable time. - 6.4 - Schedutil governor +6.4 - Schedutil governor +^^^^^^^^^^^^^^^^^^^^^^^^ EAS tries to predict at which OPP will the CPUs be running in the close future in order to estimate their energy consumption. To do so, it is assumed that OPPs @@ -405,7 +408,8 @@ frequency requests and energy predictions. Using EAS with any other governor than schedutil is not supported. - 6.5 Scale-invariant utilization signals +6.5 Scale-invariant utilization signals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In order to make accurate prediction across CPUs and for all performance states, EAS needs frequency-invariant and CPU-invariant PELT signals. These can @@ -416,7 +420,8 @@ Using EAS on a platform that doesn't implement these two callbacks is not supported. - 6.6 Multithreading (SMT) +6.6 Multithreading (SMT) +^^^^^^^^^^^^^^^^^^^^^^^^ EAS in its current form is SMT unaware and is not able to leverage multithreaded hardware to save energy. EAS considers threads as independent diff --git a/Documentation/scheduler/sched-nice-design.txt b/Documentation/scheduler/sched-nice-design.rst index 3ac1e46d5365..0571f1b47e64 100644 --- a/Documentation/scheduler/sched-nice-design.txt +++ b/Documentation/scheduler/sched-nice-design.rst @@ -1,3 +1,7 @@ +===================== +Scheduler Nice Design +===================== + This document explains the thinking about the revamped and streamlined nice-levels implementation in the new Linux scheduler. @@ -14,7 +18,7 @@ much stronger than they were before in 2.4 (and people were happy about that change), and we also intentionally calibrated the linear timeslice rule so that nice +19 level would be _exactly_ 1 jiffy. To better understand it, the timeslice graph went like this (cheesy ASCII art -alert!): +alert!):: A diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.rst index c09f7a3fee66..d27d3f3712fd 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.rst @@ -1,18 +1,18 @@ - Real-Time group scheduling - -------------------------- +========================== +Real-Time group scheduling +========================== -CONTENTS -======== +.. CONTENTS -0. WARNING -1. Overview - 1.1 The problem - 1.2 The solution -2. The interface - 2.1 System-wide settings - 2.2 Default behaviour - 2.3 Basis for grouping tasks -3. Future plans + 0. WARNING + 1. Overview + 1.1 The problem + 1.2 The solution + 2. The interface + 2.1 System-wide settings + 2.2 Default behaviour + 2.3 Basis for grouping tasks + 3. Future plans 0. WARNING @@ -159,9 +159,11 @@ Consider two sibling groups A and B; both have 50% bandwidth, but A's period is twice the length of B's. * group A: period=100000us, runtime=50000us + - this runs for 0.05s once every 0.1s * group B: period= 50000us, runtime=25000us + - this runs for 0.025s twice every 0.1s (or once every 0.05 sec). This means that currently a while (1) loop in A will run for the full period of diff --git a/Documentation/scheduler/sched-stats.txt b/Documentation/scheduler/sched-stats.rst index 8259b34a66ae..0cb0aa714545 100644 --- a/Documentation/scheduler/sched-stats.txt +++ b/Documentation/scheduler/sched-stats.rst @@ -1,3 +1,7 @@ +==================== +Scheduler Statistics +==================== + Version 15 of schedstats dropped counters for some sched_yield: yld_exp_empty, yld_act_empty and yld_both_empty. Otherwise, it is identical to version 14. @@ -35,19 +39,23 @@ CPU statistics cpu<N> 1 2 3 4 5 6 7 8 9 First field is a sched_yield() statistic: + 1) # of times sched_yield() was called Next three are schedule() statistics: + 2) This field is a legacy array expiration count field used in the O(1) scheduler. We kept it for ABI compatibility, but it is always set to zero. 3) # of times schedule() was called 4) # of times schedule() left the processor idle Next two are try_to_wake_up() statistics: + 5) # of times try_to_wake_up() was called 6) # of times try_to_wake_up() was called to wake up the local cpu Next three are statistics describing scheduling latency: + 7) sum of all time spent running by tasks on this processor (in jiffies) 8) sum of all time spent waiting to run by tasks on this processor (in jiffies) @@ -67,24 +75,23 @@ The first field is a bit mask indicating what cpus this domain operates over. The next 24 are a variety of load_balance() statistics in grouped into types of idleness (idle, busy, and newly idle): - 1) # of times in this domain load_balance() was called when the + 1) # of times in this domain load_balance() was called when the cpu was idle - 2) # of times in this domain load_balance() checked but found + 2) # of times in this domain load_balance() checked but found the load did not require balancing when the cpu was idle - 3) # of times in this domain load_balance() tried to move one or + 3) # of times in this domain load_balance() tried to move one or more tasks and failed, when the cpu was idle - 4) sum of imbalances discovered (if any) with each call to + 4) sum of imbalances discovered (if any) with each call to load_balance() in this domain when the cpu was idle - 5) # of times in this domain pull_task() was called when the cpu + 5) # of times in this domain pull_task() was called when the cpu was idle - 6) # of times in this domain pull_task() was called even though + 6) # of times in this domain pull_task() was called even though the target task was cache-hot when idle - 7) # of times in this domain load_balance() was called but did + 7) # of times in this domain load_balance() was called but did not find a busier queue while the cpu was idle - 8) # of times in this domain a busier queue was found while the + 8) # of times in this domain a busier queue was found while the cpu was idle but no busier group was found - - 9) # of times in this domain load_balance() was called when the + 9) # of times in this domain load_balance() was called when the cpu was busy 10) # of times in this domain load_balance() checked but found the load did not require balancing when busy @@ -117,21 +124,25 @@ of idleness (idle, busy, and newly idle): was just becoming idle but no busier group was found Next three are active_load_balance() statistics: + 25) # of times active_load_balance() was called 26) # of times active_load_balance() tried to move a task and failed 27) # of times active_load_balance() successfully moved a task Next three are sched_balance_exec() statistics: + 28) sbe_cnt is not used 29) sbe_balanced is not used 30) sbe_pushed is not used Next three are sched_balance_fork() statistics: + 31) sbf_cnt is not used 32) sbf_balanced is not used 33) sbf_pushed is not used Next three are try_to_wake_up() statistics: + 34) # of times in this domain try_to_wake_up() awoke a task that last ran on a different cpu in this domain 35) # of times in this domain try_to_wake_up() moved a task to the @@ -139,10 +150,11 @@ of idleness (idle, busy, and newly idle): 36) # of times in this domain try_to_wake_up() started passive balancing /proc/<pid>/schedstat ----------------- +--------------------- schedstats also adds a new /proc/<pid>/schedstat file to include some of the same information on a per-process level. There are three fields in this file correlating for that process to: + 1) time spent on the cpu 2) time spent waiting on a runqueue 3) # of timeslices run on this cpu @@ -151,4 +163,5 @@ A program could be easily written to make use of these extra fields to report on how well a particular process or set of processes is faring under the scheduler's policies. A simple version of such a program is available at + http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c diff --git a/Documentation/scheduler/text_files.rst b/Documentation/scheduler/text_files.rst new file mode 100644 index 000000000000..0bc50307b241 --- /dev/null +++ b/Documentation/scheduler/text_files.rst @@ -0,0 +1,5 @@ +Scheduler pelt c program +------------------------ + +.. literalinclude:: sched-pelt.c + :language: c diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst index 1b3c907980ad..bc561ca95c86 100644 --- a/Documentation/security/keys/core.rst +++ b/Documentation/security/keys/core.rst @@ -1687,10 +1687,12 @@ The structure has a number of fields, some of which are mandatory: attempted key link operation. If there is no match, -EINVAL is returned. - * ``int (*asym_eds_op)(struct kernel_pkey_params *params, - const void *in, void *out);`` - ``int (*asym_verify_signature)(struct kernel_pkey_params *params, - const void *in, const void *in2);`` + * ``asym_eds_op`` and ``asym_verify_signature``:: + + int (*asym_eds_op)(struct kernel_pkey_params *params, + const void *in, void *out); + int (*asym_verify_signature)(struct kernel_pkey_params *params, + const void *in, const void *in2); These methods are optional. If provided the first allows a key to be used to encrypt, decrypt or sign a blob of data, and the second allows a @@ -1755,8 +1757,10 @@ The structure has a number of fields, some of which are mandatory: required crypto isn't available. - * ``int (*asym_query)(const struct kernel_pkey_params *params, - struct kernel_pkey_query *info);`` + * ``asym_query``:: + + int (*asym_query)(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info); This method is optional. If provided it allows information about the public or asymmetric key held in the key to be determined. diff --git a/Documentation/security/keys/trusted-encrypted.rst b/Documentation/security/keys/trusted-encrypted.rst index 7b35fcb58933..50ac8bcd6970 100644 --- a/Documentation/security/keys/trusted-encrypted.rst +++ b/Documentation/security/keys/trusted-encrypted.rst @@ -107,12 +107,14 @@ Where:: Examples of trusted and encrypted key usage: -Create and save a trusted key named "kmk" of length 32 bytes:: +Create and save a trusted key named "kmk" of length 32 bytes. Note: When using a TPM 2.0 with a persistent key with handle 0x81000001, append 'keyhandle=0x81000001' to statements between quotes, such as "new 32 keyhandle=0x81000001". +:: + $ keyctl add trusted kmk "new 32" @u 440502848 diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py new file mode 100644 index 000000000000..77e89c1956d7 --- /dev/null +++ b/Documentation/sphinx/automarkup.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright 2019 Jonathan Corbet <corbet@lwn.net> +# +# Apply kernel-specific tweaks after the initial document processing +# has been done. +# +from docutils import nodes +from sphinx import addnodes +from sphinx.environment import NoUri +import re + +# +# Regex nastiness. Of course. +# Try to identify "function()" that's not already marked up some +# other way. Sphinx doesn't like a lot of stuff right after a +# :c:func: block (i.e. ":c:func:`mmap()`s" flakes out), so the last +# bit tries to restrict matches to things that won't create trouble. +# +RE_function = re.compile(r'([\w_][\w\d_]+\(\))') + +# +# Many places in the docs refer to common system calls. It is +# pointless to try to cross-reference them and, as has been known +# to happen, somebody defining a function by these names can lead +# to the creation of incorrect and confusing cross references. So +# just don't even try with these names. +# +Skipfuncs = [ 'open', 'close', 'read', 'write', 'fcntl', 'mmap' + 'select', 'poll', 'fork', 'execve', 'clone', 'ioctl'] + +# +# Find all occurrences of function() and try to replace them with +# appropriate cross references. +# +def markup_funcs(docname, app, node): + cdom = app.env.domains['c'] + t = node.astext() + done = 0 + repl = [ ] + for m in RE_function.finditer(t): + # + # Include any text prior to function() as a normal text node. + # + if m.start() > done: + repl.append(nodes.Text(t[done:m.start()])) + # + # Go through the dance of getting an xref out of the C domain + # + target = m.group(1)[:-2] + target_text = nodes.Text(target + '()') + xref = None + if target not in Skipfuncs: + lit_text = nodes.literal(classes=['xref', 'c', 'c-func']) + lit_text += target_text + pxref = addnodes.pending_xref('', refdomain = 'c', + reftype = 'function', + reftarget = target, modname = None, + classname = None) + # + # XXX The Latex builder will throw NoUri exceptions here, + # work around that by ignoring them. + # + try: + xref = cdom.resolve_xref(app.env, docname, app.builder, + 'function', target, pxref, lit_text) + except NoUri: + xref = None + # + # Toss the xref into the list if we got it; otherwise just put + # the function text. + # + if xref: + repl.append(xref) + else: + repl.append(target_text) + done = m.end() + if done < len(t): + repl.append(nodes.Text(t[done:])) + return repl + +def auto_markup(app, doctree, name): + # + # This loop could eventually be improved on. Someday maybe we + # want a proper tree traversal with a lot of awareness of which + # kinds of nodes to prune. But this works well for now. + # + # The nodes.literal test catches ``literal text``, its purpose is to + # avoid adding cross-references to functions that have been explicitly + # marked with cc:func:. + # + for para in doctree.traverse(nodes.paragraph): + for node in para.traverse(nodes.Text): + if not isinstance(node.parent, nodes.literal): + node.parent.replace(node, markup_funcs(name, app, node)) + +def setup(app): + app.connect('doctree-resolved', auto_markup) + return { + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } diff --git a/Documentation/sphinx/cdomain.py b/Documentation/sphinx/cdomain.py index cf13ff3a656c..cbac8e608dc4 100644 --- a/Documentation/sphinx/cdomain.py +++ b/Documentation/sphinx/cdomain.py @@ -48,7 +48,10 @@ major, minor, patch = sphinx.version_info[:3] def setup(app): - app.override_domain(CDomain) + if (major == 1 and minor < 8): + app.override_domain(CDomain) + else: + app.add_domain(CDomain, override=True) return dict( version = __version__, diff --git a/Documentation/sphinx/requirements.txt b/Documentation/sphinx/requirements.txt index 742be3e12619..14e29a0ae480 100644 --- a/Documentation/sphinx/requirements.txt +++ b/Documentation/sphinx/requirements.txt @@ -1,3 +1,3 @@ -docutils==0.12 -Sphinx==1.4.9 +docutils +Sphinx==1.7.9 sphinx_rtd_theme diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 5af8b131ccbc..1b2fe17cd2fa 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -154,7 +154,7 @@ is 0x15 and the full version number is 0x234, this file will contain the value 340 = 0x154. See the type_of_loader and ext_loader_type fields in -Documentation/x86/boot.txt for additional information. +Documentation/x86/boot.rst for additional information. ============================================================== @@ -166,7 +166,7 @@ The complete bootloader version number. In the example above, this file will contain the value 564 = 0x234. See the type_of_loader and ext_loader_ver fields in -Documentation/x86/boot.txt for additional information. +Documentation/x86/boot.rst for additional information. ============================================================== diff --git a/Documentation/target/index.rst b/Documentation/target/index.rst new file mode 100644 index 000000000000..b68f48982392 --- /dev/null +++ b/Documentation/target/index.rst @@ -0,0 +1,19 @@ +:orphan: + +================== +TCM Virtual Device +================== + +.. toctree:: + :maxdepth: 1 + + tcmu-design + tcm_mod_builder + scripts + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/target/scripts.rst b/Documentation/target/scripts.rst new file mode 100644 index 000000000000..172d42b522e4 --- /dev/null +++ b/Documentation/target/scripts.rst @@ -0,0 +1,11 @@ +TCM mod builder script +---------------------- + +.. literalinclude:: tcm_mod_builder.py + :language: perl + +Target export device script +--------------------------- + +.. literalinclude:: target-export-device + :language: shell diff --git a/Documentation/target/tcm_mod_builder.rst b/Documentation/target/tcm_mod_builder.rst new file mode 100644 index 000000000000..9bfc9822e2bd --- /dev/null +++ b/Documentation/target/tcm_mod_builder.rst @@ -0,0 +1,149 @@ +========================================= +The TCM v4 fabric module script generator +========================================= + +Greetings all, + +This document is intended to be a mini-HOWTO for using the tcm_mod_builder.py +script to generate a brand new functional TCM v4 fabric .ko module of your very own, +that once built can be immediately be loaded to start access the new TCM/ConfigFS +fabric skeleton, by simply using:: + + modprobe $TCM_NEW_MOD + mkdir -p /sys/kernel/config/target/$TCM_NEW_MOD + +This script will create a new drivers/target/$TCM_NEW_MOD/, and will do the following + + 1) Generate new API callers for drivers/target/target_core_fabric_configs.c logic + ->make_tpg(), ->drop_tpg(), ->make_wwn(), ->drop_wwn(). These are created + into $TCM_NEW_MOD/$TCM_NEW_MOD_configfs.c + 2) Generate basic infrastructure for loading/unloading LKMs and TCM/ConfigFS fabric module + using a skeleton struct target_core_fabric_ops API template. + 3) Based on user defined T10 Proto_Ident for the new fabric module being built, + the TransportID / Initiator and Target WWPN related handlers for + SPC-3 persistent reservation are automatically generated in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c + using drivers/target/target_core_fabric_lib.c logic. + 4) NOP API calls for all other Data I/O path and fabric dependent attribute logic + in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c + +tcm_mod_builder.py depends upon the mandatory '-p $PROTO_IDENT' and '-m +$FABRIC_MOD_name' parameters, and actually running the script looks like:: + + target:/mnt/sdb/lio-core-2.6.git/Documentation/target# python tcm_mod_builder.py -p iSCSI -m tcm_nab5000 + tcm_dir: /mnt/sdb/lio-core-2.6.git/Documentation/target/../../ + Set fabric_mod_name: tcm_nab5000 + Set fabric_mod_dir: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000 + Using proto_ident: iSCSI + Creating fabric_mod_dir: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000 + Writing file: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_base.h + Using tcm_mod_scan_fabric_ops: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../include/target/target_core_fabric_ops.h + Writing file: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.c + Writing file: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.h + Writing file: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_configfs.c + Writing file: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kbuild + Writing file: + /mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kconfig + Would you like to add tcm_nab5000to drivers/target/Kbuild..? [yes,no]: yes + Would you like to add tcm_nab5000to drivers/target/Kconfig..? [yes,no]: yes + +At the end of tcm_mod_builder.py. the script will ask to add the following +line to drivers/target/Kbuild:: + + obj-$(CONFIG_TCM_NAB5000) += tcm_nab5000/ + +and the same for drivers/target/Kconfig:: + + source "drivers/target/tcm_nab5000/Kconfig" + +#) Run 'make menuconfig' and select the new CONFIG_TCM_NAB5000 item:: + + <M> TCM_NAB5000 fabric module + +#) Build using 'make modules', once completed you will have:: + + target:/mnt/sdb/lio-core-2.6.git# ls -la drivers/target/tcm_nab5000/ + total 1348 + drwxr-xr-x 2 root root 4096 2010-10-05 03:23 . + drwxr-xr-x 9 root root 4096 2010-10-05 03:22 .. + -rw-r--r-- 1 root root 282 2010-10-05 03:22 Kbuild + -rw-r--r-- 1 root root 171 2010-10-05 03:22 Kconfig + -rw-r--r-- 1 root root 49 2010-10-05 03:23 modules.order + -rw-r--r-- 1 root root 738 2010-10-05 03:22 tcm_nab5000_base.h + -rw-r--r-- 1 root root 9096 2010-10-05 03:22 tcm_nab5000_configfs.c + -rw-r--r-- 1 root root 191200 2010-10-05 03:23 tcm_nab5000_configfs.o + -rw-r--r-- 1 root root 40504 2010-10-05 03:23 .tcm_nab5000_configfs.o.cmd + -rw-r--r-- 1 root root 5414 2010-10-05 03:22 tcm_nab5000_fabric.c + -rw-r--r-- 1 root root 2016 2010-10-05 03:22 tcm_nab5000_fabric.h + -rw-r--r-- 1 root root 190932 2010-10-05 03:23 tcm_nab5000_fabric.o + -rw-r--r-- 1 root root 40713 2010-10-05 03:23 .tcm_nab5000_fabric.o.cmd + -rw-r--r-- 1 root root 401861 2010-10-05 03:23 tcm_nab5000.ko + -rw-r--r-- 1 root root 265 2010-10-05 03:23 .tcm_nab5000.ko.cmd + -rw-r--r-- 1 root root 459 2010-10-05 03:23 tcm_nab5000.mod.c + -rw-r--r-- 1 root root 23896 2010-10-05 03:23 tcm_nab5000.mod.o + -rw-r--r-- 1 root root 22655 2010-10-05 03:23 .tcm_nab5000.mod.o.cmd + -rw-r--r-- 1 root root 379022 2010-10-05 03:23 tcm_nab5000.o + -rw-r--r-- 1 root root 211 2010-10-05 03:23 .tcm_nab5000.o.cmd + +#) Load the new module, create a lun_0 configfs group, and add new TCM Core + IBLOCK backstore symlink to port:: + + target:/mnt/sdb/lio-core-2.6.git# insmod drivers/target/tcm_nab5000.ko + target:/mnt/sdb/lio-core-2.6.git# mkdir -p /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0 + target:/mnt/sdb/lio-core-2.6.git# cd /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0/ + target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# ln -s /sys/kernel/config/target/core/iblock_0/lvm_test0 nab5000_port + + target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# cd - + target:/mnt/sdb/lio-core-2.6.git# tree /sys/kernel/config/target/nab5000/ + /sys/kernel/config/target/nab5000/ + |-- discovery_auth + |-- iqn.foo + | `-- tpgt_1 + | |-- acls + | |-- attrib + | |-- lun + | | `-- lun_0 + | | |-- alua_tg_pt_gp + | | |-- alua_tg_pt_offline + | | |-- alua_tg_pt_status + | | |-- alua_tg_pt_write_md + | | `-- nab5000_port -> ../../../../../../target/core/iblock_0/lvm_test0 + | |-- np + | `-- param + `-- version + + target:/mnt/sdb/lio-core-2.6.git# lsmod + Module Size Used by + tcm_nab5000 3935 4 + iscsi_target_mod 193211 0 + target_core_stgt 8090 0 + target_core_pscsi 11122 1 + target_core_file 9172 2 + target_core_iblock 9280 1 + target_core_mod 228575 31 + tcm_nab5000,iscsi_target_mod,target_core_stgt,target_core_pscsi,target_core_file,target_core_iblock + libfc 73681 0 + scsi_debug 56265 0 + scsi_tgt 8666 1 target_core_stgt + configfs 20644 2 target_core_mod + +---------------------------------------------------------------------- + +Future TODO items +================= + + 1) Add more T10 proto_idents + 2) Make tcm_mod_dump_fabric_ops() smarter and generate function pointer + defs directly from include/target/target_core_fabric_ops.h:struct target_core_fabric_ops + structure members. + +October 5th, 2010 + +Nicholas A. Bellinger <nab@linux-iscsi.org> diff --git a/Documentation/target/tcm_mod_builder.txt b/Documentation/target/tcm_mod_builder.txt deleted file mode 100644 index ae22f7005540..000000000000 --- a/Documentation/target/tcm_mod_builder.txt +++ /dev/null @@ -1,145 +0,0 @@ ->>>>>>>>>> The TCM v4 fabric module script generator <<<<<<<<<< - -Greetings all, - -This document is intended to be a mini-HOWTO for using the tcm_mod_builder.py -script to generate a brand new functional TCM v4 fabric .ko module of your very own, -that once built can be immediately be loaded to start access the new TCM/ConfigFS -fabric skeleton, by simply using: - - modprobe $TCM_NEW_MOD - mkdir -p /sys/kernel/config/target/$TCM_NEW_MOD - -This script will create a new drivers/target/$TCM_NEW_MOD/, and will do the following - - *) Generate new API callers for drivers/target/target_core_fabric_configs.c logic - ->make_tpg(), ->drop_tpg(), ->make_wwn(), ->drop_wwn(). These are created - into $TCM_NEW_MOD/$TCM_NEW_MOD_configfs.c - *) Generate basic infrastructure for loading/unloading LKMs and TCM/ConfigFS fabric module - using a skeleton struct target_core_fabric_ops API template. - *) Based on user defined T10 Proto_Ident for the new fabric module being built, - the TransportID / Initiator and Target WWPN related handlers for - SPC-3 persistent reservation are automatically generated in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c - using drivers/target/target_core_fabric_lib.c logic. - *) NOP API calls for all other Data I/O path and fabric dependent attribute logic - in $TCM_NEW_MOD/$TCM_NEW_MOD_fabric.c - -tcm_mod_builder.py depends upon the mandatory '-p $PROTO_IDENT' and '-m -$FABRIC_MOD_name' parameters, and actually running the script looks like: - -target:/mnt/sdb/lio-core-2.6.git/Documentation/target# python tcm_mod_builder.py -p iSCSI -m tcm_nab5000 -tcm_dir: /mnt/sdb/lio-core-2.6.git/Documentation/target/../../ -Set fabric_mod_name: tcm_nab5000 -Set fabric_mod_dir: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000 -Using proto_ident: iSCSI -Creating fabric_mod_dir: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000 -Writing file: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_base.h -Using tcm_mod_scan_fabric_ops: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../include/target/target_core_fabric_ops.h -Writing file: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.c -Writing file: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_fabric.h -Writing file: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/tcm_nab5000_configfs.c -Writing file: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kbuild -Writing file: -/mnt/sdb/lio-core-2.6.git/Documentation/target/../../drivers/target/tcm_nab5000/Kconfig -Would you like to add tcm_nab5000to drivers/target/Kbuild..? [yes,no]: yes -Would you like to add tcm_nab5000to drivers/target/Kconfig..? [yes,no]: yes - -At the end of tcm_mod_builder.py. the script will ask to add the following -line to drivers/target/Kbuild: - - obj-$(CONFIG_TCM_NAB5000) += tcm_nab5000/ - -and the same for drivers/target/Kconfig: - - source "drivers/target/tcm_nab5000/Kconfig" - -*) Run 'make menuconfig' and select the new CONFIG_TCM_NAB5000 item: - - <M> TCM_NAB5000 fabric module - -*) Build using 'make modules', once completed you will have: - -target:/mnt/sdb/lio-core-2.6.git# ls -la drivers/target/tcm_nab5000/ -total 1348 -drwxr-xr-x 2 root root 4096 2010-10-05 03:23 . -drwxr-xr-x 9 root root 4096 2010-10-05 03:22 .. --rw-r--r-- 1 root root 282 2010-10-05 03:22 Kbuild --rw-r--r-- 1 root root 171 2010-10-05 03:22 Kconfig --rw-r--r-- 1 root root 49 2010-10-05 03:23 modules.order --rw-r--r-- 1 root root 738 2010-10-05 03:22 tcm_nab5000_base.h --rw-r--r-- 1 root root 9096 2010-10-05 03:22 tcm_nab5000_configfs.c --rw-r--r-- 1 root root 191200 2010-10-05 03:23 tcm_nab5000_configfs.o --rw-r--r-- 1 root root 40504 2010-10-05 03:23 .tcm_nab5000_configfs.o.cmd --rw-r--r-- 1 root root 5414 2010-10-05 03:22 tcm_nab5000_fabric.c --rw-r--r-- 1 root root 2016 2010-10-05 03:22 tcm_nab5000_fabric.h --rw-r--r-- 1 root root 190932 2010-10-05 03:23 tcm_nab5000_fabric.o --rw-r--r-- 1 root root 40713 2010-10-05 03:23 .tcm_nab5000_fabric.o.cmd --rw-r--r-- 1 root root 401861 2010-10-05 03:23 tcm_nab5000.ko --rw-r--r-- 1 root root 265 2010-10-05 03:23 .tcm_nab5000.ko.cmd --rw-r--r-- 1 root root 459 2010-10-05 03:23 tcm_nab5000.mod.c --rw-r--r-- 1 root root 23896 2010-10-05 03:23 tcm_nab5000.mod.o --rw-r--r-- 1 root root 22655 2010-10-05 03:23 .tcm_nab5000.mod.o.cmd --rw-r--r-- 1 root root 379022 2010-10-05 03:23 tcm_nab5000.o --rw-r--r-- 1 root root 211 2010-10-05 03:23 .tcm_nab5000.o.cmd - -*) Load the new module, create a lun_0 configfs group, and add new TCM Core - IBLOCK backstore symlink to port: - -target:/mnt/sdb/lio-core-2.6.git# insmod drivers/target/tcm_nab5000.ko -target:/mnt/sdb/lio-core-2.6.git# mkdir -p /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0 -target:/mnt/sdb/lio-core-2.6.git# cd /sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0/ -target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# ln -s /sys/kernel/config/target/core/iblock_0/lvm_test0 nab5000_port - -target:/sys/kernel/config/target/nab5000/iqn.foo/tpgt_1/lun/lun_0# cd - -target:/mnt/sdb/lio-core-2.6.git# tree /sys/kernel/config/target/nab5000/ -/sys/kernel/config/target/nab5000/ -|-- discovery_auth -|-- iqn.foo -| `-- tpgt_1 -| |-- acls -| |-- attrib -| |-- lun -| | `-- lun_0 -| | |-- alua_tg_pt_gp -| | |-- alua_tg_pt_offline -| | |-- alua_tg_pt_status -| | |-- alua_tg_pt_write_md -| | `-- nab5000_port -> ../../../../../../target/core/iblock_0/lvm_test0 -| |-- np -| `-- param -`-- version - -target:/mnt/sdb/lio-core-2.6.git# lsmod -Module Size Used by -tcm_nab5000 3935 4 -iscsi_target_mod 193211 0 -target_core_stgt 8090 0 -target_core_pscsi 11122 1 -target_core_file 9172 2 -target_core_iblock 9280 1 -target_core_mod 228575 31 -tcm_nab5000,iscsi_target_mod,target_core_stgt,target_core_pscsi,target_core_file,target_core_iblock -libfc 73681 0 -scsi_debug 56265 0 -scsi_tgt 8666 1 target_core_stgt -configfs 20644 2 target_core_mod - ----------------------------------------------------------------------- - -Future TODO items: - - *) Add more T10 proto_idents - *) Make tcm_mod_dump_fabric_ops() smarter and generate function pointer - defs directly from include/target/target_core_fabric_ops.h:struct target_core_fabric_ops - structure members. - -October 5th, 2010 -Nicholas A. Bellinger <nab@linux-iscsi.org> diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.rst index 4cebc1ebf99a..a7b426707bf6 100644 --- a/Documentation/target/tcmu-design.txt +++ b/Documentation/target/tcmu-design.rst @@ -1,25 +1,30 @@ -Contents: - -1) TCM Userspace Design - a) Background - b) Benefits - c) Design constraints - d) Implementation overview - i. Mailbox - ii. Command ring - iii. Data Area - e) Device discovery - f) Device events - g) Other contingencies -2) Writing a user pass-through handler - a) Discovering and configuring TCMU uio devices - b) Waiting for events on the device(s) - c) Managing the command ring -3) A final note +==================== +TCM Userspace Design +==================== + + +.. Contents: + + 1) TCM Userspace Design + a) Background + b) Benefits + c) Design constraints + d) Implementation overview + i. Mailbox + ii. Command ring + iii. Data Area + e) Device discovery + f) Device events + g) Other contingencies + 2) Writing a user pass-through handler + a) Discovering and configuring TCMU uio devices + b) Waiting for events on the device(s) + c) Managing the command ring + 3) A final note TCM Userspace Design --------------------- +==================== TCM is another name for LIO, an in-kernel iSCSI target (server). Existing TCM targets run in the kernel. TCMU (TCM in Userspace) @@ -32,7 +37,8 @@ modules for file, block device, RAM or using another SCSI device as storage. These are called "backstores" or "storage engines". These built-in modules are implemented entirely as kernel code. -Background: +Background +---------- In addition to modularizing the transport protocol used for carrying SCSI commands ("fabrics"), the Linux kernel target, LIO, also modularizes @@ -60,7 +66,8 @@ kernel, another approach is to create a userspace pass-through backstore for LIO, "TCMU". -Benefits: +Benefits +-------- In addition to allowing relatively easy support for RBD and GLFS, TCMU will also allow easier development of new backstores. TCMU combines @@ -72,21 +79,25 @@ The disadvantage is there are more distinct components to configure, and potentially to malfunction. This is unavoidable, but hopefully not fatal if we're careful to keep things as simple as possible. -Design constraints: +Design constraints +------------------ - Good performance: high throughput, low latency - Cleanly handle if userspace: + 1) never attaches 2) hangs 3) dies 4) misbehaves + - Allow future flexibility in user & kernel implementations - Be reasonably memory-efficient - Simple to configure & run - Simple to write a userspace backend -Implementation overview: +Implementation overview +----------------------- The core of the TCMU interface is a memory region that is shared between kernel and userspace. Within this region is: a control area @@ -108,7 +119,8 @@ the region mapped at a different virtual address. See target_core_user.h for the struct definitions. -The Mailbox: +The Mailbox +----------- The mailbox is always at the start of the shared memory region, and contains a version, details about the starting offset and size of the @@ -117,19 +129,27 @@ userspace (respectively) to put commands on the ring, and indicate when the commands are completed. version - 1 (userspace should abort if otherwise) + flags: -- TCMU_MAILBOX_FLAG_CAP_OOOC: indicates out-of-order completion is - supported. See "The Command Ring" for details. -cmdr_off - The offset of the start of the command ring from the start -of the memory region, to account for the mailbox size. -cmdr_size - The size of the command ring. This does *not* need to be a -power of two. -cmd_head - Modified by the kernel to indicate when a command has been -placed on the ring. -cmd_tail - Modified by userspace to indicate when it has completed -processing of a command. - -The Command Ring: + - TCMU_MAILBOX_FLAG_CAP_OOOC: + indicates out-of-order completion is supported. + See "The Command Ring" for details. + +cmdr_off + The offset of the start of the command ring from the start + of the memory region, to account for the mailbox size. +cmdr_size + The size of the command ring. This does *not* need to be a + power of two. +cmd_head + Modified by the kernel to indicate when a command has been + placed on the ring. +cmd_tail + Modified by userspace to indicate when it has completed + processing of a command. + +The Command Ring +---------------- Commands are placed on the ring by the kernel incrementing mailbox.cmd_head by the size of the command, modulo cmdr_size, and @@ -180,29 +200,31 @@ opcode it does not handle, it must set UNKNOWN_OP bit (bit 0) in hdr.uflags, update cmd_tail, and proceed with processing additional commands, if any. -The Data Area: +The Data Area +------------- This is shared-memory space after the command ring. The organization of this area is not defined in the TCMU interface, and userspace should access only the parts referenced by pending iovs. -Device Discovery: +Device Discovery +---------------- Other devices may be using UIO besides TCMU. Unrelated user processes may also be handling different sets of TCMU devices. TCMU userspace processes must find their devices by scanning sysfs class/uio/uio*/name. For TCMU devices, these names will be of the -format: +format:: -tcm-user/<hba_num>/<device_name>/<subtype>/<path> + tcm-user/<hba_num>/<device_name>/<subtype>/<path> where "tcm-user" is common for all TCMU-backed UIO devices. <hba_num> and <device_name> allow userspace to find the device's path in the kernel target's configfs tree. Assuming the usual mount point, it is -found at: +found at:: -/sys/kernel/config/target/core/user_<hba_num>/<device_name> + /sys/kernel/config/target/core/user_<hba_num>/<device_name> This location contains attributes such as "hw_block_size", that userspace needs to know for correct operation. @@ -214,15 +236,16 @@ configure the device, if needed. The name cannot contain ':', due to LIO limitations. For all devices so discovered, the user handler opens /dev/uioX and -calls mmap(): +calls mmap():: -mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0) + mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0) where size must be equal to the value read from /sys/class/uio/uioX/maps/map0/size. -Device Events: +Device Events +------------- If a new device is added or removed, a notification will be broadcast over netlink, using a generic netlink family name of "TCM-USER" and a @@ -233,7 +256,8 @@ the LIO device, so that after determining the device is supported (based on subtype) it can take the appropriate action. -Other contingencies: +Other contingencies +------------------- Userspace handler process never attaches: @@ -258,7 +282,7 @@ Userspace handler process is malicious: Writing a user pass-through handler (with example code) -------------------------------------------------------- +======================================================= A user process handing a TCMU device must support the following: @@ -277,103 +301,103 @@ TCMU is designed so that multiple unrelated processes can manage TCMU devices separately. All handlers should make sure to only open their devices, based opon a known subtype string. -a) Discovering and configuring TCMU UIO devices: +a) Discovering and configuring TCMU UIO devices:: -(error checking omitted for brevity) + /* error checking omitted for brevity */ -int fd, dev_fd; -char buf[256]; -unsigned long long map_len; -void *map; + int fd, dev_fd; + char buf[256]; + unsigned long long map_len; + void *map; -fd = open("/sys/class/uio/uio0/name", O_RDONLY); -ret = read(fd, buf, sizeof(buf)); -close(fd); -buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + fd = open("/sys/class/uio/uio0/name", O_RDONLY); + ret = read(fd, buf, sizeof(buf)); + close(fd); + buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ -/* we only want uio devices whose name is a format we expect */ -if (strncmp(buf, "tcm-user", 8)) + /* we only want uio devices whose name is a format we expect */ + if (strncmp(buf, "tcm-user", 8)) exit(-1); -/* Further checking for subtype also needed here */ - -fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY); -ret = read(fd, buf, sizeof(buf)); -close(fd); -str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + /* Further checking for subtype also needed here */ -map_len = strtoull(buf, NULL, 0); + fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY); + ret = read(fd, buf, sizeof(buf)); + close(fd); + str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ -dev_fd = open("/dev/uio0", O_RDWR); -map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0); + map_len = strtoull(buf, NULL, 0); + dev_fd = open("/dev/uio0", O_RDWR); + map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0); -b) Waiting for events on the device(s) - -while (1) { - char buf[4]; - int ret = read(dev_fd, buf, 4); /* will block */ + b) Waiting for events on the device(s) - handle_device_events(dev_fd, map); -} + while (1) { + char buf[4]; + int ret = read(dev_fd, buf, 4); /* will block */ -c) Managing the command ring - -#include <linux/target_core_user.h> - -int handle_device_events(int fd, void *map) -{ - struct tcmu_mailbox *mb = map; - struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; - int did_some_work = 0; - - /* Process events from cmd ring until we catch up with cmd_head */ - while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) { - - if (tcmu_hdr_get_op(ent->hdr.len_op) == TCMU_OP_CMD) { - uint8_t *cdb = (void *)mb + ent->req.cdb_off; - bool success = true; - - /* Handle command here. */ - printf("SCSI opcode: 0x%x\n", cdb[0]); - - /* Set response fields */ - if (success) - ent->rsp.scsi_status = SCSI_NO_SENSE; - else { - /* Also fill in rsp->sense_buffer here */ - ent->rsp.scsi_status = SCSI_CHECK_CONDITION; + handle_device_events(dev_fd, map); } - } - else if (tcmu_hdr_get_op(ent->hdr.len_op) != TCMU_OP_PAD) { - /* Tell the kernel we didn't handle unknown opcodes */ - ent->hdr.uflags |= TCMU_UFLAG_UNKNOWN_OP; - } - else { - /* Do nothing for PAD entries except update cmd_tail */ - } - - /* update cmd_tail */ - mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size; - ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; - did_some_work = 1; - } - /* Notify the kernel that work has been finished */ - if (did_some_work) { - uint32_t buf = 0; - write(fd, &buf, 4); - } - - return 0; -} +c) Managing the command ring:: + + #include <linux/target_core_user.h> + + int handle_device_events(int fd, void *map) + { + struct tcmu_mailbox *mb = map; + struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + int did_some_work = 0; + + /* Process events from cmd ring until we catch up with cmd_head */ + while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) { + + if (tcmu_hdr_get_op(ent->hdr.len_op) == TCMU_OP_CMD) { + uint8_t *cdb = (void *)mb + ent->req.cdb_off; + bool success = true; + + /* Handle command here. */ + printf("SCSI opcode: 0x%x\n", cdb[0]); + + /* Set response fields */ + if (success) + ent->rsp.scsi_status = SCSI_NO_SENSE; + else { + /* Also fill in rsp->sense_buffer here */ + ent->rsp.scsi_status = SCSI_CHECK_CONDITION; + } + } + else if (tcmu_hdr_get_op(ent->hdr.len_op) != TCMU_OP_PAD) { + /* Tell the kernel we didn't handle unknown opcodes */ + ent->hdr.uflags |= TCMU_UFLAG_UNKNOWN_OP; + } + else { + /* Do nothing for PAD entries except update cmd_tail */ + } + + /* update cmd_tail */ + mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size; + ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + did_some_work = 1; + } + + /* Notify the kernel that work has been finished */ + if (did_some_work) { + uint32_t buf = 0; + + write(fd, &buf, 4); + } + + return 0; + } A final note ------------- +============ Please be careful to return codes as defined by the SCSI specifications. These are different than some values defined in the diff --git a/Documentation/tee.txt b/Documentation/tee.txt index 56ea85ffebf2..afacdf2fd1de 100644 --- a/Documentation/tee.txt +++ b/Documentation/tee.txt @@ -32,7 +32,7 @@ User space (the client) connects to the driver by opening /dev/tee[0-9]* or memory. - TEE_IOC_VERSION lets user space know which TEE this driver handles and - the its capabilities. + its capabilities. - TEE_IOC_OPEN_SESSION opens a new session to a Trusted Application. diff --git a/Documentation/timers/highres.txt b/Documentation/timers/highres.rst index 8f9741592123..bde5eb7e5c9e 100644 --- a/Documentation/timers/highres.txt +++ b/Documentation/timers/highres.rst @@ -1,5 +1,6 @@ +===================================================== High resolution timers and dynamic ticks design notes ------------------------------------------------------ +===================================================== Further information can be found in the paper of the OLS 2006 talk "hrtimers and beyond". The paper is part of the OLS 2006 Proceedings Volume 1, which can @@ -30,11 +31,12 @@ hrtimer base infrastructure --------------------------- The hrtimer base infrastructure was merged into the 2.6.16 kernel. Details of -the base implementation are covered in Documentation/timers/hrtimers.txt. See +the base implementation are covered in Documentation/timers/hrtimers.rst. See also figure #2 (OLS slides p. 15) The main differences to the timer wheel, which holds the armed timer_list type timers are: + - time ordered enqueueing into a rb-tree - independent of ticks (the processing is based on nanoseconds) @@ -55,7 +57,8 @@ merged into the 2.6.18 kernel. Further information about the Generic Time Of Day framework is available in the OLS 2005 Proceedings Volume 1: -http://www.linuxsymposium.org/2005/linuxsymposium_procv1.pdf + + http://www.linuxsymposium.org/2005/linuxsymposium_procv1.pdf The paper "We Are Not Getting Any Younger: A New Approach to Time and Timers" was written by J. Stultz, D.V. Hart, & N. Aravamudan. @@ -100,6 +103,7 @@ accounting, profiling, and high resolution timers. The management layer assigns one or more of the following functions to a clock event device: + - system global periodic tick (jiffies update) - cpu local update_process_times - cpu local profiling @@ -244,6 +248,3 @@ extended to x86_64 and ARM already. Initial (work in progress) support is also available for MIPS and PowerPC. Thomas, Ingo - - - diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.rst index 895345ec513b..c9d05d3caaca 100644 --- a/Documentation/timers/hpet.txt +++ b/Documentation/timers/hpet.rst @@ -1,4 +1,6 @@ - High Precision Event Timer Driver for Linux +=========================================== +High Precision Event Timer Driver for Linux +=========================================== The High Precision Event Timer (HPET) hardware follows a specification by Intel and Microsoft, revision 1. diff --git a/Documentation/timers/hrtimers.txt b/Documentation/timers/hrtimers.rst index 588d85724f10..c1c20a693e8f 100644 --- a/Documentation/timers/hrtimers.txt +++ b/Documentation/timers/hrtimers.rst @@ -1,6 +1,6 @@ - +====================================================== hrtimers - subsystem for high-resolution kernel timers ----------------------------------------------------- +====================================================== This patch introduces a new subsystem for high-resolution kernel timers. @@ -146,7 +146,7 @@ the clock_getres() interface. This will return whatever real resolution a given clock has - be it low-res, high-res, or artificially-low-res. hrtimers - testing and verification ----------------------------------- +----------------------------------- We used the high-resolution clock subsystem ontop of hrtimers to verify the hrtimer implementation details in praxis, and we also ran the posix diff --git a/Documentation/timers/index.rst b/Documentation/timers/index.rst new file mode 100644 index 000000000000..91f6f8263c48 --- /dev/null +++ b/Documentation/timers/index.rst @@ -0,0 +1,22 @@ +:orphan: + +====== +timers +====== + +.. toctree:: + :maxdepth: 1 + + highres + hpet + hrtimers + no_hz + timekeeping + timers-howto + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/no_hz.rst index 9591092da5e0..065db217cb04 100644 --- a/Documentation/timers/NO_HZ.txt +++ b/Documentation/timers/no_hz.rst @@ -1,4 +1,6 @@ - NO_HZ: Reducing Scheduling-Clock Ticks +====================================== +NO_HZ: Reducing Scheduling-Clock Ticks +====================================== This document describes Kconfig options and boot parameters that can @@ -28,7 +30,8 @@ by a third section on RCU-specific considerations, a fourth section discussing testing, and a fifth and final section listing known issues. -NEVER OMIT SCHEDULING-CLOCK TICKS +Never Omit Scheduling-Clock Ticks +================================= Very old versions of Linux from the 1990s and the very early 2000s are incapable of omitting scheduling-clock ticks. It turns out that @@ -59,7 +62,8 @@ degrade your applications performance. If this describes your workload, you should read the following two sections. -OMIT SCHEDULING-CLOCK TICKS FOR IDLE CPUs +Omit Scheduling-Clock Ticks For Idle CPUs +========================================= If a CPU is idle, there is little point in sending it a scheduling-clock interrupt. After all, the primary purpose of a scheduling-clock interrupt @@ -97,7 +101,8 @@ By default, CONFIG_NO_HZ_IDLE=y kernels boot with "nohz=on", enabling dyntick-idle mode. -OMIT SCHEDULING-CLOCK TICKS FOR CPUs WITH ONLY ONE RUNNABLE TASK +Omit Scheduling-Clock Ticks For CPUs With Only One Runnable Task +================================================================ If a CPU has only one runnable task, there is little point in sending it a scheduling-clock interrupt because there is no other task to switch to. @@ -174,7 +179,8 @@ However, the drawbacks listed above mean that adaptive ticks should not (yet) be enabled by default. -RCU IMPLICATIONS +RCU Implications +================ There are situations in which idle CPUs cannot be permitted to enter either dyntick-idle mode or adaptive-tick mode, the most @@ -199,7 +205,8 @@ scheduler will decide where to run them, which might or might not be where you want them to run. -TESTING +Testing +======= So you enable all the OS-jitter features described in this document, but do not see any change in your workload's behavior. Is this because @@ -222,9 +229,10 @@ We do not currently have a good way to remove OS jitter from single-CPU systems. -KNOWN ISSUES +Known Issues +============ -o Dyntick-idle slows transitions to and from idle slightly. +* Dyntick-idle slows transitions to and from idle slightly. In practice, this has not been a problem except for the most aggressive real-time workloads, which have the option of disabling dyntick-idle mode, an option that most of them take. However, @@ -248,13 +256,13 @@ o Dyntick-idle slows transitions to and from idle slightly. this parameter effectively disables Turbo Mode on Intel CPUs, which can significantly reduce maximum performance. -o Adaptive-ticks slows user/kernel transitions slightly. +* Adaptive-ticks slows user/kernel transitions slightly. This is not expected to be a problem for computationally intensive workloads, which have few such transitions. Careful benchmarking will be required to determine whether or not other workloads are significantly affected by this effect. -o Adaptive-ticks does not do anything unless there is only one +* Adaptive-ticks does not do anything unless there is only one runnable task for a given CPU, even though there are a number of other situations where the scheduling-clock tick is not needed. To give but one example, consider a CPU that has one @@ -275,7 +283,7 @@ o Adaptive-ticks does not do anything unless there is only one Better handling of these sorts of situations is future work. -o A reboot is required to reconfigure both adaptive idle and RCU +* A reboot is required to reconfigure both adaptive idle and RCU callback offloading. Runtime reconfiguration could be provided if needed, however, due to the complexity of reconfiguring RCU at runtime, there would need to be an earthshakingly good reason. @@ -283,12 +291,12 @@ o A reboot is required to reconfigure both adaptive idle and RCU simply offloading RCU callbacks from all CPUs and pinning them where you want them whenever you want them pinned. -o Additional configuration is required to deal with other sources +* Additional configuration is required to deal with other sources of OS jitter, including interrupts and system-utility tasks and processes. This configuration normally involves binding interrupts and tasks to particular CPUs. -o Some sources of OS jitter can currently be eliminated only by +* Some sources of OS jitter can currently be eliminated only by constraining the workload. For example, the only way to eliminate OS jitter due to global TLB shootdowns is to avoid the unmapping operations (such as kernel module unload operations) that @@ -299,17 +307,17 @@ o Some sources of OS jitter can currently be eliminated only by helpful, especially when combined with the mlock() and mlockall() system calls. -o Unless all CPUs are idle, at least one CPU must keep the +* Unless all CPUs are idle, at least one CPU must keep the scheduling-clock interrupt going in order to support accurate timekeeping. -o If there might potentially be some adaptive-ticks CPUs, there +* If there might potentially be some adaptive-ticks CPUs, there will be at least one CPU keeping the scheduling-clock interrupt going, even if all CPUs are otherwise idle. Better handling of this situation is ongoing work. -o Some process-handling operations still require the occasional +* Some process-handling operations still require the occasional scheduling-clock tick. These operations include calculating CPU load, maintaining sched average, computing CFS entity vruntime, computing avenrun, and carrying out load balancing. They are diff --git a/Documentation/timers/timekeeping.txt b/Documentation/timers/timekeeping.rst index 2d1732b0a868..f83e98852e2c 100644 --- a/Documentation/timers/timekeeping.txt +++ b/Documentation/timers/timekeeping.rst @@ -1,5 +1,6 @@ +=========================================================== Clock sources, Clock events, sched_clock() and delay timers ------------------------------------------------------------ +=========================================================== This document tries to briefly explain some basic kernel timekeeping abstractions. It partly pertains to the drivers usually found in diff --git a/Documentation/timers/timers-howto.txt b/Documentation/timers/timers-howto.rst index 038f8c77a076..7e3167bec2b1 100644 --- a/Documentation/timers/timers-howto.txt +++ b/Documentation/timers/timers-howto.rst @@ -1,5 +1,6 @@ +=================================================================== delays - Information on the various kernel delay / sleep mechanisms -------------------------------------------------------------------- +=================================================================== This document seeks to answer the common question: "What is the RightWay (TM) to insert a delay?" @@ -17,7 +18,7 @@ code in an atomic context?" This should be followed closely by "Does it really need to delay in atomic context?" If so... ATOMIC CONTEXT: - You must use the *delay family of functions. These + You must use the `*delay` family of functions. These functions use the jiffie estimation of clock speed and will busy wait for enough loop cycles to achieve the desired delay: @@ -35,21 +36,26 @@ ATOMIC CONTEXT: be refactored to allow for the use of msleep. NON-ATOMIC CONTEXT: - You should use the *sleep[_range] family of functions. + You should use the `*sleep[_range]` family of functions. There are a few more options here, while any of them may work correctly, using the "right" sleep function will help the scheduler, power management, and just make your driver better :) -- Backed by busy-wait loop: + udelay(unsigned long usecs) + -- Backed by hrtimers: + usleep_range(unsigned long min, unsigned long max) + -- Backed by jiffies / legacy_timers + msleep(unsigned long msecs) msleep_interruptible(unsigned long msecs) - Unlike the *delay family, the underlying mechanism + Unlike the `*delay` family, the underlying mechanism driving each of these calls varies, thus there are quirks you should be aware of. @@ -70,6 +76,7 @@ NON-ATOMIC CONTEXT: - Why not msleep for (1ms - 20ms)? Explained originally here: http://lkml.org/lkml/2007/8/3/250 + msleep(1~20) may not do what the caller intends, and will often sleep longer (~20 ms actual sleep for any value given in the 1~20ms range). In many cases this diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt index efbc832146e7..b027d61b27a6 100644 --- a/Documentation/trace/coresight.txt +++ b/Documentation/trace/coresight.txt @@ -188,6 +188,49 @@ specific to that component only. "Implementation defined" customisations are expected to be accessed and controlled using those entries. +Device Naming scheme +------------------------ +The devices that appear on the "coresight" bus were named the same as their +parent devices, i.e, the real devices that appears on AMBA bus or the platform bus. +Thus the names were based on the Linux Open Firmware layer naming convention, +which follows the base physical address of the device followed by the device +type. e.g: + +root:~# ls /sys/bus/coresight/devices/ + 20010000.etf 20040000.funnel 20100000.stm 22040000.etm + 22140000.etm 230c0000.funnel 23240000.etm 20030000.tpiu + 20070000.etr 20120000.replicator 220c0000.funnel + 23040000.etm 23140000.etm 23340000.etm + +However, with the introduction of ACPI support, the names of the real +devices are a bit cryptic and non-obvious. Thus, a new naming scheme was +introduced to use more generic names based on the type of the device. The +following rules apply: + + 1) Devices that are bound to CPUs, are named based on the CPU logical + number. + + e.g, ETM bound to CPU0 is named "etm0" + + 2) All other devices follow a pattern, "<device_type_prefix>N", where : + + <device_type_prefix> - A prefix specific to the type of the device + N - a sequential number assigned based on the order + of probing. + + e.g, tmc_etf0, tmc_etr0, funnel0, funnel1 + +Thus, with the new scheme the devices could appear as : + +root:~# ls /sys/bus/coresight/devices/ + etm0 etm1 etm2 etm3 etm4 etm5 funnel0 + funnel1 funnel2 replicator0 stm0 tmc_etf0 tmc_etr0 tpiu0 + +Some of the examples below might refer to old naming scheme and some +to the newer scheme, to give a confirmation that what you see on your +system is not unexpected. One must use the "names" as they appear on +the system under specified locations. + How to use the tracer modules ----------------------------- @@ -326,16 +369,25 @@ amount of processor cores), the "cs_etm" PMU will be listed only once. A Coresight PMU works the same way as any other PMU, i.e the name of the PMU is listed along with configuration options within forward slashes '/'. Since a Coresight system will typically have more than one sink, the name of the sink to -work with needs to be specified as an event option. Names for sink to choose -from are listed in sysFS under ($SYSFS)/bus/coresight/devices: +work with needs to be specified as an event option. +On newer kernels the available sinks are listed in sysFS under: +($SYSFS)/bus/event_source/devices/cs_etm/sinks/ + + root@localhost:/sys/bus/event_source/devices/cs_etm/sinks# ls + tmc_etf0 tmc_etr0 tpiu0 + +On older kernels, this may need to be found from the list of coresight devices, +available under ($SYSFS)/bus/coresight/devices/: + + root:~# ls /sys/bus/coresight/devices/ + etm0 etm1 etm2 etm3 etm4 etm5 funnel0 + funnel1 funnel2 replicator0 stm0 tmc_etf0 tmc_etr0 tpiu0 - root@linaro-nano:~# ls /sys/bus/coresight/devices/ - 20010000.etf 20040000.funnel 20100000.stm 22040000.etm - 22140000.etm 230c0000.funnel 23240000.etm 20030000.tpiu - 20070000.etr 20120000.replicator 220c0000.funnel - 23040000.etm 23140000.etm 23340000.etm + root@linaro-nano:~# perf record -e cs_etm/@tmc_etr0/u --per-thread program - root@linaro-nano:~# perf record -e cs_etm/@20070000.etr/u --per-thread program +As mentioned above in section "Device Naming scheme", the names of the devices could +look different from what is used in the example above. One must use the device names +as it appears under the sysFS. The syntax within the forward slashes '/' is important. The '@' character tells the parser that a sink is about to be specified and that this is the sink @@ -352,7 +404,7 @@ perf can be used to record and analyze trace of programs. Execution can be recorded using 'perf record' with the cs_etm event, specifying the name of the sink to record to, e.g: - perf record -e cs_etm/@20070000.etr/u --per-thread + perf record -e cs_etm/@tmc_etr0/u --per-thread The 'perf report' and 'perf script' commands can be used to analyze execution, synthesizing instruction and branch events from the instruction trace. @@ -381,7 +433,7 @@ sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tuto Bubble sorting array of 30000 elements 5910 ms - $ perf record -e cs_etm/@20070000.etr/u --per-thread taskset -c 2 ./sort + $ perf record -e cs_etm/@tmc_etr0/u --per-thread taskset -c 2 ./sort Bubble sorting array of 30000 elements 12543 ms [ perf record: Woken up 35 times to write data ] @@ -405,7 +457,7 @@ than the program flow through the code. As with any other CoreSight component, specifics about the STM tracer can be found in sysfs with more information on each entry being found in [1]: -root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm +root@genericarmv8:~# ls /sys/bus/coresight/devices/stm0 enable_source hwevent_select port_enable subsystem uevent hwevent_enable mgmt port_select traceid root@genericarmv8:~# @@ -413,14 +465,14 @@ root@genericarmv8:~# Like any other source a sink needs to be identified and the STM enabled before being used: -root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink -root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source +root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/tmc_etf0/enable_sink +root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/stm0/enable_source From there user space applications can request and use channels using the devfs interface provided for that purpose by the generic STM API: -root@genericarmv8:~# ls -l /dev/20100000.stm -crw------- 1 root root 10, 61 Jan 3 18:11 /dev/20100000.stm +root@genericarmv8:~# ls -l /dev/stm0 +crw------- 1 root root 10, 61 Jan 3 18:11 /dev/stm0 root@genericarmv8:~# Details on how to use the generic STM API can be found here [2]. diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index fb621a1c2638..8408670d0328 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1010,7 +1010,7 @@ Extended error information For example, suppose we wanted to take a look at the relative weights in terms of skb length for each callpath that leads to a - netif_receieve_skb event when downloading a decent-sized file using + netif_receive_skb event when downloading a decent-sized file using wget. First we set up an initially paused stacktrace trigger on the @@ -1843,7 +1843,7 @@ practice, not every handler.action combination is currently supported; if a given handler.action combination isn't supported, the hist trigger will fail with -EINVAL; -The default 'handler.action' if none is explicity specified is as it +The default 'handler.action' if none is explicitly specified is as it always has been, to simply update the set of values associated with an entry. Some applications, however, may want to perform additional actions at that point, such as generate another event, or compare and @@ -2088,7 +2088,7 @@ The following commonly-used handler.action pairs are available: and the saved values corresponding to the max are displayed following the rest of the fields. - If a snaphot was taken, there is also a message indicating that, + If a snapshot was taken, there is also a message indicating that, along with the value and event that triggered the global maximum: # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist @@ -2176,7 +2176,7 @@ The following commonly-used handler.action pairs are available: hist trigger entry. Note that in this case the changed value is a global variable - associated withe current trace instance. The key of the specific + associated with current trace instance. The key of the specific trace event that caused the value to change and the global value itself are displayed, along with a message stating that a snapshot has been taken and where to find it. The user can use the key @@ -2203,7 +2203,7 @@ The following commonly-used handler.action pairs are available: and the saved values corresponding to that value are displayed following the rest of the fields. - If a snaphot was taken, there is also a message indicating that, + If a snapshot was taken, there is also a message indicating that, along with the value and event that triggered the snapshot:: # cat /sys/kernel/debug/tracing/events/tcp/tcp_probe/hist diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 235ce2ab131a..7d2b0178d3f3 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -189,6 +189,13 @@ events, you need to enable it. echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable +Use the following command to start tracing in an interval. +:: + + # echo 1 > tracing_on + Open something... + # echo 0 > tracing_on + And you can see the traced information via /sys/kernel/debug/tracing/trace. :: diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index 4346e23e3ae7..0b21305fabdc 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -152,10 +152,15 @@ events, you need to enable it by:: # echo 1 > events/uprobes/enable -Lets disable the event after sleeping for some time. +Lets start tracing, sleep for some time and stop tracing. :: + # echo 1 > tracing_on # sleep 20 + # echo 0 > tracing_on + +Also, you can disable the event by:: + # echo 0 > events/uprobes/enable And you can see the traced information via /sys/kernel/debug/tracing/trace. diff --git a/Documentation/translations/it_IT/admin-guide/kernel-parameters.rst b/Documentation/translations/it_IT/admin-guide/kernel-parameters.rst new file mode 100644 index 000000000000..0e36d82a92be --- /dev/null +++ b/Documentation/translations/it_IT/admin-guide/kernel-parameters.rst @@ -0,0 +1,12 @@ +.. include:: ../disclaimer-ita.rst + +:Original: :ref:`Documentation/admin-guide/kernel-parameters.rst <kernelparameters>` + +.. _it_kernelparameters: + +I parametri da linea di comando del kernel +========================================== + +.. warning:: + + TODO ancora da tradurre diff --git a/Documentation/translations/it_IT/doc-guide/sphinx.rst b/Documentation/translations/it_IT/doc-guide/sphinx.rst index 793b5cc33403..1739cba8863e 100644 --- a/Documentation/translations/it_IT/doc-guide/sphinx.rst +++ b/Documentation/translations/it_IT/doc-guide/sphinx.rst @@ -35,8 +35,7 @@ Installazione Sphinx ==================== I marcatori ReST utilizzati nei file in Documentation/ sono pensati per essere -processati da ``Sphinx`` nella versione 1.3 o superiore. Se desiderate produrre -un documento PDF è raccomandato l'utilizzo di una versione superiore alle 1.4.6. +processati da ``Sphinx`` nella versione 1.3 o superiore. Esiste uno script che verifica i requisiti Sphinx. Per ulteriori dettagli consultate :ref:`it_sphinx-pre-install`. @@ -68,13 +67,13 @@ pacchettizzato dalla vostra distribuzione. utilizzando LaTeX. Per una corretta interpretazione, è necessario aver installato texlive con i pacchetti amdfonts e amsmath. -Riassumendo, se volete installare la versione 1.4.9 di Sphinx dovete eseguire:: +Riassumendo, se volete installare la versione 1.7.9 di Sphinx dovete eseguire:: - $ virtualenv sphinx_1.4 - $ . sphinx_1.4/bin/activate - (sphinx_1.4) $ pip install -r Documentation/sphinx/requirements.txt + $ virtualenv sphinx_1.7.9 + $ . sphinx_1.7.9/bin/activate + (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt -Dopo aver eseguito ``. sphinx_1.4/bin/activate``, il prompt cambierà per +Dopo aver eseguito ``. sphinx_1.7.9/bin/activate``, il prompt cambierà per indicare che state usando il nuovo ambiente. Se aprite un nuova sessione, prima di generare la documentazione, dovrete rieseguire questo comando per rientrare nell'ambiente virtuale. @@ -120,8 +119,8 @@ l'installazione:: You should run: sudo dnf install -y texlive-luatex85 - /usr/bin/virtualenv sphinx_1.4 - . sphinx_1.4/bin/activate + /usr/bin/virtualenv sphinx_1.7.9 + . sphinx_1.7.9/bin/activate pip install -r Documentation/sphinx/requirements.txt Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468. diff --git a/Documentation/translations/it_IT/kernel-hacking/hacking.rst b/Documentation/translations/it_IT/kernel-hacking/hacking.rst index 7178e517af0a..24c592852bf1 100644 --- a/Documentation/translations/it_IT/kernel-hacking/hacking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/hacking.rst @@ -755,7 +755,7 @@ anche per avere patch pulite, c'è del lavoro amministrativo da fare: - Solitamente vorrete un'opzione di configurazione per la vostra modifica al kernel. Modificate ``Kconfig`` nella cartella giusta. Il linguaggio Config è facile con copia ed incolla, e c'è una completa documentazione - nel file ``Documentation/kbuild/kconfig-language.txt``. + nel file ``Documentation/kbuild/kconfig-language.rst``. Nella descrizione della vostra opzione, assicuratevi di parlare sia agli utenti esperti sia agli utente che non sanno nulla del vostro lavoro. @@ -767,7 +767,7 @@ anche per avere patch pulite, c'è del lavoro amministrativo da fare: - Modificate il file ``Makefile``: le variabili CONFIG sono esportate qui, quindi potete solitamente aggiungere una riga come la seguete "obj-$(CONFIG_xxx) += xxx.o". La sintassi è documentata nel file - ``Documentation/kbuild/makefiles.txt``. + ``Documentation/kbuild/makefiles.rst``. - Aggiungete voi stessi in ``CREDITS`` se avete fatto qualcosa di notevole, solitamente qualcosa che supera il singolo file (comunque il vostro nome diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst index 0ef31666663b..5fd8a1abd2be 100644 --- a/Documentation/translations/it_IT/kernel-hacking/locking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst @@ -468,7 +468,7 @@ e tutti gli oggetti che contiene. Ecco il codice:: if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL) return -ENOMEM; - strlcpy(obj->name, name, sizeof(obj->name)); + strscpy(obj->name, name, sizeof(obj->name)); obj->id = id; obj->popularity = 0; @@ -678,7 +678,7 @@ Ecco il codice:: } @@ -63,6 +94,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); + strscpy(obj->name, name, sizeof(obj->name)); obj->id = id; obj->popularity = 0; + obj->refcnt = 1; /* The cache holds a reference */ @@ -792,7 +792,7 @@ contatore stesso. } @@ -94,7 +76,7 @@ - strlcpy(obj->name, name, sizeof(obj->name)); + strscpy(obj->name, name, sizeof(obj->name)); obj->id = id; obj->popularity = 0; - obj->refcnt = 1; /* The cache holds a reference */ diff --git a/Documentation/translations/it_IT/process/4.Coding.rst b/Documentation/translations/it_IT/process/4.Coding.rst index c05b89e616dd..a5e36aa60448 100644 --- a/Documentation/translations/it_IT/process/4.Coding.rst +++ b/Documentation/translations/it_IT/process/4.Coding.rst @@ -314,7 +314,7 @@ di allocazione di memoria sarà destinata al fallimento; questi fallimenti possono essere ridotti ad uno specifico pezzo di codice. Procedere con l'inserimento dei fallimenti attivo permette al programmatore di verificare come il codice risponde quando le cose vanno male. Consultate: -Documentation/fault-injection/fault-injection.txt per avere maggiori +Documentation/fault-injection/fault-injection.rst per avere maggiori informazioni su come utilizzare questo strumento. Altre tipologie di errori possono essere riscontrati con lo strumento di diff --git a/Documentation/translations/it_IT/process/adding-syscalls.rst b/Documentation/translations/it_IT/process/adding-syscalls.rst index e0a64b0688a7..c3a3439595a6 100644 --- a/Documentation/translations/it_IT/process/adding-syscalls.rst +++ b/Documentation/translations/it_IT/process/adding-syscalls.rst @@ -39,7 +39,7 @@ vostra interfaccia. un qualche modo opaca. - Se dovete esporre solo delle informazioni sul sistema, un nuovo nodo in - sysfs (vedere ``Documentation/translations/it_IT/filesystems/sysfs.txt``) o + sysfs (vedere ``Documentation/filesystems/sysfs.txt``) o in procfs potrebbe essere sufficiente. Tuttavia, l'accesso a questi meccanismi richiede che il filesystem sia montato, il che potrebbe non essere sempre vero (per esempio, in ambienti come namespace/sandbox/chroot). diff --git a/Documentation/translations/it_IT/process/coding-style.rst b/Documentation/translations/it_IT/process/coding-style.rst index 5ef534c95e69..8995d2d19f20 100644 --- a/Documentation/translations/it_IT/process/coding-style.rst +++ b/Documentation/translations/it_IT/process/coding-style.rst @@ -696,7 +696,7 @@ nella stringa di titolo:: ... Per la documentazione completa sui file di configurazione, consultate -il documento Documentation/translations/it_IT/kbuild/kconfig-language.txt +il documento Documentation/kbuild/kconfig-language.rst 11) Strutture dati diff --git a/Documentation/translations/it_IT/process/howto.rst b/Documentation/translations/it_IT/process/howto.rst index 9903ac7c566b..44e6077730e8 100644 --- a/Documentation/translations/it_IT/process/howto.rst +++ b/Documentation/translations/it_IT/process/howto.rst @@ -131,7 +131,7 @@ Di seguito una lista di file che sono presenti nei sorgente del kernel e che "Linux kernel patch submission format" http://linux.yyz.us/patch-format.html - :ref:`Documentation/process/translations/it_IT/stable-api-nonsense.rst <it_stable_api_nonsense>` + :ref:`Documentation/translations/it_IT/process/stable-api-nonsense.rst <it_stable_api_nonsense>` Questo file descrive la motivazioni sottostanti la conscia decisione di non avere un API stabile all'interno del kernel, incluso cose come: diff --git a/Documentation/translations/it_IT/process/license-rules.rst b/Documentation/translations/it_IT/process/license-rules.rst index f058e06996dc..4cd87a3a7bf9 100644 --- a/Documentation/translations/it_IT/process/license-rules.rst +++ b/Documentation/translations/it_IT/process/license-rules.rst @@ -303,7 +303,7 @@ essere categorizzate in: LICENSES/dual I file in questa cartella contengono il testo completo della rispettiva - licenza e i suoi `Metatags`_. I nomi dei file sono identici agli + licenza e i suoi `Metatag`_. I nomi dei file sono identici agli identificatori di licenza SPDX che dovrebbero essere usati nei file sorgenti. @@ -326,19 +326,19 @@ essere categorizzate in: Esempio del formato del file:: - Valid-License-Identifier: MPL-1.1 - SPDX-URL: https://spdx.org/licenses/MPL-1.1.html - Usage-Guide: - Do NOT use. The MPL-1.1 is not GPL2 compatible. It may only be used for - dual-licensed files where the other license is GPL2 compatible. - If you end up using this it MUST be used together with a GPL2 compatible - license using "OR". - To use the Mozilla Public License version 1.1 put the following SPDX - tag/value pair into a comment according to the placement guidelines in - the licensing rules documentation: - SPDX-License-Identifier: MPL-1.1 - License-Text: - Full license text + Valid-License-Identifier: MPL-1.1 + SPDX-URL: https://spdx.org/licenses/MPL-1.1.html + Usage-Guide: + Do NOT use. The MPL-1.1 is not GPL2 compatible. It may only be used for + dual-licensed files where the other license is GPL2 compatible. + If you end up using this it MUST be used together with a GPL2 compatible + license using "OR". + To use the Mozilla Public License version 1.1 put the following SPDX + tag/value pair into a comment according to the placement guidelines in + the licensing rules documentation: + SPDX-License-Identifier: MPL-1.1 + License-Text: + Full license text | diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst index 5281d53e57ee..ed1121d0ba84 100644 --- a/Documentation/translations/it_IT/process/magic-number.rst +++ b/Documentation/translations/it_IT/process/magic-number.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-ita.rst -:Original: :ref:`Documentation/process/magic-numbers.rst <magicnumbers>` +:Original: :ref:`Documentation/process/magic-number.rst <magicnumbers>` :Translator: Federico Vaga <federico.vaga@vaga.pv.it> .. _it_magicnumbers: diff --git a/Documentation/translations/it_IT/process/stable-kernel-rules.rst b/Documentation/translations/it_IT/process/stable-kernel-rules.rst index 48e88e5ad2c5..4f206cee31a7 100644 --- a/Documentation/translations/it_IT/process/stable-kernel-rules.rst +++ b/Documentation/translations/it_IT/process/stable-kernel-rules.rst @@ -33,7 +33,7 @@ Regole sul tipo di patch che vengono o non vengono accettate nei sorgenti - Non deve includere alcuna correzione "banale" (correzioni grammaticali, pulizia dagli spazi bianchi, eccetera). - Deve rispettare le regole scritte in - :ref:`Documentation/translation/it_IT/process/submitting-patches.rst <it_submittingpatches>` + :ref:`Documentation/translations/it_IT/process/submitting-patches.rst <it_submittingpatches>` - Questa patch o una equivalente deve esistere già nei sorgenti principali di Linux @@ -43,7 +43,7 @@ Procedura per sottomettere patch per i sorgenti -stable - Se la patch contiene modifiche a dei file nelle cartelle net/ o drivers/net, allora seguite le linee guida descritte in - :ref:`Documentation/translation/it_IT/networking/netdev-FAQ.rst <it_netdev-FAQ>`; + :ref:`Documentation/translations/it_IT/networking/netdev-FAQ.rst <it_netdev-FAQ>`; ma solo dopo aver verificato al seguente indirizzo che la patch non sia già in coda: https://patchwork.ozlabs.org/bundle/davem/stable/?series=&submitter=&state=*&q=&archive= diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst index 70e65a7b3620..ea74cae958d7 100644 --- a/Documentation/translations/it_IT/process/submit-checklist.rst +++ b/Documentation/translations/it_IT/process/submit-checklist.rst @@ -43,7 +43,7 @@ sottomissione delle patch, in particolare 6) Le opzioni ``CONFIG``, nuove o modificate, non scombussolano il menu di configurazione e sono preimpostate come disabilitate a meno che non - soddisfino i criteri descritti in ``Documentation/kbuild/kconfig-language.txt`` + soddisfino i criteri descritti in ``Documentation/kbuild/kconfig-language.rst`` alla punto "Voci di menu: valori predefiniti". 7) Tutte le nuove opzioni ``Kconfig`` hanno un messaggio di aiuto. diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 5f3c74dcad43..a33c2a536542 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -569,7 +569,7 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE [*] 버스 마스터링 DMA 와 일관성에 대해서는 다음을 참고하시기 바랍니다: - Documentation/PCI/pci.txt + Documentation/PCI/pci.rst Documentation/DMA-API-HOWTO.txt Documentation/DMA-API.txt diff --git a/Documentation/translations/zh_CN/arm64/booting.txt b/Documentation/translations/zh_CN/arm64/booting.txt index c1dd968c5ee9..3bfbf66e5a5e 100644 --- a/Documentation/translations/zh_CN/arm64/booting.txt +++ b/Documentation/translations/zh_CN/arm64/booting.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/booting.txt +Chinese translated version of Documentation/arm64/booting.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ M: Will Deacon <will.deacon@arm.com> zh_CN: Fu Wei <wefu@redhat.com> C: 55f058e7574c3615dea4615573a19bdb258696c6 --------------------------------------------------------------------- -Documentation/arm64/booting.txt 的中文翻译 +Documentation/arm64/booting.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/legacy_instructions.txt b/Documentation/translations/zh_CN/arm64/legacy_instructions.txt index 68362a1ab717..e295cf75f606 100644 --- a/Documentation/translations/zh_CN/arm64/legacy_instructions.txt +++ b/Documentation/translations/zh_CN/arm64/legacy_instructions.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/legacy_instructions.txt +Chinese translated version of Documentation/arm64/legacy_instructions.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ Maintainer: Punit Agrawal <punit.agrawal@arm.com> Suzuki K. Poulose <suzuki.poulose@arm.com> Chinese maintainer: Fu Wei <wefu@redhat.com> --------------------------------------------------------------------- -Documentation/arm64/legacy_instructions.txt 的中文翻译 +Documentation/arm64/legacy_instructions.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/memory.txt b/Documentation/translations/zh_CN/arm64/memory.txt index 19b3a52d5d94..be20f8228b91 100644 --- a/Documentation/translations/zh_CN/arm64/memory.txt +++ b/Documentation/translations/zh_CN/arm64/memory.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/memory.txt +Chinese translated version of Documentation/arm64/memory.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -9,7 +9,7 @@ or if there is a problem with the translation. Maintainer: Catalin Marinas <catalin.marinas@arm.com> Chinese maintainer: Fu Wei <wefu@redhat.com> --------------------------------------------------------------------- -Documentation/arm64/memory.txt 的中文翻译 +Documentation/arm64/memory.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/silicon-errata.txt b/Documentation/translations/zh_CN/arm64/silicon-errata.txt index 39477c75c4a4..440c59ac7dce 100644 --- a/Documentation/translations/zh_CN/arm64/silicon-errata.txt +++ b/Documentation/translations/zh_CN/arm64/silicon-errata.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/silicon-errata.txt +Chinese translated version of Documentation/arm64/silicon-errata.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ M: Will Deacon <will.deacon@arm.com> zh_CN: Fu Wei <wefu@redhat.com> C: 1926e54f115725a9248d0c4c65c22acaf94de4c4 --------------------------------------------------------------------- -Documentation/arm64/silicon-errata.txt 的中文翻译 +Documentation/arm64/silicon-errata.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/arm64/tagged-pointers.txt b/Documentation/translations/zh_CN/arm64/tagged-pointers.txt index 2664d1bd5a1c..77ac3548a16d 100644 --- a/Documentation/translations/zh_CN/arm64/tagged-pointers.txt +++ b/Documentation/translations/zh_CN/arm64/tagged-pointers.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/arm64/tagged-pointers.txt +Chinese translated version of Documentation/arm64/tagged-pointers.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -9,7 +9,7 @@ or if there is a problem with the translation. Maintainer: Will Deacon <will.deacon@arm.com> Chinese maintainer: Fu Wei <wefu@redhat.com> --------------------------------------------------------------------- -Documentation/arm64/tagged-pointers.txt 的中文翻译 +Documentation/arm64/tagged-pointers.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 diff --git a/Documentation/translations/zh_CN/basic_profiling.txt b/Documentation/translations/zh_CN/basic_profiling.txt deleted file mode 100644 index 1e6bf0bdf8f5..000000000000 --- a/Documentation/translations/zh_CN/basic_profiling.txt +++ /dev/null @@ -1,71 +0,0 @@ -Chinese translated version of Documentation/basic_profiling - -If you have any comment or update to the content, please post to LKML directly. -However, if you have problem communicating in English you can also ask the -Chinese maintainer for help. Contact the Chinese maintainer, if this -translation is outdated or there is problem with translation. - -Chinese maintainer: Liang Xie <xieliang@xiaomi.com> ---------------------------------------------------------------------- -Documentation/basic_profiling的中文翻译 - -如果想评论或更新本文的内容,请直接发信到LKML。如果你使用英文交流有困难的话,也可 -以向中文版维护者求助。如果本翻译更新不及时或者翻译存在问题,请联系中文版维护者。 - -中文版维护者: 谢良 Liang Xie <xieliang007@gmail.com> -中文版翻译者: 谢良 Liang Xie <xieliang007@gmail.com> -中文版校译者: -以下为正文 ---------------------------------------------------------------------- - -下面这些说明指令都是非常基础的,如果你想进一步了解请阅读相关专业文档:) -请不要再在本文档增加新的内容,但可以修复文档中的错误:)(mbligh@aracnet.com) -感谢John Levon,Dave Hansen等在撰写时的帮助 - -<test> 用于表示要测量的目标 -请先确保您已经有正确的System.map / vmlinux配置! - -对于linux系统来说,配置vmlinuz最容易的方法可能就是使用“make install”,然后修改 -/sbin/installkernel将vmlinux拷贝到/boot目录,而System.map通常是默认安装好的 - -Readprofile ------------ -2.6系列内核需要版本相对较新的readprofile,比如util-linux 2.12a中包含的,可以从: - -http://www.kernel.org/pub/linux/utils/util-linux/ 下载 - -大部分linux发行版已经包含了. - -启用readprofile需要在kernel启动命令行增加”profile=2“ - -clear readprofile -r - <test> -dump output readprofile -m /boot/System.map > captured_profile - -Oprofile --------- - -从http://oprofile.sourceforge.net/获取源代码(请参考Changes以获取匹配的版本) -在kernel启动命令行增加“idle=poll” - -配置CONFIG_PROFILING=y和CONFIG_OPROFILE=y然后重启进入新kernel - -./configure --with-kernel-support -make install - -想得到好的测量结果,请确保启用了本地APIC特性。如果opreport显示有0Hz CPU, -说明APIC特性没有开启。另外注意idle=poll选项可能有损性能。 - -One time setup: - opcontrol --setup --vmlinux=/boot/vmlinux - -clear opcontrol --reset -start opcontrol --start - <test> -stop opcontrol --stop -dump output opreport > output_file - -如果只看kernel相关的报告结果,请运行命令 opreport -l /boot/vmlinux > output_file - -通过reset选项可以清理过期统计数据,相当于重启的效果。 - diff --git a/Documentation/translations/zh_CN/oops-tracing.txt b/Documentation/translations/zh_CN/oops-tracing.txt index 93fa061cf9e4..368ddd05b304 100644 --- a/Documentation/translations/zh_CN/oops-tracing.txt +++ b/Documentation/translations/zh_CN/oops-tracing.txt @@ -53,7 +53,7 @@ cat /proc/kmsg > file, 然而你必须介入中止传输, kmsg是一个“ (2)用串口终端启动(请参看Documentation/admin-guide/serial-console.rst),运行一个null modem到另一台机器并用你喜欢的通讯工具获取输出。Minicom工作地很好。 -(3)使用Kdump(请参看Documentation/kdump/kdump.txt), +(3)使用Kdump(请参看Documentation/kdump/kdump.rst), 使用在Documentation/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核 环形缓冲区。 diff --git a/Documentation/translations/zh_CN/process/4.Coding.rst b/Documentation/translations/zh_CN/process/4.Coding.rst index 5301e9d55255..b82b1dde3122 100644 --- a/Documentation/translations/zh_CN/process/4.Coding.rst +++ b/Documentation/translations/zh_CN/process/4.Coding.rst @@ -205,7 +205,7 @@ Linus对这个问题给出了最佳答案: 启用故障注入后,内存分配的可配置百分比将失败;这些失败可以限制在特定的代码 范围内。在启用了故障注入的情况下运行,程序员可以看到当情况恶化时代码如何响 应。有关如何使用此工具的详细信息,请参阅 -Documentation/fault-injection/fault-injection.txt。 +Documentation/fault-injection/fault-injection.rst。 使用“sparse”静态分析工具可以发现其他类型的错误。对于sparse,可以警告程序员 用户空间和内核空间地址之间的混淆、big endian和small endian数量的混合、在需 @@ -241,7 +241,7 @@ scripts/coccinelle目录下已经打包了相当多的内核“语义补丁” 任何添加新用户空间界面的代码(包括新的sysfs或/proc文件)都应该包含该界面的 文档,该文档使用户空间开发人员能够知道他们在使用什么。请参阅 -Documentation/abi/readme,了解如何格式化此文档以及需要提供哪些信息。 +Documentation/ABI/README,了解如何格式化此文档以及需要提供哪些信息。 文件 :ref:`Documentation/admin-guide/kernel-parameters.rst <kernelparameters>` 描述了内核的所有引导时间参数。任何添加新参数的补丁都应该向该文件添加适当的 diff --git a/Documentation/translations/zh_CN/process/coding-style.rst b/Documentation/translations/zh_CN/process/coding-style.rst index 5479c591c2f7..4f6237392e65 100644 --- a/Documentation/translations/zh_CN/process/coding-style.rst +++ b/Documentation/translations/zh_CN/process/coding-style.rst @@ -599,7 +599,7 @@ Documentation/doc-guide/ 和 scripts/kernel-doc 以获得详细信息。 depends on ADFS_FS ... -要查看配置文件的完整文档,请看 Documentation/kbuild/kconfig-language.txt。 +要查看配置文件的完整文档,请看 Documentation/kbuild/kconfig-language.rst。 11) 数据结构 diff --git a/Documentation/translations/zh_CN/process/management-style.rst b/Documentation/translations/zh_CN/process/management-style.rst index a181fa56d19e..c6a5bb285797 100644 --- a/Documentation/translations/zh_CN/process/management-style.rst +++ b/Documentation/translations/zh_CN/process/management-style.rst @@ -28,7 +28,7 @@ Linux内核管理风格 不管怎样,这里是: -.. _decisions: +.. _cn_decisions: 1)决策 ------- @@ -108,7 +108,7 @@ Linux内核管理风格 但是,为了做好作为内核管理者的准备,最好记住不要烧掉任何桥梁,不要轰炸任何 无辜的村民,也不要疏远太多的内核开发人员。事实证明,疏远人是相当容易的,而 亲近一个疏远的人是很难的。因此,“疏远”立即属于“不可逆”的范畴,并根据 -:ref:`decisions` 成为绝不可以做的事情。 +:ref:`cn_decisions` 成为绝不可以做的事情。 这里只有几个简单的规则: diff --git a/Documentation/translations/zh_CN/process/programming-language.rst b/Documentation/translations/zh_CN/process/programming-language.rst index 51fd4ef48ea1..2a47a1d2ec20 100644 --- a/Documentation/translations/zh_CN/process/programming-language.rst +++ b/Documentation/translations/zh_CN/process/programming-language.rst @@ -8,21 +8,21 @@ 程序设计语言 ============ -内核是用C语言 [c-language]_ 编写的。更准确地说,内核通常是用 ``gcc`` [gcc]_ -在 ``-std=gnu89`` [gcc-c-dialect-options]_ 下编译的:ISO C90的 GNU 方言( +内核是用C语言 :ref:`c-language <cn_c-language>` 编写的。更准确地说,内核通常是用 :ref:`gcc <cn_gcc>` +在 ``-std=gnu89`` :ref:`gcc-c-dialect-options <cn_gcc-c-dialect-options>` 下编译的:ISO C90的 GNU 方言( 包括一些C99特性) -这种方言包含对语言 [gnu-extensions]_ 的许多扩展,当然,它们许多都在内核中使用。 +这种方言包含对语言 :ref:`gnu-extensions <cn_gnu-extensions>` 的许多扩展,当然,它们许多都在内核中使用。 -对于一些体系结构,有一些使用 ``clang`` [clang]_ 和 ``icc`` [icc]_ 编译内核 +对于一些体系结构,有一些使用 :ref:`clang <cn_clang>` 和 :ref:`icc <cn_icc>` 编译内核 的支持,尽管在编写此文档时还没有完成,仍需要第三方补丁。 属性 ---- -在整个内核中使用的一个常见扩展是属性(attributes) [gcc-attribute-syntax]_ +在整个内核中使用的一个常见扩展是属性(attributes) :ref:`gcc-attribute-syntax <cn_gcc-attribute-syntax>` 属性允许将实现定义的语义引入语言实体(如变量、函数或类型),而无需对语言进行 -重大的语法更改(例如添加新关键字) [n2049]_ +重大的语法更改(例如添加新关键字) :ref:`n2049 <cn_n2049>` 在某些情况下,属性是可选的(即不支持这些属性的编译器仍然应该生成正确的代码, 即使其速度较慢或执行的编译时检查/诊断次数不够) @@ -31,11 +31,42 @@ ``__attribute__((__pure__))`` ),以检测可以使用哪些关键字和/或缩短代码, 具体 请参阅 ``include/linux/compiler_attributes.h`` -.. [c-language] http://www.open-std.org/jtc1/sc22/wg14/www/standards -.. [gcc] https://gcc.gnu.org -.. [clang] https://clang.llvm.org -.. [icc] https://software.intel.com/en-us/c-compilers -.. [gcc-c-dialect-options] https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html -.. [gnu-extensions] https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html -.. [gcc-attribute-syntax] https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html -.. [n2049] http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2049.pdf +.. _cn_c-language: + +c-language + http://www.open-std.org/jtc1/sc22/wg14/www/standards + +.. _cn_gcc: + +gcc + https://gcc.gnu.org + +.. _cn_clang: + +clang + https://clang.llvm.org + +.. _cn_icc: + +icc + https://software.intel.com/en-us/c-compilers + +.. _cn_gcc-c-dialect-options: + +c-dialect-options + https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html + +.. _cn_gnu-extensions: + +gnu-extensions + https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html + +.. _cn_gcc-attribute-syntax: + +gcc-attribute-syntax + https://gcc.gnu.org/onlinedocs/gcc/Attribute-Syntax.html + +.. _cn_n2049: + +n2049 + http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2049.pdf diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst index 89061aa8fdbe..f4785d2b0491 100644 --- a/Documentation/translations/zh_CN/process/submit-checklist.rst +++ b/Documentation/translations/zh_CN/process/submit-checklist.rst @@ -38,7 +38,7 @@ Linux内核补丁提交清单 违规行为。 6) 任何新的或修改过的 ``CONFIG`` 选项都不会弄脏配置菜单,并默认为关闭,除非 - 它们符合 ``Documentation/kbuild/kconfig-language.txt`` 中记录的异常条件, + 它们符合 ``Documentation/kbuild/kconfig-language.rst`` 中记录的异常条件, 菜单属性:默认值. 7) 所有新的 ``kconfig`` 选项都有帮助文本。 diff --git a/Documentation/translations/zh_CN/process/submitting-drivers.rst b/Documentation/translations/zh_CN/process/submitting-drivers.rst index 72c6cd935821..72f4f45c98de 100644 --- a/Documentation/translations/zh_CN/process/submitting-drivers.rst +++ b/Documentation/translations/zh_CN/process/submitting-drivers.rst @@ -22,7 +22,7 @@ 兴趣的是显卡驱动程序,你也许应该访问 XFree86 项目(http://www.xfree86.org/) 和/或 X.org 项目 (http://x.org)。 -另请参阅 Documentation/Documentation/translations/zh_CN/process/submitting-patches.rst 文档。 +另请参阅 Documentation/translations/zh_CN/process/submitting-patches.rst 文档。 分配设备号 diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index 1129c7550a48..7ddd8f667459 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -49,6 +49,8 @@ If PR_SPEC_PRCTL is set, then the per-task control of the mitigation is available. If not set, prctl(PR_SET_SPECULATION_CTRL) for the speculation misfeature will fail. +.. _set_spec_ctrl: + PR_SET_SPECULATION_CTRL ----------------------- diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst index 659bbc093b52..d18c97b4e140 100644 --- a/Documentation/virtual/kvm/amd-memory-encryption.rst +++ b/Documentation/virtual/kvm/amd-memory-encryption.rst @@ -241,6 +241,9 @@ Returns: 0 on success, -negative on error References ========== + +See [white-paper]_, [api-spec]_, [amd-apm]_ and [kvm-forum]_ for more info. + .. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf .. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf .. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2a4531bb06bd..383b292966fa 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2205,7 +2205,7 @@ max_vq. This is the maximum vector length available to the guest on this vcpu, and determines which register slices are visible through this ioctl interface. -(See Documentation/arm64/sve.txt for an explanation of the "vq" +(See Documentation/arm64/sve.rst for an explanation of the "vq" nomenclature.) KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt index 4f0c9fc40365..eeaa95b893a8 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic-its.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic-its.txt @@ -103,7 +103,7 @@ Groups: The following ordering must be followed when restoring the GIC and the ITS: a) restore all guest memory and create vcpus b) restore all redistributors -c) provide the its base address +c) provide the ITS base address (KVM_DEV_ARM_VGIC_GRP_ADDR) d) restore the ITS in the following order: 1. Restore GITS_CBASER diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst index 09bd24a92784..a5c884293dac 100644 --- a/Documentation/vm/hwpoison.rst +++ b/Documentation/vm/hwpoison.rst @@ -13,32 +13,32 @@ kill the processes associated with it and avoid using it in the future. This patchkit implements the necessary infrastructure in the VM. -To quote the overview comment: - - * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache - * failure. - * - * This focusses on pages detected as corrupted in the background. - * When the current CPU tries to consume corruption the currently - * running process can just be killed directly instead. This implies - * that if the error cannot be handled for some reason it's safe to - * just ignore it because no corruption has been consumed yet. Instead - * when that happens another machine check will happen. - * - * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * Some of the operations here are somewhat inefficient and have non - * linear algorithmic complexity, because the data structures have not - * been optimized for this case. This is in particular the case - * for the mapping from a vma to a process. Since this case is expected - * to be rare we hope we can get away with this. +To quote the overview comment:: + + High level machine check handler. Handles pages reported by the + hardware as being corrupted usually due to a 2bit ECC memory or cache + failure. + + This focusses on pages detected as corrupted in the background. + When the current CPU tries to consume corruption the currently + running process can just be killed directly instead. This implies + that if the error cannot be handled for some reason it's safe to + just ignore it because no corruption has been consumed yet. Instead + when that happens another machine check will happen. + + Handles page cache pages in various states. The tricky part + here is that we can access any page asynchronous to other VM + users, because memory failures could happen anytime and anywhere, + possibly violating some of their assumptions. This is why this code + has to be extremely careful. Generally it tries to use normal locking + rules, as in get the standard locks, even if that means the + error handling takes potentially a long time. + + Some of the operations here are somewhat inefficient and have non + linear algorithmic complexity, because the data structures have not + been optimized for this case. This is in particular the case + for the mapping from a vma to a process. Since this case is expected + to be rare we hope we can get away with this. The code consists of a the high level handler in mm/memory-failure.c, a new page poison bit and various checks in the VM to handle poisoned diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst index 0d830edae8fe..130f3cfa1c19 100644 --- a/Documentation/vm/numa.rst +++ b/Documentation/vm/numa.rst @@ -99,7 +99,7 @@ Local allocation will tend to keep subsequent access to the allocated memory as long as the task on whose behalf the kernel allocated some memory does not later migrate away from that memory. The Linux scheduler is aware of the NUMA topology of the platform--embodied in the "scheduling domains" data -structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler +structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler attempts to minimize task migration to distant scheduling domains. However, the scheduler does not take a task's NUMA footprint into account directly. Thus, under sufficient imbalance, tasks can migrate between nodes, remote diff --git a/Documentation/watchdog/convert_drivers_to_kernel_api.txt b/Documentation/watchdog/convert_drivers_to_kernel_api.rst index 9fffb2958d13..dd934cc08e40 100644 --- a/Documentation/watchdog/convert_drivers_to_kernel_api.txt +++ b/Documentation/watchdog/convert_drivers_to_kernel_api.rst @@ -1,7 +1,9 @@ +========================================================= Converting old watchdog drivers to the watchdog framework -by Wolfram Sang <w.sang@pengutronix.de> ========================================================= +by Wolfram Sang <w.sang@pengutronix.de> + Before the watchdog framework came into the kernel, every driver had to implement the API on its own. Now, as the framework factored out the common components, those drivers can be lightened making it a user of the framework. @@ -69,16 +71,16 @@ Here is a overview of the functions and probably needed actions: -ENOIOCTLCMD, the IOCTLs of the framework will be tried, too. Any other error is directly given to the user. -Example conversion: +Example conversion:: --static const struct file_operations s3c2410wdt_fops = { -- .owner = THIS_MODULE, -- .llseek = no_llseek, -- .write = s3c2410wdt_write, -- .unlocked_ioctl = s3c2410wdt_ioctl, -- .open = s3c2410wdt_open, -- .release = s3c2410wdt_release, --}; + -static const struct file_operations s3c2410wdt_fops = { + - .owner = THIS_MODULE, + - .llseek = no_llseek, + - .write = s3c2410wdt_write, + - .unlocked_ioctl = s3c2410wdt_ioctl, + - .open = s3c2410wdt_open, + - .release = s3c2410wdt_release, + -}; Check the functions for device-specific stuff and keep it for later refactoring. The rest can go. @@ -89,24 +91,24 @@ Remove the miscdevice Since the file_operations are gone now, you can also remove the 'struct miscdevice'. The framework will create it on watchdog_dev_register() called by -watchdog_register_device(). +watchdog_register_device():: --static struct miscdevice s3c2410wdt_miscdev = { -- .minor = WATCHDOG_MINOR, -- .name = "watchdog", -- .fops = &s3c2410wdt_fops, --}; + -static struct miscdevice s3c2410wdt_miscdev = { + - .minor = WATCHDOG_MINOR, + - .name = "watchdog", + - .fops = &s3c2410wdt_fops, + -}; Remove obsolete includes and defines ------------------------------------ Because of the simplifications, a few defines are probably unused now. Remove -them. Includes can be removed, too. For example: +them. Includes can be removed, too. For example:: -- #include <linux/fs.h> -- #include <linux/miscdevice.h> (if MODULE_ALIAS_MISCDEV is not used) -- #include <linux/uaccess.h> (if no custom IOCTLs are used) + - #include <linux/fs.h> + - #include <linux/miscdevice.h> (if MODULE_ALIAS_MISCDEV is not used) + - #include <linux/uaccess.h> (if no custom IOCTLs are used) Add the watchdog operations @@ -121,30 +123,30 @@ change the function header. Other changes are most likely not needed, because here simply happens the direct hardware access. If you have device-specific code left from the above steps, it should be refactored into these callbacks. -Here is a simple example: +Here is a simple example:: -+static struct watchdog_ops s3c2410wdt_ops = { -+ .owner = THIS_MODULE, -+ .start = s3c2410wdt_start, -+ .stop = s3c2410wdt_stop, -+ .ping = s3c2410wdt_keepalive, -+ .set_timeout = s3c2410wdt_set_heartbeat, -+}; + +static struct watchdog_ops s3c2410wdt_ops = { + + .owner = THIS_MODULE, + + .start = s3c2410wdt_start, + + .stop = s3c2410wdt_stop, + + .ping = s3c2410wdt_keepalive, + + .set_timeout = s3c2410wdt_set_heartbeat, + +}; -A typical function-header change looks like: +A typical function-header change looks like:: --static void s3c2410wdt_keepalive(void) -+static int s3c2410wdt_keepalive(struct watchdog_device *wdd) - { -... -+ -+ return 0; - } + -static void s3c2410wdt_keepalive(void) + +static int s3c2410wdt_keepalive(struct watchdog_device *wdd) + { + ... + + + + return 0; + } -... + ... -- s3c2410wdt_keepalive(); -+ s3c2410wdt_keepalive(&s3c2410_wdd); + - s3c2410wdt_keepalive(); + + s3c2410wdt_keepalive(&s3c2410_wdd); Add the watchdog device @@ -159,12 +161,12 @@ static variables. Those have to be converted to use the members in watchdog_device. Note that the timeout values are unsigned int. Some drivers use signed int, so this has to be converted, too. -Here is a simple example for a watchdog device: +Here is a simple example for a watchdog device:: -+static struct watchdog_device s3c2410_wdd = { -+ .info = &s3c2410_wdt_ident, -+ .ops = &s3c2410wdt_ops, -+}; + +static struct watchdog_device s3c2410_wdd = { + + .info = &s3c2410_wdt_ident, + + .ops = &s3c2410wdt_ops, + +}; Handle the 'nowayout' feature @@ -173,12 +175,12 @@ Handle the 'nowayout' feature A few drivers use nowayout statically, i.e. there is no module parameter for it and only CONFIG_WATCHDOG_NOWAYOUT determines if the feature is going to be used. This needs to be converted by initializing the status variable of the -watchdog_device like this: +watchdog_device like this:: .status = WATCHDOG_NOWAYOUT_INIT_STATUS, Most drivers, however, also allow runtime configuration of nowayout, usually -by adding a module parameter. The conversion for this would be something like: +by adding a module parameter. The conversion for this would be something like:: watchdog_set_nowayout(&s3c2410_wdd, nowayout); @@ -191,15 +193,15 @@ Register the watchdog device Replace misc_register(&miscdev) with watchdog_register_device(&watchdog_dev). Make sure the return value gets checked and the error message, if present, -still fits. Also convert the unregister case. +still fits. Also convert the unregister case:: -- ret = misc_register(&s3c2410wdt_miscdev); -+ ret = watchdog_register_device(&s3c2410_wdd); + - ret = misc_register(&s3c2410wdt_miscdev); + + ret = watchdog_register_device(&s3c2410_wdd); -... + ... -- misc_deregister(&s3c2410wdt_miscdev); -+ watchdog_unregister_device(&s3c2410_wdd); + - misc_deregister(&s3c2410wdt_miscdev); + + watchdog_unregister_device(&s3c2410_wdd); Update the Kconfig-entry @@ -207,7 +209,7 @@ Update the Kconfig-entry The entry for the driver now needs to select WATCHDOG_CORE: -+ select WATCHDOG_CORE + + select WATCHDOG_CORE Create a patch and send it to upstream @@ -215,4 +217,3 @@ Create a patch and send it to upstream Make sure you understood Documentation/process/submitting-patches.rst and send your patch to linux-watchdog@vger.kernel.org. We are looking forward to it :) - diff --git a/Documentation/watchdog/hpwdt.txt b/Documentation/watchdog/hpwdt.rst index 55df692c5595..94a96371113e 100644 --- a/Documentation/watchdog/hpwdt.txt +++ b/Documentation/watchdog/hpwdt.rst @@ -1,7 +1,12 @@ +=========================== +HPE iLO NMI Watchdog Driver +=========================== + +for iLO based ProLiant Servers +============================== + Last reviewed: 08/20/2018 - HPE iLO NMI Watchdog Driver - for iLO based ProLiant Servers The HPE iLO NMI Watchdog driver is a kernel module that provides basic watchdog functionality and handler for the iLO "Generate NMI to System" @@ -20,23 +25,26 @@ Last reviewed: 08/20/2018 The hpwdt driver also has the following module parameters: - soft_margin - allows the user to set the watchdog timer value. + ============ ================================================================ + soft_margin allows the user to set the watchdog timer value. Default value is 30 seconds. - timeout - an alias of soft_margin. - pretimeout - allows the user to set the watchdog pretimeout value. + timeout an alias of soft_margin. + pretimeout allows the user to set the watchdog pretimeout value. This is the number of seconds before timeout when an NMI is delivered to the system. Setting the value to zero disables the pretimeout NMI. Default value is 9 seconds. - nowayout - basic watchdog parameter that does not allow the timer to + nowayout basic watchdog parameter that does not allow the timer to be restarted or an impending ASR to be escaped. Default value is set when compiling the kernel. If it is set to "Y", then there is no way of disabling the watchdog once it has been started. + ============ ================================================================ - NOTE: More information about watchdog drivers in general, including the ioctl + NOTE: + More information about watchdog drivers in general, including the ioctl interface to /dev/watchdog can be found in - Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt. + Documentation/watchdog/watchdog-api.rst and Documentation/IPMI.txt. Due to limitations in the iLO hardware, the NMI pretimeout if enabled, can only be set to 9 seconds. Attempts to set pretimeout to other @@ -51,7 +59,7 @@ Last reviewed: 08/20/2018 and loop forever. This is generally not what a watchdog user wants. For those wishing to learn more please see: - Documentation/kdump/kdump.txt + Documentation/kdump/kdump.rst Documentation/admin-guide/kernel-parameters.txt (panic=) Your Linux Distribution specific documentation. @@ -63,4 +71,3 @@ Last reviewed: 08/20/2018 The HPE iLO NMI Watchdog Driver and documentation were originally developed by Tom Mingarelli. - diff --git a/Documentation/watchdog/index.rst b/Documentation/watchdog/index.rst new file mode 100644 index 000000000000..33a0de631e84 --- /dev/null +++ b/Documentation/watchdog/index.rst @@ -0,0 +1,25 @@ +:orphan: + +====================== +Linux Watchdog Support +====================== + +.. toctree:: + :maxdepth: 1 + + hpwdt + mlx-wdt + pcwd-watchdog + watchdog-api + watchdog-kernel-api + watchdog-parameters + watchdog-pm + wdt + convert_drivers_to_kernel_api + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/watchdog/mlx-wdt.txt b/Documentation/watchdog/mlx-wdt.rst index 66eeb78505c3..bf5bafac47f0 100644 --- a/Documentation/watchdog/mlx-wdt.txt +++ b/Documentation/watchdog/mlx-wdt.rst @@ -1,5 +1,9 @@ - Mellanox watchdog drivers - for x86 based system switches +========================= +Mellanox watchdog drivers +========================= + +for x86 based system switches +============================= This driver provides watchdog functionality for various Mellanox Ethernet and Infiniband switch systems. @@ -9,16 +13,16 @@ Mellanox watchdog device is implemented in a programmable logic device. There are 2 types of HW watchdog implementations. Type 1: -Actual HW timeout can be defined as a power of 2 msec. -e.g. timeout 20 sec will be rounded up to 32768 msec. -The maximum timeout period is 32 sec (32768 msec.), -Get time-left isn't supported + Actual HW timeout can be defined as a power of 2 msec. + e.g. timeout 20 sec will be rounded up to 32768 msec. + The maximum timeout period is 32 sec (32768 msec.), + Get time-left isn't supported Type 2: -Actual HW timeout is defined in sec. and it's the same as -a user-defined timeout. -Maximum timeout is 255 sec. -Get time-left is supported. + Actual HW timeout is defined in sec. and it's the same as + a user-defined timeout. + Maximum timeout is 255 sec. + Get time-left is supported. Type 1 HW watchdog implementation exist in old systems and all new systems have type 2 HW watchdog. diff --git a/Documentation/watchdog/pcwd-watchdog.txt b/Documentation/watchdog/pcwd-watchdog.rst index b8e60a441a43..405e2a370082 100644 --- a/Documentation/watchdog/pcwd-watchdog.txt +++ b/Documentation/watchdog/pcwd-watchdog.rst @@ -1,8 +1,13 @@ +=================================== +Berkshire Products PC Watchdog Card +=================================== + Last reviewed: 10/05/2007 - Berkshire Products PC Watchdog Card - Support for ISA Cards Revision A and C - Documentation and Driver by Ken Hollis <kenji@bitgate.com> +Support for ISA Cards Revision A and C +======================================= + +Documentation and Driver by Ken Hollis <kenji@bitgate.com> The PC Watchdog is a card that offers the same type of functionality that the WDT card does, only it doesn't require an IRQ to run. Furthermore, @@ -33,6 +38,7 @@ Last reviewed: 10/05/2007 WDIOC_GETSUPPORT This returns the support of the card itself. This returns in structure "PCWDS" which returns: + options = WDIOS_TEMPPANIC (This card supports temperature) firmware_version = xxxx @@ -63,4 +69,3 @@ Last reviewed: 10/05/2007 -- Ken Hollis (kenji@bitgate.com) - diff --git a/Documentation/watchdog/watchdog-api.txt b/Documentation/watchdog/watchdog-api.rst index 0e62ba33b7fb..c6c1e9fa9f73 100644 --- a/Documentation/watchdog/watchdog-api.txt +++ b/Documentation/watchdog/watchdog-api.rst @@ -1,7 +1,10 @@ +============================= +The Linux Watchdog driver API +============================= + Last reviewed: 10/05/2007 -The Linux Watchdog driver API. Copyright 2002 Christer Weingel <wingel@nano-system.com> @@ -10,7 +13,8 @@ driver which is (c) Copyright 2000 Jakob Oestergaard <jakob@ostenfeld.dk> This document describes the state of the Linux 2.4.18 kernel. -Introduction: +Introduction +============ A Watchdog Timer (WDT) is a hardware circuit that can reset the computer system in case of a software fault. You probably knew that @@ -30,7 +34,8 @@ drivers implement different, and sometimes incompatible, parts of it. This file is an attempt to document the existing usage and allow future driver writers to use it as a reference. -The simplest API: +The simplest API +================ All drivers support the basic mode of operation, where the watchdog activates as soon as /dev/watchdog is opened and will reboot unless @@ -54,7 +59,8 @@ after the timeout has passed. Watchdog devices also usually support the nowayout module parameter so that this option can be controlled at runtime. -Magic Close feature: +Magic Close feature +=================== If a driver supports "Magic Close", the driver will not disable the watchdog unless a specific magic character 'V' has been sent to @@ -64,7 +70,8 @@ will assume that the daemon (and userspace in general) died, and will stop pinging the watchdog without disabling it first. This will then cause a reboot if the watchdog is not re-opened in sufficient time. -The ioctl API: +The ioctl API +============= All conforming drivers also support an ioctl API. @@ -73,7 +80,7 @@ Pinging the watchdog using an ioctl: All drivers that have an ioctl interface support at least one ioctl, KEEPALIVE. This ioctl does exactly the same thing as a write to the watchdog device, so the main loop in the above program could be -replaced with: +replaced with:: while (1) { ioctl(fd, WDIOC_KEEPALIVE, 0); @@ -82,14 +89,15 @@ replaced with: the argument to the ioctl is ignored. -Setting and getting the timeout: +Setting and getting the timeout +=============================== For some drivers it is possible to modify the watchdog timeout on the fly with the SETTIMEOUT ioctl, those drivers have the WDIOF_SETTIMEOUT flag set in their option field. The argument is an integer representing the timeout in seconds. The driver returns the real timeout used in the same variable, and this timeout might differ from -the requested one due to limitation of the hardware. +the requested one due to limitation of the hardware:: int timeout = 45; ioctl(fd, WDIOC_SETTIMEOUT, &timeout); @@ -99,18 +107,19 @@ This example might actually print "The timeout was set to 60 seconds" if the device has a granularity of minutes for its timeout. Starting with the Linux 2.4.18 kernel, it is possible to query the -current timeout using the GETTIMEOUT ioctl. +current timeout using the GETTIMEOUT ioctl:: ioctl(fd, WDIOC_GETTIMEOUT, &timeout); printf("The timeout was is %d seconds\n", timeout); -Pretimeouts: +Pretimeouts +=========== Some watchdog timers can be set to have a trigger go off before the actual time they will reset the system. This can be done with an NMI, interrupt, or other mechanism. This allows Linux to record useful information (like panic information and kernel coredumps) before it -resets. +resets:: pretimeout = 10; ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout); @@ -121,89 +130,113 @@ the pretimeout. So, for instance, if you set the timeout to 60 seconds and the pretimeout to 10 seconds, the pretimeout will go off in 50 seconds. Setting a pretimeout to zero disables it. -There is also a get function for getting the pretimeout: +There is also a get function for getting the pretimeout:: ioctl(fd, WDIOC_GETPRETIMEOUT, &timeout); printf("The pretimeout was is %d seconds\n", timeout); Not all watchdog drivers will support a pretimeout. -Get the number of seconds before reboot: +Get the number of seconds before reboot +======================================= Some watchdog drivers have the ability to report the remaining time before the system will reboot. The WDIOC_GETTIMELEFT is the ioctl -that returns the number of seconds before reboot. +that returns the number of seconds before reboot:: ioctl(fd, WDIOC_GETTIMELEFT, &timeleft); printf("The timeout was is %d seconds\n", timeleft); -Environmental monitoring: +Environmental monitoring +======================== All watchdog drivers are required return more information about the system, some do temperature, fan and power level monitoring, some can tell you the reason for the last reboot of the system. The GETSUPPORT ioctl is -available to ask what the device can do: +available to ask what the device can do:: struct watchdog_info ident; ioctl(fd, WDIOC_GETSUPPORT, &ident); the fields returned in the ident struct are: + ================ ============================================= identity a string identifying the watchdog driver firmware_version the firmware version of the card if available options a flags describing what the device supports + ================ ============================================= the options field can have the following bits set, and describes what kind of information that the GET_STATUS and GET_BOOT_STATUS ioctls can return. [FIXME -- Is this correct?] + ================ ========================= WDIOF_OVERHEAT Reset due to CPU overheat + ================ ========================= The machine was last rebooted by the watchdog because the thermal limit was -exceeded +exceeded: + ============== ========== WDIOF_FANFAULT Fan failed + ============== ========== A system fan monitored by the watchdog card has failed + ============= ================ WDIOF_EXTERN1 External relay 1 + ============= ================ External monitoring relay/source 1 was triggered. Controllers intended for real world applications include external monitoring pins that will trigger a reset. + ============= ================ WDIOF_EXTERN2 External relay 2 + ============= ================ External monitoring relay/source 2 was triggered + ================ ===================== WDIOF_POWERUNDER Power bad/power fault + ================ ===================== The machine is showing an undervoltage status + =============== ============================= WDIOF_CARDRESET Card previously reset the CPU + =============== ============================= The last reboot was caused by the watchdog card + ================ ===================== WDIOF_POWEROVER Power over voltage + ================ ===================== The machine is showing an overvoltage status. Note that if one level is under and one over both bits will be set - this may seem odd but makes sense. + =================== ===================== WDIOF_KEEPALIVEPING Keep alive ping reply + =================== ===================== The watchdog saw a keepalive ping since it was last queried. + ================ ======================= WDIOF_SETTIMEOUT Can set/get the timeout + ================ ======================= The watchdog can do pretimeouts. + ================ ================================ WDIOF_PRETIMEOUT Pretimeout (in seconds), get/set + ================ ================================ For those drivers that return any bits set in the option field, the GETSTATUS and GETBOOTSTATUS ioctls can be used to ask for the current -status, and the status at the last reboot, respectively. +status, and the status at the last reboot, respectively:: int flags; ioctl(fd, WDIOC_GETSTATUS, &flags); @@ -216,22 +249,23 @@ Note that not all devices support these two calls, and some only support the GETBOOTSTATUS call. Some drivers can measure the temperature using the GETTEMP ioctl. The -returned value is the temperature in degrees fahrenheit. +returned value is the temperature in degrees fahrenheit:: int temperature; ioctl(fd, WDIOC_GETTEMP, &temperature); Finally the SETOPTIONS ioctl can be used to control some aspects of -the cards operation. +the cards operation:: int options = 0; ioctl(fd, WDIOC_SETOPTIONS, &options); The following options are available: + ================= ================================ WDIOS_DISABLECARD Turn off the watchdog timer WDIOS_ENABLECARD Turn on the watchdog timer WDIOS_TEMPPANIC Kernel panic on temperature trip + ================= ================================ [FIXME -- better explanations] - diff --git a/Documentation/watchdog/watchdog-kernel-api.txt b/Documentation/watchdog/watchdog-kernel-api.rst index 3a91ef5af044..864edbe932c1 100644 --- a/Documentation/watchdog/watchdog-kernel-api.txt +++ b/Documentation/watchdog/watchdog-kernel-api.rst @@ -1,5 +1,7 @@ -The Linux WatchDog Timer Driver Core kernel API. =============================================== +The Linux WatchDog Timer Driver Core kernel API +=============================================== + Last reviewed: 12-Feb-2013 Wim Van Sebroeck <wim@iguana.be> @@ -9,7 +11,7 @@ Introduction This document does not describe what a WatchDog Timer (WDT) Driver or Device is. It also does not describe the API which can be used by user space to communicate with a WatchDog Timer. If you want to know this then please read the following -file: Documentation/watchdog/watchdog-api.txt . +file: Documentation/watchdog/watchdog-api.rst . So what does this document describe? It describes the API that can be used by WatchDog Timer Drivers that want to use the WatchDog Timer Driver Core @@ -23,10 +25,10 @@ The API Each watchdog timer driver that wants to use the WatchDog Timer Driver Core must #include <linux/watchdog.h> (you would have to do this anyway when writing a watchdog device driver). This include file contains following -register/unregister routines: +register/unregister routines:: -extern int watchdog_register_device(struct watchdog_device *); -extern void watchdog_unregister_device(struct watchdog_device *); + extern int watchdog_register_device(struct watchdog_device *); + extern void watchdog_unregister_device(struct watchdog_device *); The watchdog_register_device routine registers a watchdog timer device. The parameter of this routine is a pointer to a watchdog_device structure. @@ -40,9 +42,9 @@ The watchdog subsystem includes an registration deferral mechanism, which allows you to register an watchdog as early as you wish during the boot process. -The watchdog device structure looks like this: +The watchdog device structure looks like this:: -struct watchdog_device { + struct watchdog_device { int id; struct device *parent; const struct attribute_group **groups; @@ -62,9 +64,10 @@ struct watchdog_device { struct watchdog_core_data *wd_data; unsigned long status; struct list_head deferred; -}; + }; It contains following fields: + * id: set by watchdog_register_device, id 0 is special. It has both a /dev/watchdog0 cdev (dynamic major, minor 0) as well as the old /dev/watchdog miscdev. The id is set automatically when calling @@ -114,9 +117,9 @@ It contains following fields: * deferred: entry in wtd_deferred_reg_list which is used to register early initialized watchdogs. -The list of watchdog operations is defined as: +The list of watchdog operations is defined as:: -struct watchdog_ops { + struct watchdog_ops { struct module *owner; /* mandatory operations */ int (*start)(struct watchdog_device *); @@ -129,7 +132,7 @@ struct watchdog_ops { unsigned int (*get_timeleft)(struct watchdog_device *); int (*restart)(struct watchdog_device *); long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long); -}; + }; It is important that you first define the module owner of the watchdog timer driver's operations. This module owner will be used to lock the module when @@ -138,6 +141,7 @@ module and /dev/watchdog is still open). Some operations are mandatory and some are optional. The mandatory operations are: + * start: this is a pointer to the routine that starts the watchdog timer device. The routine needs a pointer to the watchdog timer device structure as a @@ -146,51 +150,64 @@ are: Not all watchdog timer hardware supports the same functionality. That's why all other routines/operations are optional. They only need to be provided if they are supported. These optional routines/operations are: + * stop: with this routine the watchdog timer device is being stopped. + The routine needs a pointer to the watchdog timer device structure as a parameter. It returns zero on success or a negative errno code for failure. Some watchdog timer hardware can only be started and not be stopped. A driver supporting such hardware does not have to implement the stop routine. + If a driver has no stop function, the watchdog core will set WDOG_HW_RUNNING and start calling the driver's keepalive pings function after the watchdog device is closed. + If a watchdog driver does not implement the stop function, it must set max_hw_heartbeat_ms. * ping: this is the routine that sends a keepalive ping to the watchdog timer hardware. + The routine needs a pointer to the watchdog timer device structure as a parameter. It returns zero on success or a negative errno code for failure. + Most hardware that does not support this as a separate function uses the start function to restart the watchdog timer hardware. And that's also what the watchdog timer driver core does: to send a keepalive ping to the watchdog timer hardware it will either use the ping operation (when available) or the start operation (when the ping operation is not available). + (Note: the WDIOC_KEEPALIVE ioctl call will only be active when the WDIOF_KEEPALIVEPING bit has been set in the option field on the watchdog's info structure). * status: this routine checks the status of the watchdog timer device. The status of the device is reported with watchdog WDIOF_* status flags/bits. + WDIOF_MAGICCLOSE and WDIOF_KEEPALIVEPING are reported by the watchdog core; it is not necessary to report those bits from the driver. Also, if no status function is provided by the driver, the watchdog core reports the status bits provided in the bootstatus variable of struct watchdog_device. + * set_timeout: this routine checks and changes the timeout of the watchdog timer device. It returns 0 on success, -EINVAL for "parameter out of range" and -EIO for "could not write value to the watchdog". On success this routine should set the timeout value of the watchdog_device to the achieved timeout value (which may be different from the requested one because the watchdog does not necessarily have a 1 second resolution). + Drivers implementing max_hw_heartbeat_ms set the hardware watchdog heartbeat to the minimum of timeout and max_hw_heartbeat_ms. Those drivers set the timeout value of the watchdog_device either to the requested timeout value (if it is larger than max_hw_heartbeat_ms), or to the achieved timeout value. (Note: the WDIOF_SETTIMEOUT needs to be set in the options field of the watchdog's info structure). + If the watchdog driver does not have to perform any action but setting the watchdog_device.timeout, this callback can be omitted. + If set_timeout is not provided but, WDIOF_SETTIMEOUT is set, the watchdog infrastructure updates the timeout value of the watchdog_device internally to the requested value. + If the pretimeout feature is used (WDIOF_PRETIMEOUT), then set_timeout must also take care of checking if pretimeout is still valid and set up the timer accordingly. This can't be done in the core without races, so it is the @@ -201,13 +218,16 @@ they are supported. These optional routines/operations are: seconds before the actual timeout would happen. It returns 0 on success, -EINVAL for "parameter out of range" and -EIO for "could not write value to the watchdog". A value of 0 disables pretimeout notification. + (Note: the WDIOF_PRETIMEOUT needs to be set in the options field of the watchdog's info structure). + If the watchdog driver does not have to perform any action but setting the watchdog_device.pretimeout, this callback can be omitted. That means if set_pretimeout is not provided but WDIOF_PRETIMEOUT is set, the watchdog infrastructure updates the pretimeout value of the watchdog_device internally to the requested value. + * get_timeleft: this routines returns the time that's left before a reset. * restart: this routine restarts the machine. It returns 0 on success or a negative errno code for failure. @@ -218,6 +238,7 @@ they are supported. These optional routines/operations are: The status bits should (preferably) be set with the set_bit and clear_bit alike bit-operations. The status bits that are defined are: + * WDOG_ACTIVE: this status bit indicates whether or not a watchdog timer device is active or not from user perspective. User space is expected to send heartbeat requests to the driver while this flag is set. @@ -235,22 +256,30 @@ bit-operations. The status bits that are defined are: To set the WDOG_NO_WAY_OUT status bit (before registering your watchdog timer device) you can either: + * set it statically in your watchdog_device struct with + .status = WATCHDOG_NOWAYOUT_INIT_STATUS, + (this will set the value the same as CONFIG_WATCHDOG_NOWAYOUT) or - * use the following helper function: - static inline void watchdog_set_nowayout(struct watchdog_device *wdd, int nowayout) + * use the following helper function:: + + static inline void watchdog_set_nowayout(struct watchdog_device *wdd, + int nowayout) + +Note: + The WatchDog Timer Driver Core supports the magic close feature and + the nowayout feature. To use the magic close feature you must set the + WDIOF_MAGICCLOSE bit in the options field of the watchdog's info structure. -Note: The WatchDog Timer Driver Core supports the magic close feature and -the nowayout feature. To use the magic close feature you must set the -WDIOF_MAGICCLOSE bit in the options field of the watchdog's info structure. The nowayout feature will overrule the magic close feature. To get or set driver specific data the following two helper functions should be -used: +used:: -static inline void watchdog_set_drvdata(struct watchdog_device *wdd, void *data) -static inline void *watchdog_get_drvdata(struct watchdog_device *wdd) + static inline void watchdog_set_drvdata(struct watchdog_device *wdd, + void *data) + static inline void *watchdog_get_drvdata(struct watchdog_device *wdd) The watchdog_set_drvdata function allows you to add driver specific data. The arguments of this function are the watchdog device where you want to add the @@ -260,10 +289,11 @@ The watchdog_get_drvdata function allows you to retrieve driver specific data. The argument of this function is the watchdog device where you want to retrieve data from. The function returns the pointer to the driver specific data. -To initialize the timeout field, the following function can be used: +To initialize the timeout field, the following function can be used:: -extern int watchdog_init_timeout(struct watchdog_device *wdd, - unsigned int timeout_parm, struct device *dev); + extern int watchdog_init_timeout(struct watchdog_device *wdd, + unsigned int timeout_parm, + struct device *dev); The watchdog_init_timeout function allows you to initialize the timeout field using the module timeout parameter or by retrieving the timeout-sec property from @@ -272,30 +302,33 @@ to set the default timeout value as timeout value in the watchdog_device and then use this function to set the user "preferred" timeout value. This routine returns zero on success and a negative errno code for failure. -To disable the watchdog on reboot, the user must call the following helper: +To disable the watchdog on reboot, the user must call the following helper:: -static inline void watchdog_stop_on_reboot(struct watchdog_device *wdd); + static inline void watchdog_stop_on_reboot(struct watchdog_device *wdd); To disable the watchdog when unregistering the watchdog, the user must call the following helper. Note that this will only stop the watchdog if the nowayout flag is not set. -static inline void watchdog_stop_on_unregister(struct watchdog_device *wdd); +:: + + static inline void watchdog_stop_on_unregister(struct watchdog_device *wdd); To change the priority of the restart handler the following helper should be -used: +used:: -void watchdog_set_restart_priority(struct watchdog_device *wdd, int priority); + void watchdog_set_restart_priority(struct watchdog_device *wdd, int priority); User should follow the following guidelines for setting the priority: + * 0: should be called in last resort, has limited restart capabilities * 128: default restart handler, use if no other handler is expected to be available, and/or if restart is sufficient to restart the entire system * 255: highest priority, will preempt all other restart handlers -To raise a pretimeout notification, the following function should be used: +To raise a pretimeout notification, the following function should be used:: -void watchdog_notify_pretimeout(struct watchdog_device *wdd) + void watchdog_notify_pretimeout(struct watchdog_device *wdd) The function can be called in the interrupt context. If watchdog pretimeout governor framework (kbuild CONFIG_WATCHDOG_PRETIMEOUT_GOV symbol) is enabled, diff --git a/Documentation/watchdog/watchdog-parameters.rst b/Documentation/watchdog/watchdog-parameters.rst new file mode 100644 index 000000000000..b121caae7798 --- /dev/null +++ b/Documentation/watchdog/watchdog-parameters.rst @@ -0,0 +1,736 @@ +========================== +WatchDog Module Parameters +========================== + +This file provides information on the module parameters of many of +the Linux watchdog drivers. Watchdog driver parameter specs should +be listed here unless the driver has its own driver-specific information +file. + +See Documentation/admin-guide/kernel-parameters.rst for information on +providing kernel parameters for builtin drivers versus loadable +modules. + +------------------------------------------------- + +acquirewdt: + wdt_stop: + Acquire WDT 'stop' io port (default 0x43) + wdt_start: + Acquire WDT 'start' io port (default 0x443) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +advantechwdt: + wdt_stop: + Advantech WDT 'stop' io port (default 0x443) + wdt_start: + Advantech WDT 'start' io port (default 0x443) + timeout: + Watchdog timeout in seconds. 1<= timeout <=63, default=60. + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +alim1535_wdt: + timeout: + Watchdog timeout in seconds. (0 < timeout < 18000, default=60 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +alim7101_wdt: + timeout: + Watchdog timeout in seconds. (1<=timeout<=3600, default=30 + use_gpio: + Use the gpio watchdog (required by old cobalt boards). + default=0/off/no + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +ar7_wdt: + margin: + Watchdog margin in seconds (default=60) + nowayout: + Disable watchdog shutdown on close + (default=kernel config parameter) + +------------------------------------------------- + +armada_37xx_wdt: + timeout: + Watchdog timeout in seconds. (default=120) + nowayout: + Disable watchdog shutdown on close + (default=kernel config parameter) + +------------------------------------------------- + +at91rm9200_wdt: + wdt_time: + Watchdog time in seconds. (default=5) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +at91sam9_wdt: + heartbeat: + Watchdog heartbeats in seconds. (default = 15) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +bcm47xx_wdt: + wdt_time: + Watchdog time in seconds. (default=30) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +coh901327_wdt: + margin: + Watchdog margin in seconds (default 60s) + +------------------------------------------------- + +cpu5wdt: + port: + base address of watchdog card, default is 0x91 + verbose: + be verbose, default is 0 (no) + ticks: + count down ticks, default is 10000 + +------------------------------------------------- + +cpwd: + wd0_timeout: + Default watchdog0 timeout in 1/10secs + wd1_timeout: + Default watchdog1 timeout in 1/10secs + wd2_timeout: + Default watchdog2 timeout in 1/10secs + +------------------------------------------------- + +da9052wdt: + timeout: + Watchdog timeout in seconds. 2<= timeout <=131, default=2.048s + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +davinci_wdt: + heartbeat: + Watchdog heartbeat period in seconds from 1 to 600, default 60 + +------------------------------------------------- + +ebc-c384_wdt: + timeout: + Watchdog timeout in seconds. (1<=timeout<=15300, default=60) + nowayout: + Watchdog cannot be stopped once started + +------------------------------------------------- + +ep93xx_wdt: + nowayout: + Watchdog cannot be stopped once started + timeout: + Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD) + +------------------------------------------------- + +eurotechwdt: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + io: + Eurotech WDT io port (default=0x3f0) + irq: + Eurotech WDT irq (default=10) + ev: + Eurotech WDT event type (default is `int`) + +------------------------------------------------- + +gef_wdt: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +geodewdt: + timeout: + Watchdog timeout in seconds. 1<= timeout <=131, default=60. + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +i6300esb: + heartbeat: + Watchdog heartbeat in seconds. (1<heartbeat<2046, default=30) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +iTCO_wdt: + heartbeat: + Watchdog heartbeat in seconds. + (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +iTCO_vendor_support: + vendorsupport: + iTCO vendor specific support mode, default=0 (none), + 1=SuperMicro Pent3, 2=SuperMicro Pent4+, 911=Broken SMI BIOS + +------------------------------------------------- + +ib700wdt: + timeout: + Watchdog timeout in seconds. 0<= timeout <=30, default=30. + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +ibmasr: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +imx2_wdt: + timeout: + Watchdog timeout in seconds (default 60 s) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +indydog: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +iop_wdt: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +it8712f_wdt: + margin: + Watchdog margin in seconds (default 60) + nowayout: + Disable watchdog shutdown on close + (default=kernel config parameter) + +------------------------------------------------- + +it87_wdt: + nogameport: + Forbid the activation of game port, default=0 + nocir: + Forbid the use of CIR (workaround for some buggy setups); set to 1 if +system resets despite watchdog daemon running, default=0 + exclusive: + Watchdog exclusive device open, default=1 + timeout: + Watchdog timeout in seconds, default=60 + testmode: + Watchdog test mode (1 = no reboot), default=0 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +ixp4xx_wdt: + heartbeat: + Watchdog heartbeat in seconds (default 60s) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +ks8695_wdt: + wdt_time: + Watchdog time in seconds. (default=5) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +machzwd: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + action: + after watchdog resets, generate: + 0 = RESET(*) 1 = SMI 2 = NMI 3 = SCI + +------------------------------------------------- + +max63xx_wdt: + heartbeat: + Watchdog heartbeat period in seconds from 1 to 60, default 60 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + nodelay: + Force selection of a timeout setting without initial delay + (max6373/74 only, default=0) + +------------------------------------------------- + +mixcomwd: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +mpc8xxx_wdt: + timeout: + Watchdog timeout in ticks. (0<timeout<65536, default=65535) + reset: + Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +mv64x60_wdt: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +ni903x_wdt: + timeout: + Initial watchdog timeout in seconds (0<timeout<516, default=60) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +nic7018_wdt: + timeout: + Initial watchdog timeout in seconds (0<timeout<464, default=80) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +nuc900_wdt: + heartbeat: + Watchdog heartbeats in seconds. + (default = 15) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +omap_wdt: + timer_margin: + initial watchdog timeout (in seconds) + early_enable: + Watchdog is started on module insertion (default=0 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +orion_wdt: + heartbeat: + Initial watchdog heartbeat in seconds + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +pc87413_wdt: + io: + pc87413 WDT I/O port (default: io). + timeout: + Watchdog timeout in minutes (default=timeout). + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +pika_wdt: + heartbeat: + Watchdog heartbeats in seconds. (default = 15) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +pnx4008_wdt: + heartbeat: + Watchdog heartbeat period in seconds from 1 to 60, default 19 + nowayout: + Set to 1 to keep watchdog running after device release + +------------------------------------------------- + +pnx833x_wdt: + timeout: + Watchdog timeout in Mhz. (68Mhz clock), default=2040000000 (30 seconds) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + start_enabled: + Watchdog is started on module insertion (default=1) + +------------------------------------------------- + +rc32434_wdt: + timeout: + Watchdog timeout value, in seconds (default=20) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +riowd: + riowd_timeout: + Watchdog timeout in minutes (default=1) + +------------------------------------------------- + +s3c2410_wdt: + tmr_margin: + Watchdog tmr_margin in seconds. (default=15) + tmr_atboot: + Watchdog is started at boot time if set to 1, default=0 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + soft_noboot: + Watchdog action, set to 1 to ignore reboots, 0 to reboot + debug: + Watchdog debug, set to >1 for debug, (default 0) + +------------------------------------------------- + +sa1100_wdt: + margin: + Watchdog margin in seconds (default 60s) + +------------------------------------------------- + +sb_wdog: + timeout: + Watchdog timeout in microseconds (max/default 8388607 or 8.3ish secs) + +------------------------------------------------- + +sbc60xxwdt: + wdt_stop: + SBC60xx WDT 'stop' io port (default 0x45) + wdt_start: + SBC60xx WDT 'start' io port (default 0x443) + timeout: + Watchdog timeout in seconds. (1<=timeout<=3600, default=30) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +sbc7240_wdt: + timeout: + Watchdog timeout in seconds. (1<=timeout<=255, default=30) + nowayout: + Disable watchdog when closing device file + +------------------------------------------------- + +sbc8360: + timeout: + Index into timeout table (0-63) (default=27 (60s)) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +sbc_epx_c3: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +sbc_fitpc2_wdt: + margin: + Watchdog margin in seconds (default 60s) + nowayout: + Watchdog cannot be stopped once started + +------------------------------------------------- + +sbsa_gwdt: + timeout: + Watchdog timeout in seconds. (default 10s) + action: + Watchdog action at the first stage timeout, + set to 0 to ignore, 1 to panic. (default=0) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +sc1200wdt: + isapnp: + When set to 0 driver ISA PnP support will be disabled (default=1) + io: + io port + timeout: + range is 0-255 minutes, default is 1 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +sc520_wdt: + timeout: + Watchdog timeout in seconds. (1 <= timeout <= 3600, default=30) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +sch311x_wdt: + force_id: + Override the detected device ID + therm_trip: + Should a ThermTrip trigger the reset generator + timeout: + Watchdog timeout in seconds. 1<= timeout <=15300, default=60 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +scx200_wdt: + margin: + Watchdog margin in seconds + nowayout: + Disable watchdog shutdown on close + +------------------------------------------------- + +shwdt: + clock_division_ratio: + Clock division ratio. Valid ranges are from 0x5 (1.31ms) + to 0x7 (5.25ms). (default=7) + heartbeat: + Watchdog heartbeat in seconds. (1 <= heartbeat <= 3600, default=30 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +smsc37b787_wdt: + timeout: + range is 1-255 units, default is 60 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +softdog: + soft_margin: + Watchdog soft_margin in seconds. + (0 < soft_margin < 65536, default=60) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + soft_noboot: + Softdog action, set to 1 to ignore reboots, 0 to reboot + (default=0) + +------------------------------------------------- + +stmp3xxx_wdt: + heartbeat: + Watchdog heartbeat period in seconds from 1 to 4194304, default 19 + +------------------------------------------------- + +tegra_wdt: + heartbeat: + Watchdog heartbeats in seconds. (default = 120) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +ts72xx_wdt: + timeout: + Watchdog timeout in seconds. (1 <= timeout <= 8, default=8) + nowayout: + Disable watchdog shutdown on close + +------------------------------------------------- + +twl4030_wdt: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +txx9wdt: + timeout: + Watchdog timeout in seconds. (0<timeout<N, default=60) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +uniphier_wdt: + timeout: + Watchdog timeout in power of two seconds. + (1 <= timeout <= 128, default=64) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +w83627hf_wdt: + wdt_io: + w83627hf/thf WDT io port (default 0x2E) + timeout: + Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +w83877f_wdt: + timeout: + Watchdog timeout in seconds. (1<=timeout<=3600, default=30) + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +w83977f_wdt: + timeout: + Watchdog timeout in seconds (15..7635), default=45) + testmode: + Watchdog testmode (1 = no reboot), default=0 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +wafer5823wdt: + timeout: + Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +wdt285: + soft_margin: + Watchdog timeout in seconds (default=60) + +------------------------------------------------- + +wdt977: + timeout: + Watchdog timeout in seconds (60..15300, default=60) + testmode: + Watchdog testmode (1 = no reboot), default=0 + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +wm831x_wdt: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +wm8350_wdt: + nowayout: + Watchdog cannot be stopped once started + (default=kernel config parameter) + +------------------------------------------------- + +sun4v_wdt: + timeout_ms: + Watchdog timeout in milliseconds 1..180000, default=60000) + nowayout: + Watchdog cannot be stopped once started diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt deleted file mode 100644 index 0b88e333f9e1..000000000000 --- a/Documentation/watchdog/watchdog-parameters.txt +++ /dev/null @@ -1,410 +0,0 @@ -This file provides information on the module parameters of many of -the Linux watchdog drivers. Watchdog driver parameter specs should -be listed here unless the driver has its own driver-specific information -file. - - -See Documentation/admin-guide/kernel-parameters.rst for information on -providing kernel parameters for builtin drivers versus loadable -modules. - - -------------------------------------------------- -acquirewdt: -wdt_stop: Acquire WDT 'stop' io port (default 0x43) -wdt_start: Acquire WDT 'start' io port (default 0x443) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -advantechwdt: -wdt_stop: Advantech WDT 'stop' io port (default 0x443) -wdt_start: Advantech WDT 'start' io port (default 0x443) -timeout: Watchdog timeout in seconds. 1<= timeout <=63, default=60. -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -alim1535_wdt: -timeout: Watchdog timeout in seconds. (0 < timeout < 18000, default=60 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -alim7101_wdt: -timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30 -use_gpio: Use the gpio watchdog (required by old cobalt boards). - default=0/off/no -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -ar7_wdt: -margin: Watchdog margin in seconds (default=60) -nowayout: Disable watchdog shutdown on close - (default=kernel config parameter) -------------------------------------------------- -armada_37xx_wdt: -timeout: Watchdog timeout in seconds. (default=120) -nowayout: Disable watchdog shutdown on close - (default=kernel config parameter) -------------------------------------------------- -at91rm9200_wdt: -wdt_time: Watchdog time in seconds. (default=5) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -at91sam9_wdt: -heartbeat: Watchdog heartbeats in seconds. (default = 15) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -bcm47xx_wdt: -wdt_time: Watchdog time in seconds. (default=30) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -coh901327_wdt: -margin: Watchdog margin in seconds (default 60s) -------------------------------------------------- -cpu5wdt: -port: base address of watchdog card, default is 0x91 -verbose: be verbose, default is 0 (no) -ticks: count down ticks, default is 10000 -------------------------------------------------- -cpwd: -wd0_timeout: Default watchdog0 timeout in 1/10secs -wd1_timeout: Default watchdog1 timeout in 1/10secs -wd2_timeout: Default watchdog2 timeout in 1/10secs -------------------------------------------------- -da9052wdt: -timeout: Watchdog timeout in seconds. 2<= timeout <=131, default=2.048s -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -davinci_wdt: -heartbeat: Watchdog heartbeat period in seconds from 1 to 600, default 60 -------------------------------------------------- -ebc-c384_wdt: -timeout: Watchdog timeout in seconds. (1<=timeout<=15300, default=60) -nowayout: Watchdog cannot be stopped once started -------------------------------------------------- -ep93xx_wdt: -nowayout: Watchdog cannot be stopped once started -timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=TBD) -------------------------------------------------- -eurotechwdt: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -io: Eurotech WDT io port (default=0x3f0) -irq: Eurotech WDT irq (default=10) -ev: Eurotech WDT event type (default is `int') -------------------------------------------------- -gef_wdt: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -geodewdt: -timeout: Watchdog timeout in seconds. 1<= timeout <=131, default=60. -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -i6300esb: -heartbeat: Watchdog heartbeat in seconds. (1<heartbeat<2046, default=30) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -iTCO_wdt: -heartbeat: Watchdog heartbeat in seconds. - (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=30) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -iTCO_vendor_support: -vendorsupport: iTCO vendor specific support mode, default=0 (none), - 1=SuperMicro Pent3, 2=SuperMicro Pent4+, 911=Broken SMI BIOS -------------------------------------------------- -ib700wdt: -timeout: Watchdog timeout in seconds. 0<= timeout <=30, default=30. -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -ibmasr: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -imx2_wdt: -timeout: Watchdog timeout in seconds (default 60 s) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -indydog: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -iop_wdt: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -it8712f_wdt: -margin: Watchdog margin in seconds (default 60) -nowayout: Disable watchdog shutdown on close - (default=kernel config parameter) -------------------------------------------------- -it87_wdt: -nogameport: Forbid the activation of game port, default=0 -nocir: Forbid the use of CIR (workaround for some buggy setups); set to 1 if -system resets despite watchdog daemon running, default=0 -exclusive: Watchdog exclusive device open, default=1 -timeout: Watchdog timeout in seconds, default=60 -testmode: Watchdog test mode (1 = no reboot), default=0 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -ixp4xx_wdt: -heartbeat: Watchdog heartbeat in seconds (default 60s) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -ks8695_wdt: -wdt_time: Watchdog time in seconds. (default=5) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -machzwd: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -action: after watchdog resets, generate: - 0 = RESET(*) 1 = SMI 2 = NMI 3 = SCI -------------------------------------------------- -max63xx_wdt: -heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 60 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -nodelay: Force selection of a timeout setting without initial delay - (max6373/74 only, default=0) -------------------------------------------------- -mixcomwd: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -mpc8xxx_wdt: -timeout: Watchdog timeout in ticks. (0<timeout<65536, default=65535) -reset: Watchdog Interrupt/Reset Mode. 0 = interrupt, 1 = reset -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -mv64x60_wdt: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -ni903x_wdt: -timeout: Initial watchdog timeout in seconds (0<timeout<516, default=60) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -nic7018_wdt: -timeout: Initial watchdog timeout in seconds (0<timeout<464, default=80) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -nuc900_wdt: -heartbeat: Watchdog heartbeats in seconds. - (default = 15) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -omap_wdt: -timer_margin: initial watchdog timeout (in seconds) -early_enable: Watchdog is started on module insertion (default=0 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -orion_wdt: -heartbeat: Initial watchdog heartbeat in seconds -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -pc87413_wdt: -io: pc87413 WDT I/O port (default: io). -timeout: Watchdog timeout in minutes (default=timeout). -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -pika_wdt: -heartbeat: Watchdog heartbeats in seconds. (default = 15) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -pnx4008_wdt: -heartbeat: Watchdog heartbeat period in seconds from 1 to 60, default 19 -nowayout: Set to 1 to keep watchdog running after device release -------------------------------------------------- -pnx833x_wdt: -timeout: Watchdog timeout in Mhz. (68Mhz clock), default=2040000000 (30 seconds) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -start_enabled: Watchdog is started on module insertion (default=1) -------------------------------------------------- -rc32434_wdt: -timeout: Watchdog timeout value, in seconds (default=20) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -riowd: -riowd_timeout: Watchdog timeout in minutes (default=1) -------------------------------------------------- -s3c2410_wdt: -tmr_margin: Watchdog tmr_margin in seconds. (default=15) -tmr_atboot: Watchdog is started at boot time if set to 1, default=0 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -soft_noboot: Watchdog action, set to 1 to ignore reboots, 0 to reboot -debug: Watchdog debug, set to >1 for debug, (default 0) -------------------------------------------------- -sa1100_wdt: -margin: Watchdog margin in seconds (default 60s) -------------------------------------------------- -sb_wdog: -timeout: Watchdog timeout in microseconds (max/default 8388607 or 8.3ish secs) -------------------------------------------------- -sbc60xxwdt: -wdt_stop: SBC60xx WDT 'stop' io port (default 0x45) -wdt_start: SBC60xx WDT 'start' io port (default 0x443) -timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -sbc7240_wdt: -timeout: Watchdog timeout in seconds. (1<=timeout<=255, default=30) -nowayout: Disable watchdog when closing device file -------------------------------------------------- -sbc8360: -timeout: Index into timeout table (0-63) (default=27 (60s)) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -sbc_epx_c3: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -sbc_fitpc2_wdt: -margin: Watchdog margin in seconds (default 60s) -nowayout: Watchdog cannot be stopped once started -------------------------------------------------- -sbsa_gwdt: -timeout: Watchdog timeout in seconds. (default 10s) -action: Watchdog action at the first stage timeout, - set to 0 to ignore, 1 to panic. (default=0) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -sc1200wdt: -isapnp: When set to 0 driver ISA PnP support will be disabled (default=1) -io: io port -timeout: range is 0-255 minutes, default is 1 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -sc520_wdt: -timeout: Watchdog timeout in seconds. (1 <= timeout <= 3600, default=30) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -sch311x_wdt: -force_id: Override the detected device ID -therm_trip: Should a ThermTrip trigger the reset generator -timeout: Watchdog timeout in seconds. 1<= timeout <=15300, default=60 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -scx200_wdt: -margin: Watchdog margin in seconds -nowayout: Disable watchdog shutdown on close -------------------------------------------------- -shwdt: -clock_division_ratio: Clock division ratio. Valid ranges are from 0x5 (1.31ms) - to 0x7 (5.25ms). (default=7) -heartbeat: Watchdog heartbeat in seconds. (1 <= heartbeat <= 3600, default=30 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -smsc37b787_wdt: -timeout: range is 1-255 units, default is 60 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -softdog: -soft_margin: Watchdog soft_margin in seconds. - (0 < soft_margin < 65536, default=60) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -soft_noboot: Softdog action, set to 1 to ignore reboots, 0 to reboot - (default=0) -------------------------------------------------- -stmp3xxx_wdt: -heartbeat: Watchdog heartbeat period in seconds from 1 to 4194304, default 19 -------------------------------------------------- -tegra_wdt: -heartbeat: Watchdog heartbeats in seconds. (default = 120) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -ts72xx_wdt: -timeout: Watchdog timeout in seconds. (1 <= timeout <= 8, default=8) -nowayout: Disable watchdog shutdown on close -------------------------------------------------- -twl4030_wdt: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -txx9wdt: -timeout: Watchdog timeout in seconds. (0<timeout<N, default=60) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -uniphier_wdt: -timeout: Watchdog timeout in power of two seconds. - (1 <= timeout <= 128, default=64) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -w83627hf_wdt: -wdt_io: w83627hf/thf WDT io port (default 0x2E) -timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -w83877f_wdt: -timeout: Watchdog timeout in seconds. (1<=timeout<=3600, default=30) -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -w83977f_wdt: -timeout: Watchdog timeout in seconds (15..7635), default=45) -testmode: Watchdog testmode (1 = no reboot), default=0 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -wafer5823wdt: -timeout: Watchdog timeout in seconds. 1 <= timeout <= 255, default=60. -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -wdt285: -soft_margin: Watchdog timeout in seconds (default=60) -------------------------------------------------- -wdt977: -timeout: Watchdog timeout in seconds (60..15300, default=60) -testmode: Watchdog testmode (1 = no reboot), default=0 -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -wm831x_wdt: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -wm8350_wdt: -nowayout: Watchdog cannot be stopped once started - (default=kernel config parameter) -------------------------------------------------- -sun4v_wdt: -timeout_ms: Watchdog timeout in milliseconds 1..180000, default=60000) -nowayout: Watchdog cannot be stopped once started -------------------------------------------------- diff --git a/Documentation/watchdog/watchdog-pm.txt b/Documentation/watchdog/watchdog-pm.rst index 7a4dd46e0d24..646e1f28f31f 100644 --- a/Documentation/watchdog/watchdog-pm.txt +++ b/Documentation/watchdog/watchdog-pm.rst @@ -1,5 +1,7 @@ +=============================================== The Linux WatchDog Timer Power Management Guide =============================================== + Last reviewed: 17-Dec-2018 Wolfram Sang <wsa+renesas@sang-engineering.com> @@ -16,4 +18,5 @@ On resume, a watchdog timer shall be reset to its selected value to give userspace enough time to resume. [1] [2] [1] https://patchwork.kernel.org/patch/10252209/ + [2] https://patchwork.kernel.org/patch/10711625/ diff --git a/Documentation/watchdog/wdt.txt b/Documentation/watchdog/wdt.rst index ed2f0b860869..d97b0361535b 100644 --- a/Documentation/watchdog/wdt.txt +++ b/Documentation/watchdog/wdt.rst @@ -1,11 +1,14 @@ +============================================================ +WDT Watchdog Timer Interfaces For The Linux Operating System +============================================================ + Last Reviewed: 10/05/2007 - WDT Watchdog Timer Interfaces For The Linux Operating System - Alan Cox <alan@lxorguk.ukuu.org.uk> +Alan Cox <alan@lxorguk.ukuu.org.uk> - ICS WDT501-P - ICS WDT501-P (no fan tachometer) - ICS WDT500-P + - ICS WDT501-P + - ICS WDT501-P (no fan tachometer) + - ICS WDT500-P All the interfaces provide /dev/watchdog, which when open must be written to within a timeout or the machine will reboot. Each write delays the reboot @@ -21,19 +24,26 @@ degrees Fahrenheit. Each read returns a single byte giving the temperature. The third interface logs kernel messages on additional alert events. The ICS ISA-bus wdt card cannot be safely probed for. Instead you need to -pass IO address and IRQ boot parameters. E.g.: +pass IO address and IRQ boot parameters. E.g.:: + wdt.io=0x240 wdt.irq=11 Other "wdt" driver parameters are: + + =========== ====================================================== heartbeat Watchdog heartbeat in seconds (default 60) nowayout Watchdog cannot be stopped once started (kernel - build parameter) + build parameter) tachometer WDT501-P Fan Tachometer support (0=disable, default=0) type WDT501-P Card type (500 or 501, default=500) + =========== ====================================================== Features -------- - WDT501P WDT500P + +================ ======= ======= + WDT501P WDT500P +================ ======= ======= Reboot Timer X X External Reboot X X I/O Port Monitor o o @@ -42,9 +52,12 @@ Fan Speed X o Power Under X o Power Over X o Overheat X o +================ ======= ======= The external event interfaces on the WDT boards are not currently supported. Minor numbers are however allocated for it. -Example Watchdog Driver: see samples/watchdog/watchdog-simple.c +Example Watchdog Driver: + + see samples/watchdog/watchdog-simple.c diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index ae36fc5fc649..f2de1b2d3ac7 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -19,7 +19,6 @@ x86-specific Documentation tlb mtrr pat - protection-keys intel_mpx amd-memory-encryption pti diff --git a/Documentation/x86/resctrl_ui.rst b/Documentation/x86/resctrl_ui.rst index 225cfd4daaee..5368cedfb530 100644 --- a/Documentation/x86/resctrl_ui.rst +++ b/Documentation/x86/resctrl_ui.rst @@ -40,7 +40,7 @@ mount options are: Enable the MBA Software Controller(mba_sc) to specify MBA bandwidth in MBps -L2 and L3 CDP are controlled seperately. +L2 and L3 CDP are controlled separately. RDT features are orthogonal. A particular system may support only monitoring, only control, or both monitoring and control. Cache @@ -118,7 +118,7 @@ related to allocation: Corresponding region is pseudo-locked. No sharing allowed. -Memory bandwitdh(MB) subdirectory contains the following files +Memory bandwidth(MB) subdirectory contains the following files with respect to allocation: "min_bandwidth": @@ -209,7 +209,7 @@ All groups contain the following files: CPUs to/from this group. As with the tasks file a hierarchy is maintained where MON groups may only include CPUs owned by the parent CTRL_MON group. - When the resouce group is in pseudo-locked mode this file will + When the resource group is in pseudo-locked mode this file will only be readable, reflecting the CPUs associated with the pseudo-locked region. @@ -342,7 +342,7 @@ For cache resources we describe the portion of the cache that is available for allocation using a bitmask. The maximum value of the mask is defined by each cpu model (and may be different for different cache levels). It is found using CPUID, but is also provided in the "info" directory of -the resctrl file system in "info/{resource}/cbm_mask". X86 hardware +the resctrl file system in "info/{resource}/cbm_mask". Intel hardware requires that these masks have all the '1' bits in a contiguous block. So 0x3, 0x6 and 0xC are legal 4-bit masks with two bits set, but 0x5, 0x9 and 0xA are not. On a system with a 20-bit mask each bit represents 5% @@ -380,7 +380,7 @@ where L2 external is 10GBps (hence aggregate L2 external bandwidth is 240GBps) and L3 external bandwidth is 100GBps. Now a workload with '20 threads, having 50% bandwidth, each consuming 5GBps' consumes the max L3 bandwidth of 100GBps although the percentage value specified is only 50% -<< 100%. Hence increasing the bandwidth percentage will not yeild any +<< 100%. Hence increasing the bandwidth percentage will not yield any more bandwidth. This is because although the L2 external bandwidth still has capacity, the L3 external bandwidth is fully used. Also note that this would be dependent on number of cores the benchmark is run on. @@ -398,7 +398,7 @@ In order to mitigate this and make the interface more user friendly, resctrl added support for specifying the bandwidth in MBps as well. The kernel underneath would use a software feedback mechanism or a "Software Controller(mba_sc)" which reads the actual bandwidth using MBM counters -and adjust the memowy bandwidth percentages to ensure:: +and adjust the memory bandwidth percentages to ensure:: "actual bandwidth < user specified bandwidth". @@ -418,16 +418,22 @@ L3 schemata file details (CDP enabled via mount option to resctrl) When CDP is enabled L3 control is split into two separate resources so you can specify independent masks for code and data like this:: - L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... - L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... + L3DATA:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... + L3CODE:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... L2 schemata file details ------------------------ -L2 cache does not support code and data prioritization, so the -schemata format is always:: +CDP is supported at L2 using the 'cdpl2' mount option. The schemata +format is either:: L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... +or + + L2DATA:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... + L2CODE:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... + + Memory bandwidth Allocation (default mode) ------------------------------------------ @@ -671,8 +677,8 @@ allocations can overlap or not. The allocations specifies the maximum b/w that the group may be able to use and the system admin can configure the b/w accordingly. -If the MBA is specified in MB(megabytes) then user can enter the max b/w in MB -rather than the percentage values. +If resctrl is using the software controller (mba_sc) then user can enter the +max b/w in MB rather than the percentage values. :: # echo "L3:0=3;1=c\nMB:0=1024;1=500" > /sys/fs/resctrl/p0/schemata diff --git a/Documentation/x86/x86_64/5level-paging.rst b/Documentation/x86/x86_64/5level-paging.rst index ab88a4514163..44856417e6a5 100644 --- a/Documentation/x86/x86_64/5level-paging.rst +++ b/Documentation/x86/x86_64/5level-paging.rst @@ -20,7 +20,7 @@ physical address space. This "ought to be enough for anybody" ©. QEMU 2.9 and later support 5-level paging. Virtual memory layout for 5-level paging is described in -Documentation/x86/x86_64/mm.txt +Documentation/x86/x86_64/mm.rst Enabling 5-level paging diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 2f69836b8445..6a4285a3c7a4 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -9,7 +9,7 @@ only the AMD64 specific ones are listed here. Machine check ============= -Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables. +Please see Documentation/x86/x86_64/machinecheck.rst for sysfs runtime tunables. mce=off Disable machine check @@ -89,7 +89,7 @@ APICs Don't use the local APIC (alias for i386 compatibility) pirq=... - See Documentation/x86/i386/IO-APIC.txt + See Documentation/x86/i386/IO-APIC.rst noapictimer Don't set up the APIC timer diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst index a6926cd40f70..30108684ae87 100644 --- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst +++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst @@ -18,7 +18,7 @@ For more information on the features of cpusets, see Documentation/cgroup-v1/cpusets.rst. There are a number of different configurations you can use for your needs. For more information on the numa=fake command line option and its various ways of -configuring fake nodes, see Documentation/x86/x86_64/boot-options.txt. +configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst. For the purposes of this introduction, we'll assume a very primitive NUMA emulation setup of "numa=fake=4*512,". This will split our system memory into diff --git a/Documentation/xilinx/eemi.txt b/Documentation/xilinx/eemi.rst index 5f39b4ffdcd4..9dcbc6f18d75 100644 --- a/Documentation/xilinx/eemi.txt +++ b/Documentation/xilinx/eemi.rst @@ -1,6 +1,6 @@ ---------------------------------------------------------------------- +==================================== Xilinx Zynq MPSoC EEMI Documentation ---------------------------------------------------------------------- +==================================== Xilinx Zynq MPSoC Firmware Interface ------------------------------------- @@ -21,7 +21,7 @@ The zynqmp-firmware driver maintain all EEMI APIs in zynqmp_eemi_ops structure. Any driver who want to communicate with PMC using EEMI APIs can call zynqmp_pm_get_eemi_ops(). -Example of EEMI ops: +Example of EEMI ops:: /* zynqmp-firmware driver maintain all EEMI APIs */ struct zynqmp_eemi_ops { @@ -34,7 +34,7 @@ Example of EEMI ops: .query_data = zynqmp_pm_query_data, }; -Example of EEMI ops usage: +Example of EEMI ops usage:: static const struct zynqmp_eemi_ops *eemi_ops; u32 ret_payload[PAYLOAD_ARG_CNT]; diff --git a/Documentation/xilinx/index.rst b/Documentation/xilinx/index.rst new file mode 100644 index 000000000000..01cc1a0714df --- /dev/null +++ b/Documentation/xilinx/index.rst @@ -0,0 +1,17 @@ +:orphan: + +=========== +Xilinx FPGA +=========== + +.. toctree:: + :maxdepth: 1 + + eemi + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` |