From d14d0c1aea8fe0eb61b7788a3b546475e183b079 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:56:08 +0200 Subject: scripts: sphinx-pre-install: improve distro detection check The Arch-linux detection is hit by catting /etc/issue, whose contents is (nowadays): Arch Linux \r (\l) It sounds a little ackward to print such string, so, instead, let's use the /etc/os-release file, with exists on lots of distributions and should provide a more reliable result. We'll keep the old tests before it, in order to avoid possible regressions with the other distros, although the new way should probably work on all the currently supported distributions. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/472924557afdf2b5492ae2a48c5ecfae216d54e2.1586883286.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index fa3fb05cd54b..ea941c6e0647 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -780,6 +780,24 @@ $system_release = catcheck("/etc/system-release") if !$system_release; $system_release = catcheck("/etc/redhat-release") if !$system_release; $system_release = catcheck("/etc/lsb-release") if !$system_release; $system_release = catcheck("/etc/gentoo-release") if !$system_release; + +# This seems more common than LSB these days +if (!$system_release) { + my %os_var; + if (open IN, "cat /etc/os-release|") { + while () { + if (m/^([\w\d\_]+)=\"?([^\"]*)\"?\n/) { + $os_var{$1}=$2; + } + } + $system_release = $os_var{"NAME"}; + if (defined($os_var{"VERSION_ID"})) { + $system_release .= " " . $os_var{"VERSION_ID"} if (defined($os_var{"VERSION_ID"})); + } else { + $system_release .= " " . $os_var{"VERSION"}; + } + } +} $system_release = catcheck("/etc/issue") if !$system_release; $system_release =~ s/\s+$//; -- cgit v1.2.3 From b3df6223bdea763ed666b06a4e7431f40b33da67 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:56:09 +0200 Subject: scripts: sphinx-pre-install: improve openSuse Tumbleweed check Currently, with openSUSE Tumbleweed 20200303, it keeps recommending this forever: sudo zypper install --no-recommends rsvg-view This dependency will never be fulfilled there, as the package now is named as on other distros: rsvg-convert. So, improve the detection to avoid such issue. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/c3774f72ac36c5e5b5f446ae5db5b795d1f274f4.1586883286.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index ea941c6e0647..6d6d2849fbb1 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -446,9 +446,11 @@ sub give_opensuse_hints() "convert" => "ImageMagick", "Pod::Usage" => "perl-Pod-Usage", "xelatex" => "texlive-xetex-bin", - "rsvg-convert" => "rsvg-view", ); + # On Tumbleweed, this package is also named rsvg-convert + $map{"rsvg-convert"} = "rsvg-view" if (!($system_release =~ /Tumbleweed/)); + my @suse_tex_pkgs = ( "texlive-babel-english", "texlive-caption", -- cgit v1.2.3 From bfc7f4281066154e6b420a87335d6387c9b0835a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:56:10 +0200 Subject: scripts: sphinx-pre-install: fix a dependency hint with Ubuntu 16.04 Avoid the scripts to keep asking to install fonts-noto-cjk on Ubuntu 16.04. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/912b664a8ca54e8c5c5767c3fe9171973eeddd6b.1586883286.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 6d6d2849fbb1..ba750f7b9127 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -349,7 +349,8 @@ sub give_debian_hints() "fonts-dejavu", 2); check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc", - "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc"], + "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc"], "fonts-noto-cjk", 2); } -- cgit v1.2.3 From e45a631742fadd7c9feb5a0049382102e5d43fe7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:56:11 +0200 Subject: scripts: sphinx-pre-install: address some issues with Gentoo There are some small misdetections with Gentoo. While they don't cause too much trouble, it keeps recomending to install things that are already there. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/7f631edce102b02ccbdbfb18be1376a86b41373d.1586883286.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index ba750f7b9127..86e14b9fd537 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -560,7 +560,8 @@ sub give_gentoo_hints() "media-fonts/dejavu", 2) if ($pdf); if ($pdf) { - check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf"], + check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf", + "/usr/share/fonts/noto-cjk/NotoSerifCJK-Regular.ttc"], "media-fonts/noto-cjk", 2); } @@ -575,10 +576,10 @@ sub give_gentoo_hints() my $portage_imagemagick = "/etc/portage/package.use/imagemagick"; my $portage_cairo = "/etc/portage/package.use/graphviz"; - if (qx(cat $portage_imagemagick) ne "$imagemagick\n") { + if (qx(grep imagemagick $portage_imagemagick 2>/dev/null) eq "") { printf("\tsudo su -c 'echo \"$imagemagick\" > $portage_imagemagick'\n") } - if (qx(cat $portage_cairo) ne "$cairo\n") { + if (qx(grep graphviz $portage_cairo 2>/dev/null) eq "") { printf("\tsudo su -c 'echo \"$cairo\" > $portage_cairo'\n"); } -- cgit v1.2.3 From d6ebf1890c8ba6c9340681cb24bdbdb0ea7c71b4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:56:12 +0200 Subject: scripts: sphinx-pre-install: add support for OpenMandriva It seems that Mageia and OpenMandriva will reunite on a single distribution. In any case, both came from Mandriva. So, it is close enough to use the same logic. So, add support for it. Tested with OpenMandriva 4.1 and with Mageia 7.1. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/692809729c6818a0b0f75513da15970c53d5565c.1586883286.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 86e14b9fd537..89b033285caf 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -494,7 +494,7 @@ sub give_mageia_hints() "convert" => "ImageMagick", "Pod::Usage" => "perl-Pod-Usage", "xelatex" => "texlive", - "rsvg-convert" => "librsvg2-tools", + "rsvg-convert" => "librsvg2", ); my @tex_pkgs = ( @@ -503,16 +503,29 @@ sub give_mageia_hints() $map{"latexmk"} = "texlive-collection-basic"; + my $packager_cmd; + my $noto_sans; + if ($system_release =~ /OpenMandriva/) { + $packager_cmd = "dnf install"; + $noto_sans = "noto-sans-cjk-fonts"; + @tex_pkgs = ( "texlive-collection-fontsextra" ); + } else { + $packager_cmd = "urpmi"; + $noto_sans = "google-noto-sans-cjk-ttc-fonts"; + } + + if ($pdf) { - check_missing_file(["/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc"], - "google-noto-sans-cjk-ttc-fonts", 2); + check_missing_file(["/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc", + "/usr/share/fonts/TTF/NotoSans-Regular.ttf"], + $noto_sans, 2); } check_rpm_missing(\@tex_pkgs, 2) if ($pdf); check_missing(\%map); return if (!$need && !$optional); - printf("You should run:\n\n\tsudo urpmi $install\n"); + printf("You should run:\n\n\tsudo $packager_cmd $install\n"); } sub give_arch_linux_hints() @@ -626,6 +639,10 @@ sub check_distros() give_mageia_hints; return; } + if ($system_release =~ /OpenMandriva/) { + give_mageia_hints; + return; + } if ($system_release =~ /Arch Linux/) { give_arch_linux_hints; return; -- cgit v1.2.3 From 2f9c502552cd3e791db91ee80f7766e122b3b026 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:56:13 +0200 Subject: scripts: sphinx-pre-install: add support for python -m venv Since python 3.3, the recommended way to setup a virtual env is via "python -m venv". Set this as a default, if python version is compatible with such feature. While here, add more comments to it, as the script is getting more complex. So, better to add more things, to avoid accidentally breaking it while improving it. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/252cc849c79527ad496247e4c481961478adf41c.1586883286.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 113 ++++++++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 38 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 89b033285caf..d4dfe1e59989 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0-or-later use strict; -# Copyright (c) 2017-2019 Mauro Carvalho Chehab +# Copyright (c) 2017-2020 Mauro Carvalho Chehab # my $prefix = "./"; @@ -22,9 +22,12 @@ my $need = 0; my $optional = 0; my $need_symlink = 0; my $need_sphinx = 0; +my $need_venv = 0; +my $need_virtualenv = 0; my $rec_sphinx_upgrade = 0; my $install = ""; my $virtenv_dir = ""; +my $python_cmd = ""; my $min_version; # @@ -147,7 +150,7 @@ sub check_program($$) my $prog = shift; my $is_optional = shift; - return if findprog($prog); + return $prog if findprog($prog); add_package($prog, $is_optional); } @@ -168,9 +171,9 @@ sub check_python_module($$) my $prog = shift; my $is_optional = shift; - my $err = system("python3 -c 'import $prog' 2>/dev/null /dev/null"); - return if ($err == 0); - my $err = system("python -c 'import $prog' 2>/dev/null /dev/null"); + return if (!$python_cmd); + + my $err = system("$python_cmd -c 'import $prog' 2>/dev/null /dev/null"); return if ($err == 0); add_package($prog, $is_optional); @@ -225,16 +228,6 @@ sub get_sphinx_fname() return $fname; } - if ($virtualenv) { - my $prog = findprog("virtualenv-3"); - $prog = findprog("virtualenv-3.5") if (!$prog); - - check_program("virtualenv", 0) if (!$prog); - $need_sphinx = 1; - } else { - add_package("python-sphinx", 0); - } - return ""; } @@ -268,7 +261,10 @@ sub check_sphinx() $virtenv_dir = $virtenv_prefix . $rec_version; my $sphinx = get_sphinx_fname(); - return if ($sphinx eq ""); + if ($sphinx eq "") { + $need_sphinx = 1; + return; + } open IN, "$sphinx --version 2>&1 |" or die "$sphinx returned an error"; while () { @@ -336,6 +332,7 @@ sub give_debian_hints() my %map = ( "python-sphinx" => "python3-sphinx", "sphinx_rtd_theme" => "python3-sphinx-rtd-theme", + "ensurepip" => "python3-venv", "virtualenv" => "virtualenv", "dot" => "graphviz", "convert" => "imagemagick", @@ -672,13 +669,13 @@ sub check_distros() sub deactivate_help() { - printf "\tIf you want to exit the virtualenv, you can use:\n"; + printf "\nIf you want to exit the virtualenv, you can use:\n"; printf "\tdeactivate\n"; } sub check_needs() { - # Check for needed programs/tools + # Check if Sphinx is already accessible from current environment check_sphinx(); if ($system_release) { @@ -689,6 +686,43 @@ sub check_needs() print "To upgrade Sphinx, use:\n\n" if ($rec_sphinx_upgrade); + # Check python command line, trying first python3 + $python_cmd = findprog("python3"); + $python_cmd = check_program("python", 0) if (!$python_cmd); + + # Check the type of virtual env, depending on Python version + if ($python_cmd) { + if ($virtualenv) { + my $tmp = qx($python_cmd --version 2>&1); + if ($tmp =~ m/(\d+\.)(\d+\.)/) { + if ($1 >= 3 && $2 >= 3) { + $need_venv = 1; # python 3.3 or upper + } else { + $need_virtualenv = 1; + } + if ($1 < 3) { + # Complain if it finds python2 (or worse) + printf "Warning: python$1 support is deprecated. Use it with caution!\n"; + } + } else { + die "Warning: couldn't identify $python_cmd version!"; + } + } else { + add_package("python-sphinx", 0); + } + } + + # Set virtualenv command line, if python < 3.3 + my $virtualenv_cmd; + if ($need_virtualenv) { + $virtualenv_cmd = findprog("virtualenv-3"); + $virtualenv_cmd = findprog("virtualenv-3.5") if (!$virtualenv_cmd); + if (!$virtualenv_cmd) { + check_program("virtualenv", 0); + $virtualenv_cmd = "virtualenv"; + } + } + # Check for needed programs/tools check_perl_module("Pod::Usage", 0); check_program("make", 0); @@ -702,12 +736,30 @@ sub check_needs() check_program("rsvg-convert", 2) if ($pdf); check_program("latexmk", 2) if ($pdf); + if ($need_sphinx || $rec_sphinx_upgrade) { + check_python_module("ensurepip", 0) if ($need_venv); + } + + # Do distro-specific checks and output distro-install commands check_distros(); + if (!$python_cmd) { + if ($need == 1) { + die "Can't build as $need mandatory dependency is missing"; + } elsif ($need) { + die "Can't build as $need mandatory dependencies are missing"; + } + } + + # Check if sphinx-build is called sphinx-build-3 if ($need_symlink) { printf "\tsudo ln -sf %s /usr/bin/sphinx-build\n\n", which("sphinx-build-3"); } + + # NOTE: if the system has a too old Sphinx version installed, + # it will recommend installing a newer version using virtualenv + if ($need_sphinx || $rec_sphinx_upgrade) { my $min_activate = "$ENV{'PWD'}/${virtenv_prefix}${min_version}/bin/activate"; my @activates = glob "$ENV{'PWD'}/${virtenv_prefix}*/bin/activate"; @@ -721,27 +773,12 @@ sub check_needs() exit (1); } else { my $rec_activate = "$virtenv_dir/bin/activate"; - my $virtualenv = findprog("virtualenv-3"); - my $rec_python3 = ""; - $virtualenv = findprog("virtualenv-3.5") if (!$virtualenv); - $virtualenv = findprog("virtualenv") if (!$virtualenv); - $virtualenv = "virtualenv" if (!$virtualenv); - - my $rel = ""; - if (index($system_release, "Ubuntu") != -1) { - $rel = $1 if ($system_release =~ /Ubuntu\s+(\d+)[.]/); - if ($rel && $rel >= 16) { - $rec_python3 = " -p python3"; - } - } - if (index($system_release, "Debian") != -1) { - $rel = $1 if ($system_release =~ /Debian\s+(\d+)/); - if ($rel && $rel >= 7) { - $rec_python3 = " -p python3"; - } - } - printf "\t$virtualenv$rec_python3 $virtenv_dir\n"; + if ($need_venv) { + printf "\t$python_cmd -m venv $virtenv_dir\n"; + } else { + printf "\t$virtualenv_cmd $virtenv_dir\n"; + } printf "\t. $rec_activate\n"; printf "\tpip install -r $requirement_file\n"; deactivate_help(); -- cgit v1.2.3 From 346282db9c6bc7ce2dad8ae640494f3f88d23889 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:27 +0200 Subject: scripts: kernel-doc: proper handle @foo->bar() The pattern @foo->bar() is valid, as it can be used by a function pointer inside a struct passed as a parameter. Right now, it causes a warning: ./drivers/firewire/core-transaction.c:606: WARNING: Inline strong start-string without end-string. In this specific case, the kernel-doc markup is: /** * fw_core_remove_address_handler() - unregister an address handler * @handler: callback * * To be called in process context. * * When fw_core_remove_address_handler() returns, @handler->callback() is * guaranteed to not run on any CPU anymore. */ With seems valid on my eyes. So, instead of trying to hack the kernel-doc markup, let's teach it about how to handle such things. This should likely remove lots of other similar warnings as well. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/48b46426d7bf6ff7529f20e5718fbf4e9758e62c.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/kernel-doc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index f746ca8fa403..a9f6d4cea866 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -214,6 +214,7 @@ my $type_constant2 = '\%([-_\w]+)'; my $type_func = '(\w+)\(\)'; my $type_param = '\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; my $type_fp_param = '\@(\w+)\(\)'; # Special RST handling for func ptr params +my $type_fp_param2 = '\@(\w+->\S+)\(\)'; # Special RST handling for structs with func ptr params my $type_env = '(\$\w+)'; my $type_enum = '\&(enum\s*([_\w]+))'; my $type_struct = '\&(struct\s*([_\w]+))'; @@ -249,6 +250,7 @@ my @highlights_rst = ( [$type_member_func, "\\:c\\:type\\:`\$1\$2\$3\\\\(\\\\) <\$1>`"], [$type_member, "\\:c\\:type\\:`\$1\$2\$3 <\$1>`"], [$type_fp_param, "**\$1\\\\(\\\\)**"], + [$type_fp_param2, "**\$1\\\\(\\\\)**"], [$type_func, "\$1()"], [$type_enum, "\\:c\\:type\\:`\$1 <\$2>`"], [$type_struct, "\\:c\\:type\\:`\$1 <\$2>`"], -- cgit v1.2.3 From ee2aa7590398ecc0877552191e38d8a282f21cb8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:28 +0200 Subject: scripts: kernel-doc: accept negation like !@var On a few places, it sometimes need to indicate a negation of a parameter, like: !@fshared This pattern happens, for example, at: kernel/futex.c and it is perfectly valid. However, kernel-doc currently transforms it into: !**fshared** This won't do what it would be expected. Fortunately, fixing the script is a simple matter of storing the "!" before "@" and adding it after the bold markup, like: **!fshared** Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/0314b47f8c3e1f9db00d5375a73dc3cddd8a21f2.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/kernel-doc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index a9f6d4cea866..83b7260e44dc 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -213,6 +213,7 @@ my $type_constant = '\b``([^\`]+)``\b'; my $type_constant2 = '\%([-_\w]+)'; my $type_func = '(\w+)\(\)'; my $type_param = '\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; +my $type_param_ref = '([\!]?)\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; my $type_fp_param = '\@(\w+)\(\)'; # Special RST handling for func ptr params my $type_fp_param2 = '\@(\w+->\S+)\(\)'; # Special RST handling for structs with func ptr params my $type_env = '(\$\w+)'; @@ -237,6 +238,7 @@ my @highlights_man = ( [$type_typedef, "\\\\fI\$1\\\\fP"], [$type_union, "\\\\fI\$1\\\\fP"], [$type_param, "\\\\fI\$1\\\\fP"], + [$type_param_ref, "\\\\fI\$1\$2\\\\fP"], [$type_member, "\\\\fI\$1\$2\$3\\\\fP"], [$type_fallback, "\\\\fI\$1\\\\fP"] ); @@ -258,7 +260,7 @@ my @highlights_rst = ( [$type_union, "\\:c\\:type\\:`\$1 <\$2>`"], # in rst this can refer to any type [$type_fallback, "\\:c\\:type\\:`\$1`"], - [$type_param, "**\$1**"] + [$type_param_ref, "**\$1\$2**"] ); my $blankline_rst = "\n"; -- cgit v1.2.3 From 0d55d48b19ff3a31b295836da23223a52de7ef06 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:29 +0200 Subject: scripts: kernel-doc: accept blank lines on parameter description Sphinx is very pedantic with respect to blank lines. Sometimes, in order to make it to properly handle something, we need to add a blank line. However, currently, any blank line inside a kernel-doc comment like: /* * @foo: bar * * foobar * * some description will be considered as if "foobar" was part of the description. This patch changes kernel-doc behavior. After it, foobar will be considered as part of the parameter text. The description will only be considered as such if it starts with: zero spaces after asterisk: *foo one space after asterisk: * foo or have a explicit Description section: * Description: Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/c07d2862792d75a2691d69c9eceb7b89a0164cc0.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/kernel-doc | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 83b7260e44dc..f68d76dd97ba 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -331,13 +331,14 @@ my $lineprefix=""; # Parser states use constant { - STATE_NORMAL => 0, # normal code - STATE_NAME => 1, # looking for function name - STATE_BODY_MAYBE => 2, # body - or maybe more description - STATE_BODY => 3, # the body of the comment - STATE_PROTO => 4, # scanning prototype - STATE_DOCBLOCK => 5, # documentation block - STATE_INLINE => 6, # gathering documentation outside main block + STATE_NORMAL => 0, # normal code + STATE_NAME => 1, # looking for function name + STATE_BODY_MAYBE => 2, # body - or maybe more description + STATE_BODY => 3, # the body of the comment + STATE_BODY_WITH_BLANK_LINE => 4, # the body, which has a blank line + STATE_PROTO => 5, # scanning prototype + STATE_DOCBLOCK => 6, # documentation block + STATE_INLINE => 7, # gathering doc outside main block }; my $state; my $in_doc_sect; @@ -1957,6 +1958,12 @@ sub process_body($$) { } } + if ($state == STATE_BODY_WITH_BLANK_LINE && /^\s*\*\s?\S/) { + dump_section($file, $section, $contents); + $section = $section_default; + $contents = ""; + } + if (/$doc_sect/i) { # case insensitive for supported section names $newsection = $1; $newcontents = $2; @@ -2010,18 +2017,21 @@ sub process_body($$) { $state = STATE_PROTO; $brcount = 0; } elsif (/$doc_content/) { - # miguel-style comment kludge, look for blank lines after - # @parameter line to signify start of description if ($1 eq "") { - if ($section =~ m/^@/ || $section eq $section_context) { + if ($section eq $section_context) { dump_section($file, $section, $contents); $section = $section_default; $contents = ""; $new_start_line = $.; + $state = STATE_BODY; } else { + if ($section ne $section_default) { + $state = STATE_BODY_WITH_BLANK_LINE; + } else { + $state = STATE_BODY; + } $contents .= "\n"; } - $state = STATE_BODY; } elsif ($state == STATE_BODY_MAYBE) { # Continued declaration purpose chomp($declaration_purpose); @@ -2173,7 +2183,8 @@ sub process_file($) { process_normal(); } elsif ($state == STATE_NAME) { process_name($file, $_); - } elsif ($state == STATE_BODY || $state == STATE_BODY_MAYBE) { + } elsif ($state == STATE_BODY || $state == STATE_BODY_MAYBE || + $state == STATE_BODY_WITH_BLANK_LINE) { process_body($file, $_); } elsif ($state == STATE_INLINE) { # scanning for inline parameters process_inline($file, $_); -- cgit v1.2.3 From d5afc9640a6d4596e57a2c4906f903ab1c83ada5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:30 +0200 Subject: docs: update recommended Sphinx version to 2.4.4 There are some docs that have nested tables. While this was always part of the spec, only Sphinx version 2.4.x can translate it to LaTeX. In other words, if someone is using a Sphinx version < 2.4, the LaTeX and PDF output won't work for some of the docs. So, it seems that it is time to raise the bar again for the recommented version. The Sphinx check script is already smart enough to keep working, with older versions, warning the users that an upgrade is recommended (and explaining how): Sphinx version 1.7.9 Warning: It is recommended at least Sphinx version 2.4.4. Detected OS: Fedora release 31 (Thirty One). To upgrade Sphinx, use: /usr/bin/virtualenv sphinx_2.4.4 . sphinx_2.4.4/bin/activate pip install -r ./Documentation/sphinx/requirements.txt Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/498f701c618f7d0cf5f0a37e5889ee926f7c8bf4.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/sphinx/requirements.txt b/Documentation/sphinx/requirements.txt index 14e29a0ae480..489f6626de67 100644 --- a/Documentation/sphinx/requirements.txt +++ b/Documentation/sphinx/requirements.txt @@ -1,3 +1,3 @@ docutils -Sphinx==1.7.9 +Sphinx==2.4.4 sphinx_rtd_theme -- cgit v1.2.3 From 25813cae1eebbd77240b19ba45a1854020fa2cf6 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:31 +0200 Subject: docs: LaTeX/PDF: drop list of documents The building system can auto-generate a list of documents since commit: 9d42afbe6bd4 ("docs: pdf: add all Documentation/*/index.rst to PDF output"). The added logic there allows keeping the existing list, but there's not real reason to keep it. Now, the media document has gone (it was split into tree). So, it sounds about time to get rid of the manual entries, and let the script to generate it automatically instead. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9345dba7164497dbf28578f6ec271e479379610c.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/conf.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/Documentation/conf.py b/Documentation/conf.py index 9ae8e9abf846..f6a1bc07c410 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -388,44 +388,6 @@ if major == 1 and minor < 6: # author, documentclass [howto, manual, or own class]). # Sorted in alphabetical order latex_documents = [ - ('admin-guide/index', 'linux-user.tex', 'Linux Kernel User Documentation', - 'The kernel development community', 'manual'), - ('core-api/index', 'core-api.tex', 'The kernel core API manual', - 'The kernel development community', 'manual'), - ('crypto/index', 'crypto-api.tex', 'Linux Kernel Crypto API manual', - 'The kernel development community', 'manual'), - ('dev-tools/index', 'dev-tools.tex', 'Development tools for the Kernel', - 'The kernel development community', 'manual'), - ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide', - 'The kernel development community', 'manual'), - ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual', - 'The kernel development community', 'manual'), - ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', - 'The kernel development community', 'manual'), - ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', - 'ext4 Community', 'manual'), - ('filesystems/ext4/index', 'ext4-data-structures.tex', - 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'), - ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', - 'The kernel development community', 'manual'), - ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', - 'The kernel development community', 'manual'), - ('kernel-hacking/index', 'kernel-hacking.tex', 'Unreliable Guide To Hacking The Linux Kernel', - 'The kernel development community', 'manual'), - ('media/index', 'media.tex', 'Linux Media Subsystem Documentation', - 'The kernel development community', 'manual'), - ('networking/index', 'networking.tex', 'Linux Networking Documentation', - 'The kernel development community', 'manual'), - ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation', - 'The kernel development community', 'manual'), - ('security/index', 'security.tex', 'The kernel security subsystem manual', - 'The kernel development community', 'manual'), - ('sh/index', 'sh.tex', 'SuperH architecture implementation manual', - 'The kernel development community', 'manual'), - ('sound/index', 'sound.tex', 'Linux Sound Subsystem Documentation', - 'The kernel development community', 'manual'), - ('userspace-api/index', 'userspace-api.tex', 'The Linux kernel user-space API guide', - 'The kernel development community', 'manual'), ] # Add all other index files from Documentation/ subdirectories -- cgit v1.2.3 From 101e330fd3f275ea553224bece4cf11ba20c6def Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:32 +0200 Subject: MAINTAINERS: dt: update display/allwinner file entry Changeset f5a98bfe7b37 ("dt-bindings: display: Convert Allwinner display pipeline to schemas") split Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt into several files. Yet, it kept the old place at MAINTAINERS. Update it to point to the new place. Fixes: f5a98bfe7b37 ("dt-bindings: display: Convert Allwinner display pipeline to schemas") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/1be758765272ba4c2acbc3904bdf71c863a90186.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..86f98c3e6cfc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5552,7 +5552,7 @@ M: Chen-Yu Tsai L: dri-devel@lists.freedesktop.org S: Supported T: git git://anongit.freedesktop.org/drm/drm-misc -F: Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt +F: Documentation/devicetree/bindings/display/allwinner* F: drivers/gpu/drm/sun4i/ DRM DRIVERS FOR AMLOGIC SOCS -- cgit v1.2.3 From 0855a36e22483b8e56eea84112eb0d5c48a2fa3a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:33 +0200 Subject: MAINTAINERS: dt: fix pointers for ARM Integrator, Versatile and RealView There's a conversion from a plain text binding file into 4 yaml ones. The old file got removed, causing this new warning: Warning: MAINTAINERS references a file that doesn't exist: Documentation/devicetree/bindings/arm/arm-boards Address it by replacing the old reference by the new ones Fixes: 2d483550b6d2 ("dt-bindings: arm: Drop the non-YAML bindings") Fixes: 33fbfb3eaf4e ("dt-bindings: arm: Add Integrator YAML schema") Fixes: 4b900070d50d ("dt-bindings: arm: Add Versatile YAML schema") Fixes: 7db625b9fa75 ("dt-bindings: arm: Add RealView YAML schema") Fixes: 4fb00d9066c1 ("dt-bindings: arm: Add Versatile Express and Juno YAML schema") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/eae3440fb70c1b1666973e34fd3fd6b8ab4a3bc7.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- MAINTAINERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 86f98c3e6cfc..82e4b0a0c921 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1323,7 +1323,10 @@ ARM INTEGRATOR, VERSATILE AND REALVIEW SUPPORT M: Linus Walleij L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained -F: Documentation/devicetree/bindings/arm/arm-boards +F: Documentation/devicetree/bindings/arm/arm,integrator.yaml +F: Documentation/devicetree/bindings/arm/arm,realview.yaml +F: Documentation/devicetree/bindings/arm/arm,versatile.yaml +F: Documentation/devicetree/bindings/arm/arm,vexpress-juno.yaml F: Documentation/devicetree/bindings/auxdisplay/arm-charlcd.txt F: Documentation/devicetree/bindings/clock/arm,syscon-icst.yaml F: Documentation/devicetree/bindings/i2c/i2c-versatile.txt -- cgit v1.2.3 From f9faa90899a2fa6e830bb785d473ebbedebcf80b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:34 +0200 Subject: docs: dt: fix broken reference to phy-cadence-torrent.yaml This file was removed, and another file was added instead of it, on two separate commits. Splitting a single logical change (doc conversion) on two patches is a bad thing, as it makes harder to discover what crap happened. Anyway, this patch fixes the broken reference, making it pointing to the new location of the file. Fixes: 922003733d42 ("dt-bindings: phy: Remove Cadence MHDP PHY dt binding") Fixes: c6d8eef38b7f ("dt-bindings: phy: Add Cadence MHDP PHY bindings in YAML format.") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/3f1cf6d74e392f3ee9f894d82cb7ee29d04c1b6d.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/devicetree/bindings/phy/ti,phy-j721e-wiz.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/phy/ti,phy-j721e-wiz.yaml b/Documentation/devicetree/bindings/phy/ti,phy-j721e-wiz.yaml index fd1982c56104..3f913d6d1c3d 100644 --- a/Documentation/devicetree/bindings/phy/ti,phy-j721e-wiz.yaml +++ b/Documentation/devicetree/bindings/phy/ti,phy-j721e-wiz.yaml @@ -146,7 +146,7 @@ patternProperties: bindings specified in Documentation/devicetree/bindings/phy/phy-cadence-sierra.txt Torrent SERDES should follow the bindings specified in - Documentation/devicetree/bindings/phy/phy-cadence-dp.txt + Documentation/devicetree/bindings/phy/phy-cadence-torrent.yaml required: - compatible -- cgit v1.2.3 From 72ef5e52b3f74c0be47b20f5c434b7ecc830cf40 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:35 +0200 Subject: docs: fix broken references to text files Several references got broken due to txt to ReST conversion. Several of them can be automatically fixed with: scripts/documentation-file-ref-check --fix Reviewed-by: Mathieu Poirier # hwtracing/coresight/Kconfig Reviewed-by: Paul E. McKenney # memory-barrier.txt Acked-by: Alex Shi # translations/zh_CN Acked-by: Federico Vaga # translations/it_IT Acked-by: Marc Zyngier # kvm/arm64 Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6f919ddb83a33b5f2a63b6b5f0575737bb2b36aa.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/memory-barriers.txt | 2 +- Documentation/process/submit-checklist.rst | 2 +- .../translations/it_IT/process/submit-checklist.rst | 2 +- Documentation/translations/ko_KR/memory-barriers.txt | 2 +- Documentation/translations/zh_CN/filesystems/sysfs.txt | 2 +- .../translations/zh_CN/process/submit-checklist.rst | 2 +- Documentation/virt/kvm/arm/pvtime.rst | 2 +- Documentation/virt/kvm/devices/vcpu.rst | 2 +- Documentation/virt/kvm/hypercalls.rst | 4 ++-- arch/powerpc/include/uapi/asm/kvm_para.h | 2 +- drivers/gpu/drm/Kconfig | 2 +- drivers/gpu/drm/drm_ioctl.c | 2 +- drivers/hwtracing/coresight/Kconfig | 2 +- fs/fat/Kconfig | 8 ++++---- fs/fuse/Kconfig | 2 +- fs/fuse/dev.c | 2 +- fs/overlayfs/Kconfig | 6 +++--- include/linux/mm.h | 4 ++-- include/uapi/linux/ethtool_netlink.h | 2 +- include/uapi/rdma/rdma_user_ioctl_cmds.h | 2 +- mm/gup.c | 12 ++++++------ virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 4 ++-- 23 files changed, 36 insertions(+), 36 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index e1c355e84edd..eaabc3134294 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -620,7 +620,7 @@ because the CPUs that the Linux kernel supports don't do writes until they are certain (1) that the write will actually happen, (2) of the location of the write, and (3) of the value to be written. But please carefully read the "CONTROL DEPENDENCIES" section and the -Documentation/RCU/rcu_dereference.txt file: The compiler can and does +Documentation/RCU/rcu_dereference.rst file: The compiler can and does break dependencies in a great many highly creative ways. CPU 1 CPU 2 diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index 8e56337d422d..3f8e9d5d95c2 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches. and why. 26) If any ioctl's are added by the patch, then also update - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 27) If your modified source code depends on or uses any of the kernel APIs or features that are related to the following ``Kconfig`` symbols, diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst index 995ee69fab11..3e575502690f 100644 --- a/Documentation/translations/it_IT/process/submit-checklist.rst +++ b/Documentation/translations/it_IT/process/submit-checklist.rst @@ -117,7 +117,7 @@ sottomissione delle patch, in particolare sorgenti che ne spieghi la logica: cosa fanno e perché. 25) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate - ``Documentation/ioctl/ioctl-number.rst``. + ``Documentation/userspace-api/ioctl/ioctl-number.rst``. 26) Se il codice che avete modificato dipende o usa una qualsiasi interfaccia o funzionalità del kernel che è associata a uno dei seguenti simboli diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 2e831ece6e26..e50fe6541335 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -641,7 +641,7 @@ P 는 짝수 번호 캐시 라인에 저장되어 있고, 변수 B 는 홀수 리눅스 커널이 지원하는 CPU 들은 (1) 쓰기가 정말로 일어날지, (2) 쓰기가 어디에 이루어질지, 그리고 (3) 쓰여질 값을 확실히 알기 전까지는 쓰기를 수행하지 않기 때문입니다. 하지만 "컨트롤 의존성" 섹션과 -Documentation/RCU/rcu_dereference.txt 파일을 주의 깊게 읽어 주시기 바랍니다: +Documentation/RCU/rcu_dereference.rst 파일을 주의 깊게 읽어 주시기 바랍니다: 컴파일러는 매우 창의적인 많은 방법으로 종속성을 깰 수 있습니다. CPU 1 CPU 2 diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index ee1f37da5b23..a15c3ebdfa82 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -281,7 +281,7 @@ drivers/ 包含了每个已为特定总线上的设备而挂载的驱动程序 假定驱动没有跨越多个总线类型)。 fs/ 包含了一个为文件系统设立的目录。现在每个想要导出属性的文件系统必须 -在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse.txt)。 +在 fs/ 下创建自己的层次结构(参见Documentation/filesystems/fuse.rst)。 dev/ 包含两个子目录: char/ 和 block/。在这两个子目录中,有以 : 格式命名的符号链接。这些符号链接指向 sysfs 目录 diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst index 8738c55e42a2..50386e0e42e7 100644 --- a/Documentation/translations/zh_CN/process/submit-checklist.rst +++ b/Documentation/translations/zh_CN/process/submit-checklist.rst @@ -97,7 +97,7 @@ Linux内核补丁提交清单 24) 所有内存屏障例如 ``barrier()``, ``rmb()``, ``wmb()`` 都需要源代码中的注 释来解释它们正在执行的操作及其原因的逻辑。 -25) 如果补丁添加了任何ioctl,那么也要更新 ``Documentation/ioctl/ioctl-number.rst`` +25) 如果补丁添加了任何ioctl,那么也要更新 ``Documentation/userspace-api/ioctl/ioctl-number.rst`` 26) 如果修改后的源代码依赖或使用与以下 ``Kconfig`` 符号相关的任何内核API或 功能,则在禁用相关 ``Kconfig`` 符号和/或 ``=m`` (如果该选项可用)的情况 diff --git a/Documentation/virt/kvm/arm/pvtime.rst b/Documentation/virt/kvm/arm/pvtime.rst index 2357dd2d8655..687b60d76ca9 100644 --- a/Documentation/virt/kvm/arm/pvtime.rst +++ b/Documentation/virt/kvm/arm/pvtime.rst @@ -76,5 +76,5 @@ It is advisable that one or more 64k pages are set aside for the purpose of these structures and not used for other purposes, this enables the guest to map the region using 64k pages and avoids conflicting attributes with other memory. -For the user space interface see Documentation/virt/kvm/devices/vcpu.txt +For the user space interface see Documentation/virt/kvm/devices/vcpu.rst section "3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL". diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 9963e680770a..ca374d3fe085 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -110,5 +110,5 @@ Returns: Specifies the base address of the stolen time structure for this VCPU. The base address must be 64 byte aligned and exist within a valid guest memory -region. See Documentation/virt/kvm/arm/pvtime.txt for more information +region. See Documentation/virt/kvm/arm/pvtime.rst for more information including the layout of the stolen time structure. diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst index dbaf207e560d..ed4fddd364ea 100644 --- a/Documentation/virt/kvm/hypercalls.rst +++ b/Documentation/virt/kvm/hypercalls.rst @@ -22,7 +22,7 @@ S390: number in R1. For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virt/kvm/s390-diag.txt. + refer to Documentation/virt/kvm/s390-diag.rst. PowerPC: It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. @@ -30,7 +30,7 @@ PowerPC: KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' property inside the device tree's /hypervisor node. - For more information refer to Documentation/virt/kvm/ppc-pv.txt + For more information refer to Documentation/virt/kvm/ppc-pv.rst MIPS: KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h index be48c2215fa2..a809b1b44ddf 100644 --- a/arch/powerpc/include/uapi/asm/kvm_para.h +++ b/arch/powerpc/include/uapi/asm/kvm_para.h @@ -31,7 +31,7 @@ * Struct fields are always 32 or 64 bit aligned, depending on them being 32 * or 64 bit wide respectively. * - * See Documentation/virt/kvm/ppc-pv.txt + * See Documentation/virt/kvm/ppc-pv.rst */ struct kvm_vcpu_arch_shared { __u64 scratch1; diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 43594978958e..fb92be7e8aa7 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -161,7 +161,7 @@ config DRM_LOAD_EDID_FIRMWARE monitor are unable to provide appropriate EDID data. Since this feature is provided as a workaround for broken hardware, the default case is N. Details and instructions how to build your own - EDID data are given in Documentation/driver-api/edid.rst. + EDID data are given in Documentation/admin-guide/edid.rst. config DRM_DP_CEC bool "Enable DisplayPort CEC-Tunneling-over-AUX HDMI support" diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 9e41972c4bbc..c2b8d2a953ae 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -741,7 +741,7 @@ static const struct drm_ioctl_desc drm_ioctls[] = { * }; * * Please make sure that you follow all the best practices from - * ``Documentation/ioctl/botching-up-ioctls.rst``. Note that drm_ioctl() + * ``Documentation/process/botching-up-ioctls.rst``. Note that drm_ioctl() * automatically zero-extends structures, hence make sure you can add more stuff * at the end, i.e. don't put a variable sized array there. * diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig index 83e841be1081..02dbb5ca3bcf 100644 --- a/drivers/hwtracing/coresight/Kconfig +++ b/drivers/hwtracing/coresight/Kconfig @@ -107,7 +107,7 @@ config CORESIGHT_CPU_DEBUG can quickly get to know program counter (PC), secure state, exception level, etc. Before use debugging functionality, platform needs to ensure the clock domain and power domain are enabled - properly, please refer Documentation/trace/coresight-cpu-debug.rst + properly, please refer Documentation/trace/coresight/coresight-cpu-debug.rst for detailed description and the example for usage. config CORESIGHT_CTI diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 718163d0c621..ca31993dcb47 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -69,7 +69,7 @@ config VFAT_FS The VFAT support enlarges your kernel by about 10 KB and it only works if you said Y to the "DOS FAT fs support" above. Please read - the file for details. If + the file for details. If unsure, say Y. To compile this as a module, choose M here: the module will be called @@ -82,7 +82,7 @@ config FAT_DEFAULT_CODEPAGE help This option should be set to the codepage of your FAT filesystems. It can be overridden with the "codepage" mount option. - See for more information. + See for more information. config FAT_DEFAULT_IOCHARSET string "Default iocharset for FAT" @@ -96,7 +96,7 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here - select the next option instead if you would like to use UTF-8 encoded file names by default. - See for more information. + See for more information. Enable any character sets you need in File Systems/Native Language Support. @@ -114,4 +114,4 @@ config FAT_DEFAULT_UTF8 Say Y if you use UTF-8 encoding for file names, N otherwise. - See for more information. + See for more information. diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index eb2a585572dc..774b2618018a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -12,7 +12,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See for more information. + See for more information. See for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..c7a65cf2bcca 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2081,7 +2081,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.txt). + * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 714c14c47ca5..dd188c7996b3 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -9,7 +9,7 @@ config OVERLAY_FS 'lower' filesystem is either hidden or, in the case of directories, merged with the 'upper' object. - For more information see Documentation/filesystems/overlayfs.txt + For more information see Documentation/filesystems/overlayfs.rst config OVERLAY_FS_REDIRECT_DIR bool "Overlayfs: turn on redirect directory feature by default" @@ -38,7 +38,7 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW If backward compatibility is not an issue, then it is safe and recommended to say N here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say Y. @@ -103,7 +103,7 @@ config OVERLAY_FS_XINO_AUTO If compatibility with applications that expect 32bit inodes is not an issue, then it is safe and recommended to say Y here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say N. diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a323422d783..1f2850465f59 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1219,7 +1219,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages); * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS * scheme). * - * For more information, please see Documentation/vm/pin_user_pages.rst. + * For more information, please see Documentation/core-api/pin_user_pages.rst. * * @page: pointer to page to be queried. * @Return: True, if it is likely that the page has been "dma-pinned". @@ -2834,7 +2834,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * - * Please see Documentation/vm/pin_user_pages.rst for more information. + * Please see Documentation/core-api/pin_user_pages.rst for more information. */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7fde76366ba4..1711e57f7848 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -2,7 +2,7 @@ /* * include/uapi/linux/ethtool_netlink.h - netlink interface for ethtool * - * See Documentation/networking/ethtool-netlink.txt in kernel source tree for + * See Documentation/networking/ethtool-netlink.rst in kernel source tree for * doucumentation of the interface. */ diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 7b1ec806f8f9..38ab7accb7be 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -36,7 +36,7 @@ #include #include -/* Documentation/ioctl/ioctl-number.rst */ +/* Documentation/userspace-api/ioctl/ioctl-number.rst */ #define RDMA_IOCTL_MAGIC 0x1b #define RDMA_VERBS_IOCTL \ _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) diff --git a/mm/gup.c b/mm/gup.c index 6076df8e04a4..81e4d0b377fd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2843,9 +2843,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for further details. + * see Documentation/core-api/pin_user_pages.rst for further details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, @@ -2883,9 +2883,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -2919,9 +2919,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. + * see Documentation/core-api/pin_user_pages.rst for details. * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index e72dcc454247..859464fd413f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -301,7 +301,7 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, * pending state of interrupt is latched in pending_latch variable. * Userspace will save and restore pending state and line_level * separately. - * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst * for handling of ISPENDR and ICPENDR. */ for (i = 0; i < len * 8; i++) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 769e4802645e..64fcd7511110 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,7 +42,7 @@ VGIC_AFFINITY_LEVEL(val, 3)) /* - * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 @@ -63,7 +63,7 @@ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* - * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 -- cgit v1.2.3 From 3ecad8c2c1ff333e204c26e2f0dddfa623153f87 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:36 +0200 Subject: docs: fix broken references for ReST files that moved around Some broken references happened due to shifting files around and ReST renames. Those can't be auto-fixed by the script, so let's fix them manually. Signed-off-by: Mauro Carvalho Chehab Acked-by: Corentin Labbe Link: https://lore.kernel.org/r/64773a12b4410aaf3e3be89e3ec7e34de2484eea.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/doc-guide/maintainer-profile.rst | 2 +- Documentation/virt/kvm/mmu.rst | 2 +- Documentation/virt/kvm/review-checklist.rst | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c | 2 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c | 2 +- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c | 2 +- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c | 2 +- drivers/media/v4l2-core/v4l2-fwnode.c | 2 +- include/uapi/linux/kvm.h | 4 ++-- tools/include/uapi/linux/kvm.h | 4 ++-- 11 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/doc-guide/maintainer-profile.rst b/Documentation/doc-guide/maintainer-profile.rst index 5afc0ddba40a..755d39f0d407 100644 --- a/Documentation/doc-guide/maintainer-profile.rst +++ b/Documentation/doc-guide/maintainer-profile.rst @@ -6,7 +6,7 @@ Documentation subsystem maintainer entry profile The documentation "subsystem" is the central coordinating point for the kernel's documentation and associated infrastructure. It covers the hierarchy under Documentation/ (with the exception of -Documentation/device-tree), various utilities under scripts/ and, at least +Documentation/devicetree), various utilities under scripts/ and, at least some of the time, LICENSES/. It's worth noting, though, that the boundaries of this subsystem are rather diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 60981887d20b..46126ecc70f7 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -319,7 +319,7 @@ Handling a page fault is performed as follows: - If both P bit and R/W bit of error code are set, this could possibly be handled as a "fast page fault" (fixed without taking the MMU lock). See - the description in Documentation/virt/kvm/locking.txt. + the description in Documentation/virt/kvm/locking.rst. - if needed, walk the guest page tables to determine the guest translation (gva->gpa or ngpa->gpa) diff --git a/Documentation/virt/kvm/review-checklist.rst b/Documentation/virt/kvm/review-checklist.rst index 1f86a9d3f705..dc01aea4057b 100644 --- a/Documentation/virt/kvm/review-checklist.rst +++ b/Documentation/virt/kvm/review-checklist.rst @@ -10,7 +10,7 @@ Review checklist for kvm patches 2. Patches should be against kvm.git master branch. 3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/virt/kvm/api.txt + - the API must be documented in Documentation/virt/kvm/api.rst - the API must be discoverable using KVM_CHECK_EXTENSION 4. New state must include support for save/restore. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8071952e9cf2..fd59fee84631 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3586,7 +3586,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virt/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.rst to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c index a5fd8975f3d3..a6abb701bfc6 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c @@ -8,7 +8,7 @@ * This file add support for AES cipher with 128,192,256 bits keysize in * CBC and ECB mode. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c index 3e4e4bbda34c..b957061424a1 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c @@ -7,7 +7,7 @@ * * Core file which registers crypto algorithms supported by the CryptoEngine. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include #include diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c index 84d52fc3a2da..c89cb2ee2496 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-cipher.c @@ -8,7 +8,7 @@ * This file add support for AES cipher with 128,192,256 bits keysize in * CBC and ECB mode. * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c index 6b301afffd11..8ba4f9c81dac 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c @@ -7,7 +7,7 @@ * * Core file which registers crypto algorithms supported by the SecuritySystem * - * You could find a link for the datasheet in Documentation/arm/sunxi/README + * You could find a link for the datasheet in Documentation/arm/sunxi.rst */ #include #include diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c index 97f0f8b23b5d..8a1e1b95b379 100644 --- a/drivers/media/v4l2-core/v4l2-fwnode.c +++ b/drivers/media/v4l2-core/v4l2-fwnode.c @@ -980,7 +980,7 @@ static int v4l2_fwnode_reference_parse(struct device *dev, * * THIS EXAMPLE EXISTS MERELY TO DOCUMENT THIS FUNCTION. DO NOT USE IT AS A * REFERENCE IN HOW ACPI TABLES SHOULD BE WRITTEN!! See documentation under - * Documentation/acpi/dsd instead and especially graph.txt, + * Documentation/firmware-guide/acpi/dsd/ instead and especially graph.txt, * data-node-references.txt and leds.txt . * * Scope (\_SB.PCI0.I2C2) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 428c7dde6b4b..fdd632c833b4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virt/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.rst */ union { __u32 irq; @@ -1107,7 +1107,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virt/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.rst. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 4b95f9a31a2f..e5f32fcec68f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virt/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.rst */ union { __u32 irq; @@ -1100,7 +1100,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virt/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.rst. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) -- cgit v1.2.3 From 0c1bc6b84525b96aa9fb8f6fbe8c5cb26a5c0ead Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:37 +0200 Subject: docs: filesystems: fix renamed references Some filesystem references got broken by a previous patch series I submitted. Address those. Signed-off-by: Mauro Carvalho Chehab Acked-by: David Sterba # fs/affs/Kconfig Link: https://lore.kernel.org/r/57318c53008dbda7f6f4a5a9e5787f4d37e8565a.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/ABI/stable/sysfs-devices-node | 2 +- Documentation/ABI/testing/procfs-smaps_rollup | 2 +- Documentation/admin-guide/cpu-load.rst | 2 +- Documentation/admin-guide/nfs/nfsroot.rst | 2 +- Documentation/driver-api/driver-model/device.rst | 4 ++-- Documentation/driver-api/driver-model/overview.rst | 2 +- Documentation/filesystems/dax.txt | 2 +- Documentation/filesystems/dnotify.txt | 2 +- Documentation/filesystems/ramfs-rootfs-initramfs.rst | 2 +- Documentation/filesystems/sysfs.rst | 2 +- Documentation/powerpc/firmware-assisted-dump.rst | 2 +- Documentation/process/adding-syscalls.rst | 2 +- Documentation/translations/it_IT/process/adding-syscalls.rst | 2 +- Documentation/translations/zh_CN/filesystems/sysfs.txt | 6 +++--- drivers/base/core.c | 2 +- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h | 2 +- fs/Kconfig | 2 +- fs/Kconfig.binfmt | 2 +- fs/adfs/Kconfig | 2 +- fs/affs/Kconfig | 2 +- fs/afs/Kconfig | 6 +++--- fs/bfs/Kconfig | 2 +- fs/cramfs/Kconfig | 2 +- fs/ecryptfs/Kconfig | 2 +- fs/hfs/Kconfig | 2 +- fs/hpfs/Kconfig | 2 +- fs/isofs/Kconfig | 2 +- fs/namespace.c | 2 +- fs/notify/inotify/Kconfig | 2 +- fs/ntfs/Kconfig | 2 +- fs/ocfs2/Kconfig | 2 +- fs/proc/Kconfig | 4 ++-- fs/romfs/Kconfig | 2 +- fs/sysfs/dir.c | 2 +- fs/sysfs/file.c | 2 +- fs/sysfs/mount.c | 2 +- fs/sysfs/symlink.c | 2 +- fs/sysv/Kconfig | 2 +- fs/udf/Kconfig | 2 +- include/linux/kobject.h | 2 +- include/linux/kobject_ns.h | 2 +- include/linux/relay.h | 2 +- include/linux/sysfs.h | 2 +- kernel/relay.c | 2 +- lib/kobject.c | 4 ++-- 45 files changed, 52 insertions(+), 52 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index df8413cf1468..484fc04bcc25 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -54,7 +54,7 @@ Date: October 2002 Contact: Linux Memory Management list Description: Provides information about the node's distribution and memory - utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.txt + utilization. Similar to /proc/meminfo, see Documentation/filesystems/proc.rst What: /sys/devices/system/node/nodeX/numastat Date: October 2002 diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup index 274df44d8b1b..046978193368 100644 --- a/Documentation/ABI/testing/procfs-smaps_rollup +++ b/Documentation/ABI/testing/procfs-smaps_rollup @@ -11,7 +11,7 @@ Description: Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem are not present in /proc/pid/smaps. These fields represent the sum of the Pss field of each type (anon, file, shmem). - For more details, see Documentation/filesystems/proc.txt + For more details, see Documentation/filesystems/proc.rst and the procfs man page. Typical output looks like this: diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst index 2d01ce43d2a2..ebdecf864080 100644 --- a/Documentation/admin-guide/cpu-load.rst +++ b/Documentation/admin-guide/cpu-load.rst @@ -105,7 +105,7 @@ References ---------- - http://lkml.org/lkml/2007/2/12/6 -- Documentation/filesystems/proc.txt (1.8) +- Documentation/filesystems/proc.rst (1.8) Thanks diff --git a/Documentation/admin-guide/nfs/nfsroot.rst b/Documentation/admin-guide/nfs/nfsroot.rst index 82a4fda057f9..c6772075c80c 100644 --- a/Documentation/admin-guide/nfs/nfsroot.rst +++ b/Documentation/admin-guide/nfs/nfsroot.rst @@ -18,7 +18,7 @@ Mounting the root filesystem via NFS (nfsroot) In order to use a diskless system, such as an X-terminal or printer server for example, it is necessary for the root filesystem to be present on a non-disk device. This may be an initramfs (see -Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see +Documentation/filesystems/ramfs-rootfs-initramfs.rst), a ramdisk (see Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The following text describes on how to use NFS for the root filesystem. For the rest of this text 'client' means the diskless system, and 'server' means the NFS diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst index 2b868d49d349..b9b022371e85 100644 --- a/Documentation/driver-api/driver-model/device.rst +++ b/Documentation/driver-api/driver-model/device.rst @@ -50,10 +50,10 @@ Attributes Attributes of devices can be exported by a device driver through sysfs. -Please see Documentation/filesystems/sysfs.txt for more information +Please see Documentation/filesystems/sysfs.rst for more information on how sysfs works. -As explained in Documentation/kobject.txt, device attributes must be +As explained in Documentation/core-api/kobject.rst, device attributes must be created before the KOBJ_ADD uevent is generated. The only way to realize that is by defining an attribute group. diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst index d4d1e9b40e0c..e98d0ab4a9b6 100644 --- a/Documentation/driver-api/driver-model/overview.rst +++ b/Documentation/driver-api/driver-model/overview.rst @@ -121,4 +121,4 @@ device-specific data or tunable interfaces. More information about the sysfs directory layout can be found in the other documents in this directory and in the file -Documentation/filesystems/sysfs.txt. +Documentation/filesystems/sysfs.rst. diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 679729442fd2..735f3859b19f 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -74,7 +74,7 @@ are zeroed out and converted to written extents before being returned to avoid exposure of uninitialized data through mmap. These filesystems may be used for inspiration: -- ext2: see Documentation/filesystems/ext2.txt +- ext2: see Documentation/filesystems/ext2.rst - ext4: see Documentation/filesystems/ext4/ - xfs: see Documentation/admin-guide/xfs.rst diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt index 15156883d321..08d575ece45d 100644 --- a/Documentation/filesystems/dnotify.txt +++ b/Documentation/filesystems/dnotify.txt @@ -67,4 +67,4 @@ See tools/testing/selftests/filesystems/dnotify_test.c for an example. NOTE ---- Beginning with Linux 2.6.13, dnotify has been replaced by inotify. -See Documentation/filesystems/inotify.txt for more information on it. +See Documentation/filesystems/inotify.rst for more information on it. diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.rst b/Documentation/filesystems/ramfs-rootfs-initramfs.rst index 6c576e241d86..3fddacc6bf14 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.rst +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.rst @@ -71,7 +71,7 @@ be allowed write access to a ramfs mount. A ramfs derivative called tmpfs was created to add size limits, and the ability to write the data to swap space. Normal users can be allowed write access to -tmpfs mounts. See Documentation/filesystems/tmpfs.txt for more information. +tmpfs mounts. See Documentation/filesystems/tmpfs.rst for more information. What is rootfs? --------------- diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst index 290891c3fecb..ab0f7795792b 100644 --- a/Documentation/filesystems/sysfs.rst +++ b/Documentation/filesystems/sysfs.rst @@ -20,7 +20,7 @@ a means to export kernel data structures, their attributes, and the linkages between them to userspace. sysfs is tied inherently to the kobject infrastructure. Please read -Documentation/kobject.txt for more information concerning the kobject +Documentation/core-api/kobject.rst for more information concerning the kobject interface. diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index b3f3ee135dbe..20ea8cdee0aa 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -344,7 +344,7 @@ Here is the list of files under powerpc debugfs: NOTE: - Please refer to Documentation/filesystems/debugfs.txt on + Please refer to Documentation/filesystems/debugfs.rst on how to mount the debugfs filesystem. diff --git a/Documentation/process/adding-syscalls.rst b/Documentation/process/adding-syscalls.rst index 1c3a840d06b9..a6b4a3a5bf3f 100644 --- a/Documentation/process/adding-syscalls.rst +++ b/Documentation/process/adding-syscalls.rst @@ -33,7 +33,7 @@ interface. to a somewhat opaque API. - If you're just exposing runtime system information, a new node in sysfs - (see ``Documentation/filesystems/sysfs.txt``) or the ``/proc`` filesystem may + (see ``Documentation/filesystems/sysfs.rst``) or the ``/proc`` filesystem may be more appropriate. However, access to these mechanisms requires that the relevant filesystem is mounted, which might not always be the case (e.g. in a namespaced/sandboxed/chrooted environment). Avoid adding any API to diff --git a/Documentation/translations/it_IT/process/adding-syscalls.rst b/Documentation/translations/it_IT/process/adding-syscalls.rst index c3a3439595a6..bff0a82bf127 100644 --- a/Documentation/translations/it_IT/process/adding-syscalls.rst +++ b/Documentation/translations/it_IT/process/adding-syscalls.rst @@ -39,7 +39,7 @@ vostra interfaccia. un qualche modo opaca. - Se dovete esporre solo delle informazioni sul sistema, un nuovo nodo in - sysfs (vedere ``Documentation/filesystems/sysfs.txt``) o + sysfs (vedere ``Documentation/filesystems/sysfs.rst``) o in procfs potrebbe essere sufficiente. Tuttavia, l'accesso a questi meccanismi richiede che il filesystem sia montato, il che potrebbe non essere sempre vero (per esempio, in ambienti come namespace/sandbox/chroot). diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index a15c3ebdfa82..fcf620049d11 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -1,4 +1,4 @@ -Chinese translated version of Documentation/filesystems/sysfs.txt +Chinese translated version of Documentation/filesystems/sysfs.rst If you have any comment or update to the content, please contact the original document maintainer directly. However, if you have a problem @@ -10,7 +10,7 @@ Maintainer: Patrick Mochel Mike Murphy Chinese maintainer: Fu Wei --------------------------------------------------------------------- -Documentation/filesystems/sysfs.txt 的中文翻译 +Documentation/filesystems/sysfs.rst 的中文翻译 如果想评论或更新本文的内容,请直接联系原文档的维护者。如果你使用英文 交流有困难的话,也可以向中文版维护者求助。如果本翻译更新不及时或者翻 @@ -40,7 +40,7 @@ sysfs 是一个最初基于 ramfs 且位于内存的文件系统。它提供导 数据结构及其属性,以及它们之间的关联到用户空间的方法。 sysfs 始终与 kobject 的底层结构紧密相关。请阅读 -Documentation/kobject.txt 文档以获得更多关于 kobject 接口的 +Documentation/core-api/kobject.rst 文档以获得更多关于 kobject 接口的 信息。 diff --git a/drivers/base/core.c b/drivers/base/core.c index 139cdf7e7327..89fcc0a09f94 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1374,7 +1374,7 @@ static void device_release(struct kobject *kobj) else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev); else - WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/kobject.txt.\n", + WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", dev_name(dev)); kfree(p); } diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h index 211f5de99a44..9aba2910d83a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h @@ -170,7 +170,7 @@ struct dpu_global_state * * Main debugfs documentation is located at, * - * Documentation/filesystems/debugfs.txt + * Documentation/filesystems/debugfs.rst * * @dpu_debugfs_setup_regset32: Initialize data for dpu_debugfs_create_regset32 * @dpu_debugfs_create_regset32: Create 32-bit register dump file diff --git a/fs/Kconfig b/fs/Kconfig index f08fbbfafd9a..d1ad3935fb85 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -166,7 +166,7 @@ config TMPFS space. If you unmount a tmpfs instance, everything stored therein is lost. - See for details. + See for details. config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 62dc4f577ba1..3fbbd54f50fd 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -72,7 +72,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS The core dump behavior can be controlled per process using the /proc/PID/coredump_filter pseudo-file; this setting is - inherited. See Documentation/filesystems/proc.txt for details. + inherited. See Documentation/filesystems/proc.rst for details. This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index df4650dccf68..44738fed6625 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -12,7 +12,7 @@ config ADFS_FS The ADFS partition should be the first partition (i.e., /dev/[hs]d?1) on each of your drives. Please read the file - for further details. + for further details. To compile this code as a module, choose M here: the module will be called adfs. diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 84c46b9025c5..eb9d0ab850cb 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -9,7 +9,7 @@ config AFFS_FS FFS partition on your hard drive. Amiga floppies however cannot be read with this driver due to an incompatibility of the floppy controller used in an Amiga and the standard floppy controller in - PCs and workstations. Read + PCs and workstations. Read and . With this driver you can also mount disk files used by Bernd diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 3fb1f559e317..1ad211d72b3b 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -8,7 +8,7 @@ config AFS_FS If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. - See for more information. + See for more information. If unsure, say N. @@ -18,7 +18,7 @@ config AFS_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See for more information. + See for more information. If unsure, say N. @@ -37,6 +37,6 @@ config AFS_DEBUG_CURSOR the dmesg log if the server rotation algorithm fails to successfully contact a server. - See for more information. + See for more information. If unsure, say N. diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3e1247f07913..3a757805b585 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -11,7 +11,7 @@ config BFS_FS on your /stand slice from within Linux. You then also need to say Y to "UnixWare slices support", below. More information about the BFS file system is contained in the file - . + . If you don't know what this is about, say N. diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index c8bebb70a971..d98cef0dbb6b 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -9,7 +9,7 @@ config CRAMFS limited to 256MB file systems (with 16MB files), and doesn't support 16/32 bits uid/gid, hard links and timestamps. - See and + See and for further information. To compile this as a module, choose M here: the module will be called diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 522c35d5292b..1bdeaa6d5790 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -7,7 +7,7 @@ config ECRYPT_FS select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See - to learn more about + to learn more about eCryptfs. Userspace components are required and can be obtained from . diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index 44f6e89bcb75..129926b5142d 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -6,7 +6,7 @@ config HFS_FS help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. - Please read to learn about + Please read to learn about the available mount options. To compile this file system support as a module, choose M here: the diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 56aa0336254a..2b36dc6f0a10 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -9,7 +9,7 @@ config HPFS_FS write files to an OS/2 HPFS partition on your hard drive. OS/2 floppies however are in regular MSDOS format, so you don't need this option in order to be able to read them. Read - . + . To compile this file system support as a module, choose M here: the module will be called hpfs. If unsure, say N. diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 5e7419599f50..08ffd37b9bb8 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -8,7 +8,7 @@ config ISO9660_FS long Unix filenames and symbolic links are also supported by this driver. If you have a CD-ROM drive and want to do more with it than just listen to audio CDs and watch its LEDs, say Y (and read - and the CD-ROM-HOWTO, + and the CD-ROM-HOWTO, available from ), thereby enlarging your kernel by about 27 KB; otherwise say N. diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..5f036dc711b6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3595,7 +3595,7 @@ EXPORT_SYMBOL(path_is_under); * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 6736e47d94d8..7715fadd5fff 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -12,6 +12,6 @@ config INOTIFY_USER new features including multiple file events, one-shot support, and unmount notification. - For more information, see + For more information, see If unsure, say Y. diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index de9fb5cff226..1667a7e590d8 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -18,7 +18,7 @@ config NTFS_FS the Linux 2.4 kernel series is separately available as a patch from the project web site. - For more information see + For more information see and . To compile this file system support as a module, choose M here: the diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 46bba20da6b5..1177c33df895 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -21,7 +21,7 @@ config OCFS2_FS OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ For more information on OCFS2, see the file - . + . config OCFS2_FS_O2CB tristate "O2CB Kernelspace Clustering" diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 27ef84d99f59..971a42f6357d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -23,7 +23,7 @@ config PROC_FS /proc" or the equivalent line in /etc/fstab does the job. The /proc file system is explained in the file - and on the proc(5) manpage + and on the proc(5) manpage ("man 5 proc"). This option will enlarge your kernel by about 67 KB. Several @@ -95,7 +95,7 @@ config PROC_CHILDREN default n help Provides a fast way to retrieve first level children pids of a task. See - for more information. + for more information. Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index ad4c45788896..9737b8e68878 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -6,7 +6,7 @@ config ROMFS_FS This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for other read-only media as well. Read - for details. + for details. To compile this file system support as a module, choose M here: the module will be called romfs. Note that the file system of your diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aa85f2874a9f..59dffd5ca517 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 26bbf960e2a2..f275fcda62fb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db81cfbab9d6..e747c135c1d1 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c4deecc80f67..5603530a1a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index d4edf7d9ae10..b4e23e03fbeb 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -28,7 +28,7 @@ config SYSV_FS tar" or preferably "info tar"). Note also that this option has nothing whatsoever to do with the option "System V IPC". Read about the System V file system in - . + . Saying Y here will enlarge your kernel by about 27 KB. To compile this as a module, choose M here: the module will be called diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 6848de581ce1..26e1a49f3ba7 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -9,7 +9,7 @@ config UDF_FS compatible with standard unix file systems, it is also suitable for removable USB disks. Say Y if you intend to mount DVD discs or CDRW's written in packet mode, or if you want to use UDF for removable USB - disks. Please read . + disks. Please read . To compile this file system support as a module, choose M here: the module will be called udf. diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e2ca0a292e21..fc8d83e91379 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -7,7 +7,7 @@ * Copyright (c) 2006-2008 Greg Kroah-Hartman * Copyright (c) 2006-2008 Novell Inc. * - * Please read Documentation/kobject.txt before using the kobject + * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 069aa2ebef90..2b5b64256cf4 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -8,7 +8,7 @@ * * Split from kobject.h by David Howells (dhowells@redhat.com) * - * Please read Documentation/kobject.txt before using the kobject + * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ diff --git a/include/linux/relay.h b/include/linux/relay.h index c759f96e39c1..e13a333e7c37 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -141,7 +141,7 @@ struct rchan_callbacks * cause relay_open() to create a single global buffer rather * than the default set of per-cpu buffers. * - * See Documentation/filesystems/relay.txt for more info. + * See Documentation/filesystems/relay.rst for more info. */ struct dentry *(*create_buf_file)(const char *filename, struct dentry *parent, diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 80bb865b3a33..86067dbe7745 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -7,7 +7,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ diff --git a/kernel/relay.c b/kernel/relay.c index ade14fb7ce2e..d0c9c287680a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relay.txt for an overview. + * See Documentation/filesystems/relay.rst for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) diff --git a/lib/kobject.c b/lib/kobject.c index 83198cb37d8d..65fa7bf70c57 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -6,7 +6,7 @@ * Copyright (c) 2006-2007 Greg Kroah-Hartman * Copyright (c) 2006-2007 Novell Inc. * - * Please see the file Documentation/kobject.txt for critical information + * Please see the file Documentation/core-api/kobject.rst for critical information * about using the kobject interface. */ @@ -670,7 +670,7 @@ static void kobject_cleanup(struct kobject *kobj) kobject_name(kobj), kobj, __func__, kobj->parent); if (t && !t->release) - pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/kobject.txt.\n", + pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", kobject_name(kobj), kobj); /* send "remove" if the caller did not do it but sent "add" */ -- cgit v1.2.3 From d91589556b6a096d53f47ed3d91172a2788d4cdb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:38 +0200 Subject: docs: amu: supress some Sphinx warnings Add extra blank lines on some places, in order to avoid those warnings when building the docs: Documentation/arm64/amu.rst:26: WARNING: Unexpected indentation. Documentation/arm64/amu.rst:60: WARNING: Unexpected indentation. Documentation/arm64/amu.rst:81: WARNING: Unexpected indentation. Documentation/arm64/amu.rst:108: WARNING: Unexpected indentation. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/ab0881638fc41ed790b3307a8e022ec84b7cce7e.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/arm64/amu.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/arm64/amu.rst b/Documentation/arm64/amu.rst index 5057b11100ed..452ec8b115c2 100644 --- a/Documentation/arm64/amu.rst +++ b/Documentation/arm64/amu.rst @@ -23,6 +23,7 @@ optional external memory-mapped interface. Version 1 of the Activity Monitors architecture implements a counter group of four fixed and architecturally defined 64-bit event counters. + - CPU cycle counter: increments at the frequency of the CPU. - Constant counter: increments at the fixed frequency of the system clock. @@ -57,6 +58,7 @@ counters, only the presence of the extension. Firmware (code running at higher exception levels, e.g. arm-tf) support is needed to: + - Enable access for lower exception levels (EL2 and EL1) to the AMU registers. - Enable the counters. If not enabled these will read as 0. @@ -78,6 +80,7 @@ are not trapped in EL2/EL3. The fixed counters of AMUv1 are accessible though the following system register definitions: + - SYS_AMEVCNTR0_CORE_EL0 - SYS_AMEVCNTR0_CONST_EL0 - SYS_AMEVCNTR0_INST_RET_EL0 @@ -93,6 +96,7 @@ Userspace access ---------------- Currently, access from userspace to the AMU registers is disabled due to: + - Security reasons: they might expose information about code executed in secure mode. - Purpose: AMU counters are intended for system management use. @@ -105,6 +109,7 @@ Virtualization Currently, access from userspace (EL0) and kernelspace (EL1) on the KVM guest side is disabled due to: + - Security reasons: they might expose information about code executed by other guests or the host. -- cgit v1.2.3 From 877a37d31e0f9f97d04f8d211924dc16df74d31a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:39 +0200 Subject: docs: arm64: booting.rst: get rid of some warnings Get rid of those warnings: Documentation/arm64/booting.rst:253: WARNING: Unexpected indentation. Documentation/arm64/booting.rst:259: WARNING: Block quote ends without a blank line; unexpected unindent. By adding an extra blank lines where needed. While here, use list markups on some places, as otherwise Sphinx will consider the next lines as continuation of the privious ones. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/121b267be0a102fde73498c31792e5a9309013cc.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/arm64/booting.rst | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst index a3f1a47b6f1c..e50186092948 100644 --- a/Documentation/arm64/booting.rst +++ b/Documentation/arm64/booting.rst @@ -173,7 +173,9 @@ Before jumping into the kernel, the following conditions must be met: - Caches, MMUs The MMU must be off. + Instruction cache may be on or off. + The address range corresponding to the loaded kernel image must be cleaned to the PoC. In the presence of a system cache or other coherent masters with caches enabled, this will typically require @@ -238,6 +240,7 @@ Before jumping into the kernel, the following conditions must be met: - The DT or ACPI tables must describe a GICv2 interrupt controller. For CPUs with pointer authentication functionality: + - If EL3 is present: - SCR_EL3.APK (bit 16) must be initialised to 0b1 @@ -249,18 +252,22 @@ Before jumping into the kernel, the following conditions must be met: - HCR_EL2.API (bit 41) must be initialised to 0b1 For CPUs with Activity Monitors Unit v1 (AMUv1) extension present: + - If EL3 is present: - CPTR_EL3.TAM (bit 30) must be initialised to 0b0 - CPTR_EL2.TAM (bit 30) must be initialised to 0b0 - AMCNTENSET0_EL0 must be initialised to 0b1111 - AMCNTENSET1_EL0 must be initialised to a platform specific value - having 0b1 set for the corresponding bit for each of the auxiliary - counters present. + + - CPTR_EL3.TAM (bit 30) must be initialised to 0b0 + - CPTR_EL2.TAM (bit 30) must be initialised to 0b0 + - AMCNTENSET0_EL0 must be initialised to 0b1111 + - AMCNTENSET1_EL0 must be initialised to a platform specific value + having 0b1 set for the corresponding bit for each of the auxiliary + counters present. + - If the kernel is entered at EL1: - AMCNTENSET0_EL0 must be initialised to 0b1111 - AMCNTENSET1_EL0 must be initialised to a platform specific value - having 0b1 set for the corresponding bit for each of the auxiliary - counters present. + + - AMCNTENSET0_EL0 must be initialised to 0b1111 + - AMCNTENSET1_EL0 must be initialised to a platform specific value + having 0b1 set for the corresponding bit for each of the auxiliary + counters present. The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must @@ -304,7 +311,8 @@ following manner: Documentation/devicetree/bindings/arm/psci.yaml. - Secondary CPU general-purpose register settings - x0 = 0 (reserved for future use) - x1 = 0 (reserved for future use) - x2 = 0 (reserved for future use) - x3 = 0 (reserved for future use) + + - x0 = 0 (reserved for future use) + - x1 = 0 (reserved for future use) + - x2 = 0 (reserved for future use) + - x3 = 0 (reserved for future use) -- cgit v1.2.3 From a588332fba0b13d5090537812865f3170fdf19c2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:40 +0200 Subject: docs: pci: boot-interrupts.rst: improve html output There are some warnings with this file: /Documentation/PCI/boot-interrupts.rst:42: WARNING: Unexpected indentation. /Documentation/PCI/boot-interrupts.rst:52: WARNING: Block quote ends without a blank line; unexpected unindent. /Documentation/PCI/boot-interrupts.rst:92: WARNING: Unexpected indentation. /Documentation/PCI/boot-interrupts.rst:98: WARNING: Unexpected indentation. /Documentation/PCI/boot-interrupts.rst:136: WARNING: Unexpected indentation. It turns that this file conversion to ReST could be improved, in order to remove the warnings and provide a better output. So, fix the warnings by adjusting blank lines, add a table and some list markups. Also, mark endnodes as such. Acked-by: Bjorn Helgaas Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/a6a9eb16eede10731bcce69a600ab12d92e6ba47.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/PCI/boot-interrupts.rst | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/Documentation/PCI/boot-interrupts.rst b/Documentation/PCI/boot-interrupts.rst index d078ef3eb192..2ec70121bfca 100644 --- a/Documentation/PCI/boot-interrupts.rst +++ b/Documentation/PCI/boot-interrupts.rst @@ -32,12 +32,13 @@ interrupt goes unhandled over time, they are tracked by the Linux kernel as Spurious Interrupts. The IRQ will be disabled by the Linux kernel after it reaches a specific count with the error "nobody cared". This disabled IRQ now prevents valid usage by an existing interrupt which may happen to share -the IRQ line. +the IRQ line:: irq 19: nobody cared (try booting with the "irqpoll" option) CPU: 0 PID: 2988 Comm: irq/34-nipalk Tainted: 4.14.87-rt49-02410-g4a640ec-dirty #1 Hardware name: National Instruments NI PXIe-8880/NI PXIe-8880, BIOS 2.1.5f1 01/09/2020 Call Trace: + ? dump_stack+0x46/0x5e ? __report_bad_irq+0x2e/0xb0 @@ -85,15 +86,18 @@ Mitigations The mitigations take the form of PCI quirks. The preference has been to first identify and make use of a means to disable the routing to the PCH. In such a case a quirk to disable boot interrupt generation can be -added.[1] +added. [1]_ - Intel® 6300ESB I/O Controller Hub +Intel® 6300ESB I/O Controller Hub Alternate Base Address Register: BIE: Boot Interrupt Enable - 0 = Boot interrupt is enabled. - 1 = Boot interrupt is disabled. - Intel® Sandy Bridge through Sky Lake based Xeon servers: + == =========================== + 0 Boot interrupt is enabled. + 1 Boot interrupt is disabled. + == =========================== + +Intel® Sandy Bridge through Sky Lake based Xeon servers: Coherent Interface Protocol Interrupt Control dis_intx_route2pch/dis_intx_route2ich/dis_intx_route2dmi2: When this bit is set. Local INTx messages received from the @@ -109,12 +113,12 @@ line by default. Therefore, on chipsets where this INTx routing cannot be disabled, the Linux kernel will reroute the valid interrupt to its legacy interrupt. This redirection of the handler will prevent the occurrence of the spurious interrupt detection which would ordinarily disable the IRQ -line due to excessive unhandled counts.[2] +line due to excessive unhandled counts. [2]_ The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS exists to enable (or disable) the redirection of the interrupt handler to the PCH interrupt line. The option can be overridden by either pci=ioapicreroute or -pci=noioapicreroute.[3] +pci=noioapicreroute. [3]_ More Documentation @@ -127,19 +131,19 @@ into the evolution of its handling with chipsets. Example of disabling of the boot interrupt ------------------------------------------ -Intel® 6300ESB I/O Controller Hub (Document # 300641-004US) + - Intel® 6300ESB I/O Controller Hub (Document # 300641-004US) 5.7.3 Boot Interrupt https://www.intel.com/content/dam/doc/datasheet/6300esb-io-controller-hub-datasheet.pdf -Intel® Xeon® Processor E5-1600/2400/2600/4600 v3 Product Families -Datasheet - Volume 2: Registers (Document # 330784-003) + - Intel® Xeon® Processor E5-1600/2400/2600/4600 v3 Product Families + Datasheet - Volume 2: Registers (Document # 330784-003) 6.6.41 cipintrc Coherent Interface Protocol Interrupt Control https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf Example of handler rerouting ---------------------------- -Intel® 6700PXH 64-bit PCI Hub (Document # 302628) + - Intel® 6700PXH 64-bit PCI Hub (Document # 302628) 2.15.2 PCI Express Legacy INTx Support and Boot Interrupt https://www.intel.com/content/dam/doc/datasheet/6700pxh-64-bit-pci-hub-datasheet.pdf @@ -150,6 +154,6 @@ Cheers, Sean V Kelley sean.v.kelley@linux.intel.com -[1] https://lore.kernel.org/r/12131949181903-git-send-email-sassmann@suse.de/ -[2] https://lore.kernel.org/r/12131949182094-git-send-email-sassmann@suse.de/ -[3] https://lore.kernel.org/r/487C8EA7.6020205@suse.de/ +.. [1] https://lore.kernel.org/r/12131949181903-git-send-email-sassmann@suse.de/ +.. [2] https://lore.kernel.org/r/12131949182094-git-send-email-sassmann@suse.de/ +.. [3] https://lore.kernel.org/r/487C8EA7.6020205@suse.de/ -- cgit v1.2.3 From cfa204984d57ef205b09c7f6db9b562821b51dbd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:41 +0200 Subject: docs: ras: get rid of some warnings Sphinx produce some warnings due to a bad table format: Documentation/admin-guide/ras.rst:358: WARNING: Definition list ends without a blank line; unexpected unindent. Documentation/admin-guide/ras.rst:358: WARNING: Definition list ends without a blank line; unexpected unindent. Documentation/admin-guide/ras.rst:363: WARNING: Definition list ends without a blank line; unexpected unindent. Documentation/admin-guide/ras.rst:363: WARNING: Definition list ends without a blank line; unexpected unindent. Rearrange the things there in order to supress the warnings while being precise at the Sphinx output about how ranks are mapped into csrows. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9e1bb44d6dbedb5b6f049d081b47da1f9620de16.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/ras.rst | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index 0310db624964..22b31bc7e129 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -351,15 +351,17 @@ controllers. The following example will assume 2 channels: +------------+-----------+-----------+ | | ``ch0`` | ``ch1`` | +============+===========+===========+ - | ``csrow0`` | DIMM_A0 | DIMM_B0 | - | | rank0 | rank0 | - +------------+ - | - | + | |**DIMM_A0**|**DIMM_B0**| + +------------+-----------+-----------+ + | ``csrow0`` | rank0 | rank0 | + +------------+-----------+-----------+ | ``csrow1`` | rank1 | rank1 | +------------+-----------+-----------+ - | ``csrow2`` | DIMM_A1 | DIMM_B1 | - | | rank0 | rank0 | - +------------+ - | - | - | ``csrow3`` | rank1 | rank1 | + | |**DIMM_A1**|**DIMM_B1**| + +------------+-----------+-----------+ + | ``csrow2`` | rank0 | rank0 | + +------------+-----------+-----------+ + | ``csrow3`` | rank1 | rank1 | +------------+-----------+-----------+ In the above example, there are 4 physical slots on the motherboard -- cgit v1.2.3 From 00aff95659616a131ae4b461aa21e429c33907b5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:42 +0200 Subject: docs: ras: don't need to repeat twice the same thing We don't need to say twice "for the first time" at the same paragraph. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/f76dcae96de8b1bb8ee37a79781c111c825e26d6.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/ras.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index 22b31bc7e129..6cbaab975ee5 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -212,7 +212,7 @@ EDAC - Error Detection And Correction purposes. When the subsystem was pushed upstream for the first time, on - Kernel 2.6.16, for the first time, it was renamed to ``EDAC``. + Kernel 2.6.16, it was renamed to ``EDAC``. Purpose ------- -- cgit v1.2.3 From ad89c8852fde7ab0c4007f5b753517cf508d12be Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:44 +0200 Subject: docs: spi: spi.h: fix a doc building warning We need to add a blank line to avoid this warning: ./include/linux/spi/spi.h:401: WARNING: Unexpected indentation. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/1c701b3ac903dc0bc304dca958fbdee53bd38dc3.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/spi/spi.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 38286de779e3..aac57b5b7c21 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -394,6 +394,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * for example doing DMA mapping. Called from threaded * context. * @transfer_one: transfer a single spi_transfer. + * * - return 0 if the transfer is finished, * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must -- cgit v1.2.3 From f08252469ef5b10863424e502bb1ed049d18390c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:45 +0200 Subject: docs: drivers: fix some warnings at base/platform.c when building docs Currrently, two warnings are generated when building docs: ./drivers/base/platform.c:136: WARNING: Unexpected indentation. ./drivers/base/platform.c:214: WARNING: Unexpected indentation. As examples are code blocks, they should use "::" markup. However, Example:: Is currently interpreted as a new section. While we could fix kernel-doc to accept such new syntax, it is easier to just replace it with: For Example:: Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/564273815a76136fb5e453969b1012a786d99e28.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- drivers/base/platform.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 5255550b7c34..a07e28eab7d7 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -147,7 +147,8 @@ EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource_byname); * request_irq() APIs. This is the same as platform_get_irq(), except that it * does not print an error message if an IRQ can not be obtained. * - * Example: + * For example:: + * * int irq = platform_get_irq_optional(pdev, 0); * if (irq < 0) * return irq; @@ -226,7 +227,8 @@ EXPORT_SYMBOL_GPL(platform_get_irq_optional); * IRQ fails. Device drivers should check the return value for errors so as to * not pass a negative integer value to the request_irq() APIs. * - * Example: + * For example:: + * * int irq = platform_get_irq(pdev, 0); * if (irq < 0) * return irq; -- cgit v1.2.3 From 14a7e51ff184281a8c02c1cf803df15fd1ac19c8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:46 +0200 Subject: docs: mm: userfaultfd.rst: use ``foo`` for literals Several parts of this document define literals: ioctl names, function calls, directory patches, etc. Mark those as literal blocks, in order to improve its readability (both at text mode and after parsed by Sphinx. This fixes those two warnings: Documentation/admin-guide/mm/userfaultfd.rst:139: WARNING: Inline emphasis start-string without end-string. Documentation/admin-guide/mm/userfaultfd.rst:139: WARNING: Inline emphasis start-string without end-string. produced during documentation build. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/2ae061761baf8fe00cdf8a7e6dae293756849a05.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/mm/userfaultfd.rst | 207 ++++++++++++++------------- 1 file changed, 104 insertions(+), 103 deletions(-) diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index c30176e67900..740d111faf1c 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -12,107 +12,107 @@ and more generally they allow userland to take control of various memory page faults, something otherwise only the kernel code could do. For example userfaults allows a proper and more optimal implementation -of the PROT_NONE+SIGSEGV trick. +of the ``PROT_NONE+SIGSEGV`` trick. Design ====== -Userfaults are delivered and resolved through the userfaultfd syscall. +Userfaults are delivered and resolved through the ``userfaultfd`` syscall. -The userfaultfd (aside from registering and unregistering virtual +The ``userfaultfd`` (aside from registering and unregistering virtual memory ranges) provides two primary functionalities: -1) read/POLLIN protocol to notify a userland thread of the faults +1) ``read/POLLIN`` protocol to notify a userland thread of the faults happening -2) various UFFDIO_* ioctls that can manage the virtual memory regions - registered in the userfaultfd that allows userland to efficiently +2) various ``UFFDIO_*`` ioctls that can manage the virtual memory regions + registered in the ``userfaultfd`` that allows userland to efficiently resolve the userfaults it receives via 1) or to manage the virtual memory in the background The real advantage of userfaults if compared to regular virtual memory management of mremap/mprotect is that the userfaults in all their operations never involve heavyweight structures like vmas (in fact the -userfaultfd runtime load never takes the mmap_sem for writing). +``userfaultfd`` runtime load never takes the mmap_sem for writing). Vmas are not suitable for page- (or hugepage) granular fault tracking when dealing with virtual address spaces that could span Terabytes. Too many vmas would be needed for that. -The userfaultfd once opened by invoking the syscall, can also be +The ``userfaultfd`` once opened by invoking the syscall, can also be passed using unix domain sockets to a manager process, so the same manager process could handle the userfaults of a multitude of different processes without them being aware about what is going on -(well of course unless they later try to use the userfaultfd +(well of course unless they later try to use the ``userfaultfd`` themselves on the same region the manager is already tracking, which -is a corner case that would currently return -EBUSY). +is a corner case that would currently return ``-EBUSY``). API === -When first opened the userfaultfd must be enabled invoking the -UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or -a later API version) which will specify the read/POLLIN protocol -userland intends to speak on the UFFD and the uffdio_api.features -userland requires. The UFFDIO_API ioctl if successful (i.e. if the -requested uffdio_api.api is spoken also by the running kernel and the +When first opened the ``userfaultfd`` must be enabled invoking the +``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or +a later API version) which will specify the ``read/POLLIN`` protocol +userland intends to speak on the ``UFFD`` and the ``uffdio_api.features`` +userland requires. The ``UFFDIO_API`` ioctl if successful (i.e. if the +requested ``uffdio_api.api`` is spoken also by the running kernel and the requested features are going to be enabled) will return into -uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of +``uffdio_api.features`` and ``uffdio_api.ioctls`` two 64bit bitmasks of respectively all the available features of the read(2) protocol and the generic ioctl available. -The uffdio_api.features bitmask returned by the UFFDIO_API ioctl -defines what memory types are supported by the userfaultfd and what +The ``uffdio_api.features`` bitmask returned by the ``UFFDIO_API`` ioctl +defines what memory types are supported by the ``userfaultfd`` and what events, except page fault notifications, may be generated. -If the kernel supports registering userfaultfd ranges on hugetlbfs -virtual memory areas, UFFD_FEATURE_MISSING_HUGETLBFS will be set in -uffdio_api.features. Similarly, UFFD_FEATURE_MISSING_SHMEM will be -set if the kernel supports registering userfaultfd ranges on shared -memory (covering all shmem APIs, i.e. tmpfs, IPCSHM, /dev/zero -MAP_SHARED, memfd_create, etc). +If the kernel supports registering ``userfaultfd`` ranges on hugetlbfs +virtual memory areas, ``UFFD_FEATURE_MISSING_HUGETLBFS`` will be set in +``uffdio_api.features``. Similarly, ``UFFD_FEATURE_MISSING_SHMEM`` will be +set if the kernel supports registering ``userfaultfd`` ranges on shared +memory (covering all shmem APIs, i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, +``MAP_SHARED``, ``memfd_create``, etc). -The userland application that wants to use userfaultfd with hugetlbfs +The userland application that wants to use ``userfaultfd`` with hugetlbfs or shared memory need to set the corresponding flag in -uffdio_api.features to enable those features. +``uffdio_api.features`` to enable those features. If the userland desires to receive notifications for events other than -page faults, it has to verify that uffdio_api.features has appropriate -UFFD_FEATURE_EVENT_* bits set. These events are described in more +page faults, it has to verify that ``uffdio_api.features`` has appropriate +``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more detail below in "Non-cooperative userfaultfd" section. -Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should -be invoked (if present in the returned uffdio_api.ioctls bitmask) to -register a memory range in the userfaultfd by setting the -uffdio_register structure accordingly. The uffdio_register.mode +Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should +be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to +register a memory range in the ``userfaultfd`` by setting the +uffdio_register structure accordingly. The ``uffdio_register.mode`` bitmask will specify to the kernel which kind of faults to track for -the range (UFFDIO_REGISTER_MODE_MISSING would track missing -pages). The UFFDIO_REGISTER ioctl will return the -uffdio_register.ioctls bitmask of ioctls that are suitable to resolve +the range (``UFFDIO_REGISTER_MODE_MISSING`` would track missing +pages). The ``UFFDIO_REGISTER`` ioctl will return the +``uffdio_register.ioctls`` bitmask of ioctls that are suitable to resolve userfaults on the range registered. Not all ioctls will necessarily be supported for all memory types depending on the underlying virtual memory backend (anonymous memory vs tmpfs vs real filebacked mappings). -Userland can use the uffdio_register.ioctls to manage the virtual +Userland can use the ``uffdio_register.ioctls`` to manage the virtual address space in the background (to add or potentially also remove -memory from the userfaultfd registered range). This means a userfault +memory from the ``userfaultfd`` registered range). This means a userfault could be triggering just before userland maps in the background the user-faulted page. -The primary ioctl to resolve userfaults is UFFDIO_COPY. That +The primary ioctl to resolve userfaults is ``UFFDIO_COPY``. That atomically copies a page into the userfault registered range and wakes -up the blocked userfaults (unless uffdio_copy.mode & -UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to -UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an -half copied page since it'll keep userfaulting until the copy has -finished. +up the blocked userfaults +(unless ``uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE`` is set). +Other ioctl works similarly to ``UFFDIO_COPY``. They're atomic as in +guaranteeing that nothing can see an half copied page since it'll +keep userfaulting until the copy has finished. Notes: -- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then +- If you requested ``UFFDIO_REGISTER_MODE_MISSING`` when registering then you must provide some kind of page in your thread after reading from - the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE. + the uffd. You must provide either ``UFFDIO_COPY`` or ``UFFDIO_ZEROPAGE``. The normal behavior of the OS automatically providing a zero page on an annonymous mmaping is not in place. @@ -122,13 +122,13 @@ Notes: - You get the address of the access that triggered the missing page event out of a struct uffd_msg that you read in the thread from the - uffd. You can supply as many pages as you want with UFFDIO_COPY or - UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then + uffd. You can supply as many pages as you want with ``UFFDIO_COPY`` or + ``UFFDIO_ZEROPAGE``. Keep in mind that unless you used DONTWAKE then the first of any of those IOCTLs wakes up the faulting thread. -- Be sure to test for all errors including (pollfd[0].revents & - POLLERR). This can happen, e.g. when ranges supplied were - incorrect. +- Be sure to test for all errors including + (``pollfd[0].revents & POLLERR``). This can happen, e.g. when ranges + supplied were incorrect. Write Protect Notifications --------------------------- @@ -136,41 +136,42 @@ Write Protect Notifications This is equivalent to (but faster than) using mprotect and a SIGSEGV signal handler. -Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP. -Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT, -struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP +Firstly you need to register a range with ``UFFDIO_REGISTER_MODE_WP``. +Instead of using mprotect(2) you use +``ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect)`` +while ``mode = UFFDIO_WRITEPROTECT_MODE_WP`` in the struct passed in. The range does not default to and does not have to be identical to the range you registered with. You can write protect as many ranges as you like (inside the registered range). Then, in the thread reading from uffd the struct will have -msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send -ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again -while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set. -This wakes up the thread which will continue to run with writes. This +``msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP`` set. Now you send +``ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect)`` +again while ``pagefault.mode`` does not have ``UFFDIO_WRITEPROTECT_MODE_WP`` +set. This wakes up the thread which will continue to run with writes. This allows you to do the bookkeeping about the write in the uffd reading thread before the ioctl. -If you registered with both UFFDIO_REGISTER_MODE_MISSING and -UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in +If you registered with both ``UFFDIO_REGISTER_MODE_MISSING`` and +``UFFDIO_REGISTER_MODE_WP`` then you need to think about the sequence in which you supply a page and undo write protect. Note that there is a difference between writes into a WP area and into a !WP area. The -former will have UFFD_PAGEFAULT_FLAG_WP set, the latter -UFFD_PAGEFAULT_FLAG_WRITE. The latter did not fail on protection but -you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was +former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter +``UFFD_PAGEFAULT_FLAG_WRITE``. The latter did not fail on protection but +you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was used. QEMU/KVM ======== -QEMU/KVM is using the userfaultfd syscall to implement postcopy live +QEMU/KVM is using the ``userfaultfd`` syscall to implement postcopy live migration. Postcopy live migration is one form of memory externalization consisting of a virtual machine running with part or all of its memory residing on a different node in the cloud. The -userfaultfd abstraction is generic enough that not a single line of +``userfaultfd`` abstraction is generic enough that not a single line of KVM kernel code had to be modified in order to add postcopy live migration to QEMU. -Guest async page faults, FOLL_NOWAIT and all other GUP features work +Guest async page faults, ``FOLL_NOWAIT`` and all other ``GUP*`` features work just fine in combination with userfaults. Userfaults trigger async page faults in the guest scheduler so those guest processes that aren't waiting for userfaults (i.e. network bound) can keep running in @@ -183,19 +184,19 @@ generating userfaults for readonly guest regions. The implementation of postcopy live migration currently uses one single bidirectional socket but in the future two different sockets will be used (to reduce the latency of the userfaults to the minimum -possible without having to decrease /proc/sys/net/ipv4/tcp_wmem). +possible without having to decrease ``/proc/sys/net/ipv4/tcp_wmem``). The QEMU in the source node writes all pages that it knows are missing in the destination node, into the socket, and the migration thread of -the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE -ioctls on the userfaultfd in order to map the received pages into the -guest (UFFDIO_ZEROCOPY is used if the source page was a zero page). +the QEMU running in the destination node runs ``UFFDIO_COPY|ZEROPAGE`` +ioctls on the ``userfaultfd`` in order to map the received pages into the +guest (``UFFDIO_ZEROCOPY`` is used if the source page was a zero page). A different postcopy thread in the destination node listens with -poll() to the userfaultfd in parallel. When a POLLIN event is +poll() to the ``userfaultfd`` in parallel. When a ``POLLIN`` event is generated after a userfault triggers, the postcopy thread read() from -the userfaultfd and receives the fault address (or -EAGAIN in case the -userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run +the ``userfaultfd`` and receives the fault address (or ``-EAGAIN`` in case the +userfault was already resolved and waken by a ``UFFDIO_COPY|ZEROPAGE`` run by the parallel QEMU migration thread). After the QEMU postcopy thread (running in the destination node) gets @@ -206,7 +207,7 @@ remaining missing pages from that new page offset. Soon after that (just the time to flush the tcp_wmem queue through the network) the migration thread in the QEMU running in the destination node will receive the page that triggered the userfault and it'll map it as -usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it +usual with the ``UFFDIO_COPY|ZEROPAGE`` (without actually knowing if it was spontaneously sent by the source or if it was an urgent page requested through a userfault). @@ -219,74 +220,74 @@ checked to find which missing pages to send in round robin and we seek over it when receiving incoming userfaults. After sending each page of course the bitmap is updated accordingly. It's also useful to avoid sending the same page twice (in case the userfault is read by the -postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration +postcopy thread just before ``UFFDIO_COPY|ZEROPAGE`` runs in the migration thread). Non-cooperative userfaultfd =========================== -When the userfaultfd is monitored by an external manager, the manager +When the ``userfaultfd`` is monitored by an external manager, the manager must be able to track changes in the process virtual memory layout. Userfaultfd can notify the manager about such changes using the same read(2) protocol as for the page fault notifications. The manager has to explicitly enable these events by setting appropriate -bits in uffdio_api.features passed to UFFDIO_API ioctl: +bits in ``uffdio_api.features`` passed to ``UFFDIO_API`` ioctl: -UFFD_FEATURE_EVENT_FORK - enable userfaultfd hooks for fork(). When this feature is - enabled, the userfaultfd context of the parent process is +``UFFD_FEATURE_EVENT_FORK`` + enable ``userfaultfd`` hooks for fork(). When this feature is + enabled, the ``userfaultfd`` context of the parent process is duplicated into the newly created process. The manager - receives UFFD_EVENT_FORK with file descriptor of the new - userfaultfd context in the uffd_msg.fork. + receives ``UFFD_EVENT_FORK`` with file descriptor of the new + ``userfaultfd`` context in the ``uffd_msg.fork``. -UFFD_FEATURE_EVENT_REMAP +``UFFD_FEATURE_EVENT_REMAP`` enable notifications about mremap() calls. When the non-cooperative process moves a virtual memory area to a different location, the manager will receive - UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and + ``UFFD_EVENT_REMAP``. The ``uffd_msg.remap`` will contain the old and new addresses of the area and its original length. -UFFD_FEATURE_EVENT_REMOVE +``UFFD_FEATURE_EVENT_REMOVE`` enable notifications about madvise(MADV_REMOVE) and - madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will - be generated upon these calls to madvise. The uffd_msg.remove + madvise(MADV_DONTNEED) calls. The event ``UFFD_EVENT_REMOVE`` will + be generated upon these calls to madvise(). The ``uffd_msg.remove`` will contain start and end addresses of the removed area. -UFFD_FEATURE_EVENT_UNMAP +``UFFD_FEATURE_EVENT_UNMAP`` enable notifications about memory unmapping. The manager will - get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and + get ``UFFD_EVENT_UNMAP`` with ``uffd_msg.remove`` containing start and end addresses of the unmapped area. -Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP +Although the ``UFFD_FEATURE_EVENT_REMOVE`` and ``UFFD_FEATURE_EVENT_UNMAP`` are pretty similar, they quite differ in the action expected from the -userfaultfd manager. In the former case, the virtual memory is +``userfaultfd`` manager. In the former case, the virtual memory is removed, but the area is not, the area remains monitored by the -userfaultfd, and if a page fault occurs in that area it will be +``userfaultfd``, and if a page fault occurs in that area it will be delivered to the manager. The proper resolution for such page fault is to zeromap the faulting address. However, in the latter case, when an area is unmapped, either explicitly (with munmap() system call), or implicitly (e.g. during mremap()), the area is removed and in turn the -userfaultfd context for such area disappears too and the manager will +``userfaultfd`` context for such area disappears too and the manager will not get further userland page faults from the removed area. Still, the notification is required in order to prevent manager from using -UFFDIO_COPY on the unmapped area. +``UFFDIO_COPY`` on the unmapped area. Unlike userland page faults which have to be synchronous and require explicit or implicit wakeup, all the events are delivered asynchronously and the non-cooperative process resumes execution as -soon as manager executes read(). The userfaultfd manager should -carefully synchronize calls to UFFDIO_COPY with the events -processing. To aid the synchronization, the UFFDIO_COPY ioctl will -return -ENOSPC when the monitored process exits at the time of -UFFDIO_COPY, and -ENOENT, when the non-cooperative process has changed -its virtual memory layout simultaneously with outstanding UFFDIO_COPY +soon as manager executes read(). The ``userfaultfd`` manager should +carefully synchronize calls to ``UFFDIO_COPY`` with the events +processing. To aid the synchronization, the ``UFFDIO_COPY`` ioctl will +return ``-ENOSPC`` when the monitored process exits at the time of +``UFFDIO_COPY``, and ``-ENOENT``, when the non-cooperative process has changed +its virtual memory layout simultaneously with outstanding ``UFFDIO_COPY`` operation. The current asynchronous model of the event delivery is optimal for -single threaded non-cooperative userfaultfd manager implementations. A +single threaded non-cooperative ``userfaultfd`` manager implementations. A synchronous event delivery model can be added later as a new -userfaultfd feature to facilitate multithreading enhancements of the -non cooperative manager, for example to allow UFFDIO_COPY ioctls to +``userfaultfd`` feature to facilitate multithreading enhancements of the +non cooperative manager, for example to allow ``UFFDIO_COPY`` ioctls to run in parallel to the event reception. Single threaded implementations should continue to use the current async event delivery model instead. -- cgit v1.2.3 From 4a3fe6541c8cde8cb69e6453752b7fea1aac7d55 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:47 +0200 Subject: docs: mm: userfaultfd.rst: use a cross-reference for a section Instead of using "foo", let's use `foo`_, with is a ReST way of saying that foo is a section of the document. With that, after building the docs, an hyperlink is generated. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/f46b45f1aaec233217f2e0b0438bbd8cc16fe17b.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/mm/userfaultfd.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 740d111faf1c..0bf49d7313ad 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -79,7 +79,7 @@ or shared memory need to set the corresponding flag in If the userland desires to receive notifications for events other than page faults, it has to verify that ``uffdio_api.features`` has appropriate ``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more -detail below in "Non-cooperative userfaultfd" section. +detail below in `Non-cooperative userfaultfd`_ section. Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to -- cgit v1.2.3 From 9070492b10c36faba0d9c2cb13d3e0a5a0ffaf6a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:48 +0200 Subject: docs: vm: index.rst: add an orphan doc to the building system The new free_page_reporting.rst file is not listed at the index. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/146432ae6965a2bb62c929a6b62f9d4010986622.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/vm/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index e8d943b21cf9..611140ffef7e 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -31,6 +31,7 @@ descriptions of data structures and algorithms. active_mm balance cleancache + free_page_reporting frontswap highmem hmm -- cgit v1.2.3 From b4c6d8efdcdde455e0bd9b5df5c42b28c131ca6c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:49 +0200 Subject: docs: dt: qcom,dwc3.txt: fix cross-reference for a converted file The qcom-qusb2-phy.txt file was converted and renamed to yaml. Update cross-reference accordingly. Fixes: 8ce65d8d38df ("dt-bindings: phy: qcom,qusb2: Convert QUSB2 phy bindings to yaml") Reviewed-by: Stephen Boyd Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/a055c564f2a79aa748064329d938db8b3c8edd58.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/devicetree/bindings/usb/qcom,dwc3.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/usb/qcom,dwc3.txt b/Documentation/devicetree/bindings/usb/qcom,dwc3.txt index cb695aa3fba4..fbdd01756752 100644 --- a/Documentation/devicetree/bindings/usb/qcom,dwc3.txt +++ b/Documentation/devicetree/bindings/usb/qcom,dwc3.txt @@ -52,8 +52,8 @@ A child node must exist to represent the core DWC3 IP block. The name of the node is not important. The content of the node is defined in dwc3.txt. Phy documentation is provided in the following places: -Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt - USB3 QMP PHY -Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt - USB2 QUSB2 PHY +Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt - USB3 QMP PHY +Documentation/devicetree/bindings/phy/qcom,qusb2-phy.yaml - USB2 QUSB2 PHY Example device nodes: -- cgit v1.2.3 From 3f4a6c925a42adc15d3d50a333b179743e31df45 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:50 +0200 Subject: docs: dt: fix a broken reference for a file converted to json Changeset 32ced09d7903 ("dt-bindings: serial: Convert slave-device bindings to json-schema") moved a binding to json and updated the links. Yet, one link was not changed, due to a merge conflict. Update this one too. Fixes: 32ced09d7903 ("dt-bindings: serial: Convert slave-device bindings to json-schema") Reviewed-by: Geert Uytterhoeven Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9b1603e254d39c9607bfedefeedaafd2c44aeb19.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt index beca6466d59a..d2202791c1d4 100644 --- a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt +++ b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt @@ -29,7 +29,7 @@ Required properties for compatible string qcom,wcn399x-bt: Optional properties for compatible string qcom,wcn399x-bt: - - max-speed: see Documentation/devicetree/bindings/serial/slave-device.txt + - max-speed: see Documentation/devicetree/bindings/serial/serial.yaml - firmware-name: specify the name of nvm firmware to load - clocks: clock provided to the controller -- cgit v1.2.3 From 8f97986ccbd75ff7f4deab8e33fd33be1e9d43c4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:51 +0200 Subject: docs: powerpc: cxl.rst: mark two section titles as such The User API chapter contains two sub-chapters. Mark them as such. Signed-off-by: Mauro Carvalho Chehab Acked-by: Andrew Donnellan Link: https://lore.kernel.org/r/190d67397cd63e419de8d85b92e8018d48e8c345.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/powerpc/cxl.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/powerpc/cxl.rst b/Documentation/powerpc/cxl.rst index 920546d81326..d2d77057610e 100644 --- a/Documentation/powerpc/cxl.rst +++ b/Documentation/powerpc/cxl.rst @@ -133,6 +133,7 @@ User API ======== 1. AFU character devices +^^^^^^^^^^^^^^^^^^^^^^^^ For AFUs operating in AFU directed mode, two character device files will be created. /dev/cxl/afu0.0m will correspond to a @@ -395,6 +396,7 @@ read 2. Card character device (powerVM guest only) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In a powerVM guest, an extra character device is created for the card. The device is only used to write (flash) a new image on the -- cgit v1.2.3 From 36536a02e5542ea4909eea27fa7b173e81565312 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:52 +0200 Subject: docs: i2c: rename i2c.svg to i2c_bus.svg When generating the PDF output, the Documentation/i2c dir will generate an i2c.pdf. The same happens with i2c.svg: it will also produce a file with the same name, at the same dir. This causes errors when building the PDF output. So, rename the image to i2c_bus.svg. Signed-off-by: Mauro Carvalho Chehab Acked-by: Wolfram Sang Link: https://lore.kernel.org/r/ecf3d51909ce46b3e84a1df4b36f07d76989e5da.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/i2c/i2c.svg | 1341 ----------------------------------------- Documentation/i2c/i2c_bus.svg | 1341 +++++++++++++++++++++++++++++++++++++++++ Documentation/i2c/summary.rst | 2 +- 3 files changed, 1342 insertions(+), 1342 deletions(-) delete mode 100644 Documentation/i2c/i2c.svg create mode 100644 Documentation/i2c/i2c_bus.svg diff --git a/Documentation/i2c/i2c.svg b/Documentation/i2c/i2c.svg deleted file mode 100644 index 5979405ad1c3..000000000000 --- a/Documentation/i2c/i2c.svg +++ /dev/null @@ -1,1341 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - Luca Ceresoli - - - 2020 - - - - - - - - - - - - - - - I2CMaster - - - I2CSlave - - SCL - SDA - - I2CSlave - - I2CSlave - - - - - - - - - - - - VDD - - diff --git a/Documentation/i2c/i2c_bus.svg b/Documentation/i2c/i2c_bus.svg new file mode 100644 index 000000000000..3170de976373 --- /dev/null +++ b/Documentation/i2c/i2c_bus.svg @@ -0,0 +1,1341 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + Luca Ceresoli + + + 2020 + + + + + + + + + + + + + + + I2CMaster + + + I2CSlave + + SCL + SDA + + I2CSlave + + I2CSlave + + + + + + + + + + + + VDD + + diff --git a/Documentation/i2c/summary.rst b/Documentation/i2c/summary.rst index ce7230025b33..136c4e333be7 100644 --- a/Documentation/i2c/summary.rst +++ b/Documentation/i2c/summary.rst @@ -34,7 +34,7 @@ Terminology Using the terminology from the official documentation, the I2C bus connects one or more *master* chips and one or more *slave* chips. -.. kernel-figure:: i2c.svg +.. kernel-figure:: i2c_bus.svg :alt: Simple I2C bus with one master and 3 slaves Simple I2C bus -- cgit v1.2.3 From baeb2d5cb8ea84dfb932cbde0e2adcd527e51bb5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:53 +0200 Subject: docs: Makefile: place final pdf docs on a separate dir The Sphinx build system for PDF is too complex and generate lots of ancillary files, including one PDF file for each image. So, at the end, the main latex dir has 156 pdf files, instead of the 71 ones that would match each generated book. That's confusing and it makes harder to identify when something didn't work. So, instead, let's move the final PDF output(s) to a separate dir. This way, the latex/ dir will have the temporary and the final *.tex files, while the final pdf files that built ok will be under the pdf/ directory. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/832752cbc9678a6e8d3d634bc3356d655d44684f.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/Makefile b/Documentation/Makefile index cc786d11a028..db1fc35ded50 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -98,7 +98,11 @@ else # HAVE_PDFLATEX pdfdocs: latexdocs @$(srctree)/scripts/sphinx-pre-install --version-check - $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;) + $(foreach var,$(SPHINXDIRS), \ + $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit; \ + mkdir -p $(BUILDDIR)/$(var)/pdf; \ + mv $(subst .tex,.pdf,$(wildcard $(BUILDDIR)/$(var)/latex/*.tex)) $(BUILDDIR)/$(var)/pdf/; \ + ) endif # HAVE_PDFLATEX -- cgit v1.2.3 From 77c34b2c18d4d88aed9ab3407407c492e0ce18b8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:54 +0200 Subject: docs: dt: rockchip,dwc3.txt: fix a pointer to a renamed file phy-rockchip-inno-usb2.txt was converted to yaml. Fix the corresponding reference. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/287bd271f5c542e9d12a132a6b6a17672c9fd67c.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/devicetree/bindings/usb/rockchip,dwc3.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt index c8c4b00ecb94..94520493233b 100644 --- a/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt +++ b/Documentation/devicetree/bindings/usb/rockchip,dwc3.txt @@ -16,7 +16,7 @@ A child node must exist to represent the core DWC3 IP block. The name of the node is not important. The content of the node is defined in dwc3.txt. Phy documentation is provided in the following places: -Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt - USB2.0 PHY +Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.yaml - USB2.0 PHY Documentation/devicetree/bindings/phy/phy-rockchip-typec.txt - Type-C PHY Example device nodes: -- cgit v1.2.3 From a31a6997e6df4bd0d7782dbe945a59c321475669 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:55 +0200 Subject: ata: libata-core: fix a doc warning The docs toolchain doesn't recognise this pattern: @link->[hw_]sata_spd_limit As it can't really process it. So, instead, let's mark it with a literal block markup: ``link->[hw_]sata_spd_limit`` in order to get rid of the following warning: ./drivers/ata/libata-core.c:5974: WARNING: Unknown target name: "hw". Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9a21444df75c46095c4b1839d2061d19c9addcff.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- drivers/ata/libata-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index beca5f91bb4c..69361ec43db5 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5209,7 +5209,7 @@ void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp) * sata_link_init_spd - Initialize link->sata_spd_limit * @link: Link to configure sata_spd_limit for * - * Initialize @link->[hw_]sata_spd_limit to the currently + * Initialize ``link->[hw_]sata_spd_limit`` to the currently * configured value. * * LOCKING: -- cgit v1.2.3 From af690f459393017ce40bb9beef6a00e79511574d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:56 +0200 Subject: firewire: firewire-cdev.hL get rid of a docs warning This warning: ./include/uapi/linux/firewire-cdev.h:312: WARNING: Inline literal start-string without end-string. is because %FOO doesn't work if there's a parenthesis at the string (as a parenthesis may indicate a function). So, mark the literal block using the alternate ``FOO`` syntax. Acked-by: Stefan Richter Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9b2501a41eba27ccdd4603cac2353c0efba7a90a.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/uapi/linux/firewire-cdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 1acd2b179aef..7e5b5c10a49c 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -308,7 +308,7 @@ struct fw_cdev_event_iso_interrupt_mc { /** * struct fw_cdev_event_iso_resource - Iso resources were allocated or freed * @closure: See &fw_cdev_event_common; - * set by %FW_CDEV_IOC_(DE)ALLOCATE_ISO_RESOURCE(_ONCE) ioctl + * set by``FW_CDEV_IOC_(DE)ALLOCATE_ISO_RESOURCE(_ONCE)`` ioctl * @type: %FW_CDEV_EVENT_ISO_RESOURCE_ALLOCATED or * %FW_CDEV_EVENT_ISO_RESOURCE_DEALLOCATED * @handle: Reference by which an allocated resource can be deallocated -- cgit v1.2.3 From 2b8e8b5599a1831e9a4cf9aa86476887ff425e57 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:57 +0200 Subject: fs: inode.c: get rid of docs warnings Use *foo makes the toolchain to think that this is an emphasis, causing those warnings: ./fs/inode.c:1609: WARNING: Inline emphasis start-string without end-string. ./fs/inode.c:1609: WARNING: Inline emphasis start-string without end-string. ./fs/inode.c:1615: WARNING: Inline emphasis start-string without end-string. So, use, instead, ``*foo``, in order to mark it as a literal block. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/e8da46a0e57f2af6d63a0c53665495075698e28a.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- fs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..37226a9cfa4f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1606,14 +1606,14 @@ EXPORT_SYMBOL(iput); * @inode: inode owning the block number being requested * @block: pointer containing the block to find * - * Replaces the value in *block with the block number on the device holding + * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the - * 4 in *block, with disk block relative to the disk start that holds that + * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a - * hole, returns 0 and *block is also set to 0. + * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { -- cgit v1.2.3 From 03c109d66867767ddbb0fe09223410a79193f5ea Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:58 +0200 Subject: futex: get rid of a kernel-docs build warning Adjust whitespaces and blank lines in order to get rid of this: ./kernel/futex.c:491: WARNING: Definition list ends without a blank line; unexpected unindent. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/57788af7889161483e0c97f91c079cfb3986c4b3.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- kernel/futex.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index b59532862bc0..b4b9f960b610 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -486,10 +486,13 @@ static u64 get_inode_sequence_number(struct inode *inode) * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: + * * ( inode->i_sequence, page->index, offset_within_page ) + * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: + * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex -- cgit v1.2.3 From 4642289b5f6609b1af152a81b603ae8451df2626 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:59 +0200 Subject: lib: bitmap.c: get rid of some doc warnings There are two ascii art drawings there. Use a block markup tag there in order to get rid of those warnings: ./lib/bitmap.c:189: WARNING: Unexpected indentation. ./lib/bitmap.c:190: WARNING: Block quote ends without a blank line; unexpected unindent. ./lib/bitmap.c:190: WARNING: Unexpected indentation. ./lib/bitmap.c:191: WARNING: Line block ends without a blank line. It should be noticed that there's actually a syntax violation right now, as something like: /** ... @src: will be handled as a definition for @src parameter, and not as part of a diagram. So, we need to add something before it, in order for this to be processed the way it should. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/1e2568fdfa838c1a0d8cc2a1d70dd4b6de99bfb1.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- lib/bitmap.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index 89260aa342d6..21a7640c5eed 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -182,21 +182,22 @@ EXPORT_SYMBOL(__bitmap_shift_left); * * In pictures, example for a big-endian 32-bit architecture: * - * @src: - * 31 63 - * | | - * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 - * | | | | - * 16 14 0 32 - * - * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is: - * - * 31 63 - * | | - * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 - * | | | - * 14 (bit 17 0 32 - * from @src) + * The @src bitmap is:: + * + * 31 63 + * | | + * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 + * | | | | + * 16 14 0 32 + * + * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is:: + * + * 31 63 + * | | + * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 + * | | | + * 14 (bit 17 0 32 + * from @src) * * Note that @dst and @src might overlap partially or entirely. * -- cgit v1.2.3 From 5d8e5aee0e9391141fe2ab892d0ac98de5783537 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Sun, 15 Mar 2020 13:26:48 +0100 Subject: docs: sysctl/kernel: document BPF entries Based on the implementation in kernel/bpf/syscall.c, kernel/bpf/trampoline.c, include/linux/filter.h, and the documentation in bpftool-prog.rst. Signed-off-by: Stephen Kitt Link: https://lore.kernel.org/r/20200315122648.20558-1-steve@sk2.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/sysctl/kernel.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 0d427fd10941..55b24eada13c 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -102,6 +102,20 @@ See the ``type_of_loader`` and ``ext_loader_ver`` fields in :doc:`/x86/boot` for additional information. +bpf_stats_enabled +================= + +Controls whether the kernel should collect statistics on BPF programs +(total time spent running, number of times run...). Enabling +statistics causes a slight reduction in performance on each program +run. The statistics can be seen using ``bpftool``. + += =================================== +0 Don't collect statistics (default). +1 Collect statistics. += =================================== + + cap_last_cap ============ @@ -1178,6 +1192,16 @@ NMI switch that most IA32 servers have fires unknown NMI up, for example. If a system hangs up, try pressing the NMI switch. +unprivileged_bpf_disabled +========================= + +Writing 1 to this entry will disable unprivileged calls to ``bpf()``; +once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` will return +``-EPERM``. + +Once set, this can't be cleared. + + watchdog ======== -- cgit v1.2.3 From c7e1cc318d4a19549d55a4f114f3a3d472e9531e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 20 Apr 2020 10:41:15 +0300 Subject: dma-buf: Couple of documentation typo fixes Fix a couple of typos: "as" -> "has" and "int" -> "in". Signed-off-by: Gal Pressman Link: https://lore.kernel.org/r/20200420074115.23931-1-galpress@amazon.com Signed-off-by: Jonathan Corbet --- Documentation/driver-api/dma-buf.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst index c78db28519f7..63dec76d1d8d 100644 --- a/Documentation/driver-api/dma-buf.rst +++ b/Documentation/driver-api/dma-buf.rst @@ -11,7 +11,7 @@ course not limited to GPU use cases. The three main components of this are: (1) dma-buf, representing a sg_table and exposed to userspace as a file descriptor to allow passing between devices, (2) fence, which provides a mechanism to signal when -one device as finished access, and (3) reservation, which manages the +one device has finished access, and (3) reservation, which manages the shared or exclusive fence(s) associated with the buffer. Shared DMA Buffers @@ -31,7 +31,7 @@ The exporter - implements and manages operations in :c:type:`struct dma_buf_ops ` for the buffer, - allows other users to share the buffer by using dma_buf sharing APIs, - - manages the details of buffer allocation, wrapped int a :c:type:`struct + - manages the details of buffer allocation, wrapped in a :c:type:`struct dma_buf `, - decides about the actual backing storage where this allocation happens, - and takes care of any migration of scatterlist - for all (shared) users of -- cgit v1.2.3 From d8e8ff1fe30278e46afcb3029468e678bbcffddd Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 18 Apr 2020 20:41:32 +0300 Subject: docs: ioctl-number.rst: add habanalabs driver IOCTL Habanalabs driver in misc exposes several IOCTLs to userspace. Document the letter and IOCTLs number range in ioctl-number.rst. Signed-off-by: Oded Gabbay Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20200418174132.10597-1-oded.gabbay@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/userspace-api/ioctl/ioctl-number.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index f759edafd938..52bf58417653 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -146,6 +146,7 @@ Code Seq# Include File Comments 'H' 40-4F sound/hdspm.h conflict! 'H' 40-4F sound/hdsp.h conflict! 'H' 90 sound/usb/usx2y/usb_stream.h +'H' 00-0F uapi/misc/habanalabs.h conflict! 'H' A0 uapi/linux/usb/cdc-wdm.h 'H' C0-F0 net/bluetooth/hci.h conflict! 'H' C0-DF net/bluetooth/hidp/hidp.h conflict! -- cgit v1.2.3 From 7dbffd3f84b08c6c9f48f211dd7e5b41dade3092 Mon Sep 17 00:00:00 2001 From: Cristian Souza Date: Fri, 10 Apr 2020 22:02:01 -0300 Subject: docs: admin-guide: Clarify sentences Changes to make the text more formal and organized. The reasons are now cited and described at the same time. Minor grammatical problems have also been fixed. Signed-off-by: Cristian Souza Link: https://lore.kernel.org/r/20200411010201.GA22706@darkstar Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/init.rst | 76 ++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/Documentation/admin-guide/init.rst b/Documentation/admin-guide/init.rst index e89d97f31eaf..41f06a09152e 100644 --- a/Documentation/admin-guide/init.rst +++ b/Documentation/admin-guide/init.rst @@ -1,52 +1,48 @@ -Explaining the dreaded "No init found." boot hang message +Explaining the "No working init found." boot hang message ========================================================= +:Authors: Andreas Mohr + Cristian Souza -OK, so you've got this pretty unintuitive message (currently located -in init/main.c) and are wondering what the H*** went wrong. -Some high-level reasons for failure (listed roughly in order of execution) -to load the init binary are: - -A) Unable to mount root FS -B) init binary doesn't exist on rootfs -C) broken console device -D) binary exists but dependencies not available -E) binary cannot be loaded - -Detailed explanations: - -A) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE) - to get more detailed kernel messages. -B) make sure you have the correct root FS type - (and ``root=`` kernel parameter points to the correct partition), - required drivers such as storage hardware (such as SCSI or USB!) - and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules, - to be pre-loaded by an initrd) -C) Possibly a conflict in ``console= setup`` --> initial console unavailable. - E.g. some serial consoles are unreliable due to serial IRQ issues (e.g. - missing interrupt-based configuration). +This document provides some high-level reasons for failure +(listed roughly in order of execution) to load the init binary. + +1) **Unable to mount root FS**: Set "debug" kernel parameter (in bootloader + config file or CONFIG_CMDLINE) to get more detailed kernel messages. + +2) **init binary doesn't exist on rootfs**: Make sure you have the correct + root FS type (and ``root=`` kernel parameter points to the correct + partition), required drivers such as storage hardware (such as SCSI or + USB!) and filesystem (ext3, jffs2, etc.) are builtin (alternatively as + modules, to be pre-loaded by an initrd). + +3) **Broken console device**: Possibly a conflict in ``console= setup`` + --> initial console unavailable. E.g. some serial consoles are unreliable + due to serial IRQ issues (e.g. missing interrupt-based configuration). Try using a different ``console= device`` or e.g. ``netconsole=``. -D) e.g. required library dependencies of the init binary such as - ``/lib/ld-linux.so.2`` missing or broken. Use - ``readelf -d |grep NEEDED`` to find out which libraries are required. -E) make sure the binary's architecture matches your hardware. - E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware. - In case you tried loading a non-binary file here (shell script?), - you should make sure that the script specifies an interpreter in its shebang - header line (``#!/...``) that is fully working (including its library - dependencies). And before tackling scripts, better first test a simple - non-script binary such as ``/bin/sh`` and confirm its successful execution. - To find out more, add code ``to init/main.c`` to display kernel_execve()s - return values. + +4) **Binary exists but dependencies not available**: E.g. required library + dependencies of the init binary such as ``/lib/ld-linux.so.2`` missing or + broken. Use ``readelf -d |grep NEEDED`` to find out which libraries + are required. + +5) **Binary cannot be loaded**: Make sure the binary's architecture matches + your hardware. E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM + hardware. In case you tried loading a non-binary file here (shell script?), + you should make sure that the script specifies an interpreter in its + shebang header line (``#!/...``) that is fully working (including its + library dependencies). And before tackling scripts, better first test a + simple non-script binary such as ``/bin/sh`` and confirm its successful + execution. To find out more, add code ``to init/main.c`` to display + kernel_execve()s return values. Please extend this explanation whenever you find new failure causes (after all loading the init binary is a CRITICAL and hard transition step -which needs to be made as painless as possible), then submit patch to LKML. +which needs to be made as painless as possible), then submit a patch to LKML. Further TODOs: - Implement the various ``run_init_process()`` invocations via a struct array which can then store the ``kernel_execve()`` result value and on failure log it all by iterating over **all** results (very important usability fix). -- try to make the implementation itself more helpful in general, - e.g. by providing additional error messages at affected places. +- Try to make the implementation itself more helpful in general, e.g. by + providing additional error messages at affected places. -Andreas Mohr -- cgit v1.2.3 From fc965497d5b3204d7b726c27fefe92d2e885f994 Mon Sep 17 00:00:00 2001 From: Alessia Mantegazza Date: Mon, 13 Apr 2020 18:34:57 +0200 Subject: doc:it_IT: translation of documents in process/ Translations for the following documents in process/: - email-clients - management-style Signed-off-by: Alessia Mantegazza Signed-off-by: Federico Vaga Link: https://lore.kernel.org/r/20200413163457.52669-1-federico.vaga@vaga.pv.it Signed-off-by: Jonathan Corbet --- .../translations/it_IT/process/email-clients.rst | 332 ++++++++++++++++++++- .../it_IT/process/management-style.rst | 293 +++++++++++++++++- 2 files changed, 614 insertions(+), 11 deletions(-) diff --git a/Documentation/translations/it_IT/process/email-clients.rst b/Documentation/translations/it_IT/process/email-clients.rst index 224ab031ffd3..89abf6d325f2 100644 --- a/Documentation/translations/it_IT/process/email-clients.rst +++ b/Documentation/translations/it_IT/process/email-clients.rst @@ -1,12 +1,334 @@ .. include:: ../disclaimer-ita.rst -:Original: :ref:`Documentation/process/email-clients.rst ` - -.. _it_email_clients: +:Original: :doc:`../../../process/email-clients` +:Translator: Alessia Mantegazza Informazioni sui programmi di posta elettronica per Linux ========================================================= -.. warning:: +Git +--- + +Oggigiorno, la maggior parte degli sviluppatori utilizza ``git send-email`` +al posto dei classici programmi di posta elettronica. Le pagine man sono +abbastanza buone. Dal lato del ricevente, i manutentori utilizzano ``git am`` +per applicare le patch. + +Se siete dei novelli utilizzatori di ``git`` allora inviate la patch a voi +stessi. Salvatela come testo includendo tutte le intestazioni. Poi eseguite +il comando ``git am messaggio-formato-testo.txt`` e revisionatene il risultato +con ``git log``. Quando tutto funziona correttamente, allora potete inviare +la patch alla lista di discussione più appropriata. + +Panoramica delle opzioni +------------------------ + +Le patch per il kernel vengono inviate per posta elettronica, preferibilmente +come testo integrante del messaggio. Alcuni manutentori accettano gli +allegati, ma in questo caso gli allegati devono avere il *content-type* +impostato come ``text/plain``. Tuttavia, generalmente gli allegati non sono +ben apprezzati perché rende più difficile citare porzioni di patch durante il +processo di revisione. + +I programmi di posta elettronica che vengono usati per inviare le patch per il +kernel Linux dovrebbero inviarle senza alterazioni. Per esempio, non +dovrebbero modificare o rimuovere tabulazioni o spazi, nemmeno all'inizio o +alla fine delle righe. + +Non inviate patch con ``format=flowed``. Questo potrebbe introdurre +interruzioni di riga inaspettate e indesiderate. + +Non lasciate che il vostro programma di posta vada a capo automaticamente. +Questo può corrompere le patch. + +I programmi di posta non dovrebbero modificare la codifica dei caratteri nel +testo. Le patch inviate per posta elettronica dovrebbero essere codificate in +ASCII o UTF-8. +Se configurate il vostro programma per inviare messaggi codificati con UTF-8 +eviterete possibili problemi di codifica. + +I programmi di posta dovrebbero generare e mantenere le intestazioni +"References" o "In-Reply-To:" cosicché la discussione non venga interrotta. + +Di solito, il copia-e-incolla (o taglia-e-incolla) non funziona con le patch +perché le tabulazioni vengono convertite in spazi. Usando xclipboard, xclip +e/o xcutsel potrebbe funzionare, ma è meglio che lo verifichiate o meglio +ancora: non usate il copia-e-incolla. + +Non usate firme PGP/GPG nei messaggi che contengono delle patch. Questo +impedisce il corretto funzionamento di alcuni script per leggere o applicare +patch (questo si dovrebbe poter correggere). + +Prima di inviare le patch sulle liste di discussione Linux, può essere una +buona idea quella di inviare la patch a voi stessi, salvare il messaggio +ricevuto, e applicarlo ai sorgenti con successo. + + +Alcuni suggerimenti per i programmi di posta elettronica (MUA) +-------------------------------------------------------------- + +Qui troverete alcuni suggerimenti per configurare i vostri MUA allo scopo +di modificare ed inviare patch per il kernel Linux. Tuttavia, questi +suggerimenti non sono da considerarsi come un riassunto di una configurazione +completa. + +Legenda: + +- TUI = interfaccia utente testuale (*text-based user interface*) +- GUI = interfaccia utente grafica (*graphical user interface*) + +Alpine (TUI) +************ + +Opzioni per la configurazione: + +Nella sezione :menuselection:`Sending Preferences`: + +- :menuselection:`Do Not Send Flowed Text` deve essere ``enabled`` +- :menuselection:`Strip Whitespace Before Sending` deve essere ``disabled`` + +Quando state scrivendo un messaggio, il cursore dev'essere posizionato +dove volete che la patch inizi, poi premendo :kbd:`CTRL-R` vi verrà chiesto +di selezionare il file patch da inserire nel messaggio. + +Claws Mail (GUI) +**************** + +Funziona. Alcune persone riescono ad usarlo con successo per inviare le patch. + +Per inserire una patch usate :menuselection:`Messaggio-->Inserisci file` +(:kbd:`CTRL-I`) oppure un editor esterno. + +Se la patch che avete inserito dev'essere modificata usato la finestra di +scrittura di Claws, allora assicuratevi che l'"auto-interruzione" sia +disabilitata :menuselection:`Configurazione-->Preferenze-->Composizione-->Interruzione riga`. + +Evolution (GUI) +*************** + +Alcune persone riescono ad usarlo con successo per inviare le patch. + +Quando state scrivendo una lettera selezionate: Preformattato + da :menuselection:`Formato-->Stile del paragrafo-->Preformattato` + (:kbd:`CTRL-7`) o dalla barra degli strumenti + +Poi per inserire la patch usate: +:menuselection:`Inserisci--> File di testo...` (:kbd:`ALT-N x`) + +Potete anche eseguire ``diff -Nru old.c new.c | xclip``, selezionare +:menuselection:`Preformattato`, e poi usare il tasto centrale del mouse. + +Kmail (GUI) +*********** + +Alcune persone riescono ad usarlo con successo per inviare le patch. + +La configurazione base che disabilita la composizione di messaggi HTML è +corretta; non abilitatela. + +Quando state scrivendo un messaggio, nel menu opzioni, togliete la selezione a +"A capo automatico". L'unico svantaggio sarà che qualsiasi altra cosa scriviate +nel messaggio non verrà mandata a capo in automatico ma dovrete farlo voi. +Il modo più semplice per ovviare a questo problema è quello di scrivere il +messaggio con l'opzione abilitata e poi di salvarlo nelle bozze. Riaprendo ora +il messaggio dalle bozze le andate a capo saranno parte integrante del +messaggio, per cui togliendo l'opzione "A capo automatico" non perderete nulla. + +Alla fine del vostro messaggio, appena prima di inserire la vostra patch, +aggiungete il delimitatore di patch: tre trattini (``---``). + +Ora, dal menu :menuselection:`Messaggio`, selezionate :menuselection:`Inserisci file di testo...` +quindi scegliete la vostra patch. +Come soluzione aggiuntiva potreste personalizzare la vostra barra degli +strumenti aggiungendo un'icona per :menuselection:`Inserisci file di testo...`. + +Allargate la finestra di scrittura abbastanza da evitare andate a capo. +Questo perché in Kmail 1.13.5 (KDE 4.5.4), Kmail aggiunge andate a capo +automaticamente al momento dell'invio per tutte quelle righe che graficamente, +nella vostra finestra di composizione, si sono estete su una riga successiva. +Disabilitare l'andata a capo automatica non è sufficiente. Dunque, se la vostra +patch contiene delle righe molto lunghe, allora dovrete allargare la finestra +di composizione per evitare che quelle righe vadano a capo. Vedere: +https://bugs.kde.org/show_bug.cgi?id=174034 + +Potete firmare gli allegati con GPG, ma per le patch si preferisce aggiungerle +al testo del messaggio per cui non usate la firma GPG. Firmare le patch +inserite come testo del messaggio le rende più difficili da estrarre dalla loro +codifica a 7-bit. + +Se dovete assolutamente inviare delle patch come allegati invece di integrarle +nel testo del messaggio, allora premete il tasto destro sull'allegato e +selezionate :menuselection:`Proprietà`, e poi attivate +:menuselection:`Suggerisci visualizzazione automatica` per far si che +l'allegato sia più leggibile venendo visualizzato come parte del messaggio. + +Per salvare le patch inviate come parte di un messaggio, selezionate il +messaggio che la contiene, premete il tasto destro e selezionate +:menuselection:`Salva come`. Se il messaggio fu ben preparato, allora potrete +usarlo interamente senza alcuna modifica. +I messaggi vengono salvati con permessi di lettura-scrittura solo per l'utente, +nel caso in cui vogliate copiarli altrove per renderli disponibili ad altri +gruppi o al mondo, ricordatevi di usare ``chmod`` per cambiare i permessi. + +Lotus Notes (GUI) +***************** + +Scappate finché potete. + +IBM Verse (Web GUI) +******************* + +Vedi il commento per Lotus Notes. + +Mutt (TUI) +********** + +Un sacco di sviluppatori Linux usano ``mutt``, per cui deve funzionare +abbastanza bene. + +Mutt non ha un proprio editor, quindi qualunque sia il vostro editor dovrete +configurarlo per non aggiungere automaticamente le andate a capo. Molti +editor hanno un'opzione :menuselection:`Inserisci file` che inserisce il +contenuto di un file senza alterarlo. + +Per usare ``vim`` come editor per mutt:: + + set editor="vi" + +Se per inserire la patch nel messaggio usate xclip, scrivete il comando:: + + :set paste + +prima di premere il tasto centrale o shift-insert. Oppure usate il +comando:: + + :r filename + +(a)llega funziona bene senza ``set paste`` + +Potete generare le patch con ``git format-patch`` e usare Mutt per inviarle:: + + $ mutt -H 0001-some-bug-fix.patch + +Opzioni per la configurazione: + +Tutto dovrebbe funzionare già nella configurazione base. +Tuttavia, è una buona idea quella di impostare ``send_charset``:: + + set send_charset="us-ascii:utf-8" + +Mutt è molto personalizzabile. Qui di seguito trovate la configurazione minima +per iniziare ad usare Mutt per inviare patch usando Gmail:: + + # .muttrc + # ================ IMAP ==================== + set imap_user = 'yourusername@gmail.com' + set imap_pass = 'yourpassword' + set spoolfile = imaps://imap.gmail.com/INBOX + set folder = imaps://imap.gmail.com/ + set record="imaps://imap.gmail.com/[Gmail]/Sent Mail" + set postponed="imaps://imap.gmail.com/[Gmail]/Drafts" + set mbox="imaps://imap.gmail.com/[Gmail]/All Mail" + + # ================ SMTP ==================== + set smtp_url = "smtp://username@smtp.gmail.com:587/" + set smtp_pass = $imap_pass + set ssl_force_tls = yes # Require encrypted connection + + # ================ Composition ==================== + set editor = `echo \$EDITOR` + set edit_headers = yes # See the headers when editing + set charset = UTF-8 # value of $LANG; also fallback for send_charset + # Sender, email address, and sign-off line must match + unset use_domain # because joe@localhost is just embarrassing + set realname = "YOUR NAME" + set from = "username@gmail.com" + set use_from = yes + +La documentazione di Mutt contiene molte più informazioni: + + https://gitlab.com/muttmua/mutt/-/wikis/UseCases/Gmail + + http://www.mutt.org/doc/manual/ + +Pine (TUI) +********** + +Pine aveva alcuni problemi con gli spazi vuoti, ma questi dovrebbero essere +stati risolti. + +Se potete usate alpine (il successore di pine). + +Opzioni di configurazione: + +- Nelle versioni più recenti è necessario avere ``quell-flowed-text`` +- l'opzione ``no-strip-whitespace-before-send`` è necessaria + +Sylpheed (GUI) +************** + +- funziona bene per aggiungere testo in linea (o usando allegati) +- permette di utilizzare editor esterni +- è lento su cartelle grandi +- non farà l'autenticazione TSL SMTP su una connessione non SSL +- ha un utile righello nella finestra di scrittura +- la rubrica non comprende correttamente il nome da visualizzare e + l'indirizzo associato + +Thunderbird (GUI) +***************** + +Thunderbird è un clone di Outlook a cui piace maciullare il testo, ma esistono +modi per impedirglielo. + +- permettere l'uso di editor esterni: + La cosa più semplice da fare con Thunderbird e le patch è quello di usare + l'estensione "external editor" e di usare il vostro ``$EDITOR`` preferito per + leggere/includere patch nel vostro messaggio. Per farlo, scaricate ed + installate l'estensione e aggiungete un bottone per chiamarla rapidamente + usando :menuselection:`Visualizza-->Barra degli strumenti-->Personalizza...`; + una volta fatto potrete richiamarlo premendo sul bottone mentre siete nella + finestra :menuselection:`Scrivi` + + Tenete presente che "external editor" richiede che il vostro editor non + faccia alcun fork, in altre parole, l'editor non deve ritornare prima di + essere stato chiuso. Potreste dover passare dei parametri aggiuntivi al + vostro editor oppure cambiargli la configurazione. Per esempio, usando + gvim dovrete aggiungere l'opzione -f ``/usr/bin/gvim -f`` (Se il binario + si trova in ``/usr/bin``) nell'apposito campo nell'interfaccia di + configurazione di :menuselection:`external editor`. Se usate altri editor + consultate il loro manuale per sapere come configurarli. + +Per rendere l'editor interno un po' più sensato, fate così: + +- Modificate le impostazioni di Thunderbird per far si che non usi + ``format=flowed``. Andate in :menuselection:`Modifica-->Preferenze-->Avanzate-->Editor di configurazione` + per invocare il registro delle impostazioni. + +- impostate ``mailnews.send_plaintext_flowed`` a ``false`` + +- impostate ``mailnews.wraplength`` da ``72`` a ``0`` + +- :menuselection:`Visualizza-->Corpo del messaggio come-->Testo semplice` + +- :menuselection:`Visualizza-->Codifica del testo-->Unicode` + + +TkRat (GUI) +*********** + +Funziona. Usare "Inserisci file..." o un editor esterno. + +Gmail (Web GUI) +*************** + +Non funziona per inviare le patch. + +Il programma web Gmail converte automaticamente i tab in spazi. + +Allo stesso tempo aggiunge andata a capo ogni 78 caratteri. Comunque +il problema della conversione fra spazi e tab può essere risolto usando +un editor esterno. - TODO ancora da tradurre +Un altro problema è che Gmail usa la codifica base64 per tutti quei messaggi +che contengono caratteri non ASCII. Questo include cose tipo i nomi europei. diff --git a/Documentation/translations/it_IT/process/management-style.rst b/Documentation/translations/it_IT/process/management-style.rst index 07e68bfb8402..c709285138a7 100644 --- a/Documentation/translations/it_IT/process/management-style.rst +++ b/Documentation/translations/it_IT/process/management-style.rst @@ -1,12 +1,293 @@ .. include:: ../disclaimer-ita.rst -:Original: :ref:`Documentation/process/management-style.rst ` +:Original: :doc:`../../../process/management-style` +:Translator: Alessia Mantegazza -.. _it_managementstyle: +Il modello di gestione del kernel Linux +======================================= -Tipo di gestione del kernel Linux -================================= +Questo breve documento descrive il modello di gestione del kernel Linux. +Per certi versi, esso rispecchia il documento +:ref:`translations/it_IT/process/coding-style.rst `, +ed è principalmente scritto per evitare di rispondere [#f1]_ in continuazione +alle stesse identiche (o quasi) domande. -.. warning:: +Il modello di gestione è qualcosa di molto personale e molto più difficile da +qualificare rispetto a delle semplici regole di codifica, quindi questo +documento potrebbe avere più o meno a che fare con la realtà. È cominciato +come un gioco, ma ciò non significa che non possa essere vero. +Lo dovrete decidere voi stessi. - TODO ancora da tradurre +In ogni caso, quando si parla del "dirigente del kernel", ci si riferisce +sempre alla persona che dirige tecnicamente, e non a coloro che +tradizionalmente hanno un ruolo direttivo all'interno delle aziende. Se vi +occupate di convalidare acquisti o avete una qualche idea sul budget del vostro +gruppo, probabilmente non siete un dirigente del kernel. Quindi i suggerimenti +qui indicati potrebbero fare al caso vostro, oppure no. + +Prima di tutto, suggerirei di acquistare "Le sette regole per avere successo", +e di non leggerlo. Bruciatelo, è un grande gesto simbolico. + +.. [#f1] Questo documento non fa molto per risponde alla domanda, ma rende + così dannatamente ovvio a chi la pone che non abbiamo la minima idea + di come rispondere. + +Comunque, partiamo: + +.. _it_decisions: + +1) Le decisioni +--------------- + +Tutti pensano che i dirigenti decidano, e che questo prendere decisioni +sia importante. Più grande e dolorosa è la decisione, più importante deve +essere il dirigente che la prende. Questo è molto profondo ed ovvio, ma non è +del tutto vero. + +Il gioco consiste nell'"evitare" di dover prendere decisioni. In particolare +se qualcuno vi chiede di "Decidere" tra (a) o (b), e vi dice che ha +davvero bisogno di voi per questo, come dirigenti siete nei guai. +Le persone che gestite devono conoscere i dettagli più di quanto li conosciate +voi, quindi se vengono da voi per una decisione tecnica, siete fottuti. +Non sarete chiaramente competente per prendere quella decisione per loro. + +(Corollario: se le persone che gestite non conoscono i dettagli meglio di voi, +anche in questo caso sarete fregati, tuttavia per altre ragioni. Ossia state +facendo il lavoro sbagliato, e che invece dovrebbero essere "loro" a gestirvi) + +Quindi il gioco si chiama "evitare" decisioni, almeno le più grandi e +difficili. Prendere decisioni piccoli e senza conseguenze va bene, e vi fa +sembrare competenti in quello che state facendo, quindi quello che un dirigente +del kernel ha bisogno di fare è trasformare le decisioni grandi e difficili +in minuzie delle quali nessuno importa. + +Ciò aiuta a capire che la differenza chiave tra una grande decisione ed una +piccola sta nella possibilità di modificare tale decisione in seguito. +Qualsiasi decisione importante può essere ridotta in decisioni meno importanti, +ma dovete assicurarvi che possano essere reversibili in caso di errori +(presenti o futuri). Improvvisamente, dovrete essere doppiamente dirigenti +per **due** decisioni non sequenziali - quella sbagliata **e** quella giusta. + +E le persone vedranno tutto ciò come prova di vera capacità di comando +(*cough* cavolata *cough*) + +Così la chiave per evitare le decisioni difficili diviene l'evitare +di fare cose che non possono essere disfatte. Non infilatevi in un angolo +dal quale non potrete sfuggire. Un topo messo all'angolo può rivelarsi +pericoloso - un dirigente messo all'angolo è solo pietoso. + +**In ogni caso** dato che nessuno è stupido al punto da lasciare veramente ad +un dirigente del kernel un enorme responsabilità, solitamente è facile fare +marcia indietro. Annullare una decisione è molto facile: semplicemente dite a +tutti che siete stati degli scemi incompetenti, dite che siete dispiaciuti, ed +annullate tutto l'inutile lavoro sul quale gli altri hanno lavorato nell'ultimo +anno. Improvvisamente la decisione che avevate preso un anno fa non era poi +così grossa, dato che può essere facilmente annullata. + +È emerso che alcune persone hanno dei problemi con questo tipo di approccio, +questo per due ragioni: + + - ammettere di essere degli idioti è più difficile di quanto sembri. A tutti + noi piace mantenere le apparenze, ed uscire allo scoperto in pubblico per + ammettere che ci si è sbagliati è qualcosa di davvero impegnativo. + - avere qualcuno che ti dice che ciò su cui hai lavorato nell'ultimo anno + non era del tutto valido, può rivelarsi difficile anche per un povero ed + umile ingegnere, e mentre il **lavoro** vero era abbastanza facile da + cancellare, dall'altro canto potreste aver irrimediabilmente perso la + fiducia di quell'ingegnere. E ricordate che l'"irrevocabile" era quello + che avevamo cercato di evitare fin dall'inizio, e la vostra decisione + ha finito per esserlo. + +Fortunatamente, entrambe queste ragioni posso essere mitigate semplicemente +ammettendo fin dal principio che non avete una cavolo di idea, dicendo +agli altri in anticipo che la vostra decisione è puramente ipotetica, e che +potrebbe essere sbagliata. Dovreste sempre riservarvi il diritto di cambiare +la vostra opinione, e rendere gli altri ben **consapevoli** di ciò. +Ed è molto più facile ammettere di essere stupidi quando non avete **ancora** +fatto quella cosa stupida. + +Poi, quando è realmente emersa la vostra stupidità, le persone semplicemente +roteeranno gli occhi e diranno "Uffa, no, ancora". + +Questa ammissione preventiva di incompetenza potrebbe anche portare le persone +che stanno facendo il vero lavoro, a pensarci due volte. Dopo tutto, se +**loro** non sono certi se sia una buona idea, voi, sicuro come la morte, +non dovreste incoraggiarli promettendogli che ciò su cui stanno lavorando +verrà incluso. Fate si che ci pensino due volte prima che si imbarchino in un +grosso lavoro. + +Ricordate: loro devono sapere più cose sui dettagli rispetto a voi, e +solitamente pensano di avere già la risposta a tutto. La miglior cosa che +potete fare in qualità di dirigente è di non instillare troppa fiducia, ma +invece fornire una salutare dose di pensiero critico su quanto stanno facendo. + +Comunque, un altro modo di evitare una decisione è quello di lamentarsi +malinconicamente dicendo : "non possiamo farli entrambi e basta?" e con uno +sguardo pietoso. Fidatevi, funziona. Se non è chiaro quale sia il miglior +approccio, lo scopriranno. La risposta potrebbe essere data dal fatto che +entrambe i gruppi di lavoro diventano frustati al punto di rinunciarvi. + +Questo può suonare come un fallimento, ma di solito questo è un segno che +c'era qualcosa che non andava in entrambe i progetti, e il motivo per +il quale le persone coinvolte non abbiano potuto decidere era che entrambe +sbagliavano. Voi ne uscirete freschi come una rosa, e avrete evitato un'altra +decisione con la quale avreste potuto fregarvi. + + +2) Le persone +------------- + +Ci sono molte persone stupide, ed essere un dirigente significa che dovrete +scendere a patti con questo, e molto più importate, che **loro** devono avere +a che fare con **voi**. + +Ne emerge che mentre è facile annullare degli errori tecnici, non è invece +così facile rimuovere i disordini della personalità. Dovrete semplicemente +convivere con i loro, ed i vostri, problemi. + +Comunque, al fine di preparavi in qualità di dirigenti del kernel, è meglio +ricordare di non abbattere alcun ponte, bombardare alcun paesano innocente, +o escludere troppi sviluppatori kernel. Ne emerge che escludere le persone +è piuttosto facile, mentre includerle nuovamente è difficile. Così +"l'esclusione" immediatamente cade sotto il titolo di "non reversibile", e +diviene un no-no secondo la sezione :ref:`it_decisions`. + +Esistono alcune semplici regole qui: + + (1) non chiamate le persone teste di c*** (al meno, non in pubblico) + (2) imparate a scusarvi quando dimenticate la regola (1) + +Il problema del punto numero 1 è che è molto facile da rispettare, dato che +è possibile dire "sei una testa di c***" in milioni di modi differenti [#f2]_, +a volte senza nemmeno pensarci, e praticamente sempre con la calda convinzione +di essere nel giusto. + +E più convinti sarete che avete ragione (e diciamolo, potete chiamare +praticamente **tutti** testa di c**, e spesso **sarete** nel giusto), più +difficile sarà scusarvi successivamente. + +Per risolvere questo problema, avete due possibilità: + + - diventare davvero bravi nello scusarsi + - essere amabili così che nessuno finirà col sentirsi preso di mira. Siate + creativi abbastanza, e potrebbero esserne divertiti. + +L'opzione dell'essere immancabilmente educati non esiste proprio. Nessuno +si fiderà di qualcuno che chiaramente sta nascondendo il suo vero carattere. + +.. [#f2] Paul Simon cantava: "50 modi per lasciare il vostro amante", perché, + molto francamente, "Un milione di modi per dire ad uno sviluppatore + Testa di c***" non avrebbe funzionato. Ma sono sicuro che ci abbia + pensato. + + +3) Le persone II - quelle buone +------------------------------- + +Mentre emerge che la maggior parte delle persone sono stupide, il corollario +a questo è il triste fatto che anche voi siete fra queste, e che mentre +possiamo tutti crogiolarci nella sicurezza di essere migliori della media +delle persone (diciamocelo, nessuno crede di essere nelle media o sotto di +essa), dovremmo anche ammettere che non siamo il "coltello più affilato" del +circondario, e che ci saranno altre persone che sono meno stupide di quanto +lo siete voi. + +Molti reagiscono male davanti alle persone intelligenti. Altri le usano a +proprio vantaggio. + +Assicuratevi che voi, in qualità di manutentori del kernel, siate nel secondo +gruppo. Inchinatevi dinanzi a loro perché saranno le persone che vi renderanno +il lavoro più facile. In particolare, prenderanno le decisioni per voi, che è +l'oggetto di questo gioco. + +Quindi quando trovate qualcuno più sveglio di voi, prendetevela comoda. +Le vostre responsabilità dirigenziali si ridurranno in gran parte nel dire +"Sembra una buona idea - Vai", oppure "Sembra buono, ma invece circa questo e +quello?". La seconda versione in particolare è una gran modo per imparare +qualcosa di nuovo circa "questo e quello" o di sembrare **extra** dirigenziali +sottolineando qualcosa alla quale i più svegli non avevano pensato. In +entrambe i casi, vincete. + +Una cosa alla quale dovete fare attenzione è che l'essere grandi in qualcosa +non si traduce automaticamente nell'essere grandi anche in altre cose. Quindi +dovreste dare una spintarella alle persone in una specifica direzione, ma +diciamocelo, potrebbero essere bravi in ciò che fanno e far schifo in tutto +il resto. La buona notizia è che le persone tendono a gravitare attorno a ciò +in cui sono bravi, quindi non state facendo nulla di irreversibile quando li +spingete verso una certa direzione, solo non spingete troppo. + + +4) Addossare le colpe +--------------------- + +Le cose andranno male, e le persone vogliono qualcuno da incolpare. Sarete voi. + +Non è poi così difficile accettare la colpa, specialmente se le persone +riescono a capire che non era **tutta** colpa vostra. Il che ci porta +sulla miglior strada per assumersi la colpa: fatelo per qualcun'altro. +Vi sentirete bene nel assumervi la responsabilità, e loro si sentiranno +bene nel non essere incolpati, e coloro che hanno perso i loro 36GB di +pornografia a causa della vostra incompetenza ammetteranno a malincuore che +almeno non avete cercato di fare il furbetto. + +Successivamente fate in modo che gli sviluppatori che in realtà hanno fallito +(se riuscite a trovarli) sappiano **in privato** che sono "fottuti". +Questo non per fargli sapere che la prossima volta possono evitarselo ma per +fargli capire che sono in debito. E, forse cosa più importante, sono loro che +devono sistemare la cosa. Perché, ammettiamolo, è sicuro non sarete voi a +farlo. + +Assumersi la colpa è anche ciò che vi rendere dirigenti in prima battuta. +È parte di ciò che spinge gli altri a fidarsi di voi, e vi garantisce +la gloria potenziale, perché siete gli unici a dire "Ho fatto una cavolata". +E se avete seguito le regole precedenti, sarete decisamente bravi nel dirlo. + + +5) Le cose da evitare +--------------------- + +Esiste una cosa che le persone odiano più che essere chiamate "teste di c****", +ed è essere chiamate "teste di c****" con fare da bigotto. Se per il primo +caso potrete comunque scusarvi, per il secondo non ve ne verrà data nemmeno +l'opportunità. Probabilmente smetteranno di ascoltarvi anche se tutto sommato +state svolgendo un buon lavoro. + +Tutti crediamo di essere migliori degli altri, il che significa che quando +qualcuno inizia a darsi delle arie, ci da **davvero** fastidio. Potreste anche +essere moralmente ed intellettualmente superiore a tutti quelli attorno a voi, +ma non cercate di renderlo ovvio per gli altri a meno che non **vogliate** +veramente far arrabbiare qualcuno [#f3]_. + +Allo stesso modo evitate di essere troppo gentili e pacati. Le buone maniere +facilmente finiscono per strabordare e nascondere i problemi, e come si usa +dire, "su internet nessuno può sentire la vostra pacatezza". Usate argomenti +diretti per farvi capire, non potete sperare che la gente capisca in altro +modo. + +Un po' di umorismo può aiutare a smorzare sia la franchezza che la moralità. +Andare oltre i limiti al punto d'essere ridicolo può portare dei punti a casa +senza renderlo spiacevole per i riceventi, i quali penseranno che stavate +facendo gli scemi. Può anche aiutare a lasciare andare quei blocchi mentali +che abbiamo nei confronti delle critiche. + +.. [#f3] Suggerimento: i forum di discussione su internet, che non sono + collegati col vostro lavoro, sono ottimi modi per sfogare la frustrazione + verso altre persone. Di tanto in tanto scrivete messaggi offensivi col ghigno + in faccia per infiammare qualche discussione: vi sentirete purificati. Solo + cercate di non cagare troppo vicino a casa. + +6) Perché io? +------------- + +Dato che la vostra responsabilità principale è quella di prendervi le colpe +d'altri, e rendere dolorosamente ovvio a tutti che siete degli incompetenti, +la domanda naturale che ne segue sarà : perché dovrei fare tutto ciò? + +Innanzitutto, potreste diventare o no popolari al punto da avere la fila di +ragazzine (o ragazzini, evitiamo pregiudizi o sessismo) che gridano e bussano +alla porta del vostro camerino, ma comunque **proverete** un immenso senso di +realizzazione personale dall'essere "in carica". Dimenticate il fatto che voi +state discutendo con tutti e che cercate di inseguirli il più velocemente che +potete. Tutti continueranno a pensare che voi siete la persona in carica. + +È un bel lavoro se riuscite ad adattarlo a voi. -- cgit v1.2.3 From 4951d27b099bd24f87a093f67d91618742bd3a65 Mon Sep 17 00:00:00 2001 From: Bumsik Kim Date: Fri, 3 Apr 2020 12:15:07 +0900 Subject: watchdog: clarify that stop() is optional The commit d0684c8a9354 ("watchdog: Make stop function optional") made stop function not mandatory, but the comments and the doc weren't reflected. Fix it to clarify. Signed-off-by: Bumsik Kim Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20200403031507.63487-1-k.bumsik@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/watchdog/convert_drivers_to_kernel_api.rst | 2 +- Documentation/watchdog/watchdog-kernel-api.rst | 2 +- include/linux/watchdog.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/watchdog/convert_drivers_to_kernel_api.rst b/Documentation/watchdog/convert_drivers_to_kernel_api.rst index dd934cc08e40..51b999b5551a 100644 --- a/Documentation/watchdog/convert_drivers_to_kernel_api.rst +++ b/Documentation/watchdog/convert_drivers_to_kernel_api.rst @@ -115,7 +115,7 @@ Add the watchdog operations --------------------------- All possible callbacks are defined in 'struct watchdog_ops'. You can find it -explained in 'watchdog-kernel-api.txt' in this directory. start(), stop() and +explained in 'watchdog-kernel-api.txt' in this directory. start() and owner must be set, the rest are optional. You will easily find corresponding functions in the old driver. Note that you will now get a pointer to the watchdog_device as a parameter to these functions, so you probably have to diff --git a/Documentation/watchdog/watchdog-kernel-api.rst b/Documentation/watchdog/watchdog-kernel-api.rst index 864edbe932c1..068a55ee0d4a 100644 --- a/Documentation/watchdog/watchdog-kernel-api.rst +++ b/Documentation/watchdog/watchdog-kernel-api.rst @@ -123,8 +123,8 @@ The list of watchdog operations is defined as:: struct module *owner; /* mandatory operations */ int (*start)(struct watchdog_device *); - int (*stop)(struct watchdog_device *); /* optional operations */ + int (*stop)(struct watchdog_device *); int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 417d9f37077a..1464ce6ffa31 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -37,15 +37,15 @@ struct watchdog_governor; * * The watchdog_ops structure contains a list of low-level operations * that control a watchdog device. It also contains the module that owns - * these operations. The start and stop function are mandatory, all other + * these operations. The start function is mandatory, all other * functions are optional. */ struct watchdog_ops { struct module *owner; /* mandatory operations */ int (*start)(struct watchdog_device *); - int (*stop)(struct watchdog_device *); /* optional operations */ + int (*stop)(struct watchdog_device *); int (*ping)(struct watchdog_device *); unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); -- cgit v1.2.3 From 90c165f0de3adad4719e65ab0c31d59edf5bd481 Mon Sep 17 00:00:00 2001 From: Ricardo Cañuelo Date: Fri, 3 Apr 2020 11:36:17 +0200 Subject: docs: pr_*() kerneldocs and basic printk docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kerneldocs comments to the pr_*() macros in printk.h. Add a new rst node in the core-api manual describing the basic usage of printk and the related macro aliases. Signed-off-by: Ricardo Cañuelo Link: https://lore.kernel.org/r/20200403093617.18003-1-ricardo.canuelo@collabora.com Signed-off-by: Jonathan Corbet --- Documentation/core-api/index.rst | 1 + Documentation/core-api/printk-basics.rst | 115 ++++++++++++++++++++++++++++++ Documentation/core-api/printk-formats.rst | 2 + include/linux/printk.h | 112 +++++++++++++++++++++++++---- 4 files changed, 218 insertions(+), 12 deletions(-) create mode 100644 Documentation/core-api/printk-basics.rst diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 0897ad12c119..49e3da910d9e 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -18,6 +18,7 @@ it. kernel-api workqueue + printk-basics printk-formats symbol-namespaces diff --git a/Documentation/core-api/printk-basics.rst b/Documentation/core-api/printk-basics.rst new file mode 100644 index 000000000000..563a9ce5fe1d --- /dev/null +++ b/Documentation/core-api/printk-basics.rst @@ -0,0 +1,115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Message logging with printk +=========================== + +printk() is one of the most widely known functions in the Linux kernel. It's the +standard tool we have for printing messages and usually the most basic way of +tracing and debugging. If you're familiar with printf(3) you can tell printk() +is based on it, although it has some functional differences: + + - printk() messages can specify a log level. + + - the format string, while largely compatible with C99, doesn't follow the + exact same specification. It has some extensions and a few limitations + (no ``%n`` or floating point conversion specifiers). See :ref:`How to get + printk format specifiers right `. + +All printk() messages are printed to the kernel log buffer, which is a ring +buffer exported to userspace through /dev/kmsg. The usual way to read it is +using ``dmesg``. + +printk() is typically used like this:: + + printk(KERN_INFO "Message: %s\n", arg); + +where ``KERN_INFO`` is the log level (note that it's concatenated to the format +string, the log level is not a separate argument). The available log levels are: + ++----------------+--------+-----------------------------------------------+ +| Name | String | Alias function | ++================+========+===============================================+ +| KERN_EMERG | "0" | pr_emerg() | ++----------------+--------+-----------------------------------------------+ +| KERN_ALERT | "1" | pr_alert() | ++----------------+--------+-----------------------------------------------+ +| KERN_CRIT | "2" | pr_crit() | ++----------------+--------+-----------------------------------------------+ +| KERN_ERR | "3" | pr_err() | ++----------------+--------+-----------------------------------------------+ +| KERN_WARNING | "4" | pr_warn() | ++----------------+--------+-----------------------------------------------+ +| KERN_NOTICE | "5" | pr_notice() | ++----------------+--------+-----------------------------------------------+ +| KERN_INFO | "6" | pr_info() | ++----------------+--------+-----------------------------------------------+ +| KERN_DEBUG | "7" | pr_debug() and pr_devel() if DEBUG is defined | ++----------------+--------+-----------------------------------------------+ +| KERN_DEFAULT | "" | | ++----------------+--------+-----------------------------------------------+ +| KERN_CONT | "c" | pr_cont() | ++----------------+--------+-----------------------------------------------+ + + +The log level specifies the importance of a message. The kernel decides whether +to show the message immediately (printing it to the current console) depending +on its log level and the current *console_loglevel* (a kernel variable). If the +message priority is higher (lower log level value) than the *console_loglevel* +the message will be printed to the console. + +If the log level is omitted, the message is printed with ``KERN_DEFAULT`` +level. + +You can check the current *console_loglevel* with:: + + $ cat /proc/sys/kernel/printk + 4 4 1 7 + +The result shows the *current*, *default*, *minimum* and *boot-time-default* log +levels. + +To change the current console_loglevel simply write the the desired level to +``/proc/sys/kernel/printk``. For example, to print all messages to the console:: + + # echo 8 > /proc/sys/kernel/printk + +Another way, using ``dmesg``:: + + # dmesg -n 5 + +sets the console_loglevel to print KERN_WARNING (4) or more severe messages to +console. See ``dmesg(1)`` for more information. + +As an alternative to printk() you can use the ``pr_*()`` aliases for +logging. This family of macros embed the log level in the macro names. For +example:: + + pr_info("Info message no. %d\n", msg_num); + +prints a ``KERN_INFO`` message. + +Besides being more concise than the equivalent printk() calls, they can use a +common definition for the format string through the pr_fmt() macro. For +instance, defining this at the top of a source file (before any ``#include`` +directive):: + + #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +would prefix every pr_*() message in that file with the module and function name +that originated the message. + +For debugging purposes there are also two conditionally-compiled macros: +pr_debug() and pr_devel(), which are compiled-out unless ``DEBUG`` (or +also ``CONFIG_DYNAMIC_DEBUG`` in the case of pr_debug()) is defined. + + +Function reference +================== + +.. kernel-doc:: kernel/printk/printk.c + :functions: printk + +.. kernel-doc:: include/linux/printk.h + :functions: pr_emerg pr_alert pr_crit pr_err pr_warn pr_notice pr_info + pr_fmt pr_debug pr_devel pr_cont diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 8ebe46b1af39..1e3838652348 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -2,6 +2,8 @@ How to get printk format specifiers right ========================================= +.. _printk-specifiers: + :Author: Randy Dunlap :Author: Andrew Murray diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..ccea5d4a509e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -279,39 +279,116 @@ static inline void printk_safe_flush_on_panic(void) extern int kptr_restrict; +/** + * pr_fmt - used by the pr_*() macros to generate the printk format string + * @fmt: format string passed from a pr_*() macro + * + * This macro can be used to generate a unified format string for pr_*() + * macros. A common use is to prefix all pr_*() messages in a file with a common + * string. For example, defining this at the top of a source file: + * + * #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + * + * would prefix all pr_info, pr_emerg... messages in the file with the module + * name. + */ #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif -/* - * These can be used to print at the various log levels. - * All of these will print unconditionally, although note that pr_debug() - * and other debug macros are compiled out unless either DEBUG is defined - * or CONFIG_DYNAMIC_DEBUG is set. +/** + * pr_emerg - Print an emergency-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to + * generate the format string. */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_alert - Print an alert-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_alert(fmt, ...) \ printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_crit - Print a critical-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_crit(fmt, ...) \ printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_err - Print an error-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_err(fmt, ...) \ printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_warn - Print a warning-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt() + * to generate the format string. + */ #define pr_warn(fmt, ...) \ printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_notice - Print a notice-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_notice(fmt, ...) \ printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) +/** + * pr_info - Print an info-level message + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to + * generate the format string. + */ #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -/* - * Like KERN_CONT, pr_cont() should only be used when continuing - * a line with no newline ('\n') enclosed. Otherwise it defaults - * back to KERN_DEFAULT. + +/** + * pr_cont - Continues a previous log message in the same line. + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_CONT loglevel. It should only be + * used when continuing a log message with no newline ('\n') enclosed. Otherwise + * it defaults back to KERN_DEFAULT loglevel. */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) -/* pr_devel() should produce zero code unless DEBUG is defined */ +/** + * pr_devel - Print a debug-level message conditionally + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is + * defined. Otherwise it does nothing. + * + * It uses pr_fmt() to generate the format string. + */ #ifdef DEBUG #define pr_devel(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) @@ -325,8 +402,19 @@ extern int kptr_restrict; #if defined(CONFIG_DYNAMIC_DEBUG) #include -/* dynamic_pr_debug() uses pr_fmt() internally so we don't need it here */ -#define pr_debug(fmt, ...) \ +/** + * pr_debug - Print a debug-level message conditionally + * @fmt: format string + * @...: arguments for the format string + * + * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is + * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with + * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing. + * + * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses + * pr_fmt() internally). + */ +#define pr_debug(fmt, ...) \ dynamic_pr_debug(fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define pr_debug(fmt, ...) \ -- cgit v1.2.3 From 14bbe3e33710be52f21d61253a94c5f44a696d02 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 1 Apr 2020 10:33:43 -0700 Subject: docs: Add rbtree documentation to the core-api This file is close enough to being in rst format that I didn't feel the need to alter it in any way. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Michel Lespinasse Link: https://lore.kernel.org/r/20200401173343.17472-1-willy@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/core-api/index.rst | 1 + Documentation/core-api/rbtree.rst | 429 +++++++++++++++++++++++++++++++++ Documentation/rbtree.txt | 429 --------------------------------- include/linux/rbtree.h | 2 +- include/linux/rbtree_augmented.h | 2 +- lib/Kconfig | 2 +- tools/include/linux/rbtree.h | 2 +- tools/include/linux/rbtree_augmented.h | 2 +- 8 files changed, 435 insertions(+), 434 deletions(-) create mode 100644 Documentation/core-api/rbtree.rst delete mode 100644 Documentation/rbtree.txt diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 49e3da910d9e..b29c4a07beda 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -35,6 +35,7 @@ Library functionality that is used throughout the kernel. xarray idr circular-buffers + rbtree generic-radix-tree packing timekeeping diff --git a/Documentation/core-api/rbtree.rst b/Documentation/core-api/rbtree.rst new file mode 100644 index 000000000000..523d54b60087 --- /dev/null +++ b/Documentation/core-api/rbtree.rst @@ -0,0 +1,429 @@ +================================= +Red-black Trees (rbtree) in Linux +================================= + + +:Date: January 18, 2007 +:Author: Rob Landley + +What are red-black trees, and what are they for? +------------------------------------------------ + +Red-black trees are a type of self-balancing binary search tree, used for +storing sortable key/value data pairs. This differs from radix trees (which +are used to efficiently store sparse arrays and thus use long integer indexes +to insert/access/delete nodes) and hash tables (which are not kept sorted to +be easily traversed in order, and must be tuned for a specific size and +hash function where rbtrees scale gracefully storing arbitrary keys). + +Red-black trees are similar to AVL trees, but provide faster real-time bounded +worst case performance for insertion and deletion (at most two rotations and +three rotations, respectively, to balance the tree), with slightly slower +(but still O(log n)) lookup time. + +To quote Linux Weekly News: + + There are a number of red-black trees in use in the kernel. + The deadline and CFQ I/O schedulers employ rbtrees to + track requests; the packet CD/DVD driver does the same. + The high-resolution timer code uses an rbtree to organize outstanding + timer requests. The ext3 filesystem tracks directory entries in a + red-black tree. Virtual memory areas (VMAs) are tracked with red-black + trees, as are epoll file descriptors, cryptographic keys, and network + packets in the "hierarchical token bucket" scheduler. + +This document covers use of the Linux rbtree implementation. For more +information on the nature and implementation of Red Black Trees, see: + + Linux Weekly News article on red-black trees + http://lwn.net/Articles/184495/ + + Wikipedia entry on red-black trees + http://en.wikipedia.org/wiki/Red-black_tree + +Linux implementation of red-black trees +--------------------------------------- + +Linux's rbtree implementation lives in the file "lib/rbtree.c". To use it, +"#include ". + +The Linux rbtree implementation is optimized for speed, and thus has one +less layer of indirection (and better cache locality) than more traditional +tree implementations. Instead of using pointers to separate rb_node and data +structures, each instance of struct rb_node is embedded in the data structure +it organizes. And instead of using a comparison callback function pointer, +users are expected to write their own tree search and insert functions +which call the provided rbtree functions. Locking is also left up to the +user of the rbtree code. + +Creating a new rbtree +--------------------- + +Data nodes in an rbtree tree are structures containing a struct rb_node member:: + + struct mytype { + struct rb_node node; + char *keystring; + }; + +When dealing with a pointer to the embedded struct rb_node, the containing data +structure may be accessed with the standard container_of() macro. In addition, +individual members may be accessed directly via rb_entry(node, type, member). + +At the root of each rbtree is an rb_root structure, which is initialized to be +empty via: + + struct rb_root mytree = RB_ROOT; + +Searching for a value in an rbtree +---------------------------------- + +Writing a search function for your tree is fairly straightforward: start at the +root, compare each value, and follow the left or right branch as necessary. + +Example:: + + struct mytype *my_search(struct rb_root *root, char *string) + { + struct rb_node *node = root->rb_node; + + while (node) { + struct mytype *data = container_of(node, struct mytype, node); + int result; + + result = strcmp(string, data->keystring); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + return NULL; + } + +Inserting data into an rbtree +----------------------------- + +Inserting data in the tree involves first searching for the place to insert the +new node, then inserting the node and rebalancing ("recoloring") the tree. + +The search for insertion differs from the previous search by finding the +location of the pointer on which to graft the new node. The new node also +needs a link to its parent node for rebalancing purposes. + +Example:: + + int my_insert(struct rb_root *root, struct mytype *data) + { + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct mytype *this = container_of(*new, struct mytype, node); + int result = strcmp(data->keystring, this->keystring); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + return FALSE; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + + return TRUE; + } + +Removing or replacing existing data in an rbtree +------------------------------------------------ + +To remove an existing node from a tree, call:: + + void rb_erase(struct rb_node *victim, struct rb_root *tree); + +Example:: + + struct mytype *data = mysearch(&mytree, "walrus"); + + if (data) { + rb_erase(&data->node, &mytree); + myfree(data); + } + +To replace an existing node in a tree with a new one with the same key, call:: + + void rb_replace_node(struct rb_node *old, struct rb_node *new, + struct rb_root *tree); + +Replacing a node this way does not re-sort the tree: If the new node doesn't +have the same key as the old node, the rbtree will probably become corrupted. + +Iterating through the elements stored in an rbtree (in sort order) +------------------------------------------------------------------ + +Four functions are provided for iterating through an rbtree's contents in +sorted order. These work on arbitrary trees, and should not need to be +modified or wrapped (except for locking purposes):: + + struct rb_node *rb_first(struct rb_root *tree); + struct rb_node *rb_last(struct rb_root *tree); + struct rb_node *rb_next(struct rb_node *node); + struct rb_node *rb_prev(struct rb_node *node); + +To start iterating, call rb_first() or rb_last() with a pointer to the root +of the tree, which will return a pointer to the node structure contained in +the first or last element in the tree. To continue, fetch the next or previous +node by calling rb_next() or rb_prev() on the current node. This will return +NULL when there are no more nodes left. + +The iterator functions return a pointer to the embedded struct rb_node, from +which the containing data structure may be accessed with the container_of() +macro, and individual members may be accessed directly via +rb_entry(node, type, member). + +Example:: + + struct rb_node *node; + for (node = rb_first(&mytree); node; node = rb_next(node)) + printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); + +Cached rbtrees +-------------- + +Computing the leftmost (smallest) node is quite a common task for binary +search trees, such as for traversals or users relying on a the particular +order for their own logic. To this end, users can use 'struct rb_root_cached' +to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding +potentially expensive tree iterations. This is done at negligible runtime +overhead for maintanence; albeit larger memory footprint. + +Similar to the rb_root structure, cached rbtrees are initialized to be +empty via:: + + struct rb_root_cached mytree = RB_ROOT_CACHED; + +Cached rbtree is simply a regular rb_root with an extra pointer to cache the +leftmost node. This allows rb_root_cached to exist wherever rb_root does, +which permits augmented trees to be supported as well as only a few extra +interfaces:: + + struct rb_node *rb_first_cached(struct rb_root_cached *tree); + void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool); + void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); + +Both insert and erase calls have their respective counterpart of augmented +trees:: + + void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *, + bool, struct rb_augment_callbacks *); + void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *, + struct rb_augment_callbacks *); + + +Support for Augmented rbtrees +----------------------------- + +Augmented rbtree is an rbtree with "some" additional data stored in +each node, where the additional data for node N must be a function of +the contents of all nodes in the subtree rooted at N. This data can +be used to augment some new functionality to rbtree. Augmented rbtree +is an optional feature built on top of basic rbtree infrastructure. +An rbtree user who wants this feature will have to call the augmentation +functions with the user provided augmentation callback when inserting +and erasing nodes. + +C files implementing augmented rbtree manipulation must include + instead of . Note that +linux/rbtree_augmented.h exposes some rbtree implementations details +you are not expected to rely on; please stick to the documented APIs +there and do not include from header files +either so as to minimize chances of your users accidentally relying on +such implementation details. + +On insertion, the user must update the augmented information on the path +leading to the inserted node, then call rb_link_node() as usual and +rb_augment_inserted() instead of the usual rb_insert_color() call. +If rb_augment_inserted() rebalances the rbtree, it will callback into +a user provided function to update the augmented information on the +affected subtrees. + +When erasing a node, the user must call rb_erase_augmented() instead of +rb_erase(). rb_erase_augmented() calls back into user provided functions +to updated the augmented information on affected subtrees. + +In both cases, the callbacks are provided through struct rb_augment_callbacks. +3 callbacks must be defined: + +- A propagation callback, which updates the augmented value for a given + node and its ancestors, up to a given stop point (or NULL to update + all the way to the root). + +- A copy callback, which copies the augmented value for a given subtree + to a newly assigned subtree root. + +- A tree rotation callback, which copies the augmented value for a given + subtree to a newly assigned subtree root AND recomputes the augmented + information for the former subtree root. + +The compiled code for rb_erase_augmented() may inline the propagation and +copy callbacks, which results in a large function, so each augmented rbtree +user should have a single rb_erase_augmented() call site in order to limit +compiled code size. + + +Sample usage +^^^^^^^^^^^^ + +Interval tree is an example of augmented rb tree. Reference - +"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. +More details about interval trees: + +Classical rbtree has a single key and it cannot be directly used to store +interval ranges like [lo:hi] and do a quick lookup for any overlap with a new +lo:hi or to find whether there is an exact match for a new lo:hi. + +However, rbtree can be augmented to store such interval ranges in a structured +way making it possible to do efficient lookup and exact match. + +This "extra information" stored in each node is the maximum hi +(max_hi) value among all the nodes that are its descendants. This +information can be maintained at each node just be looking at the node +and its immediate children. And this will be used in O(log n) lookup +for lowest match (lowest start address among all possible matches) +with something like:: + + struct interval_tree_node * + interval_tree_first_match(struct rb_root *root, + unsigned long start, unsigned long last) + { + struct interval_tree_node *node; + + if (!root->rb_node) + return NULL; + node = rb_entry(root->rb_node, struct interval_tree_node, rb); + + while (true) { + if (node->rb.rb_left) { + struct interval_tree_node *left = + rb_entry(node->rb.rb_left, + struct interval_tree_node, rb); + if (left->__subtree_last >= start) { + /* + * Some nodes in left subtree satisfy Cond2. + * Iterate to find the leftmost such node N. + * If it also satisfies Cond1, that's the match + * we are looking for. Otherwise, there is no + * matching interval as nodes to the right of N + * can't satisfy Cond1 either. + */ + node = left; + continue; + } + } + if (node->start <= last) { /* Cond1 */ + if (node->last >= start) /* Cond2 */ + return node; /* node is leftmost match */ + if (node->rb.rb_right) { + node = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb); + if (node->__subtree_last >= start) + continue; + } + } + return NULL; /* No match */ + } + } + +Insertion/removal are defined using the following augmented callbacks:: + + static inline unsigned long + compute_subtree_last(struct interval_tree_node *node) + { + unsigned long max = node->last, subtree_last; + if (node->rb.rb_left) { + subtree_last = rb_entry(node->rb.rb_left, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + if (node->rb.rb_right) { + subtree_last = rb_entry(node->rb.rb_right, + struct interval_tree_node, rb)->__subtree_last; + if (max < subtree_last) + max = subtree_last; + } + return max; + } + + static void augment_propagate(struct rb_node *rb, struct rb_node *stop) + { + while (rb != stop) { + struct interval_tree_node *node = + rb_entry(rb, struct interval_tree_node, rb); + unsigned long subtree_last = compute_subtree_last(node); + if (node->__subtree_last == subtree_last) + break; + node->__subtree_last = subtree_last; + rb = rb_parent(&node->rb); + } + } + + static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) + { + struct interval_tree_node *old = + rb_entry(rb_old, struct interval_tree_node, rb); + struct interval_tree_node *new = + rb_entry(rb_new, struct interval_tree_node, rb); + + new->__subtree_last = old->__subtree_last; + } + + static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) + { + struct interval_tree_node *old = + rb_entry(rb_old, struct interval_tree_node, rb); + struct interval_tree_node *new = + rb_entry(rb_new, struct interval_tree_node, rb); + + new->__subtree_last = old->__subtree_last; + old->__subtree_last = compute_subtree_last(old); + } + + static const struct rb_augment_callbacks augment_callbacks = { + augment_propagate, augment_copy, augment_rotate + }; + + void interval_tree_insert(struct interval_tree_node *node, + struct rb_root *root) + { + struct rb_node **link = &root->rb_node, *rb_parent = NULL; + unsigned long start = node->start, last = node->last; + struct interval_tree_node *parent; + + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct interval_tree_node, rb); + if (parent->__subtree_last < last) + parent->__subtree_last = last; + if (start < parent->start) + link = &parent->rb.rb_left; + else + link = &parent->rb.rb_right; + } + + node->__subtree_last = last; + rb_link_node(&node->rb, rb_parent, link); + rb_insert_augmented(&node->rb, root, &augment_callbacks); + } + + void interval_tree_remove(struct interval_tree_node *node, + struct rb_root *root) + { + rb_erase_augmented(&node->rb, root, &augment_callbacks); + } diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt deleted file mode 100644 index 523d54b60087..000000000000 --- a/Documentation/rbtree.txt +++ /dev/null @@ -1,429 +0,0 @@ -================================= -Red-black Trees (rbtree) in Linux -================================= - - -:Date: January 18, 2007 -:Author: Rob Landley - -What are red-black trees, and what are they for? ------------------------------------------------- - -Red-black trees are a type of self-balancing binary search tree, used for -storing sortable key/value data pairs. This differs from radix trees (which -are used to efficiently store sparse arrays and thus use long integer indexes -to insert/access/delete nodes) and hash tables (which are not kept sorted to -be easily traversed in order, and must be tuned for a specific size and -hash function where rbtrees scale gracefully storing arbitrary keys). - -Red-black trees are similar to AVL trees, but provide faster real-time bounded -worst case performance for insertion and deletion (at most two rotations and -three rotations, respectively, to balance the tree), with slightly slower -(but still O(log n)) lookup time. - -To quote Linux Weekly News: - - There are a number of red-black trees in use in the kernel. - The deadline and CFQ I/O schedulers employ rbtrees to - track requests; the packet CD/DVD driver does the same. - The high-resolution timer code uses an rbtree to organize outstanding - timer requests. The ext3 filesystem tracks directory entries in a - red-black tree. Virtual memory areas (VMAs) are tracked with red-black - trees, as are epoll file descriptors, cryptographic keys, and network - packets in the "hierarchical token bucket" scheduler. - -This document covers use of the Linux rbtree implementation. For more -information on the nature and implementation of Red Black Trees, see: - - Linux Weekly News article on red-black trees - http://lwn.net/Articles/184495/ - - Wikipedia entry on red-black trees - http://en.wikipedia.org/wiki/Red-black_tree - -Linux implementation of red-black trees ---------------------------------------- - -Linux's rbtree implementation lives in the file "lib/rbtree.c". To use it, -"#include ". - -The Linux rbtree implementation is optimized for speed, and thus has one -less layer of indirection (and better cache locality) than more traditional -tree implementations. Instead of using pointers to separate rb_node and data -structures, each instance of struct rb_node is embedded in the data structure -it organizes. And instead of using a comparison callback function pointer, -users are expected to write their own tree search and insert functions -which call the provided rbtree functions. Locking is also left up to the -user of the rbtree code. - -Creating a new rbtree ---------------------- - -Data nodes in an rbtree tree are structures containing a struct rb_node member:: - - struct mytype { - struct rb_node node; - char *keystring; - }; - -When dealing with a pointer to the embedded struct rb_node, the containing data -structure may be accessed with the standard container_of() macro. In addition, -individual members may be accessed directly via rb_entry(node, type, member). - -At the root of each rbtree is an rb_root structure, which is initialized to be -empty via: - - struct rb_root mytree = RB_ROOT; - -Searching for a value in an rbtree ----------------------------------- - -Writing a search function for your tree is fairly straightforward: start at the -root, compare each value, and follow the left or right branch as necessary. - -Example:: - - struct mytype *my_search(struct rb_root *root, char *string) - { - struct rb_node *node = root->rb_node; - - while (node) { - struct mytype *data = container_of(node, struct mytype, node); - int result; - - result = strcmp(string, data->keystring); - - if (result < 0) - node = node->rb_left; - else if (result > 0) - node = node->rb_right; - else - return data; - } - return NULL; - } - -Inserting data into an rbtree ------------------------------ - -Inserting data in the tree involves first searching for the place to insert the -new node, then inserting the node and rebalancing ("recoloring") the tree. - -The search for insertion differs from the previous search by finding the -location of the pointer on which to graft the new node. The new node also -needs a link to its parent node for rebalancing purposes. - -Example:: - - int my_insert(struct rb_root *root, struct mytype *data) - { - struct rb_node **new = &(root->rb_node), *parent = NULL; - - /* Figure out where to put new node */ - while (*new) { - struct mytype *this = container_of(*new, struct mytype, node); - int result = strcmp(data->keystring, this->keystring); - - parent = *new; - if (result < 0) - new = &((*new)->rb_left); - else if (result > 0) - new = &((*new)->rb_right); - else - return FALSE; - } - - /* Add new node and rebalance tree. */ - rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); - - return TRUE; - } - -Removing or replacing existing data in an rbtree ------------------------------------------------- - -To remove an existing node from a tree, call:: - - void rb_erase(struct rb_node *victim, struct rb_root *tree); - -Example:: - - struct mytype *data = mysearch(&mytree, "walrus"); - - if (data) { - rb_erase(&data->node, &mytree); - myfree(data); - } - -To replace an existing node in a tree with a new one with the same key, call:: - - void rb_replace_node(struct rb_node *old, struct rb_node *new, - struct rb_root *tree); - -Replacing a node this way does not re-sort the tree: If the new node doesn't -have the same key as the old node, the rbtree will probably become corrupted. - -Iterating through the elements stored in an rbtree (in sort order) ------------------------------------------------------------------- - -Four functions are provided for iterating through an rbtree's contents in -sorted order. These work on arbitrary trees, and should not need to be -modified or wrapped (except for locking purposes):: - - struct rb_node *rb_first(struct rb_root *tree); - struct rb_node *rb_last(struct rb_root *tree); - struct rb_node *rb_next(struct rb_node *node); - struct rb_node *rb_prev(struct rb_node *node); - -To start iterating, call rb_first() or rb_last() with a pointer to the root -of the tree, which will return a pointer to the node structure contained in -the first or last element in the tree. To continue, fetch the next or previous -node by calling rb_next() or rb_prev() on the current node. This will return -NULL when there are no more nodes left. - -The iterator functions return a pointer to the embedded struct rb_node, from -which the containing data structure may be accessed with the container_of() -macro, and individual members may be accessed directly via -rb_entry(node, type, member). - -Example:: - - struct rb_node *node; - for (node = rb_first(&mytree); node; node = rb_next(node)) - printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); - -Cached rbtrees --------------- - -Computing the leftmost (smallest) node is quite a common task for binary -search trees, such as for traversals or users relying on a the particular -order for their own logic. To this end, users can use 'struct rb_root_cached' -to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding -potentially expensive tree iterations. This is done at negligible runtime -overhead for maintanence; albeit larger memory footprint. - -Similar to the rb_root structure, cached rbtrees are initialized to be -empty via:: - - struct rb_root_cached mytree = RB_ROOT_CACHED; - -Cached rbtree is simply a regular rb_root with an extra pointer to cache the -leftmost node. This allows rb_root_cached to exist wherever rb_root does, -which permits augmented trees to be supported as well as only a few extra -interfaces:: - - struct rb_node *rb_first_cached(struct rb_root_cached *tree); - void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool); - void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); - -Both insert and erase calls have their respective counterpart of augmented -trees:: - - void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *, - bool, struct rb_augment_callbacks *); - void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *, - struct rb_augment_callbacks *); - - -Support for Augmented rbtrees ------------------------------ - -Augmented rbtree is an rbtree with "some" additional data stored in -each node, where the additional data for node N must be a function of -the contents of all nodes in the subtree rooted at N. This data can -be used to augment some new functionality to rbtree. Augmented rbtree -is an optional feature built on top of basic rbtree infrastructure. -An rbtree user who wants this feature will have to call the augmentation -functions with the user provided augmentation callback when inserting -and erasing nodes. - -C files implementing augmented rbtree manipulation must include - instead of . Note that -linux/rbtree_augmented.h exposes some rbtree implementations details -you are not expected to rely on; please stick to the documented APIs -there and do not include from header files -either so as to minimize chances of your users accidentally relying on -such implementation details. - -On insertion, the user must update the augmented information on the path -leading to the inserted node, then call rb_link_node() as usual and -rb_augment_inserted() instead of the usual rb_insert_color() call. -If rb_augment_inserted() rebalances the rbtree, it will callback into -a user provided function to update the augmented information on the -affected subtrees. - -When erasing a node, the user must call rb_erase_augmented() instead of -rb_erase(). rb_erase_augmented() calls back into user provided functions -to updated the augmented information on affected subtrees. - -In both cases, the callbacks are provided through struct rb_augment_callbacks. -3 callbacks must be defined: - -- A propagation callback, which updates the augmented value for a given - node and its ancestors, up to a given stop point (or NULL to update - all the way to the root). - -- A copy callback, which copies the augmented value for a given subtree - to a newly assigned subtree root. - -- A tree rotation callback, which copies the augmented value for a given - subtree to a newly assigned subtree root AND recomputes the augmented - information for the former subtree root. - -The compiled code for rb_erase_augmented() may inline the propagation and -copy callbacks, which results in a large function, so each augmented rbtree -user should have a single rb_erase_augmented() call site in order to limit -compiled code size. - - -Sample usage -^^^^^^^^^^^^ - -Interval tree is an example of augmented rb tree. Reference - -"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. -More details about interval trees: - -Classical rbtree has a single key and it cannot be directly used to store -interval ranges like [lo:hi] and do a quick lookup for any overlap with a new -lo:hi or to find whether there is an exact match for a new lo:hi. - -However, rbtree can be augmented to store such interval ranges in a structured -way making it possible to do efficient lookup and exact match. - -This "extra information" stored in each node is the maximum hi -(max_hi) value among all the nodes that are its descendants. This -information can be maintained at each node just be looking at the node -and its immediate children. And this will be used in O(log n) lookup -for lowest match (lowest start address among all possible matches) -with something like:: - - struct interval_tree_node * - interval_tree_first_match(struct rb_root *root, - unsigned long start, unsigned long last) - { - struct interval_tree_node *node; - - if (!root->rb_node) - return NULL; - node = rb_entry(root->rb_node, struct interval_tree_node, rb); - - while (true) { - if (node->rb.rb_left) { - struct interval_tree_node *left = - rb_entry(node->rb.rb_left, - struct interval_tree_node, rb); - if (left->__subtree_last >= start) { - /* - * Some nodes in left subtree satisfy Cond2. - * Iterate to find the leftmost such node N. - * If it also satisfies Cond1, that's the match - * we are looking for. Otherwise, there is no - * matching interval as nodes to the right of N - * can't satisfy Cond1 either. - */ - node = left; - continue; - } - } - if (node->start <= last) { /* Cond1 */ - if (node->last >= start) /* Cond2 */ - return node; /* node is leftmost match */ - if (node->rb.rb_right) { - node = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb); - if (node->__subtree_last >= start) - continue; - } - } - return NULL; /* No match */ - } - } - -Insertion/removal are defined using the following augmented callbacks:: - - static inline unsigned long - compute_subtree_last(struct interval_tree_node *node) - { - unsigned long max = node->last, subtree_last; - if (node->rb.rb_left) { - subtree_last = rb_entry(node->rb.rb_left, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - if (node->rb.rb_right) { - subtree_last = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - return max; - } - - static void augment_propagate(struct rb_node *rb, struct rb_node *stop) - { - while (rb != stop) { - struct interval_tree_node *node = - rb_entry(rb, struct interval_tree_node, rb); - unsigned long subtree_last = compute_subtree_last(node); - if (node->__subtree_last == subtree_last) - break; - node->__subtree_last = subtree_last; - rb = rb_parent(&node->rb); - } - } - - static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) - { - struct interval_tree_node *old = - rb_entry(rb_old, struct interval_tree_node, rb); - struct interval_tree_node *new = - rb_entry(rb_new, struct interval_tree_node, rb); - - new->__subtree_last = old->__subtree_last; - } - - static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) - { - struct interval_tree_node *old = - rb_entry(rb_old, struct interval_tree_node, rb); - struct interval_tree_node *new = - rb_entry(rb_new, struct interval_tree_node, rb); - - new->__subtree_last = old->__subtree_last; - old->__subtree_last = compute_subtree_last(old); - } - - static const struct rb_augment_callbacks augment_callbacks = { - augment_propagate, augment_copy, augment_rotate - }; - - void interval_tree_insert(struct interval_tree_node *node, - struct rb_root *root) - { - struct rb_node **link = &root->rb_node, *rb_parent = NULL; - unsigned long start = node->start, last = node->last; - struct interval_tree_node *parent; - - while (*link) { - rb_parent = *link; - parent = rb_entry(rb_parent, struct interval_tree_node, rb); - if (parent->__subtree_last < last) - parent->__subtree_last = last; - if (start < parent->start) - link = &parent->rb.rb_left; - else - link = &parent->rb.rb_right; - } - - node->__subtree_last = last; - rb_link_node(&node->rb, rb_parent, link); - rb_insert_augmented(&node->rb, root, &augment_callbacks); - } - - void interval_tree_remove(struct interval_tree_node *node, - struct rb_root *root) - { - rb_erase_augmented(&node->rb, root, &augment_callbacks); - } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 1fd61a9af45c..d7db17996322 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -11,7 +11,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - See Documentation/rbtree.txt for documentation and samples. + See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 724b0d036b57..d1c53e9d8c75 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -21,7 +21,7 @@ * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * - * See Documentation/rbtree.txt for documentation and samples. + * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { diff --git a/lib/Kconfig b/lib/Kconfig index 5d53f9609c25..8005e36c3459 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -427,7 +427,7 @@ config INTERVAL_TREE See: - Documentation/rbtree.txt + Documentation/core-api/rbtree.rst for more information. diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index e03b1ea23e0e..30dd21f976c3 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -11,7 +11,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - See Documentation/rbtree.txt for documentation and samples. + See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef __TOOLS_LINUX_PERF_RBTREE_H diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 381aa948610d..570bb9794421 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -23,7 +23,7 @@ * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * - * See Documentation/rbtree.txt for documentation and samples. + * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { -- cgit v1.2.3 From 5af438d0dcdbdbc1da4f3abcdf08fe9b1291ca02 Mon Sep 17 00:00:00 2001 From: Federico Vaga Date: Sun, 5 Apr 2020 23:06:47 +0200 Subject: doc:it_IT: add RISC-V maintenance guidelines Add translation for the RISC-V maintenance guidelines as part of the translation of things related to "process/" Signed-off-by: Federico Vaga Link: https://lore.kernel.org/r/20200405210647.24991-1-federico.vaga@vaga.pv.it Signed-off-by: Jonathan Corbet --- Documentation/translations/it_IT/process/index.rst | 1 + .../translations/it_IT/riscv/patch-acceptance.rst | 40 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 Documentation/translations/it_IT/riscv/patch-acceptance.rst diff --git a/Documentation/translations/it_IT/process/index.rst b/Documentation/translations/it_IT/process/index.rst index 012de0f3154a..c4c867132c88 100644 --- a/Documentation/translations/it_IT/process/index.rst +++ b/Documentation/translations/it_IT/process/index.rst @@ -59,6 +59,7 @@ perché non si è trovato un posto migliore. magic-number volatile-considered-harmful clang-format + ../riscv/patch-acceptance .. only:: subproject and html diff --git a/Documentation/translations/it_IT/riscv/patch-acceptance.rst b/Documentation/translations/it_IT/riscv/patch-acceptance.rst new file mode 100644 index 000000000000..edf67252b3fb --- /dev/null +++ b/Documentation/translations/it_IT/riscv/patch-acceptance.rst @@ -0,0 +1,40 @@ +.. include:: ../disclaimer-ita.rst + +:Original: :doc:`../../../riscv/patch-acceptance` +:Translator: Federico Vaga + +arch/riscv linee guida alla manutenzione per gli sviluppatori +============================================================= + +Introduzione +------------ + +L'insieme di istruzioni RISC-V sono sviluppate in modo aperto: le +bozze in fase di sviluppo sono disponibili a tutti per essere +revisionate e per essere sperimentare nelle implementazioni. Le bozze +dei nuovi moduli o estensioni possono cambiare in fase di sviluppo - a +volte in modo incompatibile rispetto a bozze precedenti. Questa +flessibilità può portare a dei problemi di manutenzioni per il +supporto RISC-V nel kernel Linux. I manutentori Linux non amano +l'abbandono del codice, e il processo di sviluppo del kernel +preferisce codice ben revisionato e testato rispetto a quello +sperimentale. Desideriamo estendere questi stessi principi al codice +relativo all'architettura RISC-V che verrà accettato per l'inclusione +nel kernel. + +In aggiunta alla lista delle verifiche da fare prima di inviare una patch +------------------------------------------------------------------------- + +Accetteremo le patch per un nuovo modulo o estensione se la fondazione +RISC-V li classifica come "Frozen" o "Retified". (Ovviamente, gli +sviluppatori sono liberi di mantenere una copia del kernel Linux +contenente il codice per una bozza di estensione). + +In aggiunta, la specifica RISC-V permette agli implementatori di +creare le proprie estensioni. Queste estensioni non passano +attraverso il processo di revisione della fondazione RISC-V. Per +questo motivo, al fine di evitare complicazioni o problemi di +prestazioni, accetteremo patch solo per quelle estensioni che sono +state ufficialmente accettate dalla fondazione RISC-V. (Ovviamente, +gli implementatori sono liberi di mantenere una copia del kernel Linux +contenente il codice per queste specifiche estensioni). -- cgit v1.2.3 From 7b9121040d83eb9e332f7dbc140eca17643d3586 Mon Sep 17 00:00:00 2001 From: Adrian Freund Date: Tue, 7 Apr 2020 15:05:25 +0200 Subject: Documentation: scheduler: fix outdated information on sched groups The documentation claims that two sched groups must not overlap. This is no longer true, as overlapping sched groups are used on NUMA systems. This change has been introduced by commit e3589f6c81e47 and was documented by an in-code comment in commit 35a566e6e8a18. Signed-off-by: Adrian Freund Link: https://lore.kernel.org/r/20200407130525.76663-1-adrian@freund.io Signed-off-by: Jonathan Corbet --- Documentation/scheduler/sched-domains.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index f7504226f445..5c4b7f4f0062 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -19,10 +19,12 @@ CPUs". Each scheduling domain must have one or more CPU groups (struct sched_group) which are organised as a circular one way linked list from the ->groups pointer. The union of cpumasks of these groups MUST be the same as the -domain's span. The intersection of cpumasks from any two of these groups -MUST be the empty set. The group pointed to by the ->groups pointer MUST -contain the CPU to which the domain belongs. Groups may be shared among -CPUs as they contain read only data after they have been set up. +domain's span. The group pointed to by the ->groups pointer MUST contain the CPU +to which the domain belongs. Groups may be shared among CPUs as they contain +read only data after they have been set up. The intersection of cpumasks from +any two of these groups may be non empty. If this is the case the SD_OVERLAP +flag is set on the corresponding scheduling domain and its groups may not be +shared between CPUs. Balancing within a sched domain occurs between groups. That is, each group is treated as one entity. The load of a group is defined as the sum of the -- cgit v1.2.3 From 2d5694796b6b4afe188270388813c6c89e4d3f46 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Tue, 28 Apr 2020 15:32:25 +0200 Subject: Documentation: x86: fix space instead of tab in uefi doc Signed-off-by: Flavio Suligoi Link: https://lore.kernel.org/r/1588080745-21999-1-git-send-email-f.suligoi@asem.it Signed-off-by: Jonathan Corbet --- Documentation/x86/x86_64/uefi.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/x86/x86_64/uefi.rst b/Documentation/x86/x86_64/uefi.rst index 88c3ba32546f..3b894103a734 100644 --- a/Documentation/x86/x86_64/uefi.rst +++ b/Documentation/x86/x86_64/uefi.rst @@ -36,7 +36,7 @@ Mechanics elilo bootloader with x86_64 support, elilo configuration file, kernel image built in first step and corresponding - initrd. Instructions on building elilo and its dependencies + initrd. Instructions on building elilo and its dependencies can be found in the elilo sourceforge project. - Boot to EFI shell and invoke elilo choosing the kernel image built -- cgit v1.2.3 From 08ce0c1e1116080c2fe13b2dc2a05b884d710ef7 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Mon, 27 Apr 2020 23:44:40 -0300 Subject: mailmap: Add entry for Leonardo Bras Add an entry to connect my email addresses. Signed-off-by: Leonardo Bras Link: https://lore.kernel.org/r/20200428024439.215806-1-leobras.c@gmail.com Signed-off-by: Jonathan Corbet --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index db3754a41018..4600dcfed0d3 100644 --- a/.mailmap +++ b/.mailmap @@ -152,6 +152,7 @@ Krzysztof Kozlowski Kuninori Morimoto Leon Romanovsky Leon Romanovsky +Leonardo Bras Leonid I Ananiev Linas Vepstas Linus Lüssing -- cgit v1.2.3 From b529c06f9dc74a7ad8cb875e840967737d0455f3 Mon Sep 17 00:00:00 2001 From: Juan Manuel Méndez Rey Date: Sun, 26 Apr 2020 03:52:50 +0200 Subject: Update the documentation referencing Plan 9 from User Space. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page originally referenced to checkout Plan9 application and libraries have been missing for quite some time and the development is carried out in github and documented on this new site. Signed-off-by: Juan Manuel Méndez Rey Link: https://lore.kernel.org/r/20200426015250.GA35090@camelot Signed-off-by: Jonathan Corbet --- Documentation/filesystems/9p.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst index 671fef39a802..2995279ddc24 100644 --- a/Documentation/filesystems/9p.rst +++ b/Documentation/filesystems/9p.rst @@ -192,4 +192,4 @@ For more information on the Plan 9 Operating System check out http://plan9.bell-labs.com/plan9 For information on Plan 9 from User Space (Plan 9 applications and libraries -ported to Linux/BSD/OSX/etc) check out http://swtch.com/plan9 +ported to Linux/BSD/OSX/etc) check out https://9fans.github.io/plan9port/ -- cgit v1.2.3 From 6feb76dbd14a3addce915f251580f7b89d68d356 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 25 Apr 2020 12:06:16 +0200 Subject: Documentation: zh_CN: convert to use i2c_new_client_device() Move away from the deprecated API and advertise the new one. Signed-off-by: Wolfram Sang Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20200425100616.3363-1-wsa+renesas@sang-engineering.com Signed-off-by: Jonathan Corbet --- Documentation/translations/zh_CN/video4linux/v4l2-framework.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt index 9c39ee58ea50..a96abcdec777 100644 --- a/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt +++ b/Documentation/translations/zh_CN/video4linux/v4l2-framework.txt @@ -488,7 +488,7 @@ struct v4l2_subdev *sd = v4l2_i2c_new_subdev(v4l2_dev, adapter, 这个函数会加载给定的模块(如果没有模块需要加载,可以为 NULL), 并用给定的 i2c 适配器结构体指针(i2c_adapter)和 器件地址(chip/address) -作为参数调用 i2c_new_device()。如果一切顺利,则就在 v4l2_device +作为参数调用 i2c_new_client_device()。如果一切顺利,则就在 v4l2_device 中注册了子设备。 你也可以利用 v4l2_i2c_new_subdev()的最后一个参数,传递一个可能的 -- cgit v1.2.3 From 920af1ce1b6e713d187f178b3fe37d3e3b12381a Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 24 Apr 2020 17:35:15 +0200 Subject: docs: orangefs: fix pvfs2tab literal block Following a merge fix-up, the literal block is introduced too early; this patch merges the localhost mention with the introduction, fixing Documentation/filesystems/orangefs.rst:124: WARNING: Literal block expected; none found. Signed-off-by: Stephen Kitt Link: https://lore.kernel.org/r/20200424153515.134500-1-steve@sk2.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/orangefs.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Documentation/filesystems/orangefs.rst b/Documentation/filesystems/orangefs.rst index e41369709c5b..463e37694250 100644 --- a/Documentation/filesystems/orangefs.rst +++ b/Documentation/filesystems/orangefs.rst @@ -119,9 +119,7 @@ it comes to that question:: /opt/ofs/bin/pvfs2-genconfig /etc/pvfs2.conf -Create an /etc/pvfs2tab file:: - -Localhost is fine for your pvfs2tab file: +Create an /etc/pvfs2tab file (localhost is fine):: echo tcp://localhost:3334/orangefs /pvfsmnt pvfs2 defaults,noauto 0 0 > \ /etc/pvfs2tab -- cgit v1.2.3 From 2ad9a844fc83282d7f4e71baa82ffa2c0ae88c63 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 24 Apr 2020 17:26:37 +0200 Subject: docs: virt/kvm: close inline string literal This fixes Documentation/virt/kvm/amd-memory-encryption.rst:76: WARNING: Inline literal start-string without end-string. Fixes: 2da1ed62d55c ("KVM: SVM: document KVM_MEM_ENCRYPT_OP, let userspace detect if SEV is available") Signed-off-by: Stephen Kitt Link: https://lore.kernel.org/r/20200424152637.120876-1-steve@sk2.org Signed-off-by: Jonathan Corbet --- Documentation/virt/kvm/amd-memory-encryption.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index c3129b9ba5cb..57c01f531e61 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -74,7 +74,7 @@ should point to a file descriptor that is opened on the ``/dev/sev`` device, if needed (see individual commands). On output, ``error`` is zero on success, or an error code. Error codes -are defined in ```. +are defined in ````. KVM implements the following commands to support common lifecycle events of SEV guests, such as launching, running, snapshotting, migrating and decommissioning. -- cgit v1.2.3 From 6bc47621cbf357a9b7bab3890489804f38916a35 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Thu, 23 Apr 2020 20:36:49 +0200 Subject: docs: sysctl/kernel: document cad_pid Based on the implementation in kernel/sysctl.c (the proc_do_cad_pid() function), kernel/reboot.c, and include/linux/sched/signal.h. Signed-off-by: Stephen Kitt Link: https://lore.kernel.org/r/20200423183651.15365-1-steve@sk2.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/sysctl/kernel.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 55b24eada13c..82bfd5892663 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -116,6 +116,16 @@ run. The statistics can be seen using ``bpftool``. = =================================== +cad_pid +======= + +This is the pid which will be signalled on reboot (notably, by +Ctrl-Alt-Delete). Writing a value to this file which doesn't +correspond to a running process will result in ``-ESRCH``. + +See also `ctrl-alt-del`_. + + cap_last_cap ============ -- cgit v1.2.3 From 1f5ea8720e8db71d768533bf61773ecdbf9fdcc2 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Wed, 15 Apr 2020 23:16:50 +0200 Subject: doc: nvdimm: remove reference to non-existent CONFIG_NFIT_TEST The test driver is in tools/testing/nvdimm and cannot be selected by a config option. Signed-off-by: Michal Suchanek Link: https://lore.kernel.org/r/20200415211654.10827-1-msuchanek@suse.de Signed-off-by: Jonathan Corbet --- Documentation/driver-api/nvdimm/nvdimm.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst index 08f855cbb4e6..79c0fd39f2af 100644 --- a/Documentation/driver-api/nvdimm/nvdimm.rst +++ b/Documentation/driver-api/nvdimm/nvdimm.rst @@ -278,8 +278,8 @@ by a region device with a dynamically assigned id (REGION0 - REGION5). be contiguous in DPA-space. This bus is provided by the kernel under the device - /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and - the nfit_test.ko module is loaded. This not only test LIBNVDIMM but the + /sys/devices/platform/nfit_test.0 when the nfit_test.ko module from + tools/testing/nvdimm is loaded. This not only test LIBNVDIMM but the acpi_nfit.ko driver as well. -- cgit v1.2.3 From a8b380c379efcade6008cb5e2359073b72e73ad8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 16:31:05 +0200 Subject: scripts: sphinx-pre-install: only ask to activate valid venvs If a venv doesn't contain Sphinx, or has an older Sphinx version, ignore it. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/d11a00d88514e8a0357e1b0a05ebd518952a1d39.1587478901.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 53 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index d4dfe1e59989..249edb3932af 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -231,6 +231,27 @@ sub get_sphinx_fname() return ""; } +sub get_sphinx_version($) +{ + my $cmd = shift; + my $ver; + + open IN, "$cmd --version 2>&1 |"; + while () { + if (m/^\s*sphinx-build\s+([\d\.]+)(\+\/[\da-f]+)?$/) { + $ver=$1; + last; + } + # Sphinx 1.2.x uses a different format + if (m/^\s*Sphinx.*\s+([\d\.]+)$/) { + $ver=$1; + last; + } + } + close IN; + return $ver; +} + sub check_sphinx() { my $rec_version; @@ -266,19 +287,8 @@ sub check_sphinx() return; } - open IN, "$sphinx --version 2>&1 |" or die "$sphinx returned an error"; - while () { - if (m/^\s*sphinx-build\s+([\d\.]+)(\+\/[\da-f]+)?$/) { - $cur_version=$1; - last; - } - # Sphinx 1.2.x uses a different format - if (m/^\s*Sphinx.*\s+([\d\.]+)$/) { - $cur_version=$1; - last; - } - } - close IN; + $cur_version = get_sphinx_version($sphinx); + die ("$sphinx returned an error") if (!$cur_version); die "$sphinx didn't return its version" if (!$cur_version); @@ -765,10 +775,21 @@ sub check_needs() my @activates = glob "$ENV{'PWD'}/${virtenv_prefix}*/bin/activate"; @activates = sort {$b cmp $a} @activates; + my ($activate, $ver); + foreach my $f (@activates) { + $activate = $f; + next if ($activate lt $min_activate); + + my $sphinx_cmd = $activate; + $sphinx_cmd =~ s/activate/sphinx-build/; + next if (! -f $sphinx_cmd); - if ($need_sphinx && scalar @activates > 0 && $activates[0] ge $min_activate) { - printf "\nNeed to activate a compatible Sphinx version on virtualenv with:\n"; - printf "\t. $activates[0]\n"; + $ver = get_sphinx_version($sphinx_cmd); + last if ($ver ge $min_version); + } + if ($need_sphinx && ($activate ne "")) { + printf "\nNeed to activate Sphinx (version $ver) on virtualenv with:\n"; + printf "\t. $activate\n"; deactivate_help(); exit (1); } else { -- cgit v1.2.3 From 1ef70ced55976368275c468dfd436881d5580111 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 16:31:06 +0200 Subject: scripts: sphinx-pre-install: change the warning for version < 2.4.4 As requested by Jon, change the version check, in order to not emit a warning if version is >= 1.7.9, but below 2.4.4. After this patch, if someone used an older version, it will say: ./scripts/sphinx-pre-install Sphinx version 1.7.9 Note: It is recommended at least Sphinx version 2.4.4 if you need PDF support. Detected OS: Fedora release 31 (Thirty One). To upgrade Sphinx, use: /devel/v4l/docs/sphinx_1.7.9/bin/python3 -m venv sphinx_2.4.4 . sphinx_2.4.4/bin/activate pip install -r ./Documentation/sphinx/requirements.txt If you want to exit the virtualenv, you can use: deactivate All optional dependencies are met. Needed package dependencies are met. If Sphinx is not detected at all, it Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/79584d317ba16f5d4f37801c5ee57cf04085f962.1587478901.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 249edb3932af..0d5684c08bbc 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -29,6 +29,8 @@ my $install = ""; my $virtenv_dir = ""; my $python_cmd = ""; my $min_version; +my $rec_version = "1.7.9"; # PDF won't build here +my $min_pdf_version = "2.4.4"; # Min version where pdf builds # # Command line arguments @@ -254,7 +256,7 @@ sub get_sphinx_version($) sub check_sphinx() { - my $rec_version; + my $default_version; my $cur_version; open IN, $conf or die "Can't open $conf"; @@ -271,15 +273,15 @@ sub check_sphinx() open IN, $requirement_file or die "Can't open $requirement_file"; while () { if (m/^\s*Sphinx\s*==\s*([\d\.]+)$/) { - $rec_version=$1; + $default_version=$1; last; } } close IN; - die "Can't get recommended sphinx version from $requirement_file" if (!$min_version); + die "Can't get default sphinx version from $requirement_file" if (!$default_version); - $virtenv_dir = $virtenv_prefix . $rec_version; + $virtenv_dir = $virtenv_prefix . $default_version; my $sphinx = get_sphinx_fname(); if ($sphinx eq "") { @@ -294,7 +296,7 @@ sub check_sphinx() if ($cur_version lt $min_version) { printf "ERROR: Sphinx version is %s. It should be >= %s (recommended >= %s)\n", - $cur_version, $min_version, $rec_version;; + $cur_version, $min_version, $default_version; $need_sphinx = 1; return; } @@ -302,6 +304,13 @@ sub check_sphinx() if ($cur_version lt $rec_version) { printf "Sphinx version %s\n", $cur_version; print "Warning: It is recommended at least Sphinx version $rec_version.\n"; + print " If you want pdf, you need at least $min_pdf_version.\n"; + $rec_sphinx_upgrade = 1; + return; + } + if ($cur_version lt $min_pdf_version) { + printf "Sphinx version %s\n", $cur_version; + print "Note: It is recommended at least Sphinx version $min_pdf_version if you need PDF support.\n"; $rec_sphinx_upgrade = 1; return; } -- cgit v1.2.3 From 2834a7412bb1a74722231d7198518f9456279156 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 16:31:07 +0200 Subject: scripts: sphinx-pre-install: change recommendation text if venv exists If one is running a Sphinx version older than what's recommended, but there's already a newer working virtual env, change the text, as it is just a matter of switching to the new venv, instead of creating a new one from scratch. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/bcf79d0399a1c3444ca938dcdce599c3273980ab.1587478901.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 0d5684c08bbc..938b65d40fc8 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -29,6 +29,7 @@ my $install = ""; my $virtenv_dir = ""; my $python_cmd = ""; my $min_version; +my $cur_version; my $rec_version = "1.7.9"; # PDF won't build here my $min_pdf_version = "2.4.4"; # Min version where pdf builds @@ -257,7 +258,6 @@ sub get_sphinx_version($) sub check_sphinx() { my $default_version; - my $cur_version; open IN, $conf or die "Can't open $conf"; while () { @@ -703,8 +703,6 @@ sub check_needs() print "Unknown OS\n\n"; } - print "To upgrade Sphinx, use:\n\n" if ($rec_sphinx_upgrade); - # Check python command line, trying first python3 $python_cmd = findprog("python3"); $python_cmd = check_program("python", 0) if (!$python_cmd); @@ -786,24 +784,36 @@ sub check_needs() @activates = sort {$b cmp $a} @activates; my ($activate, $ver); foreach my $f (@activates) { - $activate = $f; - next if ($activate lt $min_activate); + next if ($f lt $min_activate); - my $sphinx_cmd = $activate; + my $sphinx_cmd = $f; $sphinx_cmd =~ s/activate/sphinx-build/; next if (! -f $sphinx_cmd); $ver = get_sphinx_version($sphinx_cmd); - last if ($ver ge $min_version); + if ($need_sphinx && ($ver ge $min_version)) { + $activate = $f; + last; + } elsif ($ver gt $cur_version) { + $activate = $f; + last; + } } - if ($need_sphinx && ($activate ne "")) { - printf "\nNeed to activate Sphinx (version $ver) on virtualenv with:\n"; - printf "\t. $activate\n"; - deactivate_help(); - exit (1); + if ($activate ne "") { + if ($need_sphinx) { + printf "\nNeed to activate Sphinx (version $ver) on virtualenv with:\n"; + printf "\t. $activate\n"; + deactivate_help(); + exit (1); + } else { + printf "\nYou may also use a newer Sphinx (version $ver) with:\n"; + printf "\tdeactivate && . $activate\n"; + } } else { my $rec_activate = "$virtenv_dir/bin/activate"; + print "To upgrade Sphinx, use:\n\n" if ($rec_sphinx_upgrade); + if ($need_venv) { printf "\t$python_cmd -m venv $virtenv_dir\n"; } else { -- cgit v1.2.3 From 412b09ddadd34e9be0d9a26e6de252361fafc237 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 16:31:08 +0200 Subject: scripts: sphinx-pre-install: fix a bug when using with venv When python3 creates a venv, it adds python into it! This causes any upgrade recommendation to look like this: /devel/v4l/docs/sphinx_1.7.9/bin/python3 -m venv sphinx_2.4.4 . sphinx_2.4.4/bin/activate pip install -r ./Documentation/sphinx/requirements.txt With is wrong (and it may not work). So, when recomending an upgrade, exclude the venv dir from the search path, and get the system's python. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/aa622ff71bebf6960fc0262fb90e7ebc7a999a02.1587478901.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 938b65d40fc8..987aebf7e3a0 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -148,6 +148,24 @@ sub findprog($) } } +sub find_python_no_venv() +{ + my $prog = shift; + + my $cur_dir = qx(pwd); + $cur_dir =~ s/\s+$//; + + foreach my $dir (split(/:/, $ENV{PATH})) { + next if ($dir =~ m,($cur_dir)/sphinx,); + return "$dir/python3" if(-x "$dir/python3"); + } + foreach my $dir (split(/:/, $ENV{PATH})) { + next if ($dir =~ m,($cur_dir)/sphinx,); + return "$dir/python" if(-x "$dir/python"); + } + return "python"; +} + sub check_program($$) { my $prog = shift; @@ -814,6 +832,8 @@ sub check_needs() print "To upgrade Sphinx, use:\n\n" if ($rec_sphinx_upgrade); + $python_cmd = find_python_no_venv(); + if ($need_venv) { printf "\t$python_cmd -m venv $virtenv_dir\n"; } else { -- cgit v1.2.3 From ec43a27fffd025f718428d9f4f8df22fa765405f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 21 Apr 2020 18:27:58 +0200 Subject: scripts: sphinx-pre-install: change the output order When the script detects the need for an upgrade, it will print either a warning or a note. Let's change a little bit the order where messages will be displayed, in order to make easier for the user to identify the more important messages. It should now be like this: Detected OS: Fedora release 31 (Thirty One). Sphinx version: 1.7.9 Note: It is recommended at least Sphinx version 2.4.4 if you need PDF support. To upgrade Sphinx, use: /usr/bin/python3 -m venv sphinx_2.4.4 . sphinx_2.4.4/bin/activate pip install -r ./Documentation/sphinx/requirements.txt If you want to exit the virtualenv, you can use: deactivate All optional dependencies are met. Needed package dependencies are met. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/20200421182758.04e0a53e@coco.lan Signed-off-by: Jonathan Corbet --- scripts/sphinx-pre-install | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 987aebf7e3a0..c680c3efb176 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -320,15 +320,10 @@ sub check_sphinx() } if ($cur_version lt $rec_version) { - printf "Sphinx version %s\n", $cur_version; - print "Warning: It is recommended at least Sphinx version $rec_version.\n"; - print " If you want pdf, you need at least $min_pdf_version.\n"; $rec_sphinx_upgrade = 1; return; } if ($cur_version lt $min_pdf_version) { - printf "Sphinx version %s\n", $cur_version; - print "Note: It is recommended at least Sphinx version $min_pdf_version if you need PDF support.\n"; $rec_sphinx_upgrade = 1; return; } @@ -716,10 +711,11 @@ sub check_needs() check_sphinx(); if ($system_release) { - print "Detected OS: $system_release.\n\n"; + print "Detected OS: $system_release.\n"; } else { - print "Unknown OS\n\n"; + print "Unknown OS\n"; } + printf "Sphinx version: %s\n\n", $cur_version if ($cur_version); # Check python command line, trying first python3 $python_cmd = findprog("python3"); @@ -799,6 +795,13 @@ sub check_needs() my $min_activate = "$ENV{'PWD'}/${virtenv_prefix}${min_version}/bin/activate"; my @activates = glob "$ENV{'PWD'}/${virtenv_prefix}*/bin/activate"; + if ($cur_version lt $rec_version) { + print "Warning: It is recommended at least Sphinx version $rec_version.\n"; + print " If you want pdf, you need at least $min_pdf_version.\n"; + } + if ($cur_version lt $min_pdf_version) { + print "Note: It is recommended at least Sphinx version $min_pdf_version if you need PDF support.\n"; + } @activates = sort {$b cmp $a} @activates; my ($activate, $ver); foreach my $f (@activates) { -- cgit v1.2.3 From 67145c23e70b066443a3c0736e74aa2342a013fd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:53 +0200 Subject: docs: filesystems: convert caching/object.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Comment out text ToC for html/pdf output; - Some whitespace fixes and new line breaks; - Adjust the events list to make them look better for html output; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/49026a8ea7e714c2e0f003aa26b975b1025476b7.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/fscache.txt | 2 +- Documentation/filesystems/caching/index.rst | 9 + Documentation/filesystems/caching/object.rst | 313 +++++++++++++++++++++++++ Documentation/filesystems/caching/object.txt | 320 -------------------------- Documentation/filesystems/index.rst | 2 + fs/fscache/object.c | 2 +- 6 files changed, 326 insertions(+), 322 deletions(-) create mode 100644 Documentation/filesystems/caching/index.rst create mode 100644 Documentation/filesystems/caching/object.rst delete mode 100644 Documentation/filesystems/caching/object.txt diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt index 50f0a5757f48..071ff50a774d 100644 --- a/Documentation/filesystems/caching/fscache.txt +++ b/Documentation/filesystems/caching/fscache.txt @@ -191,7 +191,7 @@ The cache backend API to FS-Cache can be found in: A description of the internal representations and object state machine can be found in: - Documentation/filesystems/caching/object.txt + Documentation/filesystems/caching/object.rst ======================= diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst new file mode 100644 index 000000000000..e5ec95ff0be2 --- /dev/null +++ b/Documentation/filesystems/caching/index.rst @@ -0,0 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Filesystem Caching +================== + +.. toctree:: + :maxdepth: 2 + + object diff --git a/Documentation/filesystems/caching/object.rst b/Documentation/filesystems/caching/object.rst new file mode 100644 index 000000000000..ce0e043ccd33 --- /dev/null +++ b/Documentation/filesystems/caching/object.rst @@ -0,0 +1,313 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================================== +In-Kernel Cache Object Representation and Management +==================================================== + +By: David Howells + +.. Contents: + + (*) Representation + + (*) Object management state machine. + + - Provision of cpu time. + - Locking simplification. + + (*) The set of states. + + (*) The set of events. + + +Representation +============== + +FS-Cache maintains an in-kernel representation of each object that a netfs is +currently interested in. Such objects are represented by the fscache_cookie +struct and are referred to as cookies. + +FS-Cache also maintains a separate in-kernel representation of the objects that +a cache backend is currently actively caching. Such objects are represented by +the fscache_object struct. The cache backends allocate these upon request, and +are expected to embed them in their own representations. These are referred to +as objects. + +There is a 1:N relationship between cookies and objects. A cookie may be +represented by multiple objects - an index may exist in more than one cache - +or even by no objects (it may not be cached). + +Furthermore, both cookies and objects are hierarchical. The two hierarchies +correspond, but the cookies tree is a superset of the union of the object trees +of multiple caches:: + + NETFS INDEX TREE : CACHE 1 : CACHE 2 + : : + : +-----------+ : + +----------->| IObject | : + +-----------+ | : +-----------+ : + | ICookie |-------+ : | : + +-----------+ | : | : +-----------+ + | +------------------------------>| IObject | + | : | : +-----------+ + | : V : | + | : +-----------+ : | + V +----------->| IObject | : | + +-----------+ | : +-----------+ : | + | ICookie |-------+ : | : V + +-----------+ | : | : +-----------+ + | +------------------------------>| IObject | + +-----+-----+ : | : +-----------+ + | | : | : | + V | : V : | + +-----------+ | : +-----------+ : | + | ICookie |------------------------->| IObject | : | + +-----------+ | : +-----------+ : | + | V : | : V + | +-----------+ : | : +-----------+ + | | ICookie |-------------------------------->| IObject | + | +-----------+ : | : +-----------+ + V | : V : | + +-----------+ | : +-----------+ : | + | DCookie |------------------------->| DObject | : | + +-----------+ | : +-----------+ : | + | : : | + +-------+-------+ : : | + | | : : | + V V : : V + +-----------+ +-----------+ : : +-----------+ + | DCookie | | DCookie |------------------------>| DObject | + +-----------+ +-----------+ : : +-----------+ + : : + +In the above illustration, ICookie and IObject represent indices and DCookie +and DObject represent data storage objects. Indices may have representation in +multiple caches, but currently, non-index objects may not. Objects of any type +may also be entirely unrepresented. + +As far as the netfs API goes, the netfs is only actually permitted to see +pointers to the cookies. The cookies themselves and any objects attached to +those cookies are hidden from it. + + +Object Management State Machine +=============================== + +Within FS-Cache, each active object is managed by its own individual state +machine. The state for an object is kept in the fscache_object struct, in +object->state. A cookie may point to a set of objects that are in different +states. + +Each state has an action associated with it that is invoked when the machine +wakes up in that state. There are four logical sets of states: + + (1) Preparation: states that wait for the parent objects to become ready. The + representations are hierarchical, and it is expected that an object must + be created or accessed with respect to its parent object. + + (2) Initialisation: states that perform lookups in the cache and validate + what's found and that create on disk any missing metadata. + + (3) Normal running: states that allow netfs operations on objects to proceed + and that update the state of objects. + + (4) Termination: states that detach objects from their netfs cookies, that + delete objects from disk, that handle disk and system errors and that free + up in-memory resources. + + +In most cases, transitioning between states is in response to signalled events. +When a state has finished processing, it will usually set the mask of events in +which it is interested (object->event_mask) and relinquish the worker thread. +Then when an event is raised (by calling fscache_raise_event()), if the event +is not masked, the object will be queued for processing (by calling +fscache_enqueue_object()). + + +Provision of CPU Time +--------------------- + +The work to be done by the various states was given CPU time by the threads of +the slow work facility. This was used in preference to the workqueue facility +because: + + (1) Threads may be completely occupied for very long periods of time by a + particular work item. These state actions may be doing sequences of + synchronous, journalled disk accesses (lookup, mkdir, create, setxattr, + getxattr, truncate, unlink, rmdir, rename). + + (2) Threads may do little actual work, but may rather spend a lot of time + sleeping on I/O. This means that single-threaded and 1-per-CPU-threaded + workqueues don't necessarily have the right numbers of threads. + + +Locking Simplification +---------------------- + +Because only one worker thread may be operating on any particular object's +state machine at once, this simplifies the locking, particularly with respect +to disconnecting the netfs's representation of a cache object (fscache_cookie) +from the cache backend's representation (fscache_object) - which may be +requested from either end. + + +The Set of States +================= + +The object state machine has a set of states that it can be in. There are +preparation states in which the object sets itself up and waits for its parent +object to transit to a state that allows access to its children: + + (1) State FSCACHE_OBJECT_INIT. + + Initialise the object and wait for the parent object to become active. In + the cache, it is expected that it will not be possible to look an object + up from the parent object, until that parent object itself has been looked + up. + +There are initialisation states in which the object sets itself up and accesses +disk for the object metadata: + + (2) State FSCACHE_OBJECT_LOOKING_UP. + + Look up the object on disk, using the parent as a starting point. + FS-Cache expects the cache backend to probe the cache to see whether this + object is represented there, and if it is, to see if it's valid (coherency + management). + + The cache should call fscache_object_lookup_negative() to indicate lookup + failure for whatever reason, and should call fscache_obtained_object() to + indicate success. + + At the completion of lookup, FS-Cache will let the netfs go ahead with + read operations, no matter whether the file is yet cached. If not yet + cached, read operations will be immediately rejected with ENODATA until + the first known page is uncached - as to that point there can be no data + to be read out of the cache for that file that isn't currently also held + in the pagecache. + + (3) State FSCACHE_OBJECT_CREATING. + + Create an object on disk, using the parent as a starting point. This + happens if the lookup failed to find the object, or if the object's + coherency data indicated what's on disk is out of date. In this state, + FS-Cache expects the cache to create + + The cache should call fscache_obtained_object() if creation completes + successfully, fscache_object_lookup_negative() otherwise. + + At the completion of creation, FS-Cache will start processing write + operations the netfs has queued for an object. If creation failed, the + write ops will be transparently discarded, and nothing recorded in the + cache. + +There are some normal running states in which the object spends its time +servicing netfs requests: + + (4) State FSCACHE_OBJECT_AVAILABLE. + + A transient state in which pending operations are started, child objects + are permitted to advance from FSCACHE_OBJECT_INIT state, and temporary + lookup data is freed. + + (5) State FSCACHE_OBJECT_ACTIVE. + + The normal running state. In this state, requests the netfs makes will be + passed on to the cache. + + (6) State FSCACHE_OBJECT_INVALIDATING. + + The object is undergoing invalidation. When the state comes here, it + discards all pending read, write and attribute change operations as it is + going to clear out the cache entirely and reinitialise it. It will then + continue to the FSCACHE_OBJECT_UPDATING state. + + (7) State FSCACHE_OBJECT_UPDATING. + + The state machine comes here to update the object in the cache from the + netfs's records. This involves updating the auxiliary data that is used + to maintain coherency. + +And there are terminal states in which an object cleans itself up, deallocates +memory and potentially deletes stuff from disk: + + (8) State FSCACHE_OBJECT_LC_DYING. + + The object comes here if it is dying because of a lookup or creation + error. This would be due to a disk error or system error of some sort. + Temporary data is cleaned up, and the parent is released. + + (9) State FSCACHE_OBJECT_DYING. + + The object comes here if it is dying due to an error, because its parent + cookie has been relinquished by the netfs or because the cache is being + withdrawn. + + Any child objects waiting on this one are given CPU time so that they too + can destroy themselves. This object waits for all its children to go away + before advancing to the next state. + +(10) State FSCACHE_OBJECT_ABORT_INIT. + + The object comes to this state if it was waiting on its parent in + FSCACHE_OBJECT_INIT, but its parent died. The object will destroy itself + so that the parent may proceed from the FSCACHE_OBJECT_DYING state. + +(11) State FSCACHE_OBJECT_RELEASING. +(12) State FSCACHE_OBJECT_RECYCLING. + + The object comes to one of these two states when dying once it is rid of + all its children, if it is dying because the netfs relinquished its + cookie. In the first state, the cached data is expected to persist, and + in the second it will be deleted. + +(13) State FSCACHE_OBJECT_WITHDRAWING. + + The object transits to this state if the cache decides it wants to + withdraw the object from service, perhaps to make space, but also due to + error or just because the whole cache is being withdrawn. + +(14) State FSCACHE_OBJECT_DEAD. + + The object transits to this state when the in-memory object record is + ready to be deleted. The object processor shouldn't ever see an object in + this state. + + +The Set of Events +----------------- + +There are a number of events that can be raised to an object state machine: + + FSCACHE_OBJECT_EV_UPDATE + The netfs requested that an object be updated. The state machine will ask + the cache backend to update the object, and the cache backend will ask the + netfs for details of the change through its cookie definition ops. + + FSCACHE_OBJECT_EV_CLEARED + This is signalled in two circumstances: + + (a) when an object's last child object is dropped and + + (b) when the last operation outstanding on an object is completed. + + This is used to proceed from the dying state. + + FSCACHE_OBJECT_EV_ERROR + This is signalled when an I/O error occurs during the processing of some + object. + + FSCACHE_OBJECT_EV_RELEASE, FSCACHE_OBJECT_EV_RETIRE + These are signalled when the netfs relinquishes a cookie it was using. + The event selected depends on whether the netfs asks for the backing + object to be retired (deleted) or retained. + + FSCACHE_OBJECT_EV_WITHDRAW + This is signalled when the cache backend wants to withdraw an object. + This means that the object will have to be detached from the netfs's + cookie. + +Because the withdrawing releasing/retiring events are all handled by the object +state machine, it doesn't matter if there's a collision with both ends trying +to sever the connection at the same time. The state machine can just pick +which one it wants to honour, and that effects the other. diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.txt deleted file mode 100644 index 100ff41127e4..000000000000 --- a/Documentation/filesystems/caching/object.txt +++ /dev/null @@ -1,320 +0,0 @@ - ==================================================== - IN-KERNEL CACHE OBJECT REPRESENTATION AND MANAGEMENT - ==================================================== - -By: David Howells - -Contents: - - (*) Representation - - (*) Object management state machine. - - - Provision of cpu time. - - Locking simplification. - - (*) The set of states. - - (*) The set of events. - - -============== -REPRESENTATION -============== - -FS-Cache maintains an in-kernel representation of each object that a netfs is -currently interested in. Such objects are represented by the fscache_cookie -struct and are referred to as cookies. - -FS-Cache also maintains a separate in-kernel representation of the objects that -a cache backend is currently actively caching. Such objects are represented by -the fscache_object struct. The cache backends allocate these upon request, and -are expected to embed them in their own representations. These are referred to -as objects. - -There is a 1:N relationship between cookies and objects. A cookie may be -represented by multiple objects - an index may exist in more than one cache - -or even by no objects (it may not be cached). - -Furthermore, both cookies and objects are hierarchical. The two hierarchies -correspond, but the cookies tree is a superset of the union of the object trees -of multiple caches: - - NETFS INDEX TREE : CACHE 1 : CACHE 2 - : : - : +-----------+ : - +----------->| IObject | : - +-----------+ | : +-----------+ : - | ICookie |-------+ : | : - +-----------+ | : | : +-----------+ - | +------------------------------>| IObject | - | : | : +-----------+ - | : V : | - | : +-----------+ : | - V +----------->| IObject | : | - +-----------+ | : +-----------+ : | - | ICookie |-------+ : | : V - +-----------+ | : | : +-----------+ - | +------------------------------>| IObject | - +-----+-----+ : | : +-----------+ - | | : | : | - V | : V : | - +-----------+ | : +-----------+ : | - | ICookie |------------------------->| IObject | : | - +-----------+ | : +-----------+ : | - | V : | : V - | +-----------+ : | : +-----------+ - | | ICookie |-------------------------------->| IObject | - | +-----------+ : | : +-----------+ - V | : V : | - +-----------+ | : +-----------+ : | - | DCookie |------------------------->| DObject | : | - +-----------+ | : +-----------+ : | - | : : | - +-------+-------+ : : | - | | : : | - V V : : V - +-----------+ +-----------+ : : +-----------+ - | DCookie | | DCookie |------------------------>| DObject | - +-----------+ +-----------+ : : +-----------+ - : : - -In the above illustration, ICookie and IObject represent indices and DCookie -and DObject represent data storage objects. Indices may have representation in -multiple caches, but currently, non-index objects may not. Objects of any type -may also be entirely unrepresented. - -As far as the netfs API goes, the netfs is only actually permitted to see -pointers to the cookies. The cookies themselves and any objects attached to -those cookies are hidden from it. - - -=============================== -OBJECT MANAGEMENT STATE MACHINE -=============================== - -Within FS-Cache, each active object is managed by its own individual state -machine. The state for an object is kept in the fscache_object struct, in -object->state. A cookie may point to a set of objects that are in different -states. - -Each state has an action associated with it that is invoked when the machine -wakes up in that state. There are four logical sets of states: - - (1) Preparation: states that wait for the parent objects to become ready. The - representations are hierarchical, and it is expected that an object must - be created or accessed with respect to its parent object. - - (2) Initialisation: states that perform lookups in the cache and validate - what's found and that create on disk any missing metadata. - - (3) Normal running: states that allow netfs operations on objects to proceed - and that update the state of objects. - - (4) Termination: states that detach objects from their netfs cookies, that - delete objects from disk, that handle disk and system errors and that free - up in-memory resources. - - -In most cases, transitioning between states is in response to signalled events. -When a state has finished processing, it will usually set the mask of events in -which it is interested (object->event_mask) and relinquish the worker thread. -Then when an event is raised (by calling fscache_raise_event()), if the event -is not masked, the object will be queued for processing (by calling -fscache_enqueue_object()). - - -PROVISION OF CPU TIME ---------------------- - -The work to be done by the various states was given CPU time by the threads of -the slow work facility. This was used in preference to the workqueue facility -because: - - (1) Threads may be completely occupied for very long periods of time by a - particular work item. These state actions may be doing sequences of - synchronous, journalled disk accesses (lookup, mkdir, create, setxattr, - getxattr, truncate, unlink, rmdir, rename). - - (2) Threads may do little actual work, but may rather spend a lot of time - sleeping on I/O. This means that single-threaded and 1-per-CPU-threaded - workqueues don't necessarily have the right numbers of threads. - - -LOCKING SIMPLIFICATION ----------------------- - -Because only one worker thread may be operating on any particular object's -state machine at once, this simplifies the locking, particularly with respect -to disconnecting the netfs's representation of a cache object (fscache_cookie) -from the cache backend's representation (fscache_object) - which may be -requested from either end. - - -================= -THE SET OF STATES -================= - -The object state machine has a set of states that it can be in. There are -preparation states in which the object sets itself up and waits for its parent -object to transit to a state that allows access to its children: - - (1) State FSCACHE_OBJECT_INIT. - - Initialise the object and wait for the parent object to become active. In - the cache, it is expected that it will not be possible to look an object - up from the parent object, until that parent object itself has been looked - up. - -There are initialisation states in which the object sets itself up and accesses -disk for the object metadata: - - (2) State FSCACHE_OBJECT_LOOKING_UP. - - Look up the object on disk, using the parent as a starting point. - FS-Cache expects the cache backend to probe the cache to see whether this - object is represented there, and if it is, to see if it's valid (coherency - management). - - The cache should call fscache_object_lookup_negative() to indicate lookup - failure for whatever reason, and should call fscache_obtained_object() to - indicate success. - - At the completion of lookup, FS-Cache will let the netfs go ahead with - read operations, no matter whether the file is yet cached. If not yet - cached, read operations will be immediately rejected with ENODATA until - the first known page is uncached - as to that point there can be no data - to be read out of the cache for that file that isn't currently also held - in the pagecache. - - (3) State FSCACHE_OBJECT_CREATING. - - Create an object on disk, using the parent as a starting point. This - happens if the lookup failed to find the object, or if the object's - coherency data indicated what's on disk is out of date. In this state, - FS-Cache expects the cache to create - - The cache should call fscache_obtained_object() if creation completes - successfully, fscache_object_lookup_negative() otherwise. - - At the completion of creation, FS-Cache will start processing write - operations the netfs has queued for an object. If creation failed, the - write ops will be transparently discarded, and nothing recorded in the - cache. - -There are some normal running states in which the object spends its time -servicing netfs requests: - - (4) State FSCACHE_OBJECT_AVAILABLE. - - A transient state in which pending operations are started, child objects - are permitted to advance from FSCACHE_OBJECT_INIT state, and temporary - lookup data is freed. - - (5) State FSCACHE_OBJECT_ACTIVE. - - The normal running state. In this state, requests the netfs makes will be - passed on to the cache. - - (6) State FSCACHE_OBJECT_INVALIDATING. - - The object is undergoing invalidation. When the state comes here, it - discards all pending read, write and attribute change operations as it is - going to clear out the cache entirely and reinitialise it. It will then - continue to the FSCACHE_OBJECT_UPDATING state. - - (7) State FSCACHE_OBJECT_UPDATING. - - The state machine comes here to update the object in the cache from the - netfs's records. This involves updating the auxiliary data that is used - to maintain coherency. - -And there are terminal states in which an object cleans itself up, deallocates -memory and potentially deletes stuff from disk: - - (8) State FSCACHE_OBJECT_LC_DYING. - - The object comes here if it is dying because of a lookup or creation - error. This would be due to a disk error or system error of some sort. - Temporary data is cleaned up, and the parent is released. - - (9) State FSCACHE_OBJECT_DYING. - - The object comes here if it is dying due to an error, because its parent - cookie has been relinquished by the netfs or because the cache is being - withdrawn. - - Any child objects waiting on this one are given CPU time so that they too - can destroy themselves. This object waits for all its children to go away - before advancing to the next state. - -(10) State FSCACHE_OBJECT_ABORT_INIT. - - The object comes to this state if it was waiting on its parent in - FSCACHE_OBJECT_INIT, but its parent died. The object will destroy itself - so that the parent may proceed from the FSCACHE_OBJECT_DYING state. - -(11) State FSCACHE_OBJECT_RELEASING. -(12) State FSCACHE_OBJECT_RECYCLING. - - The object comes to one of these two states when dying once it is rid of - all its children, if it is dying because the netfs relinquished its - cookie. In the first state, the cached data is expected to persist, and - in the second it will be deleted. - -(13) State FSCACHE_OBJECT_WITHDRAWING. - - The object transits to this state if the cache decides it wants to - withdraw the object from service, perhaps to make space, but also due to - error or just because the whole cache is being withdrawn. - -(14) State FSCACHE_OBJECT_DEAD. - - The object transits to this state when the in-memory object record is - ready to be deleted. The object processor shouldn't ever see an object in - this state. - - -THE SET OF EVENTS ------------------ - -There are a number of events that can be raised to an object state machine: - - (*) FSCACHE_OBJECT_EV_UPDATE - - The netfs requested that an object be updated. The state machine will ask - the cache backend to update the object, and the cache backend will ask the - netfs for details of the change through its cookie definition ops. - - (*) FSCACHE_OBJECT_EV_CLEARED - - This is signalled in two circumstances: - - (a) when an object's last child object is dropped and - - (b) when the last operation outstanding on an object is completed. - - This is used to proceed from the dying state. - - (*) FSCACHE_OBJECT_EV_ERROR - - This is signalled when an I/O error occurs during the processing of some - object. - - (*) FSCACHE_OBJECT_EV_RELEASE - (*) FSCACHE_OBJECT_EV_RETIRE - - These are signalled when the netfs relinquishes a cookie it was using. - The event selected depends on whether the netfs asks for the backing - object to be retired (deleted) or retained. - - (*) FSCACHE_OBJECT_EV_WITHDRAW - - This is signalled when the cache backend wants to withdraw an object. - This means that the object will have to be detached from the netfs's - cookie. - -Because the withdrawing releasing/retiring events are all handled by the object -state machine, it doesn't matter if there's a collision with both ends trying -to sever the connection at the same time. The state machine can just pick -which one it wants to honour, and that effects the other. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e7b46dac7079..773af390c5e5 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -25,6 +25,8 @@ algorithms work. locking directory-locking + caching/index + porting Filesystem support layers diff --git a/fs/fscache/object.c b/fs/fscache/object.c index cfeba839a0f2..efaa003b8323 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -4,7 +4,7 @@ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/object.txt for a description of the + * See Documentation/filesystems/caching/object.rst for a description of the * object state machine and the in-kernel representations. */ -- cgit v1.2.3 From fd299b2a7339ad0b8e2050189e20619cd817ba7e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:54 +0200 Subject: docs: filesystems: convert caching/fscache.txt to ReST format - Add a SPDX header; - Adjust document and section titles; - Comment out text ToC for html/pdf output; - Some whitespace fixes and new line breaks; - Add table markups; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/e33ec382a53cf10ffcbd802f6de3f384159cddba.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/fscache.rst | 565 ++++++++++++++++++++++++++ Documentation/filesystems/caching/fscache.txt | 448 -------------------- Documentation/filesystems/caching/index.rst | 1 + fs/fscache/Kconfig | 8 +- 4 files changed, 570 insertions(+), 452 deletions(-) create mode 100644 Documentation/filesystems/caching/fscache.rst delete mode 100644 Documentation/filesystems/caching/fscache.txt diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst new file mode 100644 index 000000000000..950602147aa5 --- /dev/null +++ b/Documentation/filesystems/caching/fscache.rst @@ -0,0 +1,565 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +General Filesystem Caching +========================== + +Overview +======== + +This facility is a general purpose cache for network filesystems, though it +could be used for caching other things such as ISO9660 filesystems too. + +FS-Cache mediates between cache backends (such as CacheFS) and network +filesystems:: + + +---------+ + | | +--------------+ + | NFS |--+ | | + | | | +-->| CacheFS | + +---------+ | +----------+ | | /dev/hda5 | + | | | | +--------------+ + +---------+ +-->| | | + | | | |--+ + | AFS |----->| FS-Cache | + | | | |--+ + +---------+ +-->| | | + | | | | +--------------+ + +---------+ | +----------+ | | | + | | | +-->| CacheFiles | + | ISOFS |--+ | /var/cache | + | | +--------------+ + +---------+ + +Or to look at it another way, FS-Cache is a module that provides a caching +facility to a network filesystem such that the cache is transparent to the +user:: + + +---------+ + | | + | Server | + | | + +---------+ + | NETWORK + ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + | + | +----------+ + V | | + +---------+ | | + | | | | + | NFS |----->| FS-Cache | + | | | |--+ + +---------+ | | | +--------------+ +--------------+ + | | | | | | | | + V +----------+ +-->| CacheFiles |-->| Ext3 | + +---------+ | /var/cache | | /dev/sda6 | + | | +--------------+ +--------------+ + | VFS | ^ ^ + | | | | + +---------+ +--------------+ | + | KERNEL SPACE | | + ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~ + | USER SPACE | | + V | | + +---------+ +--------------+ + | | | | + | Process | | cachefilesd | + | | | | + +---------+ +--------------+ + + +FS-Cache does not follow the idea of completely loading every netfs file +opened in its entirety into a cache before permitting it to be accessed and +then serving the pages out of that cache rather than the netfs inode because: + + (1) It must be practical to operate without a cache. + + (2) The size of any accessible file must not be limited to the size of the + cache. + + (3) The combined size of all opened files (this includes mapped libraries) + must not be limited to the size of the cache. + + (4) The user should not be forced to download an entire file just to do a + one-off access of a small portion of it (such as might be done with the + "file" program). + +It instead serves the cache out in PAGE_SIZE chunks as and when requested by +the netfs('s) using it. + + +FS-Cache provides the following facilities: + + (1) More than one cache can be used at once. Caches can be selected + explicitly by use of tags. + + (2) Caches can be added / removed at any time. + + (3) The netfs is provided with an interface that allows either party to + withdraw caching facilities from a file (required for (2)). + + (4) The interface to the netfs returns as few errors as possible, preferring + rather to let the netfs remain oblivious. + + (5) Cookies are used to represent indices, files and other objects to the + netfs. The simplest cookie is just a NULL pointer - indicating nothing + cached there. + + (6) The netfs is allowed to propose - dynamically - any index hierarchy it + desires, though it must be aware that the index search function is + recursive, stack space is limited, and indices can only be children of + indices. + + (7) Data I/O is done direct to and from the netfs's pages. The netfs + indicates that page A is at index B of the data-file represented by cookie + C, and that it should be read or written. The cache backend may or may + not start I/O on that page, but if it does, a netfs callback will be + invoked to indicate completion. The I/O may be either synchronous or + asynchronous. + + (8) Cookies can be "retired" upon release. At this point FS-Cache will mark + them as obsolete and the index hierarchy rooted at that point will get + recycled. + + (9) The netfs provides a "match" function for index searches. In addition to + saying whether a match was made or not, this can also specify that an + entry should be updated or deleted. + +(10) As much as possible is done asynchronously. + + +FS-Cache maintains a virtual indexing tree in which all indices, files, objects +and pages are kept. Bits of this tree may actually reside in one or more +caches:: + + FSDEF + | + +------------------------------------+ + | | + NFS AFS + | | + +--------------------------+ +-----------+ + | | | | + homedir mirror afs.org redhat.com + | | | + +------------+ +---------------+ +----------+ + | | | | | | + 00001 00002 00007 00125 vol00001 vol00002 + | | | | | + +---+---+ +-----+ +---+ +------+------+ +-----+----+ + | | | | | | | | | | | | | + PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak + | | + PG0 +-------+ + | | + 00001 00003 + | + +---+---+ + | | | + PG0 PG1 PG2 + +In the example above, you can see two netfs's being backed: NFS and AFS. These +have different index hierarchies: + + * The NFS primary index contains per-server indices. Each server index is + indexed by NFS file handles to get data file objects. Each data file + objects can have an array of pages, but may also have further child + objects, such as extended attributes and directory entries. Extended + attribute objects themselves have page-array contents. + + * The AFS primary index contains per-cell indices. Each cell index contains + per-logical-volume indices. Each of volume index contains up to three + indices for the read-write, read-only and backup mirrors of those volumes. + Each of these contains vnode data file objects, each of which contains an + array of pages. + +The very top index is the FS-Cache master index in which individual netfs's +have entries. + +Any index object may reside in more than one cache, provided it only has index +children. Any index with non-index object children will be assumed to only +reside in one cache. + + +The netfs API to FS-Cache can be found in: + + Documentation/filesystems/caching/netfs-api.txt + +The cache backend API to FS-Cache can be found in: + + Documentation/filesystems/caching/backend-api.txt + +A description of the internal representations and object state machine can be +found in: + + Documentation/filesystems/caching/object.rst + + +Statistical Information +======================= + +If FS-Cache is compiled with the following options enabled:: + + CONFIG_FSCACHE_STATS=y + CONFIG_FSCACHE_HISTOGRAM=y + +then it will gather certain statistics and display them through a number of +proc files. + +/proc/fs/fscache/stats +---------------------- + + This shows counts of a number of events that can happen in FS-Cache: + ++--------------+-------+-------------------------------------------------------+ +|CLASS |EVENT |MEANING | ++==============+=======+=======================================================+ +|Cookies |idx=N |Number of index cookies allocated | ++ +-------+-------------------------------------------------------+ +| |dat=N |Number of data storage cookies allocated | ++ +-------+-------------------------------------------------------+ +| |spc=N |Number of special cookies allocated | ++--------------+-------+-------------------------------------------------------+ +|Objects |alc=N |Number of objects allocated | ++ +-------+-------------------------------------------------------+ +| |nal=N |Number of object allocation failures | ++ +-------+-------------------------------------------------------+ +| |avl=N |Number of objects that reached the available state | ++ +-------+-------------------------------------------------------+ +| |ded=N |Number of objects that reached the dead state | ++--------------+-------+-------------------------------------------------------+ +|ChkAux |non=N |Number of objects that didn't have a coherency check | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of objects that passed a coherency check | ++ +-------+-------------------------------------------------------+ +| |upd=N |Number of objects that needed a coherency data update | ++ +-------+-------------------------------------------------------+ +| |obs=N |Number of objects that were declared obsolete | ++--------------+-------+-------------------------------------------------------+ +|Pages |mrk=N |Number of pages marked as being cached | +| |unc=N |Number of uncache page requests seen | ++--------------+-------+-------------------------------------------------------+ +|Acquire |n=N |Number of acquire cookie requests seen | ++ +-------+-------------------------------------------------------+ +| |nul=N |Number of acq reqs given a NULL parent | ++ +-------+-------------------------------------------------------+ +| |noc=N |Number of acq reqs rejected due to no cache available | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of acq reqs succeeded | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of acq reqs rejected due to error | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of acq reqs failed on ENOMEM | ++--------------+-------+-------------------------------------------------------+ +|Lookups |n=N |Number of lookup calls made on cache backends | ++ +-------+-------------------------------------------------------+ +| |neg=N |Number of negative lookups made | ++ +-------+-------------------------------------------------------+ +| |pos=N |Number of positive lookups made | ++ +-------+-------------------------------------------------------+ +| |crt=N |Number of objects created by lookup | ++ +-------+-------------------------------------------------------+ +| |tmo=N |Number of lookups timed out and requeued | ++--------------+-------+-------------------------------------------------------+ +|Updates |n=N |Number of update cookie requests seen | ++ +-------+-------------------------------------------------------+ +| |nul=N |Number of upd reqs given a NULL parent | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of upd reqs granted CPU time | ++--------------+-------+-------------------------------------------------------+ +|Relinqs |n=N |Number of relinquish cookie requests seen | ++ +-------+-------------------------------------------------------+ +| |nul=N |Number of rlq reqs given a NULL parent | ++ +-------+-------------------------------------------------------+ +| |wcr=N |Number of rlq reqs waited on completion of creation | ++--------------+-------+-------------------------------------------------------+ +|AttrChg |n=N |Number of attribute changed requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of attr changed requests queued | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of attr changed rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of attr changed failed -ENOMEM | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of attr changed ops given CPU time | ++--------------+-------+-------------------------------------------------------+ +|Allocs |n=N |Number of allocation requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of successful alloc reqs | ++ +-------+-------------------------------------------------------+ +| |wt=N |Number of alloc reqs that waited on lookup completion | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of alloc reqs rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |int=N |Number of alloc reqs aborted -ERESTARTSYS | ++ +-------+-------------------------------------------------------+ +| |ops=N |Number of alloc reqs submitted | ++ +-------+-------------------------------------------------------+ +| |owt=N |Number of alloc reqs waited for CPU time | ++ +-------+-------------------------------------------------------+ +| |abt=N |Number of alloc reqs aborted due to object death | ++--------------+-------+-------------------------------------------------------+ +|Retrvls |n=N |Number of retrieval (read) requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of successful retr reqs | ++ +-------+-------------------------------------------------------+ +| |wt=N |Number of retr reqs that waited on lookup completion | ++ +-------+-------------------------------------------------------+ +| |nod=N |Number of retr reqs returned -ENODATA | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of retr reqs rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |int=N |Number of retr reqs aborted -ERESTARTSYS | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of retr reqs failed -ENOMEM | ++ +-------+-------------------------------------------------------+ +| |ops=N |Number of retr reqs submitted | ++ +-------+-------------------------------------------------------+ +| |owt=N |Number of retr reqs waited for CPU time | ++ +-------+-------------------------------------------------------+ +| |abt=N |Number of retr reqs aborted due to object death | ++--------------+-------+-------------------------------------------------------+ +|Stores |n=N |Number of storage (write) requests seen | ++ +-------+-------------------------------------------------------+ +| |ok=N |Number of successful store reqs | ++ +-------+-------------------------------------------------------+ +| |agn=N |Number of store reqs on a page already pending storage | ++ +-------+-------------------------------------------------------+ +| |nbf=N |Number of store reqs rejected -ENOBUFS | ++ +-------+-------------------------------------------------------+ +| |oom=N |Number of store reqs failed -ENOMEM | ++ +-------+-------------------------------------------------------+ +| |ops=N |Number of store reqs submitted | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of store reqs granted CPU time | ++ +-------+-------------------------------------------------------+ +| |pgs=N |Number of pages given store req processing time | ++ +-------+-------------------------------------------------------+ +| |rxd=N |Number of store reqs deleted from tracking tree | ++ +-------+-------------------------------------------------------+ +| |olm=N |Number of store reqs over store limit | ++--------------+-------+-------------------------------------------------------+ +|VmScan |nos=N |Number of release reqs against pages with no | +| | |pending store | ++ +-------+-------------------------------------------------------+ +| |gon=N |Number of release reqs against pages stored by | +| | |time lock granted | ++ +-------+-------------------------------------------------------+ +| |bsy=N |Number of release reqs ignored due to in-progress store| ++ +-------+-------------------------------------------------------+ +| |can=N |Number of page stores cancelled due to release req | ++--------------+-------+-------------------------------------------------------+ +|Ops |pend=N |Number of times async ops added to pending queues | ++ +-------+-------------------------------------------------------+ +| |run=N |Number of times async ops given CPU time | ++ +-------+-------------------------------------------------------+ +| |enq=N |Number of times async ops queued for processing | ++ +-------+-------------------------------------------------------+ +| |can=N |Number of async ops cancelled | ++ +-------+-------------------------------------------------------+ +| |rej=N |Number of async ops rejected due to object | +| | |lookup/create failure | ++ +-------+-------------------------------------------------------+ +| |ini=N |Number of async ops initialised | ++ +-------+-------------------------------------------------------+ +| |dfr=N |Number of async ops queued for deferred release | ++ +-------+-------------------------------------------------------+ +| |rel=N |Number of async ops released | +| | |(should equal ini=N when idle) | ++ +-------+-------------------------------------------------------+ +| |gc=N |Number of deferred-release async ops garbage collected | ++--------------+-------+-------------------------------------------------------+ +|CacheOp |alo=N |Number of in-progress alloc_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |luo=N |Number of in-progress lookup_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |luc=N |Number of in-progress lookup_complete() cache ops | ++ +-------+-------------------------------------------------------+ +| |gro=N |Number of in-progress grab_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |upo=N |Number of in-progress update_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |dro=N |Number of in-progress drop_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |pto=N |Number of in-progress put_object() cache ops | ++ +-------+-------------------------------------------------------+ +| |syn=N |Number of in-progress sync_cache() cache ops | ++ +-------+-------------------------------------------------------+ +| |atc=N |Number of in-progress attr_changed() cache ops | ++ +-------+-------------------------------------------------------+ +| |rap=N |Number of in-progress read_or_alloc_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |ras=N |Number of in-progress read_or_alloc_pages() cache ops | ++ +-------+-------------------------------------------------------+ +| |alp=N |Number of in-progress allocate_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |als=N |Number of in-progress allocate_pages() cache ops | ++ +-------+-------------------------------------------------------+ +| |wrp=N |Number of in-progress write_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |ucp=N |Number of in-progress uncache_page() cache ops | ++ +-------+-------------------------------------------------------+ +| |dsp=N |Number of in-progress dissociate_pages() cache ops | ++--------------+-------+-------------------------------------------------------+ +|CacheEv |nsp=N |Number of object lookups/creations rejected due to | +| | |lack of space | ++ +-------+-------------------------------------------------------+ +| |stl=N |Number of stale objects deleted | ++ +-------+-------------------------------------------------------+ +| |rtr=N |Number of objects retired when relinquished | ++ +-------+-------------------------------------------------------+ +| |cul=N |Number of objects culled | ++--------------+-------+-------------------------------------------------------+ + + + +/proc/fs/fscache/histogram +-------------------------- + + :: + + cat /proc/fs/fscache/histogram + JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS + ===== ===== ========= ========= ========= ========= ========= + + This shows the breakdown of the number of times each amount of time + between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The + columns are as follows: + + ========= ======================================================= + COLUMN TIME MEASUREMENT + ========= ======================================================= + OBJ INST Length of time to instantiate an object + OP RUNS Length of time a call to process an operation took + OBJ RUNS Length of time a call to process an object event took + RETRV DLY Time between an requesting a read and lookup completing + RETRIEVLS Time between beginning and end of a retrieval + ========= ======================================================= + + Each row shows the number of events that took a particular range of times. + Each step is 1 jiffy in size. The JIFS column indicates the particular + jiffy range covered, and the SECS field the equivalent number of seconds. + + + +Object List +=========== + +If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a +list of all the objects currently allocated and allow them to be viewed +through:: + + /proc/fs/fscache/objects + +This will look something like:: + + [root@andromeda ~]# head /proc/fs/fscache/objects + OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA + ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================ + 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a + 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a + +where the first set of columns before the '|' describe the object: + + ======= =============================================================== + COLUMN DESCRIPTION + ======= =============================================================== + OBJECT Object debugging ID (appears as OBJ%x in some debug messages) + PARENT Debugging ID of parent object + STAT Object state + CHLDN Number of child objects of this object + OPS Number of outstanding operations on this object + OOP Number of outstanding child object management operations + IPR + EX Number of outstanding exclusive operations + READS Number of outstanding read operations + EM Object's event mask + EV Events raised on this object + F Object flags + S Object work item busy state mask (1:pending 2:running) + ======= =============================================================== + +and the second set of columns describe the object's cookie, if present: + + ================ ====================================================== + COLUMN DESCRIPTION + ================ ====================================================== + NETFS_COOKIE_DEF Name of netfs cookie definition + TY Cookie type (IX - index, DT - data, hex - special) + FL Cookie flags + NETFS_DATA Netfs private data stored in the cookie + OBJECT_KEY Object key } 1 column, with separating comma + AUX_DATA Object aux data } presence may be configured + ================ ====================================================== + +The data shown may be filtered by attaching the a key to an appropriate keyring +before viewing the file. Something like:: + + keyctl add user fscache:objlist @s + +where are a selection of the following letters: + + == ========================================================= + K Show hexdump of object key (don't show if not given) + A Show hexdump of object aux data (don't show if not given) + == ========================================================= + +and the following paired letters: + + == ========================================================= + C Show objects that have a cookie + c Show objects that don't have a cookie + B Show objects that are busy + b Show objects that aren't busy + W Show objects that have pending writes + w Show objects that don't have pending writes + R Show objects that have outstanding reads + r Show objects that don't have outstanding reads + S Show objects that have work queued + s Show objects that don't have work queued + == ========================================================= + +If neither side of a letter pair is given, then both are implied. For example: + + keyctl add user fscache:objlist KB @s + +shows objects that are busy, and lists their object keys, but does not dump +their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is +not implied. + +By default all objects and all fields will be shown. + + +Debugging +========= + +If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime +debugging enabled by adjusting the value in:: + + /sys/module/fscache/parameters/debug + +This is a bitmask of debugging streams to enable: + + ======= ======= =============================== ======================= + BIT VALUE STREAM POINT + ======= ======= =============================== ======================= + 0 1 Cache management Function entry trace + 1 2 Function exit trace + 2 4 General + 3 8 Cookie management Function entry trace + 4 16 Function exit trace + 5 32 General + 6 64 Page handling Function entry trace + 7 128 Function exit trace + 8 256 General + 9 512 Operation management Function entry trace + 10 1024 Function exit trace + 11 2048 General + ======= ======= =============================== ======================= + +The appropriate set of values should be OR'd together and the result written to +the control file. For example:: + + echo $((1|8|64)) >/sys/module/fscache/parameters/debug + +will turn on all function entry debugging. diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt deleted file mode 100644 index 071ff50a774d..000000000000 --- a/Documentation/filesystems/caching/fscache.txt +++ /dev/null @@ -1,448 +0,0 @@ - ========================== - General Filesystem Caching - ========================== - -======== -OVERVIEW -======== - -This facility is a general purpose cache for network filesystems, though it -could be used for caching other things such as ISO9660 filesystems too. - -FS-Cache mediates between cache backends (such as CacheFS) and network -filesystems: - - +---------+ - | | +--------------+ - | NFS |--+ | | - | | | +-->| CacheFS | - +---------+ | +----------+ | | /dev/hda5 | - | | | | +--------------+ - +---------+ +-->| | | - | | | |--+ - | AFS |----->| FS-Cache | - | | | |--+ - +---------+ +-->| | | - | | | | +--------------+ - +---------+ | +----------+ | | | - | | | +-->| CacheFiles | - | ISOFS |--+ | /var/cache | - | | +--------------+ - +---------+ - -Or to look at it another way, FS-Cache is a module that provides a caching -facility to a network filesystem such that the cache is transparent to the -user: - - +---------+ - | | - | Server | - | | - +---------+ - | NETWORK - ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - | - | +----------+ - V | | - +---------+ | | - | | | | - | NFS |----->| FS-Cache | - | | | |--+ - +---------+ | | | +--------------+ +--------------+ - | | | | | | | | - V +----------+ +-->| CacheFiles |-->| Ext3 | - +---------+ | /var/cache | | /dev/sda6 | - | | +--------------+ +--------------+ - | VFS | ^ ^ - | | | | - +---------+ +--------------+ | - | KERNEL SPACE | | - ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~ - | USER SPACE | | - V | | - +---------+ +--------------+ - | | | | - | Process | | cachefilesd | - | | | | - +---------+ +--------------+ - - -FS-Cache does not follow the idea of completely loading every netfs file -opened in its entirety into a cache before permitting it to be accessed and -then serving the pages out of that cache rather than the netfs inode because: - - (1) It must be practical to operate without a cache. - - (2) The size of any accessible file must not be limited to the size of the - cache. - - (3) The combined size of all opened files (this includes mapped libraries) - must not be limited to the size of the cache. - - (4) The user should not be forced to download an entire file just to do a - one-off access of a small portion of it (such as might be done with the - "file" program). - -It instead serves the cache out in PAGE_SIZE chunks as and when requested by -the netfs('s) using it. - - -FS-Cache provides the following facilities: - - (1) More than one cache can be used at once. Caches can be selected - explicitly by use of tags. - - (2) Caches can be added / removed at any time. - - (3) The netfs is provided with an interface that allows either party to - withdraw caching facilities from a file (required for (2)). - - (4) The interface to the netfs returns as few errors as possible, preferring - rather to let the netfs remain oblivious. - - (5) Cookies are used to represent indices, files and other objects to the - netfs. The simplest cookie is just a NULL pointer - indicating nothing - cached there. - - (6) The netfs is allowed to propose - dynamically - any index hierarchy it - desires, though it must be aware that the index search function is - recursive, stack space is limited, and indices can only be children of - indices. - - (7) Data I/O is done direct to and from the netfs's pages. The netfs - indicates that page A is at index B of the data-file represented by cookie - C, and that it should be read or written. The cache backend may or may - not start I/O on that page, but if it does, a netfs callback will be - invoked to indicate completion. The I/O may be either synchronous or - asynchronous. - - (8) Cookies can be "retired" upon release. At this point FS-Cache will mark - them as obsolete and the index hierarchy rooted at that point will get - recycled. - - (9) The netfs provides a "match" function for index searches. In addition to - saying whether a match was made or not, this can also specify that an - entry should be updated or deleted. - -(10) As much as possible is done asynchronously. - - -FS-Cache maintains a virtual indexing tree in which all indices, files, objects -and pages are kept. Bits of this tree may actually reside in one or more -caches. - - FSDEF - | - +------------------------------------+ - | | - NFS AFS - | | - +--------------------------+ +-----------+ - | | | | - homedir mirror afs.org redhat.com - | | | - +------------+ +---------------+ +----------+ - | | | | | | - 00001 00002 00007 00125 vol00001 vol00002 - | | | | | - +---+---+ +-----+ +---+ +------+------+ +-----+----+ - | | | | | | | | | | | | | -PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak - | | - PG0 +-------+ - | | - 00001 00003 - | - +---+---+ - | | | - PG0 PG1 PG2 - -In the example above, you can see two netfs's being backed: NFS and AFS. These -have different index hierarchies: - - (*) The NFS primary index contains per-server indices. Each server index is - indexed by NFS file handles to get data file objects. Each data file - objects can have an array of pages, but may also have further child - objects, such as extended attributes and directory entries. Extended - attribute objects themselves have page-array contents. - - (*) The AFS primary index contains per-cell indices. Each cell index contains - per-logical-volume indices. Each of volume index contains up to three - indices for the read-write, read-only and backup mirrors of those volumes. - Each of these contains vnode data file objects, each of which contains an - array of pages. - -The very top index is the FS-Cache master index in which individual netfs's -have entries. - -Any index object may reside in more than one cache, provided it only has index -children. Any index with non-index object children will be assumed to only -reside in one cache. - - -The netfs API to FS-Cache can be found in: - - Documentation/filesystems/caching/netfs-api.txt - -The cache backend API to FS-Cache can be found in: - - Documentation/filesystems/caching/backend-api.txt - -A description of the internal representations and object state machine can be -found in: - - Documentation/filesystems/caching/object.rst - - -======================= -STATISTICAL INFORMATION -======================= - -If FS-Cache is compiled with the following options enabled: - - CONFIG_FSCACHE_STATS=y - CONFIG_FSCACHE_HISTOGRAM=y - -then it will gather certain statistics and display them through a number of -proc files. - - (*) /proc/fs/fscache/stats - - This shows counts of a number of events that can happen in FS-Cache: - - CLASS EVENT MEANING - ======= ======= ======================================================= - Cookies idx=N Number of index cookies allocated - dat=N Number of data storage cookies allocated - spc=N Number of special cookies allocated - Objects alc=N Number of objects allocated - nal=N Number of object allocation failures - avl=N Number of objects that reached the available state - ded=N Number of objects that reached the dead state - ChkAux non=N Number of objects that didn't have a coherency check - ok=N Number of objects that passed a coherency check - upd=N Number of objects that needed a coherency data update - obs=N Number of objects that were declared obsolete - Pages mrk=N Number of pages marked as being cached - unc=N Number of uncache page requests seen - Acquire n=N Number of acquire cookie requests seen - nul=N Number of acq reqs given a NULL parent - noc=N Number of acq reqs rejected due to no cache available - ok=N Number of acq reqs succeeded - nbf=N Number of acq reqs rejected due to error - oom=N Number of acq reqs failed on ENOMEM - Lookups n=N Number of lookup calls made on cache backends - neg=N Number of negative lookups made - pos=N Number of positive lookups made - crt=N Number of objects created by lookup - tmo=N Number of lookups timed out and requeued - Updates n=N Number of update cookie requests seen - nul=N Number of upd reqs given a NULL parent - run=N Number of upd reqs granted CPU time - Relinqs n=N Number of relinquish cookie requests seen - nul=N Number of rlq reqs given a NULL parent - wcr=N Number of rlq reqs waited on completion of creation - AttrChg n=N Number of attribute changed requests seen - ok=N Number of attr changed requests queued - nbf=N Number of attr changed rejected -ENOBUFS - oom=N Number of attr changed failed -ENOMEM - run=N Number of attr changed ops given CPU time - Allocs n=N Number of allocation requests seen - ok=N Number of successful alloc reqs - wt=N Number of alloc reqs that waited on lookup completion - nbf=N Number of alloc reqs rejected -ENOBUFS - int=N Number of alloc reqs aborted -ERESTARTSYS - ops=N Number of alloc reqs submitted - owt=N Number of alloc reqs waited for CPU time - abt=N Number of alloc reqs aborted due to object death - Retrvls n=N Number of retrieval (read) requests seen - ok=N Number of successful retr reqs - wt=N Number of retr reqs that waited on lookup completion - nod=N Number of retr reqs returned -ENODATA - nbf=N Number of retr reqs rejected -ENOBUFS - int=N Number of retr reqs aborted -ERESTARTSYS - oom=N Number of retr reqs failed -ENOMEM - ops=N Number of retr reqs submitted - owt=N Number of retr reqs waited for CPU time - abt=N Number of retr reqs aborted due to object death - Stores n=N Number of storage (write) requests seen - ok=N Number of successful store reqs - agn=N Number of store reqs on a page already pending storage - nbf=N Number of store reqs rejected -ENOBUFS - oom=N Number of store reqs failed -ENOMEM - ops=N Number of store reqs submitted - run=N Number of store reqs granted CPU time - pgs=N Number of pages given store req processing time - rxd=N Number of store reqs deleted from tracking tree - olm=N Number of store reqs over store limit - VmScan nos=N Number of release reqs against pages with no pending store - gon=N Number of release reqs against pages stored by time lock granted - bsy=N Number of release reqs ignored due to in-progress store - can=N Number of page stores cancelled due to release req - Ops pend=N Number of times async ops added to pending queues - run=N Number of times async ops given CPU time - enq=N Number of times async ops queued for processing - can=N Number of async ops cancelled - rej=N Number of async ops rejected due to object lookup/create failure - ini=N Number of async ops initialised - dfr=N Number of async ops queued for deferred release - rel=N Number of async ops released (should equal ini=N when idle) - gc=N Number of deferred-release async ops garbage collected - CacheOp alo=N Number of in-progress alloc_object() cache ops - luo=N Number of in-progress lookup_object() cache ops - luc=N Number of in-progress lookup_complete() cache ops - gro=N Number of in-progress grab_object() cache ops - upo=N Number of in-progress update_object() cache ops - dro=N Number of in-progress drop_object() cache ops - pto=N Number of in-progress put_object() cache ops - syn=N Number of in-progress sync_cache() cache ops - atc=N Number of in-progress attr_changed() cache ops - rap=N Number of in-progress read_or_alloc_page() cache ops - ras=N Number of in-progress read_or_alloc_pages() cache ops - alp=N Number of in-progress allocate_page() cache ops - als=N Number of in-progress allocate_pages() cache ops - wrp=N Number of in-progress write_page() cache ops - ucp=N Number of in-progress uncache_page() cache ops - dsp=N Number of in-progress dissociate_pages() cache ops - CacheEv nsp=N Number of object lookups/creations rejected due to lack of space - stl=N Number of stale objects deleted - rtr=N Number of objects retired when relinquished - cul=N Number of objects culled - - - (*) /proc/fs/fscache/histogram - - cat /proc/fs/fscache/histogram - JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS - ===== ===== ========= ========= ========= ========= ========= - - This shows the breakdown of the number of times each amount of time - between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The - columns are as follows: - - COLUMN TIME MEASUREMENT - ======= ======================================================= - OBJ INST Length of time to instantiate an object - OP RUNS Length of time a call to process an operation took - OBJ RUNS Length of time a call to process an object event took - RETRV DLY Time between an requesting a read and lookup completing - RETRIEVLS Time between beginning and end of a retrieval - - Each row shows the number of events that took a particular range of times. - Each step is 1 jiffy in size. The JIFS column indicates the particular - jiffy range covered, and the SECS field the equivalent number of seconds. - - -=========== -OBJECT LIST -=========== - -If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a -list of all the objects currently allocated and allow them to be viewed -through: - - /proc/fs/fscache/objects - -This will look something like: - - [root@andromeda ~]# head /proc/fs/fscache/objects - OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA - ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================ - 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a - 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a - -where the first set of columns before the '|' describe the object: - - COLUMN DESCRIPTION - ======= =============================================================== - OBJECT Object debugging ID (appears as OBJ%x in some debug messages) - PARENT Debugging ID of parent object - STAT Object state - CHLDN Number of child objects of this object - OPS Number of outstanding operations on this object - OOP Number of outstanding child object management operations - IPR - EX Number of outstanding exclusive operations - READS Number of outstanding read operations - EM Object's event mask - EV Events raised on this object - F Object flags - S Object work item busy state mask (1:pending 2:running) - -and the second set of columns describe the object's cookie, if present: - - COLUMN DESCRIPTION - =============== ======================================================= - NETFS_COOKIE_DEF Name of netfs cookie definition - TY Cookie type (IX - index, DT - data, hex - special) - FL Cookie flags - NETFS_DATA Netfs private data stored in the cookie - OBJECT_KEY Object key } 1 column, with separating comma - AUX_DATA Object aux data } presence may be configured - -The data shown may be filtered by attaching the a key to an appropriate keyring -before viewing the file. Something like: - - keyctl add user fscache:objlist @s - -where are a selection of the following letters: - - K Show hexdump of object key (don't show if not given) - A Show hexdump of object aux data (don't show if not given) - -and the following paired letters: - - C Show objects that have a cookie - c Show objects that don't have a cookie - B Show objects that are busy - b Show objects that aren't busy - W Show objects that have pending writes - w Show objects that don't have pending writes - R Show objects that have outstanding reads - r Show objects that don't have outstanding reads - S Show objects that have work queued - s Show objects that don't have work queued - -If neither side of a letter pair is given, then both are implied. For example: - - keyctl add user fscache:objlist KB @s - -shows objects that are busy, and lists their object keys, but does not dump -their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is -not implied. - -By default all objects and all fields will be shown. - - -========= -DEBUGGING -========= - -If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime -debugging enabled by adjusting the value in: - - /sys/module/fscache/parameters/debug - -This is a bitmask of debugging streams to enable: - - BIT VALUE STREAM POINT - ======= ======= =============================== ======================= - 0 1 Cache management Function entry trace - 1 2 Function exit trace - 2 4 General - 3 8 Cookie management Function entry trace - 4 16 Function exit trace - 5 32 General - 6 64 Page handling Function entry trace - 7 128 Function exit trace - 8 256 General - 9 512 Operation management Function entry trace - 10 1024 Function exit trace - 11 2048 General - -The appropriate set of values should be OR'd together and the result written to -the control file. For example: - - echo $((1|8|64)) >/sys/module/fscache/parameters/debug - -will turn on all function entry debugging. diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst index e5ec95ff0be2..f488747630aa 100644 --- a/Documentation/filesystems/caching/index.rst +++ b/Documentation/filesystems/caching/index.rst @@ -6,4 +6,5 @@ Filesystem Caching .. toctree:: :maxdepth: 2 + fscache object diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 506c5e643f0d..5e796e6c38e5 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -8,7 +8,7 @@ config FSCACHE Different sorts of caches can be plugged in, depending on the resources available. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_STATS bool "Gather statistical information on local caching" @@ -25,7 +25,7 @@ config FSCACHE_STATS between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_HISTOGRAM bool "Gather latency information on local caching" @@ -42,7 +42,7 @@ config FSCACHE_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_DEBUG bool "Debug FS-Cache" @@ -52,7 +52,7 @@ config FSCACHE_DEBUG management module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/fscache/parameter/debug. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_OBJECT_LIST bool "Maintain global object list for debugging purposes" -- cgit v1.2.3 From efc930fa1d844869eb65586f8cdd6a7837db3607 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:55 +0200 Subject: docs: filesystems: caching/netfs-api.txt: convert it to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/caching/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/cfe4cb1bf8e1f0093d44c30801ec42e74721e543.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/fscache.rst | 2 +- Documentation/filesystems/caching/index.rst | 1 + Documentation/filesystems/caching/netfs-api.rst | 896 +++++++++++++++++++++++ Documentation/filesystems/caching/netfs-api.txt | 910 ------------------------ fs/fscache/cookie.c | 2 +- include/linux/fscache.h | 42 +- 6 files changed, 920 insertions(+), 933 deletions(-) create mode 100644 Documentation/filesystems/caching/netfs-api.rst delete mode 100644 Documentation/filesystems/caching/netfs-api.txt diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst index 950602147aa5..dd1297d884d0 100644 --- a/Documentation/filesystems/caching/fscache.rst +++ b/Documentation/filesystems/caching/fscache.rst @@ -183,7 +183,7 @@ reside in one cache. The netfs API to FS-Cache can be found in: - Documentation/filesystems/caching/netfs-api.txt + Documentation/filesystems/caching/netfs-api.rst The cache backend API to FS-Cache can be found in: diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst index f488747630aa..d0651db450fb 100644 --- a/Documentation/filesystems/caching/index.rst +++ b/Documentation/filesystems/caching/index.rst @@ -8,3 +8,4 @@ Filesystem Caching fscache object + netfs-api diff --git a/Documentation/filesystems/caching/netfs-api.rst b/Documentation/filesystems/caching/netfs-api.rst new file mode 100644 index 000000000000..d9f14b8610ba --- /dev/null +++ b/Documentation/filesystems/caching/netfs-api.rst @@ -0,0 +1,896 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================== +FS-Cache Network Filesystem API +=============================== + +There's an API by which a network filesystem can make use of the FS-Cache +facilities. This is based around a number of principles: + + (1) Caches can store a number of different object types. There are two main + object types: indices and files. The first is a special type used by + FS-Cache to make finding objects faster and to make retiring of groups of + objects easier. + + (2) Every index, file or other object is represented by a cookie. This cookie + may or may not have anything associated with it, but the netfs doesn't + need to care. + + (3) Barring the top-level index (one entry per cached netfs), the index + hierarchy for each netfs is structured according the whim of the netfs. + +This API is declared in . + +.. This document contains the following sections: + + (1) Network filesystem definition + (2) Index definition + (3) Object definition + (4) Network filesystem (un)registration + (5) Cache tag lookup + (6) Index registration + (7) Data file registration + (8) Miscellaneous object registration + (9) Setting the data file size + (10) Page alloc/read/write + (11) Page uncaching + (12) Index and data file consistency + (13) Cookie enablement + (14) Miscellaneous cookie operations + (15) Cookie unregistration + (16) Index invalidation + (17) Data file invalidation + (18) FS-Cache specific page flags. + + +Network Filesystem Definition +============================= + +FS-Cache needs a description of the network filesystem. This is specified +using a record of the following structure:: + + struct fscache_netfs { + uint32_t version; + const char *name; + struct fscache_cookie *primary_index; + ... + }; + +This first two fields should be filled in before registration, and the third +will be filled in by the registration function; any other fields should just be +ignored and are for internal use only. + +The fields are: + + (1) The name of the netfs (used as the key in the toplevel index). + + (2) The version of the netfs (if the name matches but the version doesn't, the + entire in-cache hierarchy for this netfs will be scrapped and begun + afresh). + + (3) The cookie representing the primary index will be allocated according to + another parameter passed into the registration function. + +For example, kAFS (linux/fs/afs/) uses the following definitions to describe +itself:: + + struct fscache_netfs afs_cache_netfs = { + .version = 0, + .name = "afs", + }; + + +Index Definition +================ + +Indices are used for two purposes: + + (1) To aid the finding of a file based on a series of keys (such as AFS's + "cell", "volume ID", "vnode ID"). + + (2) To make it easier to discard a subset of all the files cached based around + a particular key - for instance to mirror the removal of an AFS volume. + +However, since it's unlikely that any two netfs's are going to want to define +their index hierarchies in quite the same way, FS-Cache tries to impose as few +restraints as possible on how an index is structured and where it is placed in +the tree. The netfs can even mix indices and data files at the same level, but +it's not recommended. + +Each index entry consists of a key of indeterminate length plus some auxiliary +data, also of indeterminate length. + +There are some limits on indices: + + (1) Any index containing non-index objects should be restricted to a single + cache. Any such objects created within an index will be created in the + first cache only. The cache in which an index is created can be + controlled by cache tags (see below). + + (2) The entry data must be atomically journallable, so it is limited to about + 400 bytes at present. At least 400 bytes will be available. + + (3) The depth of the index tree should be judged with care as the search + function is recursive. Too many layers will run the kernel out of stack. + + +Object Definition +================= + +To define an object, a structure of the following type should be filled out:: + + struct fscache_cookie_def + { + uint8_t name[16]; + uint8_t type; + + struct fscache_cache_tag *(*select_cache)( + const void *parent_netfs_data, + const void *cookie_netfs_data); + + enum fscache_checkaux (*check_aux)(void *cookie_netfs_data, + const void *data, + uint16_t datalen, + loff_t object_size); + + void (*get_context)(void *cookie_netfs_data, void *context); + + void (*put_context)(void *cookie_netfs_data, void *context); + + void (*mark_pages_cached)(void *cookie_netfs_data, + struct address_space *mapping, + struct pagevec *cached_pvec); + }; + +This has the following fields: + + (1) The type of the object [mandatory]. + + This is one of the following values: + + FSCACHE_COOKIE_TYPE_INDEX + This defines an index, which is a special FS-Cache type. + + FSCACHE_COOKIE_TYPE_DATAFILE + This defines an ordinary data file. + + Any other value between 2 and 255 + This defines an extraordinary object such as an XATTR. + + (2) The name of the object type (NUL terminated unless all 16 chars are used) + [optional]. + + (3) A function to select the cache in which to store an index [optional]. + + This function is invoked when an index needs to be instantiated in a cache + during the instantiation of a non-index object. Only the immediate index + parent for the non-index object will be queried. Any indices above that + in the hierarchy may be stored in multiple caches. This function does not + need to be supplied for any non-index object or any index that will only + have index children. + + If this function is not supplied or if it returns NULL then the first + cache in the parent's list will be chosen, or failing that, the first + cache in the master list. + + (4) A function to check the auxiliary data [optional]. + + This function will be called to check that a match found in the cache for + this object is valid. For instance with AFS it could check the auxiliary + data against the data version number returned by the server to determine + whether the index entry in a cache is still valid. + + If this function is absent, it will be assumed that matching objects in a + cache are always valid. + + The function is also passed the cache's idea of the object size and may + use this to manage coherency also. + + If present, the function should return one of the following values: + + FSCACHE_CHECKAUX_OKAY + - the entry is okay as is + + FSCACHE_CHECKAUX_NEEDS_UPDATE + - the entry requires update + + FSCACHE_CHECKAUX_OBSOLETE + - the entry should be deleted + + This function can also be used to extract data from the auxiliary data in + the cache and copy it into the netfs's structures. + + (5) A pair of functions to manage contexts for the completion callback + [optional]. + + The cache read/write functions are passed a context which is then passed + to the I/O completion callback function. To ensure this context remains + valid until after the I/O completion is called, two functions may be + provided: one to get an extra reference on the context, and one to drop a + reference to it. + + If the context is not used or is a type of object that won't go out of + scope, then these functions are not required. These functions are not + required for indices as indices may not contain data. These functions may + be called in interrupt context and so may not sleep. + + (6) A function to mark a page as retaining cache metadata [optional]. + + This is called by the cache to indicate that it is retaining in-memory + information for this page and that the netfs should uncache the page when + it has finished. This does not indicate whether there's data on the disk + or not. Note that several pages at once may be presented for marking. + + The PG_fscache bit is set on the pages before this function would be + called, so the function need not be provided if this is sufficient. + + This function is not required for indices as they're not permitted data. + + (7) A function to unmark all the pages retaining cache metadata [mandatory]. + + This is called by FS-Cache to indicate that a backing store is being + unbound from a cookie and that all the marks on the pages should be + cleared to prevent confusion. Note that the cache will have torn down all + its tracking information so that the pages don't need to be explicitly + uncached. + + This function is not required for indices as they're not permitted data. + + +Network Filesystem (Un)registration +=================================== + +The first step is to declare the network filesystem to the cache. This also +involves specifying the layout of the primary index (for AFS, this would be the +"cell" level). + +The registration function is:: + + int fscache_register_netfs(struct fscache_netfs *netfs); + +It just takes a pointer to the netfs definition. It returns 0 or an error as +appropriate. + +For kAFS, registration is done as follows:: + + ret = fscache_register_netfs(&afs_cache_netfs); + +The last step is, of course, unregistration:: + + void fscache_unregister_netfs(struct fscache_netfs *netfs); + + +Cache Tag Lookup +================ + +FS-Cache permits the use of more than one cache. To permit particular index +subtrees to be bound to particular caches, the second step is to look up cache +representation tags. This step is optional; it can be left entirely up to +FS-Cache as to which cache should be used. The problem with doing that is that +FS-Cache will always pick the first cache that was registered. + +To get the representation for a named tag:: + + struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name); + +This takes a text string as the name and returns a representation of a tag. It +will never return an error. It may return a dummy tag, however, if it runs out +of memory; this will inhibit caching with this tag. + +Any representation so obtained must be released by passing it to this function:: + + void fscache_release_cache_tag(struct fscache_cache_tag *tag); + +The tag will be retrieved by FS-Cache when it calls the object definition +operation select_cache(). + + +Index Registration +================== + +The third step is to inform FS-Cache about part of an index hierarchy that can +be used to locate files. This is done by requesting a cookie for each index in +the path to the file:: + + struct fscache_cookie * + fscache_acquire_cookie(struct fscache_cookie *parent, + const struct fscache_object_def *def, + const void *index_key, + size_t index_key_len, + const void *aux_data, + size_t aux_data_len, + void *netfs_data, + loff_t object_size, + bool enable); + +This function creates an index entry in the index represented by parent, +filling in the index entry by calling the operations pointed to by def. + +A unique key that represents the object within the parent must be pointed to by +index_key and is of length index_key_len. + +An optional blob of auxiliary data that is to be stored within the cache can be +pointed to with aux_data and should be of length aux_data_len. This would +typically be used for storing coherency data. + +The netfs may pass an arbitrary value in netfs_data and this will be presented +to it in the event of any calling back. This may also be used in tracing or +logging of messages. + +The cache tracks the size of the data attached to an object and this set to be +object_size. For indices, this should be 0. This value will be passed to the +->check_aux() callback. + +Note that this function never returns an error - all errors are handled +internally. It may, however, return NULL to indicate no cookie. It is quite +acceptable to pass this token back to this function as the parent to another +acquisition (or even to the relinquish cookie, read page and write page +functions - see below). + +Note also that no indices are actually created in a cache until a non-index +object needs to be created somewhere down the hierarchy. Furthermore, an index +may be created in several different caches independently at different times. +This is all handled transparently, and the netfs doesn't see any of it. + +A cookie will be created in the disabled state if enabled is false. A cookie +must be enabled to do anything with it. A disabled cookie can be enabled by +calling fscache_enable_cookie() (see below). + +For example, with AFS, a cell would be added to the primary index. This index +entry would have a dependent inode containing volume mappings within this cell:: + + cell->cache = + fscache_acquire_cookie(afs_cache_netfs.primary_index, + &afs_cell_cache_index_def, + cell->name, strlen(cell->name), + NULL, 0, + cell, 0, true); + +And then a particular volume could be added to that index by ID, creating +another index for vnodes (AFS inode equivalents):: + + volume->cache = + fscache_acquire_cookie(volume->cell->cache, + &afs_volume_cache_index_def, + &volume->vid, sizeof(volume->vid), + NULL, 0, + volume, 0, true); + + +Data File Registration +====================== + +The fourth step is to request a data file be created in the cache. This is +identical to index cookie acquisition. The only difference is that the type in +the object definition should be something other than index type:: + + vnode->cache = + fscache_acquire_cookie(volume->cache, + &afs_vnode_cache_object_def, + &key, sizeof(key), + &aux, sizeof(aux), + vnode, vnode->status.size, true); + + +Miscellaneous Object Registration +================================= + +An optional step is to request an object of miscellaneous type be created in +the cache. This is almost identical to index cookie acquisition. The only +difference is that the type in the object definition should be something other +than index type. While the parent object could be an index, it's more likely +it would be some other type of object such as a data file:: + + xattr->cache = + fscache_acquire_cookie(vnode->cache, + &afs_xattr_cache_object_def, + &xattr->name, strlen(xattr->name), + NULL, 0, + xattr, strlen(xattr->val), true); + +Miscellaneous objects might be used to store extended attributes or directory +entries for example. + + +Setting the Data File Size +========================== + +The fifth step is to set the physical attributes of the file, such as its size. +This doesn't automatically reserve any space in the cache, but permits the +cache to adjust its metadata for data tracking appropriately:: + + int fscache_attr_changed(struct fscache_cookie *cookie); + +The cache will return -ENOBUFS if there is no backing cache or if there is no +space to allocate any extra metadata required in the cache. + +Note that attempts to read or write data pages in the cache over this size may +be rebuffed with -ENOBUFS. + +This operation schedules an attribute adjustment to happen asynchronously at +some point in the future, and as such, it may happen after the function returns +to the caller. The attribute adjustment excludes read and write operations. + + +Page alloc/read/write +===================== + +And the sixth step is to store and retrieve pages in the cache. There are +three functions that are used to do this. + +Note: + + (1) A page should not be re-read or re-allocated without uncaching it first. + + (2) A read or allocated page must be uncached when the netfs page is released + from the pagecache. + + (3) A page should only be written to the cache if previous read or allocated. + +This permits the cache to maintain its page tracking in proper order. + + +PAGE READ +--------- + +Firstly, the netfs should ask FS-Cache to examine the caches and read the +contents cached for a particular page of a particular file if present, or else +allocate space to store the contents if not:: + + typedef + void (*fscache_rw_complete_t)(struct page *page, + void *context, + int error); + + int fscache_read_or_alloc_page(struct fscache_cookie *cookie, + struct page *page, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp); + +The cookie argument must specify a cookie for an object that isn't an index, +the page specified will have the data loaded into it (and is also used to +specify the page number), and the gfp argument is used to control how any +memory allocations made are satisfied. + +If the cookie indicates the inode is not cached: + + (1) The function will return -ENOBUFS. + +Else if there's a copy of the page resident in the cache: + + (1) The mark_pages_cached() cookie operation will be called on that page. + + (2) The function will submit a request to read the data from the cache's + backing device directly into the page specified. + + (3) The function will return 0. + + (4) When the read is complete, end_io_func() will be invoked with: + + * The netfs data supplied when the cookie was created. + + * The page descriptor. + + * The context argument passed to the above function. This will be + maintained with the get_context/put_context functions mentioned above. + + * An argument that's 0 on success or negative for an error code. + + If an error occurs, it should be assumed that the page contains no usable + data. fscache_readpages_cancel() may need to be called. + + end_io_func() will be called in process context if the read is results in + an error, but it might be called in interrupt context if the read is + successful. + +Otherwise, if there's not a copy available in cache, but the cache may be able +to store the page: + + (1) The mark_pages_cached() cookie operation will be called on that page. + + (2) A block may be reserved in the cache and attached to the object at the + appropriate place. + + (3) The function will return -ENODATA. + +This function may also return -ENOMEM or -EINTR, in which case it won't have +read any data from the cache. + + +Page Allocate +------------- + +Alternatively, if there's not expected to be any data in the cache for a page +because the file has been extended, a block can simply be allocated instead:: + + int fscache_alloc_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp); + +This is similar to the fscache_read_or_alloc_page() function, except that it +never reads from the cache. It will return 0 if a block has been allocated, +rather than -ENODATA as the other would. One or the other must be performed +before writing to the cache. + +The mark_pages_cached() cookie operation will be called on the page if +successful. + + +Page Write +---------- + +Secondly, if the netfs changes the contents of the page (either due to an +initial download or if a user performs a write), then the page should be +written back to the cache:: + + int fscache_write_page(struct fscache_cookie *cookie, + struct page *page, + loff_t object_size, + gfp_t gfp); + +The cookie argument must specify a data file cookie, the page specified should +contain the data to be written (and is also used to specify the page number), +object_size is the revised size of the object and the gfp argument is used to +control how any memory allocations made are satisfied. + +The page must have first been read or allocated successfully and must not have +been uncached before writing is performed. + +If the cookie indicates the inode is not cached then: + + (1) The function will return -ENOBUFS. + +Else if space can be allocated in the cache to hold this page: + + (1) PG_fscache_write will be set on the page. + + (2) The function will submit a request to write the data to cache's backing + device directly from the page specified. + + (3) The function will return 0. + + (4) When the write is complete PG_fscache_write is cleared on the page and + anyone waiting for that bit will be woken up. + +Else if there's no space available in the cache, -ENOBUFS will be returned. It +is also possible for the PG_fscache_write bit to be cleared when no write took +place if unforeseen circumstances arose (such as a disk error). + +Writing takes place asynchronously. + + +Multiple Page Read +------------------ + +A facility is provided to read several pages at once, as requested by the +readpages() address space operation:: + + int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, + struct address_space *mapping, + struct list_head *pages, + int *nr_pages, + fscache_rw_complete_t end_io_func, + void *context, + gfp_t gfp); + +This works in a similar way to fscache_read_or_alloc_page(), except: + + (1) Any page it can retrieve data for is removed from pages and nr_pages and + dispatched for reading to the disk. Reads of adjacent pages on disk may + be merged for greater efficiency. + + (2) The mark_pages_cached() cookie operation will be called on several pages + at once if they're being read or allocated. + + (3) If there was an general error, then that error will be returned. + + Else if some pages couldn't be allocated or read, then -ENOBUFS will be + returned. + + Else if some pages couldn't be read but were allocated, then -ENODATA will + be returned. + + Otherwise, if all pages had reads dispatched, then 0 will be returned, the + list will be empty and ``*nr_pages`` will be 0. + + (4) end_io_func will be called once for each page being read as the reads + complete. It will be called in process context if error != 0, but it may + be called in interrupt context if there is no error. + +Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude +some of the pages being read and some being allocated. Those pages will have +been marked appropriately and will need uncaching. + + +Cancellation of Unread Pages +---------------------------- + +If one or more pages are passed to fscache_read_or_alloc_pages() but not then +read from the cache and also not read from the underlying filesystem then +those pages will need to have any marks and reservations removed. This can be +done by calling:: + + void fscache_readpages_cancel(struct fscache_cookie *cookie, + struct list_head *pages); + +prior to returning to the caller. The cookie argument should be as passed to +fscache_read_or_alloc_pages(). Every page in the pages list will be examined +and any that have PG_fscache set will be uncached. + + +Page Uncaching +============== + +To uncache a page, this function should be called:: + + void fscache_uncache_page(struct fscache_cookie *cookie, + struct page *page); + +This function permits the cache to release any in-memory representation it +might be holding for this netfs page. This function must be called once for +each page on which the read or write page functions above have been called to +make sure the cache's in-memory tracking information gets torn down. + +Note that pages can't be explicitly deleted from the a data file. The whole +data file must be retired (see the relinquish cookie function below). + +Furthermore, note that this does not cancel the asynchronous read or write +operation started by the read/alloc and write functions, so the page +invalidation functions must use:: + + bool fscache_check_page_write(struct fscache_cookie *cookie, + struct page *page); + +to see if a page is being written to the cache, and:: + + void fscache_wait_on_page_write(struct fscache_cookie *cookie, + struct page *page); + +to wait for it to finish if it is. + + +When releasepage() is being implemented, a special FS-Cache function exists to +manage the heuristics of coping with vmscan trying to eject pages, which may +conflict with the cache trying to write pages to the cache (which may itself +need to allocate memory):: + + bool fscache_maybe_release_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp); + +This takes the netfs cookie, and the page and gfp arguments as supplied to +releasepage(). It will return false if the page cannot be released yet for +some reason and if it returns true, the page has been uncached and can now be +released. + +To make a page available for release, this function may wait for an outstanding +storage request to complete, or it may attempt to cancel the storage request - +in which case the page will not be stored in the cache this time. + + +Bulk Image Page Uncache +----------------------- + +A convenience routine is provided to perform an uncache on all the pages +attached to an inode. This assumes that the pages on the inode correspond on a +1:1 basis with the pages in the cache:: + + void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode); + +This takes the netfs cookie that the pages were cached with and the inode that +the pages are attached to. This function will wait for pages to finish being +written to the cache and for the cache to finish with the page generally. No +error is returned. + + +Index and Data File consistency +=============================== + +To find out whether auxiliary data for an object is up to data within the +cache, the following function can be called:: + + int fscache_check_consistency(struct fscache_cookie *cookie, + const void *aux_data); + +This will call back to the netfs to check whether the auxiliary data associated +with a cookie is correct; if aux_data is non-NULL, it will update the auxiliary +data buffer first. It returns 0 if it is and -ESTALE if it isn't; it may also +return -ENOMEM and -ERESTARTSYS. + +To request an update of the index data for an index or other object, the +following function should be called:: + + void fscache_update_cookie(struct fscache_cookie *cookie, + const void *aux_data); + +This function will update the cookie's auxiliary data buffer from aux_data if +that is non-NULL and then schedule this to be stored on disk. The update +method in the parent index definition will be called to transfer the data. + +Note that partial updates may happen automatically at other times, such as when +data blocks are added to a data file object. + + +Cookie Enablement +================= + +Cookies exist in one of two states: enabled and disabled. If a cookie is +disabled, it ignores all attempts to acquire child cookies; check, update or +invalidate its state; allocate, read or write backing pages - though it is +still possible to uncache pages and relinquish the cookie. + +The initial enablement state is set by fscache_acquire_cookie(), but the cookie +can be enabled or disabled later. To disable a cookie, call:: + + void fscache_disable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool invalidate); + +If the cookie is not already disabled, this locks the cookie against other +enable and disable ops, marks the cookie as being disabled, discards or +invalidates any backing objects and waits for cessation of activity on any +associated object before unlocking the cookie. + +All possible failures are handled internally. The caller should consider +calling fscache_uncache_all_inode_pages() afterwards to make sure all page +markings are cleared up. + +Cookies can be enabled or reenabled with:: + + void fscache_enable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + loff_t object_size, + bool (*can_enable)(void *data), + void *data) + +If the cookie is not already enabled, this locks the cookie against other +enable and disable ops, invokes can_enable() and, if the cookie is not an index +cookie, will begin the procedure of acquiring backing objects. + +The optional can_enable() function is passed the data argument and returns a +ruling as to whether or not enablement should actually be permitted to begin. + +All possible failures are handled internally. The cookie will only be marked +as enabled if provisional backing objects are allocated. + +The object's data size is updated from object_size and is passed to the +->check_aux() function. + +In both cases, the cookie's auxiliary data buffer is updated from aux_data if +that is non-NULL inside the enablement lock before proceeding. + + +Miscellaneous Cookie operations +=============================== + +There are a number of operations that can be used to control cookies: + + * Cookie pinning:: + + int fscache_pin_cookie(struct fscache_cookie *cookie); + void fscache_unpin_cookie(struct fscache_cookie *cookie); + + These operations permit data cookies to be pinned into the cache and to + have the pinning removed. They are not permitted on index cookies. + + The pinning function will return 0 if successful, -ENOBUFS in the cookie + isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning, + -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or + -EIO if there's any other problem. + + * Data space reservation:: + + int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size); + + This permits a netfs to request cache space be reserved to store up to the + given amount of a file. It is permitted to ask for more than the current + size of the file to allow for future file expansion. + + If size is given as zero then the reservation will be cancelled. + + The function will return 0 if successful, -ENOBUFS in the cookie isn't + backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations, + -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or + -EIO if there's any other problem. + + Note that this doesn't pin an object in a cache; it can still be culled to + make space if it's not in use. + + +Cookie Unregistration +===================== + +To get rid of a cookie, this function should be called:: + + void fscache_relinquish_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool retire); + +If retire is non-zero, then the object will be marked for recycling, and all +copies of it will be removed from all active caches in which it is present. +Not only that but all child objects will also be retired. + +If retire is zero, then the object may be available again when next the +acquisition function is called. Retirement here will overrule the pinning on a +cookie. + +The cookie's auxiliary data will be updated from aux_data if that is non-NULL +so that the cache can lazily update it on disk. + +One very important note - relinquish must NOT be called for a cookie unless all +the cookies for "child" indices, objects and pages have been relinquished +first. + + +Index Invalidation +================== + +There is no direct way to invalidate an index subtree. To do this, the caller +should relinquish and retire the cookie they have, and then acquire a new one. + + +Data File Invalidation +====================== + +Sometimes it will be necessary to invalidate an object that contains data. +Typically this will be necessary when the server tells the netfs of a foreign +change - at which point the netfs has to throw away all the state it had for an +inode and reload from the server. + +To indicate that a cache object should be invalidated, the following function +can be called:: + + void fscache_invalidate(struct fscache_cookie *cookie); + +This can be called with spinlocks held as it defers the work to a thread pool. +All extant storage, retrieval and attribute change ops at this point are +cancelled and discarded. Some future operations will be rejected until the +cache has had a chance to insert a barrier in the operations queue. After +that, operations will be queued again behind the invalidation operation. + +The invalidation operation will perform an attribute change operation and an +auxiliary data update operation as it is very likely these will have changed. + +Using the following function, the netfs can wait for the invalidation operation +to have reached a point at which it can start submitting ordinary operations +once again:: + + void fscache_wait_on_invalidate(struct fscache_cookie *cookie); + + +FS-cache Specific Page Flag +=========================== + +FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is +given the alternative name PG_fscache. + +PG_fscache is used to indicate that the page is known by the cache, and that +the cache must be informed if the page is going to go away. It's an indication +to the netfs that the cache has an interest in this page, where an interest may +be a pointer to it, resources allocated or reserved for it, or I/O in progress +upon it. + +The netfs can use this information in methods such as releasepage() to +determine whether it needs to uncache a page or update it. + +Furthermore, if this bit is set, releasepage() and invalidatepage() operations +will be called on a page to get rid of it, even if PG_private is not set. This +allows caching to attempted on a page before read_cache_pages() to be called +after fscache_read_or_alloc_pages() as the former will try and release pages it +was given under certain circumstances. + +This bit does not overlap with such as PG_private. This means that FS-Cache +can be used with a filesystem that uses the block buffering code. + +There are a number of operations defined on this flag:: + + int PageFsCache(struct page *page); + void SetPageFsCache(struct page *page) + void ClearPageFsCache(struct page *page) + int TestSetPageFsCache(struct page *page) + int TestClearPageFsCache(struct page *page) + +These functions are bit test, bit set, bit clear, bit test and set and bit +test and clear operations on PG_fscache. diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt deleted file mode 100644 index ba968e8f5704..000000000000 --- a/Documentation/filesystems/caching/netfs-api.txt +++ /dev/null @@ -1,910 +0,0 @@ - =============================== - FS-CACHE NETWORK FILESYSTEM API - =============================== - -There's an API by which a network filesystem can make use of the FS-Cache -facilities. This is based around a number of principles: - - (1) Caches can store a number of different object types. There are two main - object types: indices and files. The first is a special type used by - FS-Cache to make finding objects faster and to make retiring of groups of - objects easier. - - (2) Every index, file or other object is represented by a cookie. This cookie - may or may not have anything associated with it, but the netfs doesn't - need to care. - - (3) Barring the top-level index (one entry per cached netfs), the index - hierarchy for each netfs is structured according the whim of the netfs. - -This API is declared in . - -This document contains the following sections: - - (1) Network filesystem definition - (2) Index definition - (3) Object definition - (4) Network filesystem (un)registration - (5) Cache tag lookup - (6) Index registration - (7) Data file registration - (8) Miscellaneous object registration - (9) Setting the data file size - (10) Page alloc/read/write - (11) Page uncaching - (12) Index and data file consistency - (13) Cookie enablement - (14) Miscellaneous cookie operations - (15) Cookie unregistration - (16) Index invalidation - (17) Data file invalidation - (18) FS-Cache specific page flags. - - -============================= -NETWORK FILESYSTEM DEFINITION -============================= - -FS-Cache needs a description of the network filesystem. This is specified -using a record of the following structure: - - struct fscache_netfs { - uint32_t version; - const char *name; - struct fscache_cookie *primary_index; - ... - }; - -This first two fields should be filled in before registration, and the third -will be filled in by the registration function; any other fields should just be -ignored and are for internal use only. - -The fields are: - - (1) The name of the netfs (used as the key in the toplevel index). - - (2) The version of the netfs (if the name matches but the version doesn't, the - entire in-cache hierarchy for this netfs will be scrapped and begun - afresh). - - (3) The cookie representing the primary index will be allocated according to - another parameter passed into the registration function. - -For example, kAFS (linux/fs/afs/) uses the following definitions to describe -itself: - - struct fscache_netfs afs_cache_netfs = { - .version = 0, - .name = "afs", - }; - - -================ -INDEX DEFINITION -================ - -Indices are used for two purposes: - - (1) To aid the finding of a file based on a series of keys (such as AFS's - "cell", "volume ID", "vnode ID"). - - (2) To make it easier to discard a subset of all the files cached based around - a particular key - for instance to mirror the removal of an AFS volume. - -However, since it's unlikely that any two netfs's are going to want to define -their index hierarchies in quite the same way, FS-Cache tries to impose as few -restraints as possible on how an index is structured and where it is placed in -the tree. The netfs can even mix indices and data files at the same level, but -it's not recommended. - -Each index entry consists of a key of indeterminate length plus some auxiliary -data, also of indeterminate length. - -There are some limits on indices: - - (1) Any index containing non-index objects should be restricted to a single - cache. Any such objects created within an index will be created in the - first cache only. The cache in which an index is created can be - controlled by cache tags (see below). - - (2) The entry data must be atomically journallable, so it is limited to about - 400 bytes at present. At least 400 bytes will be available. - - (3) The depth of the index tree should be judged with care as the search - function is recursive. Too many layers will run the kernel out of stack. - - -================= -OBJECT DEFINITION -================= - -To define an object, a structure of the following type should be filled out: - - struct fscache_cookie_def - { - uint8_t name[16]; - uint8_t type; - - struct fscache_cache_tag *(*select_cache)( - const void *parent_netfs_data, - const void *cookie_netfs_data); - - enum fscache_checkaux (*check_aux)(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size); - - void (*get_context)(void *cookie_netfs_data, void *context); - - void (*put_context)(void *cookie_netfs_data, void *context); - - void (*mark_pages_cached)(void *cookie_netfs_data, - struct address_space *mapping, - struct pagevec *cached_pvec); - }; - -This has the following fields: - - (1) The type of the object [mandatory]. - - This is one of the following values: - - (*) FSCACHE_COOKIE_TYPE_INDEX - - This defines an index, which is a special FS-Cache type. - - (*) FSCACHE_COOKIE_TYPE_DATAFILE - - This defines an ordinary data file. - - (*) Any other value between 2 and 255 - - This defines an extraordinary object such as an XATTR. - - (2) The name of the object type (NUL terminated unless all 16 chars are used) - [optional]. - - (3) A function to select the cache in which to store an index [optional]. - - This function is invoked when an index needs to be instantiated in a cache - during the instantiation of a non-index object. Only the immediate index - parent for the non-index object will be queried. Any indices above that - in the hierarchy may be stored in multiple caches. This function does not - need to be supplied for any non-index object or any index that will only - have index children. - - If this function is not supplied or if it returns NULL then the first - cache in the parent's list will be chosen, or failing that, the first - cache in the master list. - - (4) A function to check the auxiliary data [optional]. - - This function will be called to check that a match found in the cache for - this object is valid. For instance with AFS it could check the auxiliary - data against the data version number returned by the server to determine - whether the index entry in a cache is still valid. - - If this function is absent, it will be assumed that matching objects in a - cache are always valid. - - The function is also passed the cache's idea of the object size and may - use this to manage coherency also. - - If present, the function should return one of the following values: - - (*) FSCACHE_CHECKAUX_OKAY - the entry is okay as is - (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update - (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted - - This function can also be used to extract data from the auxiliary data in - the cache and copy it into the netfs's structures. - - (5) A pair of functions to manage contexts for the completion callback - [optional]. - - The cache read/write functions are passed a context which is then passed - to the I/O completion callback function. To ensure this context remains - valid until after the I/O completion is called, two functions may be - provided: one to get an extra reference on the context, and one to drop a - reference to it. - - If the context is not used or is a type of object that won't go out of - scope, then these functions are not required. These functions are not - required for indices as indices may not contain data. These functions may - be called in interrupt context and so may not sleep. - - (6) A function to mark a page as retaining cache metadata [optional]. - - This is called by the cache to indicate that it is retaining in-memory - information for this page and that the netfs should uncache the page when - it has finished. This does not indicate whether there's data on the disk - or not. Note that several pages at once may be presented for marking. - - The PG_fscache bit is set on the pages before this function would be - called, so the function need not be provided if this is sufficient. - - This function is not required for indices as they're not permitted data. - - (7) A function to unmark all the pages retaining cache metadata [mandatory]. - - This is called by FS-Cache to indicate that a backing store is being - unbound from a cookie and that all the marks on the pages should be - cleared to prevent confusion. Note that the cache will have torn down all - its tracking information so that the pages don't need to be explicitly - uncached. - - This function is not required for indices as they're not permitted data. - - -=================================== -NETWORK FILESYSTEM (UN)REGISTRATION -=================================== - -The first step is to declare the network filesystem to the cache. This also -involves specifying the layout of the primary index (for AFS, this would be the -"cell" level). - -The registration function is: - - int fscache_register_netfs(struct fscache_netfs *netfs); - -It just takes a pointer to the netfs definition. It returns 0 or an error as -appropriate. - -For kAFS, registration is done as follows: - - ret = fscache_register_netfs(&afs_cache_netfs); - -The last step is, of course, unregistration: - - void fscache_unregister_netfs(struct fscache_netfs *netfs); - - -================ -CACHE TAG LOOKUP -================ - -FS-Cache permits the use of more than one cache. To permit particular index -subtrees to be bound to particular caches, the second step is to look up cache -representation tags. This step is optional; it can be left entirely up to -FS-Cache as to which cache should be used. The problem with doing that is that -FS-Cache will always pick the first cache that was registered. - -To get the representation for a named tag: - - struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name); - -This takes a text string as the name and returns a representation of a tag. It -will never return an error. It may return a dummy tag, however, if it runs out -of memory; this will inhibit caching with this tag. - -Any representation so obtained must be released by passing it to this function: - - void fscache_release_cache_tag(struct fscache_cache_tag *tag); - -The tag will be retrieved by FS-Cache when it calls the object definition -operation select_cache(). - - -================== -INDEX REGISTRATION -================== - -The third step is to inform FS-Cache about part of an index hierarchy that can -be used to locate files. This is done by requesting a cookie for each index in -the path to the file: - - struct fscache_cookie * - fscache_acquire_cookie(struct fscache_cookie *parent, - const struct fscache_object_def *def, - const void *index_key, - size_t index_key_len, - const void *aux_data, - size_t aux_data_len, - void *netfs_data, - loff_t object_size, - bool enable); - -This function creates an index entry in the index represented by parent, -filling in the index entry by calling the operations pointed to by def. - -A unique key that represents the object within the parent must be pointed to by -index_key and is of length index_key_len. - -An optional blob of auxiliary data that is to be stored within the cache can be -pointed to with aux_data and should be of length aux_data_len. This would -typically be used for storing coherency data. - -The netfs may pass an arbitrary value in netfs_data and this will be presented -to it in the event of any calling back. This may also be used in tracing or -logging of messages. - -The cache tracks the size of the data attached to an object and this set to be -object_size. For indices, this should be 0. This value will be passed to the -->check_aux() callback. - -Note that this function never returns an error - all errors are handled -internally. It may, however, return NULL to indicate no cookie. It is quite -acceptable to pass this token back to this function as the parent to another -acquisition (or even to the relinquish cookie, read page and write page -functions - see below). - -Note also that no indices are actually created in a cache until a non-index -object needs to be created somewhere down the hierarchy. Furthermore, an index -may be created in several different caches independently at different times. -This is all handled transparently, and the netfs doesn't see any of it. - -A cookie will be created in the disabled state if enabled is false. A cookie -must be enabled to do anything with it. A disabled cookie can be enabled by -calling fscache_enable_cookie() (see below). - -For example, with AFS, a cell would be added to the primary index. This index -entry would have a dependent inode containing volume mappings within this cell: - - cell->cache = - fscache_acquire_cookie(afs_cache_netfs.primary_index, - &afs_cell_cache_index_def, - cell->name, strlen(cell->name), - NULL, 0, - cell, 0, true); - -And then a particular volume could be added to that index by ID, creating -another index for vnodes (AFS inode equivalents): - - volume->cache = - fscache_acquire_cookie(volume->cell->cache, - &afs_volume_cache_index_def, - &volume->vid, sizeof(volume->vid), - NULL, 0, - volume, 0, true); - - -====================== -DATA FILE REGISTRATION -====================== - -The fourth step is to request a data file be created in the cache. This is -identical to index cookie acquisition. The only difference is that the type in -the object definition should be something other than index type. - - vnode->cache = - fscache_acquire_cookie(volume->cache, - &afs_vnode_cache_object_def, - &key, sizeof(key), - &aux, sizeof(aux), - vnode, vnode->status.size, true); - - -================================= -MISCELLANEOUS OBJECT REGISTRATION -================================= - -An optional step is to request an object of miscellaneous type be created in -the cache. This is almost identical to index cookie acquisition. The only -difference is that the type in the object definition should be something other -than index type. While the parent object could be an index, it's more likely -it would be some other type of object such as a data file. - - xattr->cache = - fscache_acquire_cookie(vnode->cache, - &afs_xattr_cache_object_def, - &xattr->name, strlen(xattr->name), - NULL, 0, - xattr, strlen(xattr->val), true); - -Miscellaneous objects might be used to store extended attributes or directory -entries for example. - - -========================== -SETTING THE DATA FILE SIZE -========================== - -The fifth step is to set the physical attributes of the file, such as its size. -This doesn't automatically reserve any space in the cache, but permits the -cache to adjust its metadata for data tracking appropriately: - - int fscache_attr_changed(struct fscache_cookie *cookie); - -The cache will return -ENOBUFS if there is no backing cache or if there is no -space to allocate any extra metadata required in the cache. - -Note that attempts to read or write data pages in the cache over this size may -be rebuffed with -ENOBUFS. - -This operation schedules an attribute adjustment to happen asynchronously at -some point in the future, and as such, it may happen after the function returns -to the caller. The attribute adjustment excludes read and write operations. - - -===================== -PAGE ALLOC/READ/WRITE -===================== - -And the sixth step is to store and retrieve pages in the cache. There are -three functions that are used to do this. - -Note: - - (1) A page should not be re-read or re-allocated without uncaching it first. - - (2) A read or allocated page must be uncached when the netfs page is released - from the pagecache. - - (3) A page should only be written to the cache if previous read or allocated. - -This permits the cache to maintain its page tracking in proper order. - - -PAGE READ ---------- - -Firstly, the netfs should ask FS-Cache to examine the caches and read the -contents cached for a particular page of a particular file if present, or else -allocate space to store the contents if not: - - typedef - void (*fscache_rw_complete_t)(struct page *page, - void *context, - int error); - - int fscache_read_or_alloc_page(struct fscache_cookie *cookie, - struct page *page, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp); - -The cookie argument must specify a cookie for an object that isn't an index, -the page specified will have the data loaded into it (and is also used to -specify the page number), and the gfp argument is used to control how any -memory allocations made are satisfied. - -If the cookie indicates the inode is not cached: - - (1) The function will return -ENOBUFS. - -Else if there's a copy of the page resident in the cache: - - (1) The mark_pages_cached() cookie operation will be called on that page. - - (2) The function will submit a request to read the data from the cache's - backing device directly into the page specified. - - (3) The function will return 0. - - (4) When the read is complete, end_io_func() will be invoked with: - - (*) The netfs data supplied when the cookie was created. - - (*) The page descriptor. - - (*) The context argument passed to the above function. This will be - maintained with the get_context/put_context functions mentioned above. - - (*) An argument that's 0 on success or negative for an error code. - - If an error occurs, it should be assumed that the page contains no usable - data. fscache_readpages_cancel() may need to be called. - - end_io_func() will be called in process context if the read is results in - an error, but it might be called in interrupt context if the read is - successful. - -Otherwise, if there's not a copy available in cache, but the cache may be able -to store the page: - - (1) The mark_pages_cached() cookie operation will be called on that page. - - (2) A block may be reserved in the cache and attached to the object at the - appropriate place. - - (3) The function will return -ENODATA. - -This function may also return -ENOMEM or -EINTR, in which case it won't have -read any data from the cache. - - -PAGE ALLOCATE -------------- - -Alternatively, if there's not expected to be any data in the cache for a page -because the file has been extended, a block can simply be allocated instead: - - int fscache_alloc_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp); - -This is similar to the fscache_read_or_alloc_page() function, except that it -never reads from the cache. It will return 0 if a block has been allocated, -rather than -ENODATA as the other would. One or the other must be performed -before writing to the cache. - -The mark_pages_cached() cookie operation will be called on the page if -successful. - - -PAGE WRITE ----------- - -Secondly, if the netfs changes the contents of the page (either due to an -initial download or if a user performs a write), then the page should be -written back to the cache: - - int fscache_write_page(struct fscache_cookie *cookie, - struct page *page, - loff_t object_size, - gfp_t gfp); - -The cookie argument must specify a data file cookie, the page specified should -contain the data to be written (and is also used to specify the page number), -object_size is the revised size of the object and the gfp argument is used to -control how any memory allocations made are satisfied. - -The page must have first been read or allocated successfully and must not have -been uncached before writing is performed. - -If the cookie indicates the inode is not cached then: - - (1) The function will return -ENOBUFS. - -Else if space can be allocated in the cache to hold this page: - - (1) PG_fscache_write will be set on the page. - - (2) The function will submit a request to write the data to cache's backing - device directly from the page specified. - - (3) The function will return 0. - - (4) When the write is complete PG_fscache_write is cleared on the page and - anyone waiting for that bit will be woken up. - -Else if there's no space available in the cache, -ENOBUFS will be returned. It -is also possible for the PG_fscache_write bit to be cleared when no write took -place if unforeseen circumstances arose (such as a disk error). - -Writing takes place asynchronously. - - -MULTIPLE PAGE READ ------------------- - -A facility is provided to read several pages at once, as requested by the -readpages() address space operation: - - int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, - struct address_space *mapping, - struct list_head *pages, - int *nr_pages, - fscache_rw_complete_t end_io_func, - void *context, - gfp_t gfp); - -This works in a similar way to fscache_read_or_alloc_page(), except: - - (1) Any page it can retrieve data for is removed from pages and nr_pages and - dispatched for reading to the disk. Reads of adjacent pages on disk may - be merged for greater efficiency. - - (2) The mark_pages_cached() cookie operation will be called on several pages - at once if they're being read or allocated. - - (3) If there was an general error, then that error will be returned. - - Else if some pages couldn't be allocated or read, then -ENOBUFS will be - returned. - - Else if some pages couldn't be read but were allocated, then -ENODATA will - be returned. - - Otherwise, if all pages had reads dispatched, then 0 will be returned, the - list will be empty and *nr_pages will be 0. - - (4) end_io_func will be called once for each page being read as the reads - complete. It will be called in process context if error != 0, but it may - be called in interrupt context if there is no error. - -Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude -some of the pages being read and some being allocated. Those pages will have -been marked appropriately and will need uncaching. - - -CANCELLATION OF UNREAD PAGES ----------------------------- - -If one or more pages are passed to fscache_read_or_alloc_pages() but not then -read from the cache and also not read from the underlying filesystem then -those pages will need to have any marks and reservations removed. This can be -done by calling: - - void fscache_readpages_cancel(struct fscache_cookie *cookie, - struct list_head *pages); - -prior to returning to the caller. The cookie argument should be as passed to -fscache_read_or_alloc_pages(). Every page in the pages list will be examined -and any that have PG_fscache set will be uncached. - - -============== -PAGE UNCACHING -============== - -To uncache a page, this function should be called: - - void fscache_uncache_page(struct fscache_cookie *cookie, - struct page *page); - -This function permits the cache to release any in-memory representation it -might be holding for this netfs page. This function must be called once for -each page on which the read or write page functions above have been called to -make sure the cache's in-memory tracking information gets torn down. - -Note that pages can't be explicitly deleted from the a data file. The whole -data file must be retired (see the relinquish cookie function below). - -Furthermore, note that this does not cancel the asynchronous read or write -operation started by the read/alloc and write functions, so the page -invalidation functions must use: - - bool fscache_check_page_write(struct fscache_cookie *cookie, - struct page *page); - -to see if a page is being written to the cache, and: - - void fscache_wait_on_page_write(struct fscache_cookie *cookie, - struct page *page); - -to wait for it to finish if it is. - - -When releasepage() is being implemented, a special FS-Cache function exists to -manage the heuristics of coping with vmscan trying to eject pages, which may -conflict with the cache trying to write pages to the cache (which may itself -need to allocate memory): - - bool fscache_maybe_release_page(struct fscache_cookie *cookie, - struct page *page, - gfp_t gfp); - -This takes the netfs cookie, and the page and gfp arguments as supplied to -releasepage(). It will return false if the page cannot be released yet for -some reason and if it returns true, the page has been uncached and can now be -released. - -To make a page available for release, this function may wait for an outstanding -storage request to complete, or it may attempt to cancel the storage request - -in which case the page will not be stored in the cache this time. - - -BULK INODE PAGE UNCACHE ------------------------ - -A convenience routine is provided to perform an uncache on all the pages -attached to an inode. This assumes that the pages on the inode correspond on a -1:1 basis with the pages in the cache. - - void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, - struct inode *inode); - -This takes the netfs cookie that the pages were cached with and the inode that -the pages are attached to. This function will wait for pages to finish being -written to the cache and for the cache to finish with the page generally. No -error is returned. - - -=============================== -INDEX AND DATA FILE CONSISTENCY -=============================== - -To find out whether auxiliary data for an object is up to data within the -cache, the following function can be called: - - int fscache_check_consistency(struct fscache_cookie *cookie, - const void *aux_data); - -This will call back to the netfs to check whether the auxiliary data associated -with a cookie is correct; if aux_data is non-NULL, it will update the auxiliary -data buffer first. It returns 0 if it is and -ESTALE if it isn't; it may also -return -ENOMEM and -ERESTARTSYS. - -To request an update of the index data for an index or other object, the -following function should be called: - - void fscache_update_cookie(struct fscache_cookie *cookie, - const void *aux_data); - -This function will update the cookie's auxiliary data buffer from aux_data if -that is non-NULL and then schedule this to be stored on disk. The update -method in the parent index definition will be called to transfer the data. - -Note that partial updates may happen automatically at other times, such as when -data blocks are added to a data file object. - - -================= -COOKIE ENABLEMENT -================= - -Cookies exist in one of two states: enabled and disabled. If a cookie is -disabled, it ignores all attempts to acquire child cookies; check, update or -invalidate its state; allocate, read or write backing pages - though it is -still possible to uncache pages and relinquish the cookie. - -The initial enablement state is set by fscache_acquire_cookie(), but the cookie -can be enabled or disabled later. To disable a cookie, call: - - void fscache_disable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool invalidate); - -If the cookie is not already disabled, this locks the cookie against other -enable and disable ops, marks the cookie as being disabled, discards or -invalidates any backing objects and waits for cessation of activity on any -associated object before unlocking the cookie. - -All possible failures are handled internally. The caller should consider -calling fscache_uncache_all_inode_pages() afterwards to make sure all page -markings are cleared up. - -Cookies can be enabled or reenabled with: - - void fscache_enable_cookie(struct fscache_cookie *cookie, - const void *aux_data, - loff_t object_size, - bool (*can_enable)(void *data), - void *data) - -If the cookie is not already enabled, this locks the cookie against other -enable and disable ops, invokes can_enable() and, if the cookie is not an index -cookie, will begin the procedure of acquiring backing objects. - -The optional can_enable() function is passed the data argument and returns a -ruling as to whether or not enablement should actually be permitted to begin. - -All possible failures are handled internally. The cookie will only be marked -as enabled if provisional backing objects are allocated. - -The object's data size is updated from object_size and is passed to the -->check_aux() function. - -In both cases, the cookie's auxiliary data buffer is updated from aux_data if -that is non-NULL inside the enablement lock before proceeding. - - -=============================== -MISCELLANEOUS COOKIE OPERATIONS -=============================== - -There are a number of operations that can be used to control cookies: - - (*) Cookie pinning: - - int fscache_pin_cookie(struct fscache_cookie *cookie); - void fscache_unpin_cookie(struct fscache_cookie *cookie); - - These operations permit data cookies to be pinned into the cache and to - have the pinning removed. They are not permitted on index cookies. - - The pinning function will return 0 if successful, -ENOBUFS in the cookie - isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning, - -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or - -EIO if there's any other problem. - - (*) Data space reservation: - - int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size); - - This permits a netfs to request cache space be reserved to store up to the - given amount of a file. It is permitted to ask for more than the current - size of the file to allow for future file expansion. - - If size is given as zero then the reservation will be cancelled. - - The function will return 0 if successful, -ENOBUFS in the cookie isn't - backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations, - -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or - -EIO if there's any other problem. - - Note that this doesn't pin an object in a cache; it can still be culled to - make space if it's not in use. - - -===================== -COOKIE UNREGISTRATION -===================== - -To get rid of a cookie, this function should be called. - - void fscache_relinquish_cookie(struct fscache_cookie *cookie, - const void *aux_data, - bool retire); - -If retire is non-zero, then the object will be marked for recycling, and all -copies of it will be removed from all active caches in which it is present. -Not only that but all child objects will also be retired. - -If retire is zero, then the object may be available again when next the -acquisition function is called. Retirement here will overrule the pinning on a -cookie. - -The cookie's auxiliary data will be updated from aux_data if that is non-NULL -so that the cache can lazily update it on disk. - -One very important note - relinquish must NOT be called for a cookie unless all -the cookies for "child" indices, objects and pages have been relinquished -first. - - -================== -INDEX INVALIDATION -================== - -There is no direct way to invalidate an index subtree. To do this, the caller -should relinquish and retire the cookie they have, and then acquire a new one. - - -====================== -DATA FILE INVALIDATION -====================== - -Sometimes it will be necessary to invalidate an object that contains data. -Typically this will be necessary when the server tells the netfs of a foreign -change - at which point the netfs has to throw away all the state it had for an -inode and reload from the server. - -To indicate that a cache object should be invalidated, the following function -can be called: - - void fscache_invalidate(struct fscache_cookie *cookie); - -This can be called with spinlocks held as it defers the work to a thread pool. -All extant storage, retrieval and attribute change ops at this point are -cancelled and discarded. Some future operations will be rejected until the -cache has had a chance to insert a barrier in the operations queue. After -that, operations will be queued again behind the invalidation operation. - -The invalidation operation will perform an attribute change operation and an -auxiliary data update operation as it is very likely these will have changed. - -Using the following function, the netfs can wait for the invalidation operation -to have reached a point at which it can start submitting ordinary operations -once again: - - void fscache_wait_on_invalidate(struct fscache_cookie *cookie); - - -=========================== -FS-CACHE SPECIFIC PAGE FLAG -=========================== - -FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is -given the alternative name PG_fscache. - -PG_fscache is used to indicate that the page is known by the cache, and that -the cache must be informed if the page is going to go away. It's an indication -to the netfs that the cache has an interest in this page, where an interest may -be a pointer to it, resources allocated or reserved for it, or I/O in progress -upon it. - -The netfs can use this information in methods such as releasepage() to -determine whether it needs to uncache a page or update it. - -Furthermore, if this bit is set, releasepage() and invalidatepage() operations -will be called on a page to get rid of it, even if PG_private is not set. This -allows caching to attempted on a page before read_cache_pages() to be called -after fscache_read_or_alloc_pages() as the former will try and release pages it -was given under certain circumstances. - -This bit does not overlap with such as PG_private. This means that FS-Cache -can be used with a filesystem that uses the block buffering code. - -There are a number of operations defined on this flag: - - int PageFsCache(struct page *page); - void SetPageFsCache(struct page *page) - void ClearPageFsCache(struct page *page) - int TestSetPageFsCache(struct page *page) - int TestClearPageFsCache(struct page *page) - -These functions are bit test, bit set, bit clear, bit test and set and bit -test and clear operations on PG_fscache. diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0ce39658a620..751bc5b1cddf 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -4,7 +4,7 @@ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/netfs-api.txt for more information on + * See Documentation/filesystems/caching/netfs-api.rst for more information on * the netfs API. */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ad044c0cb1f3..a1c928fe98e7 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -6,7 +6,7 @@ * * NOTE!!! See: * - * Documentation/filesystems/caching/netfs-api.txt + * Documentation/filesystems/caching/netfs-api.rst * * for a description of the network filesystem interface declared here. */ @@ -233,7 +233,7 @@ extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_ * * Register a filesystem as desiring caching services if they're available. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -253,7 +253,7 @@ int fscache_register_netfs(struct fscache_netfs *netfs) * Indicate that a filesystem no longer desires caching services for the * moment. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -270,7 +270,7 @@ void fscache_unregister_netfs(struct fscache_netfs *netfs) * Acquire a specific cache referral tag that can be used to select a specific * cache in which to cache an index. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -288,7 +288,7 @@ struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name) * * Release a reference to a cache referral tag previously looked up. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -315,7 +315,7 @@ void fscache_release_cache_tag(struct fscache_cache_tag *tag) * that can be used to locate files. This is done by requesting a cookie for * each index in the path to the file. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -351,7 +351,7 @@ struct fscache_cookie *fscache_acquire_cookie( * provided to update the auxiliary data in the cache before the object is * disconnected. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -394,7 +394,7 @@ int fscache_check_consistency(struct fscache_cookie *cookie, * cookie. The auxiliary data on the cookie will be updated first if @aux_data * is set. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -410,7 +410,7 @@ void fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) * * Permit data-storage cache objects to be pinned in the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -425,7 +425,7 @@ int fscache_pin_cookie(struct fscache_cookie *cookie) * * Permit data-storage cache objects to be unpinned from the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -441,7 +441,7 @@ void fscache_unpin_cookie(struct fscache_cookie *cookie) * changed. This includes the data size. These attributes will be obtained * through the get_attr() cookie definition op. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -463,7 +463,7 @@ int fscache_attr_changed(struct fscache_cookie *cookie) * * This can be called with spinlocks held. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -479,7 +479,7 @@ void fscache_invalidate(struct fscache_cookie *cookie) * * Wait for the invalidation of an object to complete. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -498,7 +498,7 @@ void fscache_wait_on_invalidate(struct fscache_cookie *cookie) * cookie so that a write to that object within the space can always be * honoured. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -533,7 +533,7 @@ int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size) * Else, if the page is unbacked, -ENODATA is returned and a block may have * been allocated in the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -582,7 +582,7 @@ int fscache_read_or_alloc_page(struct fscache_cookie *cookie, * regard to different pages, the return values are prioritised in that order. * Any pages submitted for reading are removed from the pages list. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -617,7 +617,7 @@ int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, * Else, a block will be allocated if one wasn't already, and 0 will be * returned * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -667,7 +667,7 @@ void fscache_readpages_cancel(struct fscache_cookie *cookie, * be cleared at the completion of the write to indicate the success or failure * of the operation. Note that the completion may happen before the return. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -693,7 +693,7 @@ int fscache_write_page(struct fscache_cookie *cookie, * Note that this cannot cancel any outstanding I/O operations between this * page and the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -711,7 +711,7 @@ void fscache_uncache_page(struct fscache_cookie *cookie, * * Ask the cache if a page is being written to the cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline @@ -731,7 +731,7 @@ bool fscache_check_page_write(struct fscache_cookie *cookie, * Ask the cache to wake us up when a page is no longer being written to the * cache. * - * See Documentation/filesystems/caching/netfs-api.txt for a complete + * See Documentation/filesystems/caching/netfs-api.rst for a complete * description. */ static inline -- cgit v1.2.3 From 09eac7c53570038d1069d711001b478f285c30cf Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:56 +0200 Subject: docs: filesystems: caching/operations.txt: convert it to ReST - Add a SPDX header; - Adjust document and section titles; - Comment out text ToC for html/pdf output; - Mark literal blocks as such; - Add it to filesystems/caching/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/97e71cc598a4f61df484ebda3ec06b63530ceb62.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/index.rst | 1 + Documentation/filesystems/caching/operations.rst | 210 ++++++++++++++++++++++ Documentation/filesystems/caching/operations.txt | 213 ----------------------- fs/fscache/operation.c | 2 +- 4 files changed, 212 insertions(+), 214 deletions(-) create mode 100644 Documentation/filesystems/caching/operations.rst delete mode 100644 Documentation/filesystems/caching/operations.txt diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst index d0651db450fb..75492b7c8ea0 100644 --- a/Documentation/filesystems/caching/index.rst +++ b/Documentation/filesystems/caching/index.rst @@ -9,3 +9,4 @@ Filesystem Caching fscache object netfs-api + operations diff --git a/Documentation/filesystems/caching/operations.rst b/Documentation/filesystems/caching/operations.rst new file mode 100644 index 000000000000..f7ddcc028939 --- /dev/null +++ b/Documentation/filesystems/caching/operations.rst @@ -0,0 +1,210 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +Asynchronous Operations Handling +================================ + +By: David Howells + +.. Contents: + + (*) Overview. + + (*) Operation record initialisation. + + (*) Parameters. + + (*) Procedure. + + (*) Asynchronous callback. + + +Overview +======== + +FS-Cache has an asynchronous operations handling facility that it uses for its +data storage and retrieval routines. Its operations are represented by +fscache_operation structs, though these are usually embedded into some other +structure. + +This facility is available to and expected to be be used by the cache backends, +and FS-Cache will create operations and pass them off to the appropriate cache +backend for completion. + +To make use of this facility, should be #included. + + +Operation Record Initialisation +=============================== + +An operation is recorded in an fscache_operation struct:: + + struct fscache_operation { + union { + struct work_struct fast_work; + struct slow_work slow_work; + }; + unsigned long flags; + fscache_operation_processor_t processor; + ... + }; + +Someone wanting to issue an operation should allocate something with this +struct embedded in it. They should initialise it by calling:: + + void fscache_operation_init(struct fscache_operation *op, + fscache_operation_release_t release); + +with the operation to be initialised and the release function to use. + +The op->flags parameter should be set to indicate the CPU time provision and +the exclusivity (see the Parameters section). + +The op->fast_work, op->slow_work and op->processor flags should be set as +appropriate for the CPU time provision (see the Parameters section). + +FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the +operation and waited for afterwards. + + +Parameters +========== + +There are a number of parameters that can be set in the operation record's flag +parameter. There are three options for the provision of CPU time in these +operations: + + (1) The operation may be done synchronously (FSCACHE_OP_MYTHREAD). A thread + may decide it wants to handle an operation itself without deferring it to + another thread. + + This is, for example, used in read operations for calling readpages() on + the backing filesystem in CacheFiles. Although readpages() does an + asynchronous data fetch, the determination of whether pages exist is done + synchronously - and the netfs does not proceed until this has been + determined. + + If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags + before submitting the operation, and the operating thread must wait for it + to be cleared before proceeding:: + + wait_on_bit(&op->flags, FSCACHE_OP_WAITING, + TASK_UNINTERRUPTIBLE); + + + (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it + will be given to keventd to process. Such an operation is not permitted + to sleep on I/O. + + This is, for example, used by CacheFiles to copy data from a backing fs + page to a netfs page after the backing fs has read the page in. + + If this option is used, op->fast_work and op->processor must be + initialised before submitting the operation:: + + INIT_WORK(&op->fast_work, do_some_work); + + + (3) The operation may be slow asynchronous (FSCACHE_OP_SLOW), in which case it + will be given to the slow work facility to process. Such an operation is + permitted to sleep on I/O. + + This is, for example, used by FS-Cache to handle background writes of + pages that have just been fetched from a remote server. + + If this option is used, op->slow_work and op->processor must be + initialised before submitting the operation:: + + fscache_operation_init_slow(op, processor) + + +Furthermore, operations may be one of two types: + + (1) Exclusive (FSCACHE_OP_EXCLUSIVE). Operations of this type may not run in + conjunction with any other operation on the object being operated upon. + + An example of this is the attribute change operation, in which the file + being written to may need truncation. + + (2) Shareable. Operations of this type may be running simultaneously. It's + up to the operation implementation to prevent interference between other + operations running at the same time. + + +Procedure +========= + +Operations are used through the following procedure: + + (1) The submitting thread must allocate the operation and initialise it + itself. Normally this would be part of a more specific structure with the + generic op embedded within. + + (2) The submitting thread must then submit the operation for processing using + one of the following two functions:: + + int fscache_submit_op(struct fscache_object *object, + struct fscache_operation *op); + + int fscache_submit_exclusive_op(struct fscache_object *object, + struct fscache_operation *op); + + The first function should be used to submit non-exclusive ops and the + second to submit exclusive ones. The caller must still set the + FSCACHE_OP_EXCLUSIVE flag. + + If successful, both functions will assign the operation to the specified + object and return 0. -ENOBUFS will be returned if the object specified is + permanently unavailable. + + The operation manager will defer operations on an object that is still + undergoing lookup or creation. The operation will also be deferred if an + operation of conflicting exclusivity is in progress on the object. + + If the operation is asynchronous, the manager will retain a reference to + it, so the caller should put their reference to it by passing it to:: + + void fscache_put_operation(struct fscache_operation *op); + + (3) If the submitting thread wants to do the work itself, and has marked the + operation with FSCACHE_OP_MYTHREAD, then it should monitor + FSCACHE_OP_WAITING as described above and check the state of the object if + necessary (the object might have died while the thread was waiting). + + When it has finished doing its processing, it should call + fscache_op_complete() and fscache_put_operation() on it. + + (4) The operation holds an effective lock upon the object, preventing other + exclusive ops conflicting until it is released. The operation can be + enqueued for further immediate asynchronous processing by adjusting the + CPU time provisioning option if necessary, eg:: + + op->flags &= ~FSCACHE_OP_TYPE; + op->flags |= ~FSCACHE_OP_FAST; + + and calling:: + + void fscache_enqueue_operation(struct fscache_operation *op) + + This can be used to allow other things to have use of the worker thread + pools. + + +Asynchronous Callback +===================== + +When used in asynchronous mode, the worker thread pool will invoke the +processor method with a pointer to the operation. This should then get at the +container struct by using container_of():: + + static void fscache_write_op(struct fscache_operation *_op) + { + struct fscache_storage *op = + container_of(_op, struct fscache_storage, op); + ... + } + +The caller holds a reference on the operation, and will invoke +fscache_put_operation() when the processor function returns. The processor +function is at liberty to call fscache_enqueue_operation() or to take extra +references. diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt deleted file mode 100644 index d8976c434718..000000000000 --- a/Documentation/filesystems/caching/operations.txt +++ /dev/null @@ -1,213 +0,0 @@ - ================================ - ASYNCHRONOUS OPERATIONS HANDLING - ================================ - -By: David Howells - -Contents: - - (*) Overview. - - (*) Operation record initialisation. - - (*) Parameters. - - (*) Procedure. - - (*) Asynchronous callback. - - -======== -OVERVIEW -======== - -FS-Cache has an asynchronous operations handling facility that it uses for its -data storage and retrieval routines. Its operations are represented by -fscache_operation structs, though these are usually embedded into some other -structure. - -This facility is available to and expected to be be used by the cache backends, -and FS-Cache will create operations and pass them off to the appropriate cache -backend for completion. - -To make use of this facility, should be #included. - - -=============================== -OPERATION RECORD INITIALISATION -=============================== - -An operation is recorded in an fscache_operation struct: - - struct fscache_operation { - union { - struct work_struct fast_work; - struct slow_work slow_work; - }; - unsigned long flags; - fscache_operation_processor_t processor; - ... - }; - -Someone wanting to issue an operation should allocate something with this -struct embedded in it. They should initialise it by calling: - - void fscache_operation_init(struct fscache_operation *op, - fscache_operation_release_t release); - -with the operation to be initialised and the release function to use. - -The op->flags parameter should be set to indicate the CPU time provision and -the exclusivity (see the Parameters section). - -The op->fast_work, op->slow_work and op->processor flags should be set as -appropriate for the CPU time provision (see the Parameters section). - -FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the -operation and waited for afterwards. - - -========== -PARAMETERS -========== - -There are a number of parameters that can be set in the operation record's flag -parameter. There are three options for the provision of CPU time in these -operations: - - (1) The operation may be done synchronously (FSCACHE_OP_MYTHREAD). A thread - may decide it wants to handle an operation itself without deferring it to - another thread. - - This is, for example, used in read operations for calling readpages() on - the backing filesystem in CacheFiles. Although readpages() does an - asynchronous data fetch, the determination of whether pages exist is done - synchronously - and the netfs does not proceed until this has been - determined. - - If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags - before submitting the operation, and the operating thread must wait for it - to be cleared before proceeding: - - wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - TASK_UNINTERRUPTIBLE); - - - (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it - will be given to keventd to process. Such an operation is not permitted - to sleep on I/O. - - This is, for example, used by CacheFiles to copy data from a backing fs - page to a netfs page after the backing fs has read the page in. - - If this option is used, op->fast_work and op->processor must be - initialised before submitting the operation: - - INIT_WORK(&op->fast_work, do_some_work); - - - (3) The operation may be slow asynchronous (FSCACHE_OP_SLOW), in which case it - will be given to the slow work facility to process. Such an operation is - permitted to sleep on I/O. - - This is, for example, used by FS-Cache to handle background writes of - pages that have just been fetched from a remote server. - - If this option is used, op->slow_work and op->processor must be - initialised before submitting the operation: - - fscache_operation_init_slow(op, processor) - - -Furthermore, operations may be one of two types: - - (1) Exclusive (FSCACHE_OP_EXCLUSIVE). Operations of this type may not run in - conjunction with any other operation on the object being operated upon. - - An example of this is the attribute change operation, in which the file - being written to may need truncation. - - (2) Shareable. Operations of this type may be running simultaneously. It's - up to the operation implementation to prevent interference between other - operations running at the same time. - - -========= -PROCEDURE -========= - -Operations are used through the following procedure: - - (1) The submitting thread must allocate the operation and initialise it - itself. Normally this would be part of a more specific structure with the - generic op embedded within. - - (2) The submitting thread must then submit the operation for processing using - one of the following two functions: - - int fscache_submit_op(struct fscache_object *object, - struct fscache_operation *op); - - int fscache_submit_exclusive_op(struct fscache_object *object, - struct fscache_operation *op); - - The first function should be used to submit non-exclusive ops and the - second to submit exclusive ones. The caller must still set the - FSCACHE_OP_EXCLUSIVE flag. - - If successful, both functions will assign the operation to the specified - object and return 0. -ENOBUFS will be returned if the object specified is - permanently unavailable. - - The operation manager will defer operations on an object that is still - undergoing lookup or creation. The operation will also be deferred if an - operation of conflicting exclusivity is in progress on the object. - - If the operation is asynchronous, the manager will retain a reference to - it, so the caller should put their reference to it by passing it to: - - void fscache_put_operation(struct fscache_operation *op); - - (3) If the submitting thread wants to do the work itself, and has marked the - operation with FSCACHE_OP_MYTHREAD, then it should monitor - FSCACHE_OP_WAITING as described above and check the state of the object if - necessary (the object might have died while the thread was waiting). - - When it has finished doing its processing, it should call - fscache_op_complete() and fscache_put_operation() on it. - - (4) The operation holds an effective lock upon the object, preventing other - exclusive ops conflicting until it is released. The operation can be - enqueued for further immediate asynchronous processing by adjusting the - CPU time provisioning option if necessary, eg: - - op->flags &= ~FSCACHE_OP_TYPE; - op->flags |= ~FSCACHE_OP_FAST; - - and calling: - - void fscache_enqueue_operation(struct fscache_operation *op) - - This can be used to allow other things to have use of the worker thread - pools. - - -===================== -ASYNCHRONOUS CALLBACK -===================== - -When used in asynchronous mode, the worker thread pool will invoke the -processor method with a pointer to the operation. This should then get at the -container struct by using container_of(): - - static void fscache_write_op(struct fscache_operation *_op) - { - struct fscache_storage *op = - container_of(_op, struct fscache_storage, op); - ... - } - -The caller holds a reference on the operation, and will invoke -fscache_put_operation() when the processor function returns. The processor -function is at liberty to call fscache_enqueue_operation() or to take extra -references. diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 1a22a55f75a0..4a5651d4904e 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/operations.txt + * See Documentation/filesystems/caching/operations.rst */ #define FSCACHE_DEBUG_LEVEL OPERATION -- cgit v1.2.3 From d74802ade7de6b6771ac87f7104f8b587bba37fa Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:57 +0200 Subject: docs: filesystems: caching/cachefiles.txt: convert to ReST - Add a SPDX header; - Adjust document title; - Mark literal blocks as such; - Add table markups; - Comment out text ToC for html/pdf output; - Add lists markups; - Add it to filesystems/caching/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/eec0cfc268e8dca348f760224685100c9c2caba6.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/cachefiles.rst | 484 ++++++++++++++++++++++ Documentation/filesystems/caching/cachefiles.txt | 501 ----------------------- Documentation/filesystems/caching/index.rst | 1 + MAINTAINERS | 2 +- fs/cachefiles/Kconfig | 4 +- 5 files changed, 488 insertions(+), 504 deletions(-) create mode 100644 Documentation/filesystems/caching/cachefiles.rst delete mode 100644 Documentation/filesystems/caching/cachefiles.txt diff --git a/Documentation/filesystems/caching/cachefiles.rst b/Documentation/filesystems/caching/cachefiles.rst new file mode 100644 index 000000000000..65d3db476765 --- /dev/null +++ b/Documentation/filesystems/caching/cachefiles.rst @@ -0,0 +1,484 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM +=============================================== + +.. Contents: + + (*) Overview. + + (*) Requirements. + + (*) Configuration. + + (*) Starting the cache. + + (*) Things to avoid. + + (*) Cache culling. + + (*) Cache structure. + + (*) Security model and SELinux. + + (*) A note on security. + + (*) Statistical information. + + (*) Debugging. + + + +Overview +======== + +CacheFiles is a caching backend that's meant to use as a cache a directory on +an already mounted filesystem of a local type (such as Ext3). + +CacheFiles uses a userspace daemon to do some of the cache management - such as +reaping stale nodes and culling. This is called cachefilesd and lives in +/sbin. + +The filesystem and data integrity of the cache are only as good as those of the +filesystem providing the backing services. Note that CacheFiles does not +attempt to journal anything since the journalling interfaces of the various +filesystems are very specific in nature. + +CacheFiles creates a misc character device - "/dev/cachefiles" - that is used +to communication with the daemon. Only one thing may have this open at once, +and while it is open, a cache is at least partially in existence. The daemon +opens this and sends commands down it to control the cache. + +CacheFiles is currently limited to a single cache. + +CacheFiles attempts to maintain at least a certain percentage of free space on +the filesystem, shrinking the cache by culling the objects it contains to make +space if necessary - see the "Cache Culling" section. This means it can be +placed on the same medium as a live set of data, and will expand to make use of +spare space and automatically contract when the set of data requires more +space. + + + +Requirements +============ + +The use of CacheFiles and its daemon requires the following features to be +available in the system and in the cache filesystem: + + - dnotify. + + - extended attributes (xattrs). + + - openat() and friends. + + - bmap() support on files in the filesystem (FIBMAP ioctl). + + - The use of bmap() to detect a partial page at the end of the file. + +It is strongly recommended that the "dir_index" option is enabled on Ext3 +filesystems being used as a cache. + + +Configuration +============= + +The cache is configured by a script in /etc/cachefilesd.conf. These commands +set up cache ready for use. The following script commands are available: + + brun %, bcull %, bstop %, frun %, fcull %, fstop % + Configure the culling limits. Optional. See the section on culling + The defaults are 7% (run), 5% (cull) and 1% (stop) respectively. + + The commands beginning with a 'b' are file space (block) limits, those + beginning with an 'f' are file count limits. + + dir + Specify the directory containing the root of the cache. Mandatory. + + tag + Specify a tag to FS-Cache to use in distinguishing multiple caches. + Optional. The default is "CacheFiles". + + debug + Specify a numeric bitmask to control debugging in the kernel module. + Optional. The default is zero (all off). The following values can be + OR'd into the mask to collect various information: + + == ================================================= + 1 Turn on trace of function entry (_enter() macros) + 2 Turn on trace of function exit (_leave() macros) + 4 Turn on trace of internal debug points (_debug()) + == ================================================= + + This mask can also be set through sysfs, eg:: + + echo 5 >/sys/modules/cachefiles/parameters/debug + + +Starting the Cache +================== + +The cache is started by running the daemon. The daemon opens the cache device, +configures the cache and tells it to begin caching. At that point the cache +binds to fscache and the cache becomes live. + +The daemon is run as follows:: + + /sbin/cachefilesd [-d]* [-s] [-n] [-f ] + +The flags are: + + ``-d`` + Increase the debugging level. This can be specified multiple times and + is cumulative with itself. + + ``-s`` + Send messages to stderr instead of syslog. + + ``-n`` + Don't daemonise and go into background. + + ``-f `` + Use an alternative configuration file rather than the default one. + + +Things to Avoid +=============== + +Do not mount other things within the cache as this will cause problems. The +kernel module contains its own very cut-down path walking facility that ignores +mountpoints, but the daemon can't avoid them. + +Do not create, rename or unlink files and directories in the cache while the +cache is active, as this may cause the state to become uncertain. + +Renaming files in the cache might make objects appear to be other objects (the +filename is part of the lookup key). + +Do not change or remove the extended attributes attached to cache files by the +cache as this will cause the cache state management to get confused. + +Do not create files or directories in the cache, lest the cache get confused or +serve incorrect data. + +Do not chmod files in the cache. The module creates things with minimal +permissions to prevent random users being able to access them directly. + + +Cache Culling +============= + +The cache may need culling occasionally to make space. This involves +discarding objects from the cache that have been used less recently than +anything else. Culling is based on the access time of data objects. Empty +directories are culled if not in use. + +Cache culling is done on the basis of the percentage of blocks and the +percentage of files available in the underlying filesystem. There are six +"limits": + + brun, frun + If the amount of free space and the number of available files in the cache + rises above both these limits, then culling is turned off. + + bcull, fcull + If the amount of available space or the number of available files in the + cache falls below either of these limits, then culling is started. + + bstop, fstop + If the amount of available space or the number of available files in the + cache falls below either of these limits, then no further allocation of + disk space or files is permitted until culling has raised things above + these limits again. + +These must be configured thusly:: + + 0 <= bstop < bcull < brun < 100 + 0 <= fstop < fcull < frun < 100 + +Note that these are percentages of available space and available files, and do +_not_ appear as 100 minus the percentage displayed by the "df" program. + +The userspace daemon scans the cache to build up a table of cullable objects. +These are then culled in least recently used order. A new scan of the cache is +started as soon as space is made in the table. Objects will be skipped if +their atimes have changed or if the kernel module says it is still using them. + + +Cache Structure +=============== + +The CacheFiles module will create two directories in the directory it was +given: + + * cache/ + * graveyard/ + +The active cache objects all reside in the first directory. The CacheFiles +kernel module moves any retired or culled objects that it can't simply unlink +to the graveyard from which the daemon will actually delete them. + +The daemon uses dnotify to monitor the graveyard directory, and will delete +anything that appears therein. + + +The module represents index objects as directories with the filename "I..." or +"J...". Note that the "cache/" directory is itself a special index. + +Data objects are represented as files if they have no children, or directories +if they do. Their filenames all begin "D..." or "E...". If represented as a +directory, data objects will have a file in the directory called "data" that +actually holds the data. + +Special objects are similar to data objects, except their filenames begin +"S..." or "T...". + + +If an object has children, then it will be represented as a directory. +Immediately in the representative directory are a collection of directories +named for hash values of the child object keys with an '@' prepended. Into +this directory, if possible, will be placed the representations of the child +objects:: + + /INDEX /INDEX /INDEX /DATA FILES + /=========/==========/=================================/================ + cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400 + cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry + cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry + cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...FP1ry + + +If the key is so long that it exceeds NAME_MAX with the decorations added on to +it, then it will be cut into pieces, the first few of which will be used to +make a nest of directories, and the last one of which will be the objects +inside the last directory. The names of the intermediate directories will have +'+' prepended:: + + J1223/@23/+xy...z/+kl...m/Epqr + + +Note that keys are raw data, and not only may they exceed NAME_MAX in size, +they may also contain things like '/' and NUL characters, and so they may not +be suitable for turning directly into a filename. + +To handle this, CacheFiles will use a suitably printable filename directly and +"base-64" encode ones that aren't directly suitable. The two versions of +object filenames indicate the encoding: + + =============== =============== =============== + OBJECT TYPE PRINTABLE ENCODED + =============== =============== =============== + Index "I..." "J..." + Data "D..." "E..." + Special "S..." "T..." + =============== =============== =============== + +Intermediate directories are always "@" or "+" as appropriate. + + +Each object in the cache has an extended attribute label that holds the object +type ID (required to distinguish special objects) and the auxiliary data from +the netfs. The latter is used to detect stale objects in the cache and update +or retire them. + + +Note that CacheFiles will erase from the cache any file it doesn't recognise or +any file of an incorrect type (such as a FIFO file or a device file). + + +Security Model and SELinux +========================== + +CacheFiles is implemented to deal properly with the LSM security features of +the Linux kernel and the SELinux facility. + +One of the problems that CacheFiles faces is that it is generally acting on +behalf of a process, and running in that process's context, and that includes a +security context that is not appropriate for accessing the cache - either +because the files in the cache are inaccessible to that process, or because if +the process creates a file in the cache, that file may be inaccessible to other +processes. + +The way CacheFiles works is to temporarily change the security context (fsuid, +fsgid and actor security label) that the process acts as - without changing the +security context of the process when it the target of an operation performed by +some other process (so signalling and suchlike still work correctly). + + +When the CacheFiles module is asked to bind to its cache, it: + + (1) Finds the security label attached to the root cache directory and uses + that as the security label with which it will create files. By default, + this is:: + + cachefiles_var_t + + (2) Finds the security label of the process which issued the bind request + (presumed to be the cachefilesd daemon), which by default will be:: + + cachefilesd_t + + and asks LSM to supply a security ID as which it should act given the + daemon's label. By default, this will be:: + + cachefiles_kernel_t + + SELinux transitions the daemon's security ID to the module's security ID + based on a rule of this form in the policy:: + + type_transition kernel_t : process ; + + For instance:: + + type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t; + + +The module's security ID gives it permission to create, move and remove files +and directories in the cache, to find and access directories and files in the +cache, to set and access extended attributes on cache objects, and to read and +write files in the cache. + +The daemon's security ID gives it only a very restricted set of permissions: it +may scan directories, stat files and erase files and directories. It may +not read or write files in the cache, and so it is precluded from accessing the +data cached therein; nor is it permitted to create new files in the cache. + + +There are policy source files available in: + + http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2 + +and later versions. In that tarball, see the files:: + + cachefilesd.te + cachefilesd.fc + cachefilesd.if + +They are built and installed directly by the RPM. + +If a non-RPM based system is being used, then copy the above files to their own +directory and run:: + + make -f /usr/share/selinux/devel/Makefile + semodule -i cachefilesd.pp + +You will need checkpolicy and selinux-policy-devel installed prior to the +build. + + +By default, the cache is located in /var/fscache, but if it is desirable that +it should be elsewhere, than either the above policy files must be altered, or +an auxiliary policy must be installed to label the alternate location of the +cache. + +For instructions on how to add an auxiliary policy to enable the cache to be +located elsewhere when SELinux is in enforcing mode, please see:: + + /usr/share/doc/cachefilesd-*/move-cache.txt + +When the cachefilesd rpm is installed; alternatively, the document can be found +in the sources. + + +A Note on Security +================== + +CacheFiles makes use of the split security in the task_struct. It allocates +its own task_security structure, and redirects current->cred to point to it +when it acts on behalf of another process, in that process's context. + +The reason it does this is that it calls vfs_mkdir() and suchlike rather than +bypassing security and calling inode ops directly. Therefore the VFS and LSM +may deny the CacheFiles access to the cache data because under some +circumstances the caching code is running in the security context of whatever +process issued the original syscall on the netfs. + +Furthermore, should CacheFiles create a file or directory, the security +parameters with that object is created (UID, GID, security label) would be +derived from that process that issued the system call, thus potentially +preventing other processes from accessing the cache - including CacheFiles's +cache management daemon (cachefilesd). + +What is required is to temporarily override the security of the process that +issued the system call. We can't, however, just do an in-place change of the +security data as that affects the process as an object, not just as a subject. +This means it may lose signals or ptrace events for example, and affects what +the process looks like in /proc. + +So CacheFiles makes use of a logical split in the security between the +objective security (task->real_cred) and the subjective security (task->cred). +The objective security holds the intrinsic security properties of a process and +is never overridden. This is what appears in /proc, and is what is used when a +process is the target of an operation by some other process (SIGKILL for +example). + +The subjective security holds the active security properties of a process, and +may be overridden. This is not seen externally, and is used whan a process +acts upon another object, for example SIGKILLing another process or opening a +file. + +LSM hooks exist that allow SELinux (or Smack or whatever) to reject a request +for CacheFiles to run in a context of a specific security label, or to create +files and directories with another security label. + + +Statistical Information +======================= + +If FS-Cache is compiled with the following option enabled:: + + CONFIG_CACHEFILES_HISTOGRAM=y + +then it will gather certain statistics and display them through a proc file. + + /proc/fs/cachefiles/histogram + + :: + + cat /proc/fs/cachefiles/histogram + JIFS SECS LOOKUPS MKDIRS CREATES + ===== ===== ========= ========= ========= + + This shows the breakdown of the number of times each amount of time + between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The + columns are as follows: + + ======= ======================================================= + COLUMN TIME MEASUREMENT + ======= ======================================================= + LOOKUPS Length of time to perform a lookup on the backing fs + MKDIRS Length of time to perform a mkdir on the backing fs + CREATES Length of time to perform a create on the backing fs + ======= ======================================================= + + Each row shows the number of events that took a particular range of times. + Each step is 1 jiffy in size. The JIFS column indicates the particular + jiffy range covered, and the SECS field the equivalent number of seconds. + + +Debugging +========= + +If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime +debugging enabled by adjusting the value in:: + + /sys/module/cachefiles/parameters/debug + +This is a bitmask of debugging streams to enable: + + ======= ======= =============================== ======================= + BIT VALUE STREAM POINT + ======= ======= =============================== ======================= + 0 1 General Function entry trace + 1 2 Function exit trace + 2 4 General + ======= ======= =============================== ======================= + +The appropriate set of values should be OR'd together and the result written to +the control file. For example:: + + echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug + +will turn on all function entry debugging. diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.txt deleted file mode 100644 index 28aefcbb1442..000000000000 --- a/Documentation/filesystems/caching/cachefiles.txt +++ /dev/null @@ -1,501 +0,0 @@ - =============================================== - CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM - =============================================== - -Contents: - - (*) Overview. - - (*) Requirements. - - (*) Configuration. - - (*) Starting the cache. - - (*) Things to avoid. - - (*) Cache culling. - - (*) Cache structure. - - (*) Security model and SELinux. - - (*) A note on security. - - (*) Statistical information. - - (*) Debugging. - - -======== -OVERVIEW -======== - -CacheFiles is a caching backend that's meant to use as a cache a directory on -an already mounted filesystem of a local type (such as Ext3). - -CacheFiles uses a userspace daemon to do some of the cache management - such as -reaping stale nodes and culling. This is called cachefilesd and lives in -/sbin. - -The filesystem and data integrity of the cache are only as good as those of the -filesystem providing the backing services. Note that CacheFiles does not -attempt to journal anything since the journalling interfaces of the various -filesystems are very specific in nature. - -CacheFiles creates a misc character device - "/dev/cachefiles" - that is used -to communication with the daemon. Only one thing may have this open at once, -and while it is open, a cache is at least partially in existence. The daemon -opens this and sends commands down it to control the cache. - -CacheFiles is currently limited to a single cache. - -CacheFiles attempts to maintain at least a certain percentage of free space on -the filesystem, shrinking the cache by culling the objects it contains to make -space if necessary - see the "Cache Culling" section. This means it can be -placed on the same medium as a live set of data, and will expand to make use of -spare space and automatically contract when the set of data requires more -space. - - -============ -REQUIREMENTS -============ - -The use of CacheFiles and its daemon requires the following features to be -available in the system and in the cache filesystem: - - - dnotify. - - - extended attributes (xattrs). - - - openat() and friends. - - - bmap() support on files in the filesystem (FIBMAP ioctl). - - - The use of bmap() to detect a partial page at the end of the file. - -It is strongly recommended that the "dir_index" option is enabled on Ext3 -filesystems being used as a cache. - - -============= -CONFIGURATION -============= - -The cache is configured by a script in /etc/cachefilesd.conf. These commands -set up cache ready for use. The following script commands are available: - - (*) brun % - (*) bcull % - (*) bstop % - (*) frun % - (*) fcull % - (*) fstop % - - Configure the culling limits. Optional. See the section on culling - The defaults are 7% (run), 5% (cull) and 1% (stop) respectively. - - The commands beginning with a 'b' are file space (block) limits, those - beginning with an 'f' are file count limits. - - (*) dir - - Specify the directory containing the root of the cache. Mandatory. - - (*) tag - - Specify a tag to FS-Cache to use in distinguishing multiple caches. - Optional. The default is "CacheFiles". - - (*) debug - - Specify a numeric bitmask to control debugging in the kernel module. - Optional. The default is zero (all off). The following values can be - OR'd into the mask to collect various information: - - 1 Turn on trace of function entry (_enter() macros) - 2 Turn on trace of function exit (_leave() macros) - 4 Turn on trace of internal debug points (_debug()) - - This mask can also be set through sysfs, eg: - - echo 5 >/sys/modules/cachefiles/parameters/debug - - -================== -STARTING THE CACHE -================== - -The cache is started by running the daemon. The daemon opens the cache device, -configures the cache and tells it to begin caching. At that point the cache -binds to fscache and the cache becomes live. - -The daemon is run as follows: - - /sbin/cachefilesd [-d]* [-s] [-n] [-f ] - -The flags are: - - (*) -d - - Increase the debugging level. This can be specified multiple times and - is cumulative with itself. - - (*) -s - - Send messages to stderr instead of syslog. - - (*) -n - - Don't daemonise and go into background. - - (*) -f - - Use an alternative configuration file rather than the default one. - - -=============== -THINGS TO AVOID -=============== - -Do not mount other things within the cache as this will cause problems. The -kernel module contains its own very cut-down path walking facility that ignores -mountpoints, but the daemon can't avoid them. - -Do not create, rename or unlink files and directories in the cache while the -cache is active, as this may cause the state to become uncertain. - -Renaming files in the cache might make objects appear to be other objects (the -filename is part of the lookup key). - -Do not change or remove the extended attributes attached to cache files by the -cache as this will cause the cache state management to get confused. - -Do not create files or directories in the cache, lest the cache get confused or -serve incorrect data. - -Do not chmod files in the cache. The module creates things with minimal -permissions to prevent random users being able to access them directly. - - -============= -CACHE CULLING -============= - -The cache may need culling occasionally to make space. This involves -discarding objects from the cache that have been used less recently than -anything else. Culling is based on the access time of data objects. Empty -directories are culled if not in use. - -Cache culling is done on the basis of the percentage of blocks and the -percentage of files available in the underlying filesystem. There are six -"limits": - - (*) brun - (*) frun - - If the amount of free space and the number of available files in the cache - rises above both these limits, then culling is turned off. - - (*) bcull - (*) fcull - - If the amount of available space or the number of available files in the - cache falls below either of these limits, then culling is started. - - (*) bstop - (*) fstop - - If the amount of available space or the number of available files in the - cache falls below either of these limits, then no further allocation of - disk space or files is permitted until culling has raised things above - these limits again. - -These must be configured thusly: - - 0 <= bstop < bcull < brun < 100 - 0 <= fstop < fcull < frun < 100 - -Note that these are percentages of available space and available files, and do -_not_ appear as 100 minus the percentage displayed by the "df" program. - -The userspace daemon scans the cache to build up a table of cullable objects. -These are then culled in least recently used order. A new scan of the cache is -started as soon as space is made in the table. Objects will be skipped if -their atimes have changed or if the kernel module says it is still using them. - - -=============== -CACHE STRUCTURE -=============== - -The CacheFiles module will create two directories in the directory it was -given: - - (*) cache/ - - (*) graveyard/ - -The active cache objects all reside in the first directory. The CacheFiles -kernel module moves any retired or culled objects that it can't simply unlink -to the graveyard from which the daemon will actually delete them. - -The daemon uses dnotify to monitor the graveyard directory, and will delete -anything that appears therein. - - -The module represents index objects as directories with the filename "I..." or -"J...". Note that the "cache/" directory is itself a special index. - -Data objects are represented as files if they have no children, or directories -if they do. Their filenames all begin "D..." or "E...". If represented as a -directory, data objects will have a file in the directory called "data" that -actually holds the data. - -Special objects are similar to data objects, except their filenames begin -"S..." or "T...". - - -If an object has children, then it will be represented as a directory. -Immediately in the representative directory are a collection of directories -named for hash values of the child object keys with an '@' prepended. Into -this directory, if possible, will be placed the representations of the child -objects: - - INDEX INDEX INDEX DATA FILES - ========= ========== ================================= ================ - cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400 - cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry - cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry - cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...FP1ry - - -If the key is so long that it exceeds NAME_MAX with the decorations added on to -it, then it will be cut into pieces, the first few of which will be used to -make a nest of directories, and the last one of which will be the objects -inside the last directory. The names of the intermediate directories will have -'+' prepended: - - J1223/@23/+xy...z/+kl...m/Epqr - - -Note that keys are raw data, and not only may they exceed NAME_MAX in size, -they may also contain things like '/' and NUL characters, and so they may not -be suitable for turning directly into a filename. - -To handle this, CacheFiles will use a suitably printable filename directly and -"base-64" encode ones that aren't directly suitable. The two versions of -object filenames indicate the encoding: - - OBJECT TYPE PRINTABLE ENCODED - =============== =============== =============== - Index "I..." "J..." - Data "D..." "E..." - Special "S..." "T..." - -Intermediate directories are always "@" or "+" as appropriate. - - -Each object in the cache has an extended attribute label that holds the object -type ID (required to distinguish special objects) and the auxiliary data from -the netfs. The latter is used to detect stale objects in the cache and update -or retire them. - - -Note that CacheFiles will erase from the cache any file it doesn't recognise or -any file of an incorrect type (such as a FIFO file or a device file). - - -========================== -SECURITY MODEL AND SELINUX -========================== - -CacheFiles is implemented to deal properly with the LSM security features of -the Linux kernel and the SELinux facility. - -One of the problems that CacheFiles faces is that it is generally acting on -behalf of a process, and running in that process's context, and that includes a -security context that is not appropriate for accessing the cache - either -because the files in the cache are inaccessible to that process, or because if -the process creates a file in the cache, that file may be inaccessible to other -processes. - -The way CacheFiles works is to temporarily change the security context (fsuid, -fsgid and actor security label) that the process acts as - without changing the -security context of the process when it the target of an operation performed by -some other process (so signalling and suchlike still work correctly). - - -When the CacheFiles module is asked to bind to its cache, it: - - (1) Finds the security label attached to the root cache directory and uses - that as the security label with which it will create files. By default, - this is: - - cachefiles_var_t - - (2) Finds the security label of the process which issued the bind request - (presumed to be the cachefilesd daemon), which by default will be: - - cachefilesd_t - - and asks LSM to supply a security ID as which it should act given the - daemon's label. By default, this will be: - - cachefiles_kernel_t - - SELinux transitions the daemon's security ID to the module's security ID - based on a rule of this form in the policy. - - type_transition kernel_t : process ; - - For instance: - - type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t; - - -The module's security ID gives it permission to create, move and remove files -and directories in the cache, to find and access directories and files in the -cache, to set and access extended attributes on cache objects, and to read and -write files in the cache. - -The daemon's security ID gives it only a very restricted set of permissions: it -may scan directories, stat files and erase files and directories. It may -not read or write files in the cache, and so it is precluded from accessing the -data cached therein; nor is it permitted to create new files in the cache. - - -There are policy source files available in: - - http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2 - -and later versions. In that tarball, see the files: - - cachefilesd.te - cachefilesd.fc - cachefilesd.if - -They are built and installed directly by the RPM. - -If a non-RPM based system is being used, then copy the above files to their own -directory and run: - - make -f /usr/share/selinux/devel/Makefile - semodule -i cachefilesd.pp - -You will need checkpolicy and selinux-policy-devel installed prior to the -build. - - -By default, the cache is located in /var/fscache, but if it is desirable that -it should be elsewhere, than either the above policy files must be altered, or -an auxiliary policy must be installed to label the alternate location of the -cache. - -For instructions on how to add an auxiliary policy to enable the cache to be -located elsewhere when SELinux is in enforcing mode, please see: - - /usr/share/doc/cachefilesd-*/move-cache.txt - -When the cachefilesd rpm is installed; alternatively, the document can be found -in the sources. - - -================== -A NOTE ON SECURITY -================== - -CacheFiles makes use of the split security in the task_struct. It allocates -its own task_security structure, and redirects current->cred to point to it -when it acts on behalf of another process, in that process's context. - -The reason it does this is that it calls vfs_mkdir() and suchlike rather than -bypassing security and calling inode ops directly. Therefore the VFS and LSM -may deny the CacheFiles access to the cache data because under some -circumstances the caching code is running in the security context of whatever -process issued the original syscall on the netfs. - -Furthermore, should CacheFiles create a file or directory, the security -parameters with that object is created (UID, GID, security label) would be -derived from that process that issued the system call, thus potentially -preventing other processes from accessing the cache - including CacheFiles's -cache management daemon (cachefilesd). - -What is required is to temporarily override the security of the process that -issued the system call. We can't, however, just do an in-place change of the -security data as that affects the process as an object, not just as a subject. -This means it may lose signals or ptrace events for example, and affects what -the process looks like in /proc. - -So CacheFiles makes use of a logical split in the security between the -objective security (task->real_cred) and the subjective security (task->cred). -The objective security holds the intrinsic security properties of a process and -is never overridden. This is what appears in /proc, and is what is used when a -process is the target of an operation by some other process (SIGKILL for -example). - -The subjective security holds the active security properties of a process, and -may be overridden. This is not seen externally, and is used whan a process -acts upon another object, for example SIGKILLing another process or opening a -file. - -LSM hooks exist that allow SELinux (or Smack or whatever) to reject a request -for CacheFiles to run in a context of a specific security label, or to create -files and directories with another security label. - - -======================= -STATISTICAL INFORMATION -======================= - -If FS-Cache is compiled with the following option enabled: - - CONFIG_CACHEFILES_HISTOGRAM=y - -then it will gather certain statistics and display them through a proc file. - - (*) /proc/fs/cachefiles/histogram - - cat /proc/fs/cachefiles/histogram - JIFS SECS LOOKUPS MKDIRS CREATES - ===== ===== ========= ========= ========= - - This shows the breakdown of the number of times each amount of time - between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The - columns are as follows: - - COLUMN TIME MEASUREMENT - ======= ======================================================= - LOOKUPS Length of time to perform a lookup on the backing fs - MKDIRS Length of time to perform a mkdir on the backing fs - CREATES Length of time to perform a create on the backing fs - - Each row shows the number of events that took a particular range of times. - Each step is 1 jiffy in size. The JIFS column indicates the particular - jiffy range covered, and the SECS field the equivalent number of seconds. - - -========= -DEBUGGING -========= - -If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime -debugging enabled by adjusting the value in: - - /sys/module/cachefiles/parameters/debug - -This is a bitmask of debugging streams to enable: - - BIT VALUE STREAM POINT - ======= ======= =============================== ======================= - 0 1 General Function entry trace - 1 2 Function exit trace - 2 4 General - -The appropriate set of values should be OR'd together and the result written to -the control file. For example: - - echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug - -will turn on all function entry debugging. diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst index 75492b7c8ea0..a2cf35f89e28 100644 --- a/Documentation/filesystems/caching/index.rst +++ b/Documentation/filesystems/caching/index.rst @@ -8,5 +8,6 @@ Filesystem Caching fscache object + cachefiles netfs-api operations diff --git a/MAINTAINERS b/MAINTAINERS index 82e4b0a0c921..a103117dec85 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3731,7 +3731,7 @@ CACHEFILES: FS-CACHE BACKEND FOR CACHING ON MOUNTED FILESYSTEMS M: David Howells L: linux-cachefs@redhat.com (moderated for non-subscribers) S: Supported -F: Documentation/filesystems/caching/cachefiles.txt +F: Documentation/filesystems/caching/cachefiles.rst F: fs/cachefiles/ CADENCE MIPI-CSI2 BRIDGES diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index ae559ed5b3b3..ff9ca55a9ae9 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -8,7 +8,7 @@ config CACHEFILES filesystems - primarily networking filesystems - thus allowing fast local disk to enhance the speed of slower devices. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. config CACHEFILES_DEBUG @@ -36,5 +36,5 @@ config CACHEFILES_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. -- cgit v1.2.3 From 0e822145b564204cd5e9dd67a7fd37d4a7b8253b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:58 +0200 Subject: docs: filesystems: caching/backend-api.txt: convert it to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add table markups; - Add it to filesystems/caching/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/5d0a61abaa87bfe913b9e2f321e74ef7af0f3dfc.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/caching/backend-api.rst | 727 ++++++++++++++++++++++ Documentation/filesystems/caching/backend-api.txt | 726 --------------------- Documentation/filesystems/caching/fscache.rst | 2 +- Documentation/filesystems/caching/index.rst | 1 + fs/fscache/cache.c | 8 +- fs/fscache/object.c | 2 +- include/linux/fscache-cache.h | 4 +- 7 files changed, 736 insertions(+), 734 deletions(-) create mode 100644 Documentation/filesystems/caching/backend-api.rst delete mode 100644 Documentation/filesystems/caching/backend-api.txt diff --git a/Documentation/filesystems/caching/backend-api.rst b/Documentation/filesystems/caching/backend-api.rst new file mode 100644 index 000000000000..19fbf6b9aa36 --- /dev/null +++ b/Documentation/filesystems/caching/backend-api.rst @@ -0,0 +1,727 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +FS-Cache Cache backend API +========================== + +The FS-Cache system provides an API by which actual caches can be supplied to +FS-Cache for it to then serve out to network filesystems and other interested +parties. + +This API is declared in . + + +Initialising and Registering a Cache +==================================== + +To start off, a cache definition must be initialised and registered for each +cache the backend wants to make available. For instance, CacheFS does this in +the fill_super() operation on mounting. + +The cache definition (struct fscache_cache) should be initialised by calling:: + + void fscache_init_cache(struct fscache_cache *cache, + struct fscache_cache_ops *ops, + const char *idfmt, + ...); + +Where: + + * "cache" is a pointer to the cache definition; + + * "ops" is a pointer to the table of operations that the backend supports on + this cache; and + + * "idfmt" is a format and printf-style arguments for constructing a label + for the cache. + + +The cache should then be registered with FS-Cache by passing a pointer to the +previously initialised cache definition to:: + + int fscache_add_cache(struct fscache_cache *cache, + struct fscache_object *fsdef, + const char *tagname); + +Two extra arguments should also be supplied: + + * "fsdef" which should point to the object representation for the FS-Cache + master index in this cache. Netfs primary index entries will be created + here. FS-Cache keeps the caller's reference to the index object if + successful and will release it upon withdrawal of the cache. + + * "tagname" which, if given, should be a text string naming this cache. If + this is NULL, the identifier will be used instead. For CacheFS, the + identifier is set to name the underlying block device and the tag can be + supplied by mount. + +This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag +is already in use. 0 will be returned on success. + + +Unregistering a Cache +===================== + +A cache can be withdrawn from the system by calling this function with a +pointer to the cache definition:: + + void fscache_withdraw_cache(struct fscache_cache *cache); + +In CacheFS's case, this is called by put_super(). + + +Security +======== + +The cache methods are executed one of two contexts: + + (1) that of the userspace process that issued the netfs operation that caused + the cache method to be invoked, or + + (2) that of one of the processes in the FS-Cache thread pool. + +In either case, this may not be an appropriate context in which to access the +cache. + +The calling process's fsuid, fsgid and SELinux security identities may need to +be masqueraded for the duration of the cache driver's access to the cache. +This is left to the cache to handle; FS-Cache makes no effort in this regard. + + +Control and Statistics Presentation +=================================== + +The cache may present data to the outside world through FS-Cache's interfaces +in sysfs and procfs - the former for control and the latter for statistics. + +A sysfs directory called /sys/fs/fscache// is created if CONFIG_SYSFS +is enabled. This is accessible through the kobject struct fscache_cache::kobj +and is for use by the cache as it sees fit. + + +Relevant Data Structures +======================== + + * Index/Data file FS-Cache representation cookie:: + + struct fscache_cookie { + struct fscache_object_def *def; + struct fscache_netfs *netfs; + void *netfs_data; + ... + }; + + The fields that might be of use to the backend describe the object + definition, the netfs definition and the netfs's data for this cookie. + The object definition contain functions supplied by the netfs for loading + and matching index entries; these are required to provide some of the + cache operations. + + + * In-cache object representation:: + + struct fscache_object { + int debug_id; + enum { + FSCACHE_OBJECT_RECYCLING, + ... + } state; + spinlock_t lock + struct fscache_cache *cache; + struct fscache_cookie *cookie; + ... + }; + + Structures of this type should be allocated by the cache backend and + passed to FS-Cache when requested by the appropriate cache operation. In + the case of CacheFS, they're embedded in CacheFS's internal object + structures. + + The debug_id is a simple integer that can be used in debugging messages + that refer to a particular object. In such a case it should be printed + using "OBJ%x" to be consistent with FS-Cache. + + Each object contains a pointer to the cookie that represents the object it + is backing. An object should retired when put_object() is called if it is + in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be + initialised by calling fscache_object_init(object). + + + * FS-Cache operation record:: + + struct fscache_operation { + atomic_t usage; + struct fscache_object *object; + unsigned long flags; + #define FSCACHE_OP_EXCLUSIVE + void (*processor)(struct fscache_operation *op); + void (*release)(struct fscache_operation *op); + ... + }; + + FS-Cache has a pool of threads that it uses to give CPU time to the + various asynchronous operations that need to be done as part of driving + the cache. These are represented by the above structure. The processor + method is called to give the op CPU time, and the release method to get + rid of it when its usage count reaches 0. + + An operation can be made exclusive upon an object by setting the + appropriate flag before enqueuing it with fscache_enqueue_operation(). If + an operation needs more processing time, it should be enqueued again. + + + * FS-Cache retrieval operation record:: + + struct fscache_retrieval { + struct fscache_operation op; + struct address_space *mapping; + struct list_head *to_do; + ... + }; + + A structure of this type is allocated by FS-Cache to record retrieval and + allocation requests made by the netfs. This struct is then passed to the + backend to do the operation. The backend may get extra refs to it by + calling fscache_get_retrieval() and refs may be discarded by calling + fscache_put_retrieval(). + + A retrieval operation can be used by the backend to do retrieval work. To + do this, the retrieval->op.processor method pointer should be set + appropriately by the backend and fscache_enqueue_retrieval() called to + submit it to the thread pool. CacheFiles, for example, uses this to queue + page examination when it detects PG_lock being cleared. + + The to_do field is an empty list available for the cache backend to use as + it sees fit. + + + * FS-Cache storage operation record:: + + struct fscache_storage { + struct fscache_operation op; + pgoff_t store_limit; + ... + }; + + A structure of this type is allocated by FS-Cache to record outstanding + writes to be made. FS-Cache itself enqueues this operation and invokes + the write_page() method on the object at appropriate times to effect + storage. + + +Cache Operations +================ + +The cache backend provides FS-Cache with a table of operations that can be +performed on the denizens of the cache. These are held in a structure of type: + + :: + + struct fscache_cache_ops + + * Name of cache provider [mandatory]:: + + const char *name + + This isn't strictly an operation, but should be pointed at a string naming + the backend. + + + * Allocate a new object [mandatory]:: + + struct fscache_object *(*alloc_object)(struct fscache_cache *cache, + struct fscache_cookie *cookie) + + This method is used to allocate a cache object representation to back a + cookie in a particular cache. fscache_object_init() should be called on + the object to initialise it prior to returning. + + This function may also be used to parse the index key to be used for + multiple lookup calls to turn it into a more convenient form. FS-Cache + will call the lookup_complete() method to allow the cache to release the + form once lookup is complete or aborted. + + + * Look up and create object [mandatory]:: + + void (*lookup_object)(struct fscache_object *object) + + This method is used to look up an object, given that the object is already + allocated and attached to the cookie. This should instantiate that object + in the cache if it can. + + The method should call fscache_object_lookup_negative() as soon as + possible if it determines the object doesn't exist in the cache. If the + object is found to exist and the netfs indicates that it is valid then + fscache_obtained_object() should be called once the object is in a + position to have data stored in it. Similarly, fscache_obtained_object() + should also be called once a non-present object has been created. + + If a lookup error occurs, fscache_object_lookup_error() should be called + to abort the lookup of that object. + + + * Release lookup data [mandatory]:: + + void (*lookup_complete)(struct fscache_object *object) + + This method is called to ask the cache to release any resources it was + using to perform a lookup. + + + * Increment object refcount [mandatory]:: + + struct fscache_object *(*grab_object)(struct fscache_object *object) + + This method is called to increment the reference count on an object. It + may fail (for instance if the cache is being withdrawn) by returning NULL. + It should return the object pointer if successful. + + + * Lock/Unlock object [mandatory]:: + + void (*lock_object)(struct fscache_object *object) + void (*unlock_object)(struct fscache_object *object) + + These methods are used to exclusively lock an object. It must be possible + to schedule with the lock held, so a spinlock isn't sufficient. + + + * Pin/Unpin object [optional]:: + + int (*pin_object)(struct fscache_object *object) + void (*unpin_object)(struct fscache_object *object) + + These methods are used to pin an object into the cache. Once pinned an + object cannot be reclaimed to make space. Return -ENOSPC if there's not + enough space in the cache to permit this. + + + * Check coherency state of an object [mandatory]:: + + int (*check_consistency)(struct fscache_object *object) + + This method is called to have the cache check the saved auxiliary data of + the object against the netfs's idea of the state. 0 should be returned + if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS + may also be returned. + + * Update object [mandatory]:: + + int (*update_object)(struct fscache_object *object) + + This is called to update the index entry for the specified object. The + new information should be in object->cookie->netfs_data. This can be + obtained by calling object->cookie->def->get_aux()/get_attr(). + + + * Invalidate data object [mandatory]:: + + int (*invalidate_object)(struct fscache_operation *op) + + This is called to invalidate a data object (as pointed to by op->object). + All the data stored for this object should be discarded and an + attr_changed operation should be performed. The caller will follow up + with an object update operation. + + fscache_op_complete() must be called on op before returning. + + + * Discard object [mandatory]:: + + void (*drop_object)(struct fscache_object *object) + + This method is called to indicate that an object has been unbound from its + cookie, and that the cache should release the object's resources and + retire it if it's in state FSCACHE_OBJECT_RECYCLING. + + This method should not attempt to release any references held by the + caller. The caller will invoke the put_object() method as appropriate. + + + * Release object reference [mandatory]:: + + void (*put_object)(struct fscache_object *object) + + This method is used to discard a reference to an object. The object may + be freed when all the references to it are released. + + + * Synchronise a cache [mandatory]:: + + void (*sync)(struct fscache_cache *cache) + + This is called to ask the backend to synchronise a cache with its backing + device. + + + * Dissociate a cache [mandatory]:: + + void (*dissociate_pages)(struct fscache_cache *cache) + + This is called to ask a cache to perform any page dissociations as part of + cache withdrawal. + + + * Notification that the attributes on a netfs file changed [mandatory]:: + + int (*attr_changed)(struct fscache_object *object); + + This is called to indicate to the cache that certain attributes on a netfs + file have changed (for example the maximum size a file may reach). The + cache can read these from the netfs by calling the cookie's get_attr() + method. + + The cache may use the file size information to reserve space on the cache. + It should also call fscache_set_store_limit() to indicate to FS-Cache the + highest byte it's willing to store for an object. + + This method may return -ve if an error occurred or the cache object cannot + be expanded. In such a case, the object will be withdrawn from service. + + This operation is run asynchronously from FS-Cache's thread pool, and + storage and retrieval operations from the netfs are excluded during the + execution of this operation. + + + * Reserve cache space for an object's data [optional]:: + + int (*reserve_space)(struct fscache_object *object, loff_t size); + + This is called to request that cache space be reserved to hold the data + for an object and the metadata used to track it. Zero size should be + taken as request to cancel a reservation. + + This should return 0 if successful, -ENOSPC if there isn't enough space + available, or -ENOMEM or -EIO on other errors. + + The reservation may exceed the current size of the object, thus permitting + future expansion. If the amount of space consumed by an object would + exceed the reservation, it's permitted to refuse requests to allocate + pages, but not required. An object may be pruned down to its reservation + size if larger than that already. + + + * Request page be read from cache [mandatory]:: + + int (*read_or_alloc_page)(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) + + This is called to attempt to read a netfs page from the cache, or to + reserve a backing block if not. FS-Cache will have done as much checking + as it can before calling, but most of the work belongs to the backend. + + If there's no page in the cache, then -ENODATA should be returned if the + backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it + didn't. + + If there is suitable data in the cache, then a read operation should be + queued and 0 returned. When the read finishes, fscache_end_io() should be + called. + + The fscache_mark_pages_cached() should be called for the page if any cache + metadata is retained. This will indicate to the netfs that the page needs + explicit uncaching. This operation takes a pagevec, thus allowing several + pages to be marked at once. + + The retrieval record pointed to by op should be retained for each page + queued and released when I/O on the page has been formally ended. + fscache_get/put_retrieval() are available for this purpose. + + The retrieval record may be used to get CPU time via the FS-Cache thread + pool. If this is desired, the op->op.processor should be set to point to + the appropriate processing routine, and fscache_enqueue_retrieval() should + be called at an appropriate point to request CPU time. For instance, the + retrieval routine could be enqueued upon the completion of a disk read. + The to_do field in the retrieval record is provided to aid in this. + + If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS + returned if possible or fscache_end_io() called with a suitable error + code. + + fscache_put_retrieval() should be called after a page or pages are dealt + with. This will complete the operation when all pages are dealt with. + + + * Request pages be read from cache [mandatory]:: + + int (*read_or_alloc_pages)(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) + + This is like the read_or_alloc_page() method, except it is handed a list + of pages instead of one page. Any pages on which a read operation is + started must be added to the page cache for the specified mapping and also + to the LRU. Such pages must also be removed from the pages list and + ``*nr_pages`` decremented per page. + + If there was an error such as -ENOMEM, then that should be returned; else + if one or more pages couldn't be read or allocated, then -ENOBUFS should + be returned; else if one or more pages couldn't be read, then -ENODATA + should be returned. If all the pages are dispatched then 0 should be + returned. + + + * Request page be allocated in the cache [mandatory]:: + + int (*allocate_page)(struct fscache_retrieval *op, + struct page *page, + gfp_t gfp) + + This is like the read_or_alloc_page() method, except that it shouldn't + read from the cache, even if there's data there that could be retrieved. + It should, however, set up any internal metadata required such that + the write_page() method can write to the cache. + + If there's no backing block available, then -ENOBUFS should be returned + (or -ENOMEM if there were other problems). If a block is successfully + allocated, then the netfs page should be marked and 0 returned. + + + * Request pages be allocated in the cache [mandatory]:: + + int (*allocate_pages)(struct fscache_retrieval *op, + struct list_head *pages, + unsigned *nr_pages, + gfp_t gfp) + + This is an multiple page version of the allocate_page() method. pages and + nr_pages should be treated as for the read_or_alloc_pages() method. + + + * Request page be written to cache [mandatory]:: + + int (*write_page)(struct fscache_storage *op, + struct page *page); + + This is called to write from a page on which there was a previously + successful read_or_alloc_page() call or similar. FS-Cache filters out + pages that don't have mappings. + + This method is called asynchronously from the FS-Cache thread pool. It is + not required to actually store anything, provided -ENODATA is then + returned to the next read of this page. + + If an error occurred, then a negative error code should be returned, + otherwise zero should be returned. FS-Cache will take appropriate action + in response to an error, such as withdrawing this object. + + If this method returns success then FS-Cache will inform the netfs + appropriately. + + + * Discard retained per-page metadata [mandatory]:: + + void (*uncache_page)(struct fscache_object *object, struct page *page) + + This is called when a netfs page is being evicted from the pagecache. The + cache backend should tear down any internal representation or tracking it + maintains for this page. + + +FS-Cache Utilities +================== + +FS-Cache provides some utilities that a cache backend may make use of: + + * Note occurrence of an I/O error in a cache:: + + void fscache_io_error(struct fscache_cache *cache) + + This tells FS-Cache that an I/O error occurred in the cache. After this + has been called, only resource dissociation operations (object and page + release) will be passed from the netfs to the cache backend for the + specified cache. + + This does not actually withdraw the cache. That must be done separately. + + + * Invoke the retrieval I/O completion function:: + + void fscache_end_io(struct fscache_retrieval *op, struct page *page, + int error); + + This is called to note the end of an attempt to retrieve a page. The + error value should be 0 if successful and an error otherwise. + + + * Record that one or more pages being retrieved or allocated have been dealt + with:: + + void fscache_retrieval_complete(struct fscache_retrieval *op, + int n_pages); + + This is called to record the fact that one or more pages have been dealt + with and are no longer the concern of this operation. When the number of + pages remaining in the operation reaches 0, the operation will be + completed. + + + * Record operation completion:: + + void fscache_op_complete(struct fscache_operation *op); + + This is called to record the completion of an operation. This deducts + this operation from the parent object's run state, potentially permitting + one or more pending operations to start running. + + + * Set highest store limit:: + + void fscache_set_store_limit(struct fscache_object *object, + loff_t i_size); + + This sets the limit FS-Cache imposes on the highest byte it's willing to + try and store for a netfs. Any page over this limit is automatically + rejected by fscache_read_alloc_page() and co with -ENOBUFS. + + + * Mark pages as being cached:: + + void fscache_mark_pages_cached(struct fscache_retrieval *op, + struct pagevec *pagevec); + + This marks a set of pages as being cached. After this has been called, + the netfs must call fscache_uncache_page() to unmark the pages. + + + * Perform coherency check on an object:: + + enum fscache_checkaux fscache_check_aux(struct fscache_object *object, + const void *data, + uint16_t datalen); + + This asks the netfs to perform a coherency check on an object that has + just been looked up. The cookie attached to the object will determine the + netfs to use. data and datalen should specify where the auxiliary data + retrieved from the cache can be found. + + One of three values will be returned: + + FSCACHE_CHECKAUX_OKAY + The coherency data indicates the object is valid as is. + + FSCACHE_CHECKAUX_NEEDS_UPDATE + The coherency data needs updating, but otherwise the object is + valid. + + FSCACHE_CHECKAUX_OBSOLETE + The coherency data indicates that the object is obsolete and should + be discarded. + + + * Initialise a freshly allocated object:: + + void fscache_object_init(struct fscache_object *object); + + This initialises all the fields in an object representation. + + + * Indicate the destruction of an object:: + + void fscache_object_destroyed(struct fscache_cache *cache); + + This must be called to inform FS-Cache that an object that belonged to a + cache has been destroyed and deallocated. This will allow continuation + of the cache withdrawal process when it is stopped pending destruction of + all the objects. + + + * Indicate negative lookup on an object:: + + void fscache_object_lookup_negative(struct fscache_object *object); + + This is called to indicate to FS-Cache that a lookup process for an object + found a negative result. + + This changes the state of an object to permit reads pending on lookup + completion to go off and start fetching data from the netfs server as it's + known at this point that there can't be any data in the cache. + + This may be called multiple times on an object. Only the first call is + significant - all subsequent calls are ignored. + + + * Indicate an object has been obtained:: + + void fscache_obtained_object(struct fscache_object *object); + + This is called to indicate to FS-Cache that a lookup process for an object + produced a positive result, or that an object was created. This should + only be called once for any particular object. + + This changes the state of an object to indicate: + + (1) if no call to fscache_object_lookup_negative() has been made on + this object, that there may be data available, and that reads can + now go and look for it; and + + (2) that writes may now proceed against this object. + + + * Indicate that object lookup failed:: + + void fscache_object_lookup_error(struct fscache_object *object); + + This marks an object as having encountered a fatal error (usually EIO) + and causes it to move into a state whereby it will be withdrawn as soon + as possible. + + + * Indicate that a stale object was found and discarded:: + + void fscache_object_retrying_stale(struct fscache_object *object); + + This is called to indicate that the lookup procedure found an object in + the cache that the netfs decided was stale. The object has been + discarded from the cache and the lookup will be performed again. + + + * Indicate that the caching backend killed an object:: + + void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why); + + This is called to indicate that the cache backend preemptively killed an + object. The why parameter should be set to indicate the reason: + + FSCACHE_OBJECT_IS_STALE + - the object was stale and needs discarding. + + FSCACHE_OBJECT_NO_SPACE + - there was insufficient cache space + + FSCACHE_OBJECT_WAS_RETIRED + - the object was retired when relinquished. + + FSCACHE_OBJECT_WAS_CULLED + - the object was culled to make space. + + + * Get and release references on a retrieval record:: + + void fscache_get_retrieval(struct fscache_retrieval *op); + void fscache_put_retrieval(struct fscache_retrieval *op); + + These two functions are used to retain a retrieval record while doing + asynchronous data retrieval and block allocation. + + + * Enqueue a retrieval record for processing:: + + void fscache_enqueue_retrieval(struct fscache_retrieval *op); + + This enqueues a retrieval record for processing by the FS-Cache thread + pool. One of the threads in the pool will invoke the retrieval record's + op->op.processor callback function. This function may be called from + within the callback function. + + + * List of object state names:: + + const char *fscache_object_states[]; + + For debugging purposes, this may be used to turn the state that an object + is in into a text string for display purposes. diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt deleted file mode 100644 index c418280c915f..000000000000 --- a/Documentation/filesystems/caching/backend-api.txt +++ /dev/null @@ -1,726 +0,0 @@ - ========================== - FS-CACHE CACHE BACKEND API - ========================== - -The FS-Cache system provides an API by which actual caches can be supplied to -FS-Cache for it to then serve out to network filesystems and other interested -parties. - -This API is declared in . - - -==================================== -INITIALISING AND REGISTERING A CACHE -==================================== - -To start off, a cache definition must be initialised and registered for each -cache the backend wants to make available. For instance, CacheFS does this in -the fill_super() operation on mounting. - -The cache definition (struct fscache_cache) should be initialised by calling: - - void fscache_init_cache(struct fscache_cache *cache, - struct fscache_cache_ops *ops, - const char *idfmt, - ...); - -Where: - - (*) "cache" is a pointer to the cache definition; - - (*) "ops" is a pointer to the table of operations that the backend supports on - this cache; and - - (*) "idfmt" is a format and printf-style arguments for constructing a label - for the cache. - - -The cache should then be registered with FS-Cache by passing a pointer to the -previously initialised cache definition to: - - int fscache_add_cache(struct fscache_cache *cache, - struct fscache_object *fsdef, - const char *tagname); - -Two extra arguments should also be supplied: - - (*) "fsdef" which should point to the object representation for the FS-Cache - master index in this cache. Netfs primary index entries will be created - here. FS-Cache keeps the caller's reference to the index object if - successful and will release it upon withdrawal of the cache. - - (*) "tagname" which, if given, should be a text string naming this cache. If - this is NULL, the identifier will be used instead. For CacheFS, the - identifier is set to name the underlying block device and the tag can be - supplied by mount. - -This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag -is already in use. 0 will be returned on success. - - -===================== -UNREGISTERING A CACHE -===================== - -A cache can be withdrawn from the system by calling this function with a -pointer to the cache definition: - - void fscache_withdraw_cache(struct fscache_cache *cache); - -In CacheFS's case, this is called by put_super(). - - -======== -SECURITY -======== - -The cache methods are executed one of two contexts: - - (1) that of the userspace process that issued the netfs operation that caused - the cache method to be invoked, or - - (2) that of one of the processes in the FS-Cache thread pool. - -In either case, this may not be an appropriate context in which to access the -cache. - -The calling process's fsuid, fsgid and SELinux security identities may need to -be masqueraded for the duration of the cache driver's access to the cache. -This is left to the cache to handle; FS-Cache makes no effort in this regard. - - -=================================== -CONTROL AND STATISTICS PRESENTATION -=================================== - -The cache may present data to the outside world through FS-Cache's interfaces -in sysfs and procfs - the former for control and the latter for statistics. - -A sysfs directory called /sys/fs/fscache// is created if CONFIG_SYSFS -is enabled. This is accessible through the kobject struct fscache_cache::kobj -and is for use by the cache as it sees fit. - - -======================== -RELEVANT DATA STRUCTURES -======================== - - (*) Index/Data file FS-Cache representation cookie: - - struct fscache_cookie { - struct fscache_object_def *def; - struct fscache_netfs *netfs; - void *netfs_data; - ... - }; - - The fields that might be of use to the backend describe the object - definition, the netfs definition and the netfs's data for this cookie. - The object definition contain functions supplied by the netfs for loading - and matching index entries; these are required to provide some of the - cache operations. - - - (*) In-cache object representation: - - struct fscache_object { - int debug_id; - enum { - FSCACHE_OBJECT_RECYCLING, - ... - } state; - spinlock_t lock - struct fscache_cache *cache; - struct fscache_cookie *cookie; - ... - }; - - Structures of this type should be allocated by the cache backend and - passed to FS-Cache when requested by the appropriate cache operation. In - the case of CacheFS, they're embedded in CacheFS's internal object - structures. - - The debug_id is a simple integer that can be used in debugging messages - that refer to a particular object. In such a case it should be printed - using "OBJ%x" to be consistent with FS-Cache. - - Each object contains a pointer to the cookie that represents the object it - is backing. An object should retired when put_object() is called if it is - in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be - initialised by calling fscache_object_init(object). - - - (*) FS-Cache operation record: - - struct fscache_operation { - atomic_t usage; - struct fscache_object *object; - unsigned long flags; - #define FSCACHE_OP_EXCLUSIVE - void (*processor)(struct fscache_operation *op); - void (*release)(struct fscache_operation *op); - ... - }; - - FS-Cache has a pool of threads that it uses to give CPU time to the - various asynchronous operations that need to be done as part of driving - the cache. These are represented by the above structure. The processor - method is called to give the op CPU time, and the release method to get - rid of it when its usage count reaches 0. - - An operation can be made exclusive upon an object by setting the - appropriate flag before enqueuing it with fscache_enqueue_operation(). If - an operation needs more processing time, it should be enqueued again. - - - (*) FS-Cache retrieval operation record: - - struct fscache_retrieval { - struct fscache_operation op; - struct address_space *mapping; - struct list_head *to_do; - ... - }; - - A structure of this type is allocated by FS-Cache to record retrieval and - allocation requests made by the netfs. This struct is then passed to the - backend to do the operation. The backend may get extra refs to it by - calling fscache_get_retrieval() and refs may be discarded by calling - fscache_put_retrieval(). - - A retrieval operation can be used by the backend to do retrieval work. To - do this, the retrieval->op.processor method pointer should be set - appropriately by the backend and fscache_enqueue_retrieval() called to - submit it to the thread pool. CacheFiles, for example, uses this to queue - page examination when it detects PG_lock being cleared. - - The to_do field is an empty list available for the cache backend to use as - it sees fit. - - - (*) FS-Cache storage operation record: - - struct fscache_storage { - struct fscache_operation op; - pgoff_t store_limit; - ... - }; - - A structure of this type is allocated by FS-Cache to record outstanding - writes to be made. FS-Cache itself enqueues this operation and invokes - the write_page() method on the object at appropriate times to effect - storage. - - -================ -CACHE OPERATIONS -================ - -The cache backend provides FS-Cache with a table of operations that can be -performed on the denizens of the cache. These are held in a structure of type: - - struct fscache_cache_ops - - (*) Name of cache provider [mandatory]: - - const char *name - - This isn't strictly an operation, but should be pointed at a string naming - the backend. - - - (*) Allocate a new object [mandatory]: - - struct fscache_object *(*alloc_object)(struct fscache_cache *cache, - struct fscache_cookie *cookie) - - This method is used to allocate a cache object representation to back a - cookie in a particular cache. fscache_object_init() should be called on - the object to initialise it prior to returning. - - This function may also be used to parse the index key to be used for - multiple lookup calls to turn it into a more convenient form. FS-Cache - will call the lookup_complete() method to allow the cache to release the - form once lookup is complete or aborted. - - - (*) Look up and create object [mandatory]: - - void (*lookup_object)(struct fscache_object *object) - - This method is used to look up an object, given that the object is already - allocated and attached to the cookie. This should instantiate that object - in the cache if it can. - - The method should call fscache_object_lookup_negative() as soon as - possible if it determines the object doesn't exist in the cache. If the - object is found to exist and the netfs indicates that it is valid then - fscache_obtained_object() should be called once the object is in a - position to have data stored in it. Similarly, fscache_obtained_object() - should also be called once a non-present object has been created. - - If a lookup error occurs, fscache_object_lookup_error() should be called - to abort the lookup of that object. - - - (*) Release lookup data [mandatory]: - - void (*lookup_complete)(struct fscache_object *object) - - This method is called to ask the cache to release any resources it was - using to perform a lookup. - - - (*) Increment object refcount [mandatory]: - - struct fscache_object *(*grab_object)(struct fscache_object *object) - - This method is called to increment the reference count on an object. It - may fail (for instance if the cache is being withdrawn) by returning NULL. - It should return the object pointer if successful. - - - (*) Lock/Unlock object [mandatory]: - - void (*lock_object)(struct fscache_object *object) - void (*unlock_object)(struct fscache_object *object) - - These methods are used to exclusively lock an object. It must be possible - to schedule with the lock held, so a spinlock isn't sufficient. - - - (*) Pin/Unpin object [optional]: - - int (*pin_object)(struct fscache_object *object) - void (*unpin_object)(struct fscache_object *object) - - These methods are used to pin an object into the cache. Once pinned an - object cannot be reclaimed to make space. Return -ENOSPC if there's not - enough space in the cache to permit this. - - - (*) Check coherency state of an object [mandatory]: - - int (*check_consistency)(struct fscache_object *object) - - This method is called to have the cache check the saved auxiliary data of - the object against the netfs's idea of the state. 0 should be returned - if they're consistent and -ESTALE otherwise. -ENOMEM and -ERESTARTSYS - may also be returned. - - (*) Update object [mandatory]: - - int (*update_object)(struct fscache_object *object) - - This is called to update the index entry for the specified object. The - new information should be in object->cookie->netfs_data. This can be - obtained by calling object->cookie->def->get_aux()/get_attr(). - - - (*) Invalidate data object [mandatory]: - - int (*invalidate_object)(struct fscache_operation *op) - - This is called to invalidate a data object (as pointed to by op->object). - All the data stored for this object should be discarded and an - attr_changed operation should be performed. The caller will follow up - with an object update operation. - - fscache_op_complete() must be called on op before returning. - - - (*) Discard object [mandatory]: - - void (*drop_object)(struct fscache_object *object) - - This method is called to indicate that an object has been unbound from its - cookie, and that the cache should release the object's resources and - retire it if it's in state FSCACHE_OBJECT_RECYCLING. - - This method should not attempt to release any references held by the - caller. The caller will invoke the put_object() method as appropriate. - - - (*) Release object reference [mandatory]: - - void (*put_object)(struct fscache_object *object) - - This method is used to discard a reference to an object. The object may - be freed when all the references to it are released. - - - (*) Synchronise a cache [mandatory]: - - void (*sync)(struct fscache_cache *cache) - - This is called to ask the backend to synchronise a cache with its backing - device. - - - (*) Dissociate a cache [mandatory]: - - void (*dissociate_pages)(struct fscache_cache *cache) - - This is called to ask a cache to perform any page dissociations as part of - cache withdrawal. - - - (*) Notification that the attributes on a netfs file changed [mandatory]: - - int (*attr_changed)(struct fscache_object *object); - - This is called to indicate to the cache that certain attributes on a netfs - file have changed (for example the maximum size a file may reach). The - cache can read these from the netfs by calling the cookie's get_attr() - method. - - The cache may use the file size information to reserve space on the cache. - It should also call fscache_set_store_limit() to indicate to FS-Cache the - highest byte it's willing to store for an object. - - This method may return -ve if an error occurred or the cache object cannot - be expanded. In such a case, the object will be withdrawn from service. - - This operation is run asynchronously from FS-Cache's thread pool, and - storage and retrieval operations from the netfs are excluded during the - execution of this operation. - - - (*) Reserve cache space for an object's data [optional]: - - int (*reserve_space)(struct fscache_object *object, loff_t size); - - This is called to request that cache space be reserved to hold the data - for an object and the metadata used to track it. Zero size should be - taken as request to cancel a reservation. - - This should return 0 if successful, -ENOSPC if there isn't enough space - available, or -ENOMEM or -EIO on other errors. - - The reservation may exceed the current size of the object, thus permitting - future expansion. If the amount of space consumed by an object would - exceed the reservation, it's permitted to refuse requests to allocate - pages, but not required. An object may be pruned down to its reservation - size if larger than that already. - - - (*) Request page be read from cache [mandatory]: - - int (*read_or_alloc_page)(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) - - This is called to attempt to read a netfs page from the cache, or to - reserve a backing block if not. FS-Cache will have done as much checking - as it can before calling, but most of the work belongs to the backend. - - If there's no page in the cache, then -ENODATA should be returned if the - backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it - didn't. - - If there is suitable data in the cache, then a read operation should be - queued and 0 returned. When the read finishes, fscache_end_io() should be - called. - - The fscache_mark_pages_cached() should be called for the page if any cache - metadata is retained. This will indicate to the netfs that the page needs - explicit uncaching. This operation takes a pagevec, thus allowing several - pages to be marked at once. - - The retrieval record pointed to by op should be retained for each page - queued and released when I/O on the page has been formally ended. - fscache_get/put_retrieval() are available for this purpose. - - The retrieval record may be used to get CPU time via the FS-Cache thread - pool. If this is desired, the op->op.processor should be set to point to - the appropriate processing routine, and fscache_enqueue_retrieval() should - be called at an appropriate point to request CPU time. For instance, the - retrieval routine could be enqueued upon the completion of a disk read. - The to_do field in the retrieval record is provided to aid in this. - - If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS - returned if possible or fscache_end_io() called with a suitable error - code. - - fscache_put_retrieval() should be called after a page or pages are dealt - with. This will complete the operation when all pages are dealt with. - - - (*) Request pages be read from cache [mandatory]: - - int (*read_or_alloc_pages)(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) - - This is like the read_or_alloc_page() method, except it is handed a list - of pages instead of one page. Any pages on which a read operation is - started must be added to the page cache for the specified mapping and also - to the LRU. Such pages must also be removed from the pages list and - *nr_pages decremented per page. - - If there was an error such as -ENOMEM, then that should be returned; else - if one or more pages couldn't be read or allocated, then -ENOBUFS should - be returned; else if one or more pages couldn't be read, then -ENODATA - should be returned. If all the pages are dispatched then 0 should be - returned. - - - (*) Request page be allocated in the cache [mandatory]: - - int (*allocate_page)(struct fscache_retrieval *op, - struct page *page, - gfp_t gfp) - - This is like the read_or_alloc_page() method, except that it shouldn't - read from the cache, even if there's data there that could be retrieved. - It should, however, set up any internal metadata required such that - the write_page() method can write to the cache. - - If there's no backing block available, then -ENOBUFS should be returned - (or -ENOMEM if there were other problems). If a block is successfully - allocated, then the netfs page should be marked and 0 returned. - - - (*) Request pages be allocated in the cache [mandatory]: - - int (*allocate_pages)(struct fscache_retrieval *op, - struct list_head *pages, - unsigned *nr_pages, - gfp_t gfp) - - This is an multiple page version of the allocate_page() method. pages and - nr_pages should be treated as for the read_or_alloc_pages() method. - - - (*) Request page be written to cache [mandatory]: - - int (*write_page)(struct fscache_storage *op, - struct page *page); - - This is called to write from a page on which there was a previously - successful read_or_alloc_page() call or similar. FS-Cache filters out - pages that don't have mappings. - - This method is called asynchronously from the FS-Cache thread pool. It is - not required to actually store anything, provided -ENODATA is then - returned to the next read of this page. - - If an error occurred, then a negative error code should be returned, - otherwise zero should be returned. FS-Cache will take appropriate action - in response to an error, such as withdrawing this object. - - If this method returns success then FS-Cache will inform the netfs - appropriately. - - - (*) Discard retained per-page metadata [mandatory]: - - void (*uncache_page)(struct fscache_object *object, struct page *page) - - This is called when a netfs page is being evicted from the pagecache. The - cache backend should tear down any internal representation or tracking it - maintains for this page. - - -================== -FS-CACHE UTILITIES -================== - -FS-Cache provides some utilities that a cache backend may make use of: - - (*) Note occurrence of an I/O error in a cache: - - void fscache_io_error(struct fscache_cache *cache) - - This tells FS-Cache that an I/O error occurred in the cache. After this - has been called, only resource dissociation operations (object and page - release) will be passed from the netfs to the cache backend for the - specified cache. - - This does not actually withdraw the cache. That must be done separately. - - - (*) Invoke the retrieval I/O completion function: - - void fscache_end_io(struct fscache_retrieval *op, struct page *page, - int error); - - This is called to note the end of an attempt to retrieve a page. The - error value should be 0 if successful and an error otherwise. - - - (*) Record that one or more pages being retrieved or allocated have been dealt - with: - - void fscache_retrieval_complete(struct fscache_retrieval *op, - int n_pages); - - This is called to record the fact that one or more pages have been dealt - with and are no longer the concern of this operation. When the number of - pages remaining in the operation reaches 0, the operation will be - completed. - - - (*) Record operation completion: - - void fscache_op_complete(struct fscache_operation *op); - - This is called to record the completion of an operation. This deducts - this operation from the parent object's run state, potentially permitting - one or more pending operations to start running. - - - (*) Set highest store limit: - - void fscache_set_store_limit(struct fscache_object *object, - loff_t i_size); - - This sets the limit FS-Cache imposes on the highest byte it's willing to - try and store for a netfs. Any page over this limit is automatically - rejected by fscache_read_alloc_page() and co with -ENOBUFS. - - - (*) Mark pages as being cached: - - void fscache_mark_pages_cached(struct fscache_retrieval *op, - struct pagevec *pagevec); - - This marks a set of pages as being cached. After this has been called, - the netfs must call fscache_uncache_page() to unmark the pages. - - - (*) Perform coherency check on an object: - - enum fscache_checkaux fscache_check_aux(struct fscache_object *object, - const void *data, - uint16_t datalen); - - This asks the netfs to perform a coherency check on an object that has - just been looked up. The cookie attached to the object will determine the - netfs to use. data and datalen should specify where the auxiliary data - retrieved from the cache can be found. - - One of three values will be returned: - - (*) FSCACHE_CHECKAUX_OKAY - - The coherency data indicates the object is valid as is. - - (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - - The coherency data needs updating, but otherwise the object is - valid. - - (*) FSCACHE_CHECKAUX_OBSOLETE - - The coherency data indicates that the object is obsolete and should - be discarded. - - - (*) Initialise a freshly allocated object: - - void fscache_object_init(struct fscache_object *object); - - This initialises all the fields in an object representation. - - - (*) Indicate the destruction of an object: - - void fscache_object_destroyed(struct fscache_cache *cache); - - This must be called to inform FS-Cache that an object that belonged to a - cache has been destroyed and deallocated. This will allow continuation - of the cache withdrawal process when it is stopped pending destruction of - all the objects. - - - (*) Indicate negative lookup on an object: - - void fscache_object_lookup_negative(struct fscache_object *object); - - This is called to indicate to FS-Cache that a lookup process for an object - found a negative result. - - This changes the state of an object to permit reads pending on lookup - completion to go off and start fetching data from the netfs server as it's - known at this point that there can't be any data in the cache. - - This may be called multiple times on an object. Only the first call is - significant - all subsequent calls are ignored. - - - (*) Indicate an object has been obtained: - - void fscache_obtained_object(struct fscache_object *object); - - This is called to indicate to FS-Cache that a lookup process for an object - produced a positive result, or that an object was created. This should - only be called once for any particular object. - - This changes the state of an object to indicate: - - (1) if no call to fscache_object_lookup_negative() has been made on - this object, that there may be data available, and that reads can - now go and look for it; and - - (2) that writes may now proceed against this object. - - - (*) Indicate that object lookup failed: - - void fscache_object_lookup_error(struct fscache_object *object); - - This marks an object as having encountered a fatal error (usually EIO) - and causes it to move into a state whereby it will be withdrawn as soon - as possible. - - - (*) Indicate that a stale object was found and discarded: - - void fscache_object_retrying_stale(struct fscache_object *object); - - This is called to indicate that the lookup procedure found an object in - the cache that the netfs decided was stale. The object has been - discarded from the cache and the lookup will be performed again. - - - (*) Indicate that the caching backend killed an object: - - void fscache_object_mark_killed(struct fscache_object *object, - enum fscache_why_object_killed why); - - This is called to indicate that the cache backend preemptively killed an - object. The why parameter should be set to indicate the reason: - - FSCACHE_OBJECT_IS_STALE - the object was stale and needs discarding. - FSCACHE_OBJECT_NO_SPACE - there was insufficient cache space - FSCACHE_OBJECT_WAS_RETIRED - the object was retired when relinquished. - FSCACHE_OBJECT_WAS_CULLED - the object was culled to make space. - - - (*) Get and release references on a retrieval record: - - void fscache_get_retrieval(struct fscache_retrieval *op); - void fscache_put_retrieval(struct fscache_retrieval *op); - - These two functions are used to retain a retrieval record while doing - asynchronous data retrieval and block allocation. - - - (*) Enqueue a retrieval record for processing. - - void fscache_enqueue_retrieval(struct fscache_retrieval *op); - - This enqueues a retrieval record for processing by the FS-Cache thread - pool. One of the threads in the pool will invoke the retrieval record's - op->op.processor callback function. This function may be called from - within the callback function. - - - (*) List of object state names: - - const char *fscache_object_states[]; - - For debugging purposes, this may be used to turn the state that an object - is in into a text string for display purposes. diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst index dd1297d884d0..70de86922b6a 100644 --- a/Documentation/filesystems/caching/fscache.rst +++ b/Documentation/filesystems/caching/fscache.rst @@ -187,7 +187,7 @@ The netfs API to FS-Cache can be found in: The cache backend API to FS-Cache can be found in: - Documentation/filesystems/caching/backend-api.txt + Documentation/filesystems/caching/backend-api.rst A description of the internal representations and object state machine can be found in: diff --git a/Documentation/filesystems/caching/index.rst b/Documentation/filesystems/caching/index.rst index a2cf35f89e28..033da7ac7c6e 100644 --- a/Documentation/filesystems/caching/index.rst +++ b/Documentation/filesystems/caching/index.rst @@ -8,6 +8,7 @@ Filesystem Caching fscache object + backend-api cachefiles netfs-api operations diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f78793f3d21e..fcc136361415 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -172,7 +172,7 @@ no_preference: * * Initialise a record of a cache and fill in the name. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_init_cache(struct fscache_cache *cache, @@ -207,7 +207,7 @@ EXPORT_SYMBOL(fscache_init_cache); * * Add a cache to the system, making it available for netfs's to use. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ int fscache_add_cache(struct fscache_cache *cache, @@ -307,7 +307,7 @@ EXPORT_SYMBOL(fscache_add_cache); * Note that an I/O error occurred in a cache and that it should no longer be * used for anything. This also reports the error into the kernel log. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_io_error(struct fscache_cache *cache) @@ -355,7 +355,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, * Withdraw a cache from service, unbinding all its cache objects from the * netfs cookies they're currently representing. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_withdraw_cache(struct fscache_cache *cache) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index efaa003b8323..cb2146e02cd5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -295,7 +295,7 @@ static void fscache_object_work_func(struct work_struct *work) * * Initialise a cache object description to its basic values. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_object_init(struct fscache_object *object, diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index d5ba431b5d63..ce0b5fbf239d 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -6,7 +6,7 @@ * * NOTE!!! See: * - * Documentation/filesystems/caching/backend-api.txt + * Documentation/filesystems/caching/backend-api.rst * * for a description of the cache backend interface declared here. */ @@ -454,7 +454,7 @@ static inline void fscache_object_lookup_error(struct fscache_object *object) * Set the maximum size an object is permitted to reach, implying the highest * byte that may be written. Intended to be called by the attr_changed() op. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ static inline -- cgit v1.2.3 From 175cc46f4d667a9073270a32f40212d871c18955 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:16:59 +0200 Subject: docs: filesystems: convert cifs/cifsroot.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/28de01ee52283287e4195cf736d7154f122d30d4.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/cifs/cifsroot.rst | 105 ++++++++++++++++++++++++++++ Documentation/filesystems/cifs/cifsroot.txt | 97 ------------------------- Documentation/filesystems/index.rst | 1 + 3 files changed, 106 insertions(+), 97 deletions(-) create mode 100644 Documentation/filesystems/cifs/cifsroot.rst delete mode 100644 Documentation/filesystems/cifs/cifsroot.txt diff --git a/Documentation/filesystems/cifs/cifsroot.rst b/Documentation/filesystems/cifs/cifsroot.rst new file mode 100644 index 000000000000..4930bb443134 --- /dev/null +++ b/Documentation/filesystems/cifs/cifsroot.rst @@ -0,0 +1,105 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================== +Mounting root file system via SMB (cifs.ko) +=========================================== + +Written 2019 by Paulo Alcantara + +Written 2019 by Aurelien Aptel + +The CONFIG_CIFS_ROOT option enables experimental root file system +support over the SMB protocol via cifs.ko. + +It introduces a new kernel command-line option called 'cifsroot=' +which will tell the kernel to mount the root file system over the +network by utilizing SMB or CIFS protocol. + +In order to mount, the network stack will also need to be set up by +using 'ip=' config option. For more details, see +Documentation/admin-guide/nfs/nfsroot.rst. + +A CIFS root mount currently requires the use of SMB1+UNIX Extensions +which is only supported by the Samba server. SMB1 is the older +deprecated version of the protocol but it has been extended to support +POSIX features (See [1]). The equivalent extensions for the newer +recommended version of the protocol (SMB3) have not been fully +implemented yet which means SMB3 doesn't support some required POSIX +file system objects (e.g. block devices, pipes, sockets). + +As a result, a CIFS root will default to SMB1 for now but the version +to use can nonetheless be changed via the 'vers=' mount option. This +default will change once the SMB3 POSIX extensions are fully +implemented. + +Server configuration +==================== + +To enable SMB1+UNIX extensions you will need to set these global +settings in Samba smb.conf:: + + [global] + server min protocol = NT1 + unix extension = yes # default + +Kernel command line +=================== + +:: + + root=/dev/cifs + +This is just a virtual device that basically tells the kernel to mount +the root file system via SMB protocol. + +:: + + cifsroot=///[,options] + +Enables the kernel to mount the root file system via SMB that are +located in the and specified in this option. + +The default mount options are set in fs/cifs/cifsroot.c. + +server-ip + IPv4 address of the server. + +share + Path to SMB share (rootfs). + +options + Optional mount options. For more information, see mount.cifs(8). + +Examples +======== + +Export root file system as a Samba share in smb.conf file:: + + ... + [linux] + path = /path/to/rootfs + read only = no + guest ok = yes + force user = root + force group = root + browseable = yes + writeable = yes + admin users = root + public = yes + create mask = 0777 + directory mask = 0777 + ... + +Restart smb service:: + + # systemctl restart smb + +Test it under QEMU on a kernel built with CONFIG_CIFS_ROOT and +CONFIG_IP_PNP options enabled:: + + # qemu-system-x86_64 -enable-kvm -cpu host -m 1024 \ + -kernel /path/to/linux/arch/x86/boot/bzImage -nographic \ + -append "root=/dev/cifs rw ip=dhcp cifsroot=//10.0.2.2/linux,username=foo,password=bar console=ttyS0 3" + + +1: https://wiki.samba.org/index.php/UNIX_Extensions diff --git a/Documentation/filesystems/cifs/cifsroot.txt b/Documentation/filesystems/cifs/cifsroot.txt deleted file mode 100644 index 947b7ec6ce9e..000000000000 --- a/Documentation/filesystems/cifs/cifsroot.txt +++ /dev/null @@ -1,97 +0,0 @@ -Mounting root file system via SMB (cifs.ko) -=========================================== - -Written 2019 by Paulo Alcantara -Written 2019 by Aurelien Aptel - -The CONFIG_CIFS_ROOT option enables experimental root file system -support over the SMB protocol via cifs.ko. - -It introduces a new kernel command-line option called 'cifsroot=' -which will tell the kernel to mount the root file system over the -network by utilizing SMB or CIFS protocol. - -In order to mount, the network stack will also need to be set up by -using 'ip=' config option. For more details, see -Documentation/admin-guide/nfs/nfsroot.rst. - -A CIFS root mount currently requires the use of SMB1+UNIX Extensions -which is only supported by the Samba server. SMB1 is the older -deprecated version of the protocol but it has been extended to support -POSIX features (See [1]). The equivalent extensions for the newer -recommended version of the protocol (SMB3) have not been fully -implemented yet which means SMB3 doesn't support some required POSIX -file system objects (e.g. block devices, pipes, sockets). - -As a result, a CIFS root will default to SMB1 for now but the version -to use can nonetheless be changed via the 'vers=' mount option. This -default will change once the SMB3 POSIX extensions are fully -implemented. - -Server configuration -==================== - -To enable SMB1+UNIX extensions you will need to set these global -settings in Samba smb.conf: - - [global] - server min protocol = NT1 - unix extension = yes # default - -Kernel command line -=================== - -root=/dev/cifs - -This is just a virtual device that basically tells the kernel to mount -the root file system via SMB protocol. - -cifsroot=///[,options] - -Enables the kernel to mount the root file system via SMB that are -located in the and specified in this option. - -The default mount options are set in fs/cifs/cifsroot.c. - -server-ip - IPv4 address of the server. - -share - Path to SMB share (rootfs). - -options - Optional mount options. For more information, see mount.cifs(8). - -Examples -======== - -Export root file system as a Samba share in smb.conf file. - -... -[linux] - path = /path/to/rootfs - read only = no - guest ok = yes - force user = root - force group = root - browseable = yes - writeable = yes - admin users = root - public = yes - create mask = 0777 - directory mask = 0777 -... - -Restart smb service. - -# systemctl restart smb - -Test it under QEMU on a kernel built with CONFIG_CIFS_ROOT and -CONFIG_IP_PNP options enabled. - -# qemu-system-x86_64 -enable-kvm -cpu host -m 1024 \ - -kernel /path/to/linux/arch/x86/boot/bzImage -nographic \ - -append "root=/dev/cifs rw ip=dhcp cifsroot=//10.0.2.2/linux,username=foo,password=bar console=ttyS0 3" - - -1: https://wiki.samba.org/index.php/UNIX_Extensions diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 773af390c5e5..d993b33a48c4 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -59,6 +59,7 @@ Documentation for filesystem implementations. befs bfs btrfs + cifs/cifsroot ceph cramfs debugfs -- cgit v1.2.3 From 41defb4d0d587ef3fb9b90f3c320b6c41ef68f16 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:00 +0200 Subject: docs: filesystems: convert automount-support.txt to ReST - Add a SPDX header; - Add a document title; - Adjust section titles; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/ba7e2f2bf9aa2c7096772f5e7e8e609cb5fce07c.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/automount-support.rst | 98 +++++++++++++++++++++++++ Documentation/filesystems/automount-support.txt | 93 ----------------------- Documentation/filesystems/index.rst | 2 + 3 files changed, 100 insertions(+), 93 deletions(-) create mode 100644 Documentation/filesystems/automount-support.rst delete mode 100644 Documentation/filesystems/automount-support.txt diff --git a/Documentation/filesystems/automount-support.rst b/Documentation/filesystems/automount-support.rst new file mode 100644 index 000000000000..430f0b40796b --- /dev/null +++ b/Documentation/filesystems/automount-support.rst @@ -0,0 +1,98 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Automount Support +================= + + +Support is available for filesystems that wish to do automounting +support (such as kAFS which can be found in fs/afs/ and NFS in +fs/nfs/). This facility includes allowing in-kernel mounts to be +performed and mountpoint degradation to be requested. The latter can +also be requested by userspace. + + +In-Kernel Automounting +====================== + +See section "Mount Traps" of Documentation/filesystems/autofs.rst + +Then from userspace, you can just do something like:: + + [root@andromeda root]# mount -t afs \#root.afs. /afs + [root@andromeda root]# ls /afs + asd cambridge cambridge.redhat.com grand.central.org + [root@andromeda root]# ls /afs/cambridge + afsdoc + [root@andromeda root]# ls /afs/cambridge/afsdoc/ + ChangeLog html LICENSE pdf RELNOTES-1.2.2 + +And then if you look in the mountpoint catalogue, you'll see something like:: + + [root@andromeda root]# cat /proc/mounts + ... + #root.afs. /afs afs rw 0 0 + #root.cell. /afs/cambridge.redhat.com afs rw 0 0 + #afsdoc. /afs/cambridge.redhat.com/afsdoc afs rw 0 0 + + +Automatic Mountpoint Expiry +=========================== + +Automatic expiration of mountpoints is easy, provided you've mounted the +mountpoint to be expired in the automounting procedure outlined separately. + +To do expiration, you need to follow these steps: + + (1) Create at least one list off which the vfsmounts to be expired can be + hung. + + (2) When a new mountpoint is created in the ->d_automount method, add + the mnt to the list using mnt_set_expiry():: + + mnt_set_expiry(newmnt, &afs_vfsmounts); + + (3) When you want mountpoints to be expired, call mark_mounts_for_expiry() + with a pointer to this list. This will process the list, marking every + vfsmount thereon for potential expiry on the next call. + + If a vfsmount was already flagged for expiry, and if its usage count is 1 + (it's only referenced by its parent vfsmount), then it will be deleted + from the namespace and thrown away (effectively unmounted). + + It may prove simplest to simply call this at regular intervals, using + some sort of timed event to drive it. + +The expiration flag is cleared by calls to mntput. This means that expiration +will only happen on the second expiration request after the last time the +mountpoint was accessed. + +If a mountpoint is moved, it gets removed from the expiration list. If a bind +mount is made on an expirable mount, the new vfsmount will not be on the +expiration list and will not expire. + +If a namespace is copied, all mountpoints contained therein will be copied, +and the copies of those that are on an expiration list will be added to the +same expiration list. + + +Userspace Driven Expiry +======================= + +As an alternative, it is possible for userspace to request expiry of any +mountpoint (though some will be rejected - the current process's idea of the +rootfs for example). It does this by passing the MNT_EXPIRE flag to +umount(). This flag is considered incompatible with MNT_FORCE and MNT_DETACH. + +If the mountpoint in question is in referenced by something other than +umount() or its parent mountpoint, an EBUSY error will be returned and the +mountpoint will not be marked for expiration or unmounted. + +If the mountpoint was not already marked for expiry at that time, an EAGAIN +error will be given and it won't be unmounted. + +Otherwise if it was already marked and it wasn't referenced, unmounting will +take place as usual. + +Again, the expiration flag is cleared every time anything other than umount() +looks at a mountpoint. diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt deleted file mode 100644 index 7d9f82607562..000000000000 --- a/Documentation/filesystems/automount-support.txt +++ /dev/null @@ -1,93 +0,0 @@ -Support is available for filesystems that wish to do automounting -support (such as kAFS which can be found in fs/afs/ and NFS in -fs/nfs/). This facility includes allowing in-kernel mounts to be -performed and mountpoint degradation to be requested. The latter can -also be requested by userspace. - - -====================== -IN-KERNEL AUTOMOUNTING -====================== - -See section "Mount Traps" of Documentation/filesystems/autofs.rst - -Then from userspace, you can just do something like: - - [root@andromeda root]# mount -t afs \#root.afs. /afs - [root@andromeda root]# ls /afs - asd cambridge cambridge.redhat.com grand.central.org - [root@andromeda root]# ls /afs/cambridge - afsdoc - [root@andromeda root]# ls /afs/cambridge/afsdoc/ - ChangeLog html LICENSE pdf RELNOTES-1.2.2 - -And then if you look in the mountpoint catalogue, you'll see something like: - - [root@andromeda root]# cat /proc/mounts - ... - #root.afs. /afs afs rw 0 0 - #root.cell. /afs/cambridge.redhat.com afs rw 0 0 - #afsdoc. /afs/cambridge.redhat.com/afsdoc afs rw 0 0 - - -=========================== -AUTOMATIC MOUNTPOINT EXPIRY -=========================== - -Automatic expiration of mountpoints is easy, provided you've mounted the -mountpoint to be expired in the automounting procedure outlined separately. - -To do expiration, you need to follow these steps: - - (1) Create at least one list off which the vfsmounts to be expired can be - hung. - - (2) When a new mountpoint is created in the ->d_automount method, add - the mnt to the list using mnt_set_expiry() - mnt_set_expiry(newmnt, &afs_vfsmounts); - - (3) When you want mountpoints to be expired, call mark_mounts_for_expiry() - with a pointer to this list. This will process the list, marking every - vfsmount thereon for potential expiry on the next call. - - If a vfsmount was already flagged for expiry, and if its usage count is 1 - (it's only referenced by its parent vfsmount), then it will be deleted - from the namespace and thrown away (effectively unmounted). - - It may prove simplest to simply call this at regular intervals, using - some sort of timed event to drive it. - -The expiration flag is cleared by calls to mntput. This means that expiration -will only happen on the second expiration request after the last time the -mountpoint was accessed. - -If a mountpoint is moved, it gets removed from the expiration list. If a bind -mount is made on an expirable mount, the new vfsmount will not be on the -expiration list and will not expire. - -If a namespace is copied, all mountpoints contained therein will be copied, -and the copies of those that are on an expiration list will be added to the -same expiration list. - - -======================= -USERSPACE DRIVEN EXPIRY -======================= - -As an alternative, it is possible for userspace to request expiry of any -mountpoint (though some will be rejected - the current process's idea of the -rootfs for example). It does this by passing the MNT_EXPIRE flag to -umount(). This flag is considered incompatible with MNT_FORCE and MNT_DETACH. - -If the mountpoint in question is in referenced by something other than -umount() or its parent mountpoint, an EBUSY error will be returned and the -mountpoint will not be marked for expiration or unmounted. - -If the mountpoint was not already marked for expiry at that time, an EAGAIN -error will be given and it won't be unmounted. - -Otherwise if it was already marked and it wasn't referenced, unmounting will -take place as usual. - -Again, the expiration flag is cleared every time anything other than umount() -looks at a mountpoint. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index d993b33a48c4..373d695bac65 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -25,6 +25,8 @@ algorithms work. locking directory-locking + automount-support + caching/index porting -- cgit v1.2.3 From f476c6ed17d49553d40467514b4cb2dd2648e30f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:01 +0200 Subject: docs: filesystems: convert coda.txt to ReST This document has its own style. It seems to be print output for the old matrixial printers where backspace were used to do double prints. For the conversion, I used several regex expressions to get rid of some weird stuff. The patch also does almost all possible conversions in order to get a nice output document, while keeping it readable/editable as is: - Add a SPDX header; - Add a document title; - Adjust document title; - Adjust section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Adjust list markups; - Mark some unumbered titles with bold font; - Use footnoote markups; - Add table markups; - Use notes markups; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/25c06c40c3d7b947a131c3be124ce0e93cc00ae3.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/coda.rst | 1670 ++++++++++++++++++++++++++++++++++ Documentation/filesystems/coda.txt | 1676 ----------------------------------- Documentation/filesystems/index.rst | 1 + MAINTAINERS | 2 +- fs/coda/Kconfig | 2 +- 5 files changed, 1673 insertions(+), 1678 deletions(-) create mode 100644 Documentation/filesystems/coda.rst delete mode 100644 Documentation/filesystems/coda.txt diff --git a/Documentation/filesystems/coda.rst b/Documentation/filesystems/coda.rst new file mode 100644 index 000000000000..84c860c89887 --- /dev/null +++ b/Documentation/filesystems/coda.rst @@ -0,0 +1,1670 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +Coda Kernel-Venus Interface +=========================== + +.. Note:: + + This is one of the technical documents describing a component of + Coda -- this document describes the client kernel-Venus interface. + +For more information: + + http://www.coda.cs.cmu.edu + +For user level software needed to run Coda: + + ftp://ftp.coda.cs.cmu.edu + +To run Coda you need to get a user level cache manager for the client, +named Venus, as well as tools to manipulate ACLs, to log in, etc. The +client needs to have the Coda filesystem selected in the kernel +configuration. + +The server needs a user level server and at present does not depend on +kernel support. + + The Venus kernel interface + + Peter J. Braam + + v1.0, Nov 9, 1997 + + This document describes the communication between Venus and kernel + level filesystem code needed for the operation of the Coda file sys- + tem. This document version is meant to describe the current interface + (version 1.0) as well as improvements we envisage. + +.. Table of Contents + + 1. Introduction + + 2. Servicing Coda filesystem calls + + 3. The message layer + + 3.1 Implementation details + + 4. The interface at the call level + + 4.1 Data structures shared by the kernel and Venus + 4.2 The pioctl interface + 4.3 root + 4.4 lookup + 4.5 getattr + 4.6 setattr + 4.7 access + 4.8 create + 4.9 mkdir + 4.10 link + 4.11 symlink + 4.12 remove + 4.13 rmdir + 4.14 readlink + 4.15 open + 4.16 close + 4.17 ioctl + 4.18 rename + 4.19 readdir + 4.20 vget + 4.21 fsync + 4.22 inactive + 4.23 rdwr + 4.24 odymount + 4.25 ody_lookup + 4.26 ody_expand + 4.27 prefetch + 4.28 signal + + 5. The minicache and downcalls + + 5.1 INVALIDATE + 5.2 FLUSH + 5.3 PURGEUSER + 5.4 ZAPFILE + 5.5 ZAPDIR + 5.6 ZAPVNODE + 5.7 PURGEFID + 5.8 REPLACE + + 6. Initialization and cleanup + + 6.1 Requirements + +1. Introduction +=============== + + A key component in the Coda Distributed File System is the cache + manager, Venus. + + When processes on a Coda enabled system access files in the Coda + filesystem, requests are directed at the filesystem layer in the + operating system. The operating system will communicate with Venus to + service the request for the process. Venus manages a persistent + client cache and makes remote procedure calls to Coda file servers and + related servers (such as authentication servers) to service these + requests it receives from the operating system. When Venus has + serviced a request it replies to the operating system with appropriate + return codes, and other data related to the request. Optionally the + kernel support for Coda may maintain a minicache of recently processed + requests to limit the number of interactions with Venus. Venus + possesses the facility to inform the kernel when elements from its + minicache are no longer valid. + + This document describes precisely this communication between the + kernel and Venus. The definitions of so called upcalls and downcalls + will be given with the format of the data they handle. We shall also + describe the semantic invariants resulting from the calls. + + Historically Coda was implemented in a BSD file system in Mach 2.6. + The interface between the kernel and Venus is very similar to the BSD + VFS interface. Similar functionality is provided, and the format of + the parameters and returned data is very similar to the BSD VFS. This + leads to an almost natural environment for implementing a kernel-level + filesystem driver for Coda in a BSD system. However, other operating + systems such as Linux and Windows 95 and NT have virtual filesystem + with different interfaces. + + To implement Coda on these systems some reverse engineering of the + Venus/Kernel protocol is necessary. Also it came to light that other + systems could profit significantly from certain small optimizations + and modifications to the protocol. To facilitate this work as well as + to make future ports easier, communication between Venus and the + kernel should be documented in great detail. This is the aim of this + document. + +2. Servicing Coda filesystem calls +=================================== + + The service of a request for a Coda file system service originates in + a process P which accessing a Coda file. It makes a system call which + traps to the OS kernel. Examples of such calls trapping to the kernel + are ``read``, ``write``, ``open``, ``close``, ``create``, ``mkdir``, + ``rmdir``, ``chmod`` in a Unix ontext. Similar calls exist in the Win32 + environment, and are named ``CreateFile``. + + Generally the operating system handles the request in a virtual + filesystem (VFS) layer, which is named I/O Manager in NT and IFS + manager in Windows 95. The VFS is responsible for partial processing + of the request and for locating the specific filesystem(s) which will + service parts of the request. Usually the information in the path + assists in locating the correct FS drivers. Sometimes after extensive + pre-processing, the VFS starts invoking exported routines in the FS + driver. This is the point where the FS specific processing of the + request starts, and here the Coda specific kernel code comes into + play. + + The FS layer for Coda must expose and implement several interfaces. + First and foremost the VFS must be able to make all necessary calls to + the Coda FS layer, so the Coda FS driver must expose the VFS interface + as applicable in the operating system. These differ very significantly + among operating systems, but share features such as facilities to + read/write and create and remove objects. The Coda FS layer services + such VFS requests by invoking one or more well defined services + offered by the cache manager Venus. When the replies from Venus have + come back to the FS driver, servicing of the VFS call continues and + finishes with a reply to the kernel's VFS. Finally the VFS layer + returns to the process. + + As a result of this design a basic interface exposed by the FS driver + must allow Venus to manage message traffic. In particular Venus must + be able to retrieve and place messages and to be notified of the + arrival of a new message. The notification must be through a mechanism + which does not block Venus since Venus must attend to other tasks even + when no messages are waiting or being processed. + + **Interfaces of the Coda FS Driver** + + Furthermore the FS layer provides for a special path of communication + between a user process and Venus, called the pioctl interface. The + pioctl interface is used for Coda specific services, such as + requesting detailed information about the persistent cache managed by + Venus. Here the involvement of the kernel is minimal. It identifies + the calling process and passes the information on to Venus. When + Venus replies the response is passed back to the caller in unmodified + form. + + Finally Venus allows the kernel FS driver to cache the results from + certain services. This is done to avoid excessive context switches + and results in an efficient system. However, Venus may acquire + information, for example from the network which implies that cached + information must be flushed or replaced. Venus then makes a downcall + to the Coda FS layer to request flushes or updates in the cache. The + kernel FS driver handles such requests synchronously. + + Among these interfaces the VFS interface and the facility to place, + receive and be notified of messages are platform specific. We will + not go into the calls exported to the VFS layer but we will state the + requirements of the message exchange mechanism. + + +3. The message layer +===================== + + At the lowest level the communication between Venus and the FS driver + proceeds through messages. The synchronization between processes + requesting Coda file service and Venus relies on blocking and waking + up processes. The Coda FS driver processes VFS- and pioctl-requests + on behalf of a process P, creates messages for Venus, awaits replies + and finally returns to the caller. The implementation of the exchange + of messages is platform specific, but the semantics have (so far) + appeared to be generally applicable. Data buffers are created by the + FS Driver in kernel memory on behalf of P and copied to user memory in + Venus. + + The FS Driver while servicing P makes upcalls to Venus. Such an + upcall is dispatched to Venus by creating a message structure. The + structure contains the identification of P, the message sequence + number, the size of the request and a pointer to the data in kernel + memory for the request. Since the data buffer is re-used to hold the + reply from Venus, there is a field for the size of the reply. A flags + field is used in the message to precisely record the status of the + message. Additional platform dependent structures involve pointers to + determine the position of the message on queues and pointers to + synchronization objects. In the upcall routine the message structure + is filled in, flags are set to 0, and it is placed on the *pending* + queue. The routine calling upcall is responsible for allocating the + data buffer; its structure will be described in the next section. + + A facility must exist to notify Venus that the message has been + created, and implemented using available synchronization objects in + the OS. This notification is done in the upcall context of the process + P. When the message is on the pending queue, process P cannot proceed + in upcall. The (kernel mode) processing of P in the filesystem + request routine must be suspended until Venus has replied. Therefore + the calling thread in P is blocked in upcall. A pointer in the + message structure will locate the synchronization object on which P is + sleeping. + + Venus detects the notification that a message has arrived, and the FS + driver allow Venus to retrieve the message with a getmsg_from_kernel + call. This action finishes in the kernel by putting the message on the + queue of processing messages and setting flags to READ. Venus is + passed the contents of the data buffer. The getmsg_from_kernel call + now returns and Venus processes the request. + + At some later point the FS driver receives a message from Venus, + namely when Venus calls sendmsg_to_kernel. At this moment the Coda FS + driver looks at the contents of the message and decides if: + + + * the message is a reply for a suspended thread P. If so it removes + the message from the processing queue and marks the message as + WRITTEN. Finally, the FS driver unblocks P (still in the kernel + mode context of Venus) and the sendmsg_to_kernel call returns to + Venus. The process P will be scheduled at some point and continues + processing its upcall with the data buffer replaced with the reply + from Venus. + + * The message is a ``downcall``. A downcall is a request from Venus to + the FS Driver. The FS driver processes the request immediately + (usually a cache eviction or replacement) and when it finishes + sendmsg_to_kernel returns. + + Now P awakes and continues processing upcall. There are some + subtleties to take account of. First P will determine if it was woken + up in upcall by a signal from some other source (for example an + attempt to terminate P) or as is normally the case by Venus in its + sendmsg_to_kernel call. In the normal case, the upcall routine will + deallocate the message structure and return. The FS routine can proceed + with its processing. + + + **Sleeping and IPC arrangements** + + In case P is woken up by a signal and not by Venus, it will first look + at the flags field. If the message is not yet READ, the process P can + handle its signal without notifying Venus. If Venus has READ, and + the request should not be processed, P can send Venus a signal message + to indicate that it should disregard the previous message. Such + signals are put in the queue at the head, and read first by Venus. If + the message is already marked as WRITTEN it is too late to stop the + processing. The VFS routine will now continue. (-- If a VFS request + involves more than one upcall, this can lead to complicated state, an + extra field "handle_signals" could be added in the message structure + to indicate points of no return have been passed.--) + + + +3.1. Implementation details +---------------------------- + + The Unix implementation of this mechanism has been through the + implementation of a character device associated with Coda. Venus + retrieves messages by doing a read on the device, replies are sent + with a write and notification is through the select system call on the + file descriptor for the device. The process P is kept waiting on an + interruptible wait queue object. + + In Windows NT and the DPMI Windows 95 implementation a DeviceIoControl + call is used. The DeviceIoControl call is designed to copy buffers + from user memory to kernel memory with OPCODES. The sendmsg_to_kernel + is issued as a synchronous call, while the getmsg_from_kernel call is + asynchronous. Windows EventObjects are used for notification of + message arrival. The process P is kept waiting on a KernelEvent + object in NT and a semaphore in Windows 95. + + +4. The interface at the call level +=================================== + + + This section describes the upcalls a Coda FS driver can make to Venus. + Each of these upcalls make use of two structures: inputArgs and + outputArgs. In pseudo BNF form the structures take the following + form:: + + + struct inputArgs { + u_long opcode; + u_long unique; /* Keep multiple outstanding msgs distinct */ + u_short pid; /* Common to all */ + u_short pgid; /* Common to all */ + struct CodaCred cred; /* Common to all */ + + + }; + + struct outputArgs { + u_long opcode; + u_long unique; /* Keep multiple outstanding msgs distinct */ + u_long result; + + + }; + + + + Before going on let us elucidate the role of the various fields. The + inputArgs start with the opcode which defines the type of service + requested from Venus. There are approximately 30 upcalls at present + which we will discuss. The unique field labels the inputArg with a + unique number which will identify the message uniquely. A process and + process group id are passed. Finally the credentials of the caller + are included. + + Before delving into the specific calls we need to discuss a variety of + data structures shared by the kernel and Venus. + + + + +4.1. Data structures shared by the kernel and Venus +---------------------------------------------------- + + + The CodaCred structure defines a variety of user and group ids as + they are set for the calling process. The vuid_t and vgid_t are 32 bit + unsigned integers. It also defines group membership in an array. On + Unix the CodaCred has proven sufficient to implement good security + semantics for Coda but the structure may have to undergo modification + for the Windows environment when these mature:: + + struct CodaCred { + vuid_t cr_uid, cr_euid, cr_suid, cr_fsuid; /* Real, effective, set, fs uid */ + vgid_t cr_gid, cr_egid, cr_sgid, cr_fsgid; /* same for groups */ + vgid_t cr_groups[NGROUPS]; /* Group membership for caller */ + }; + + + .. Note:: + + It is questionable if we need CodaCreds in Venus. Finally Venus + doesn't know about groups, although it does create files with the + default uid/gid. Perhaps the list of group membership is superfluous. + + + The next item is the fundamental identifier used to identify Coda + files, the ViceFid. A fid of a file uniquely defines a file or + directory in the Coda filesystem within a cell [1]_:: + + typedef struct ViceFid { + VolumeId Volume; + VnodeId Vnode; + Unique_t Unique; + } ViceFid; + + .. [1] A cell is agroup of Coda servers acting under the aegis of a single + system control machine or SCM. See the Coda Administration manual + for a detailed description of the role of the SCM. + + Each of the constituent fields: VolumeId, VnodeId and Unique_t are + unsigned 32 bit integers. We envisage that a further field will need + to be prefixed to identify the Coda cell; this will probably take the + form of a Ipv6 size IP address naming the Coda cell through DNS. + + The next important structure shared between Venus and the kernel is + the attributes of the file. The following structure is used to + exchange information. It has room for future extensions such as + support for device files (currently not present in Coda):: + + + struct coda_timespec { + int64_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ + }; + + struct coda_vattr { + enum coda_vtype va_type; /* vnode type (for create) */ + u_short va_mode; /* files access mode and type */ + short va_nlink; /* number of references to file */ + vuid_t va_uid; /* owner user id */ + vgid_t va_gid; /* owner group id */ + long va_fsid; /* file system id (dev for now) */ + long va_fileid; /* file id */ + u_quad_t va_size; /* file size in bytes */ + long va_blocksize; /* blocksize preferred for i/o */ + struct coda_timespec va_atime; /* time of last access */ + struct coda_timespec va_mtime; /* time of last modification */ + struct coda_timespec va_ctime; /* time file changed */ + u_long va_gen; /* generation number of file */ + u_long va_flags; /* flags defined for file */ + dev_t va_rdev; /* device special file represents */ + u_quad_t va_bytes; /* bytes of disk space held by file */ + u_quad_t va_filerev; /* file modification number */ + u_int va_vaflags; /* operations flags, see below */ + long va_spare; /* remain quad aligned */ + }; + + +4.2. The pioctl interface +-------------------------- + + + Coda specific requests can be made by application through the pioctl + interface. The pioctl is implemented as an ordinary ioctl on a + fictitious file /coda/.CONTROL. The pioctl call opens this file, gets + a file handle and makes the ioctl call. Finally it closes the file. + + The kernel involvement in this is limited to providing the facility to + open and close and pass the ioctl message and to verify that a path in + the pioctl data buffers is a file in a Coda filesystem. + + The kernel is handed a data packet of the form:: + + struct { + const char *path; + struct ViceIoctl vidata; + int follow; + } data; + + + + where:: + + + struct ViceIoctl { + caddr_t in, out; /* Data to be transferred in, or out */ + short in_size; /* Size of input buffer <= 2K */ + short out_size; /* Maximum size of output buffer, <= 2K */ + }; + + + + The path must be a Coda file, otherwise the ioctl upcall will not be + made. + + .. Note:: The data structures and code are a mess. We need to clean this up. + + +**We now proceed to document the individual calls**: + + +4.3. root +---------- + + + Arguments + in + + empty + + out:: + + struct cfs_root_out { + ViceFid VFid; + } cfs_root; + + + + Description + This call is made to Venus during the initialization of + the Coda filesystem. If the result is zero, the cfs_root structure + contains the ViceFid of the root of the Coda filesystem. If a non-zero + result is generated, its value is a platform dependent error code + indicating the difficulty Venus encountered in locating the root of + the Coda filesystem. + +4.4. lookup +------------ + + + Summary + Find the ViceFid and type of an object in a directory if it exists. + + Arguments + in:: + + struct cfs_lookup_in { + ViceFid VFid; + char *name; /* Place holder for data. */ + } cfs_lookup; + + + + out:: + + struct cfs_lookup_out { + ViceFid VFid; + int vtype; + } cfs_lookup; + + + + Description + This call is made to determine the ViceFid and filetype of + a directory entry. The directory entry requested carries name name + and Venus will search the directory identified by cfs_lookup_in.VFid. + The result may indicate that the name does not exist, or that + difficulty was encountered in finding it (e.g. due to disconnection). + If the result is zero, the field cfs_lookup_out.VFid contains the + targets ViceFid and cfs_lookup_out.vtype the coda_vtype giving the + type of object the name designates. + + The name of the object is an 8 bit character string of maximum length + CFS_MAXNAMLEN, currently set to 256 (including a 0 terminator.) + + It is extremely important to realize that Venus bitwise ors the field + cfs_lookup.vtype with CFS_NOCACHE to indicate that the object should + not be put in the kernel name cache. + + .. Note:: + + The type of the vtype is currently wrong. It should be + coda_vtype. Linux does not take note of CFS_NOCACHE. It should. + + +4.5. getattr +------------- + + + Summary Get the attributes of a file. + + Arguments + in:: + + struct cfs_getattr_in { + ViceFid VFid; + struct coda_vattr attr; /* XXXXX */ + } cfs_getattr; + + + + out:: + + struct cfs_getattr_out { + struct coda_vattr attr; + } cfs_getattr; + + + + Description + This call returns the attributes of the file identified by fid. + + Errors + Errors can occur if the object with fid does not exist, is + unaccessible or if the caller does not have permission to fetch + attributes. + + .. Note:: + + Many kernel FS drivers (Linux, NT and Windows 95) need to acquire + the attributes as well as the Fid for the instantiation of an internal + "inode" or "FileHandle". A significant improvement in performance on + such systems could be made by combining the lookup and getattr calls + both at the Venus/kernel interaction level and at the RPC level. + + The vattr structure included in the input arguments is superfluous and + should be removed. + + +4.6. setattr +------------- + + + Summary + Set the attributes of a file. + + Arguments + in:: + + struct cfs_setattr_in { + ViceFid VFid; + struct coda_vattr attr; + } cfs_setattr; + + + + + out + + empty + + Description + The structure attr is filled with attributes to be changed + in BSD style. Attributes not to be changed are set to -1, apart from + vtype which is set to VNON. Other are set to the value to be assigned. + The only attributes which the FS driver may request to change are the + mode, owner, groupid, atime, mtime and ctime. The return value + indicates success or failure. + + Errors + A variety of errors can occur. The object may not exist, may + be inaccessible, or permission may not be granted by Venus. + + +4.7. access +------------ + + + Arguments + in:: + + struct cfs_access_in { + ViceFid VFid; + int flags; + } cfs_access; + + + + out + + empty + + Description + Verify if access to the object identified by VFid for + operations described by flags is permitted. The result indicates if + access will be granted. It is important to remember that Coda uses + ACLs to enforce protection and that ultimately the servers, not the + clients enforce the security of the system. The result of this call + will depend on whether a token is held by the user. + + Errors + The object may not exist, or the ACL describing the protection + may not be accessible. + + +4.8. create +------------ + + + Summary + Invoked to create a file + + Arguments + in:: + + struct cfs_create_in { + ViceFid VFid; + struct coda_vattr attr; + int excl; + int mode; + char *name; /* Place holder for data. */ + } cfs_create; + + + + + out:: + + struct cfs_create_out { + ViceFid VFid; + struct coda_vattr attr; + } cfs_create; + + + + Description + This upcall is invoked to request creation of a file. + The file will be created in the directory identified by VFid, its name + will be name, and the mode will be mode. If excl is set an error will + be returned if the file already exists. If the size field in attr is + set to zero the file will be truncated. The uid and gid of the file + are set by converting the CodaCred to a uid using a macro CRTOUID + (this macro is platform dependent). Upon success the VFid and + attributes of the file are returned. The Coda FS Driver will normally + instantiate a vnode, inode or file handle at kernel level for the new + object. + + + Errors + A variety of errors can occur. Permissions may be insufficient. + If the object exists and is not a file the error EISDIR is returned + under Unix. + + .. Note:: + + The packing of parameters is very inefficient and appears to + indicate confusion between the system call creat and the VFS operation + create. The VFS operation create is only called to create new objects. + This create call differs from the Unix one in that it is not invoked + to return a file descriptor. The truncate and exclusive options, + together with the mode, could simply be part of the mode as it is + under Unix. There should be no flags argument; this is used in open + (2) to return a file descriptor for READ or WRITE mode. + + The attributes of the directory should be returned too, since the size + and mtime changed. + + +4.9. mkdir +----------- + + + Summary + Create a new directory. + + Arguments + in:: + + struct cfs_mkdir_in { + ViceFid VFid; + struct coda_vattr attr; + char *name; /* Place holder for data. */ + } cfs_mkdir; + + + + out:: + + struct cfs_mkdir_out { + ViceFid VFid; + struct coda_vattr attr; + } cfs_mkdir; + + + + + Description + This call is similar to create but creates a directory. + Only the mode field in the input parameters is used for creation. + Upon successful creation, the attr returned contains the attributes of + the new directory. + + Errors + As for create. + + .. Note:: + + The input parameter should be changed to mode instead of + attributes. + + The attributes of the parent should be returned since the size and + mtime changes. + + +4.10. link +----------- + + + Summary + Create a link to an existing file. + + Arguments + in:: + + struct cfs_link_in { + ViceFid sourceFid; /* cnode to link *to* */ + ViceFid destFid; /* Directory in which to place link */ + char *tname; /* Place holder for data. */ + } cfs_link; + + + + out + + empty + + Description + This call creates a link to the sourceFid in the directory + identified by destFid with name tname. The source must reside in the + target's parent, i.e. the source must be have parent destFid, i.e. Coda + does not support cross directory hard links. Only the return value is + relevant. It indicates success or the type of failure. + + Errors + The usual errors can occur. + + +4.11. symlink +-------------- + + + Summary + create a symbolic link + + Arguments + in:: + + struct cfs_symlink_in { + ViceFid VFid; /* Directory to put symlink in */ + char *srcname; + struct coda_vattr attr; + char *tname; + } cfs_symlink; + + + + out + + none + + Description + Create a symbolic link. The link is to be placed in the + directory identified by VFid and named tname. It should point to the + pathname srcname. The attributes of the newly created object are to + be set to attr. + + .. Note:: + + The attributes of the target directory should be returned since + its size changed. + + +4.12. remove +------------- + + + Summary + Remove a file + + Arguments + in:: + + struct cfs_remove_in { + ViceFid VFid; + char *name; /* Place holder for data. */ + } cfs_remove; + + + + out + + none + + Description + Remove file named cfs_remove_in.name in directory + identified by VFid. + + + .. Note:: + + The attributes of the directory should be returned since its + mtime and size may change. + + +4.13. rmdir +------------ + + + Summary + Remove a directory + + Arguments + in:: + + struct cfs_rmdir_in { + ViceFid VFid; + char *name; /* Place holder for data. */ + } cfs_rmdir; + + + + out + + none + + Description + Remove the directory with name name from the directory + identified by VFid. + + .. Note:: The attributes of the parent directory should be returned since + its mtime and size may change. + + +4.14. readlink +--------------- + + + Summary + Read the value of a symbolic link. + + Arguments + in:: + + struct cfs_readlink_in { + ViceFid VFid; + } cfs_readlink; + + + + out:: + + struct cfs_readlink_out { + int count; + caddr_t data; /* Place holder for data. */ + } cfs_readlink; + + + + Description + This routine reads the contents of symbolic link + identified by VFid into the buffer data. The buffer data must be able + to hold any name up to CFS_MAXNAMLEN (PATH or NAM??). + + Errors + No unusual errors. + + +4.15. open +----------- + + + Summary + Open a file. + + Arguments + in:: + + struct cfs_open_in { + ViceFid VFid; + int flags; + } cfs_open; + + + + out:: + + struct cfs_open_out { + dev_t dev; + ino_t inode; + } cfs_open; + + + + Description + This request asks Venus to place the file identified by + VFid in its cache and to note that the calling process wishes to open + it with flags as in open(2). The return value to the kernel differs + for Unix and Windows systems. For Unix systems the Coda FS Driver is + informed of the device and inode number of the container file in the + fields dev and inode. For Windows the path of the container file is + returned to the kernel. + + + .. Note:: + + Currently the cfs_open_out structure is not properly adapted to + deal with the Windows case. It might be best to implement two + upcalls, one to open aiming at a container file name, the other at a + container file inode. + + +4.16. close +------------ + + + Summary + Close a file, update it on the servers. + + Arguments + in:: + + struct cfs_close_in { + ViceFid VFid; + int flags; + } cfs_close; + + + + out + + none + + Description + Close the file identified by VFid. + + .. Note:: + + The flags argument is bogus and not used. However, Venus' code + has room to deal with an execp input field, probably this field should + be used to inform Venus that the file was closed but is still memory + mapped for execution. There are comments about fetching versus not + fetching the data in Venus vproc_vfscalls. This seems silly. If a + file is being closed, the data in the container file is to be the new + data. Here again the execp flag might be in play to create confusion: + currently Venus might think a file can be flushed from the cache when + it is still memory mapped. This needs to be understood. + + +4.17. ioctl +------------ + + + Summary + Do an ioctl on a file. This includes the pioctl interface. + + Arguments + in:: + + struct cfs_ioctl_in { + ViceFid VFid; + int cmd; + int len; + int rwflag; + char *data; /* Place holder for data. */ + } cfs_ioctl; + + + + out:: + + + struct cfs_ioctl_out { + int len; + caddr_t data; /* Place holder for data. */ + } cfs_ioctl; + + + + Description + Do an ioctl operation on a file. The command, len and + data arguments are filled as usual. flags is not used by Venus. + + .. Note:: + + Another bogus parameter. flags is not used. What is the + business about PREFETCHING in the Venus code? + + + +4.18. rename +------------- + + + Summary + Rename a fid. + + Arguments + in:: + + struct cfs_rename_in { + ViceFid sourceFid; + char *srcname; + ViceFid destFid; + char *destname; + } cfs_rename; + + + + out + + none + + Description + Rename the object with name srcname in directory + sourceFid to destname in destFid. It is important that the names + srcname and destname are 0 terminated strings. Strings in Unix + kernels are not always null terminated. + + +4.19. readdir +-------------- + + + Summary + Read directory entries. + + Arguments + in:: + + struct cfs_readdir_in { + ViceFid VFid; + int count; + int offset; + } cfs_readdir; + + + + + out:: + + struct cfs_readdir_out { + int size; + caddr_t data; /* Place holder for data. */ + } cfs_readdir; + + + + Description + Read directory entries from VFid starting at offset and + read at most count bytes. Returns the data in data and returns + the size in size. + + + .. Note:: + + This call is not used. Readdir operations exploit container + files. We will re-evaluate this during the directory revamp which is + about to take place. + + +4.20. vget +----------- + + + Summary + instructs Venus to do an FSDB->Get. + + Arguments + in:: + + struct cfs_vget_in { + ViceFid VFid; + } cfs_vget; + + + + out:: + + struct cfs_vget_out { + ViceFid VFid; + int vtype; + } cfs_vget; + + + + Description + This upcall asks Venus to do a get operation on an fsobj + labelled by VFid. + + .. Note:: + + This operation is not used. However, it is extremely useful + since it can be used to deal with read/write memory mapped files. + These can be "pinned" in the Venus cache using vget and released with + inactive. + + +4.21. fsync +------------ + + + Summary + Tell Venus to update the RVM attributes of a file. + + Arguments + in:: + + struct cfs_fsync_in { + ViceFid VFid; + } cfs_fsync; + + + + out + + none + + Description + Ask Venus to update RVM attributes of object VFid. This + should be called as part of kernel level fsync type calls. The + result indicates if the syncing was successful. + + .. Note:: Linux does not implement this call. It should. + + +4.22. inactive +--------------- + + + Summary + Tell Venus a vnode is no longer in use. + + Arguments + in:: + + struct cfs_inactive_in { + ViceFid VFid; + } cfs_inactive; + + + + out + + none + + Description + This operation returns EOPNOTSUPP. + + .. Note:: This should perhaps be removed. + + +4.23. rdwr +----------- + + + Summary + Read or write from a file + + Arguments + in:: + + struct cfs_rdwr_in { + ViceFid VFid; + int rwflag; + int count; + int offset; + int ioflag; + caddr_t data; /* Place holder for data. */ + } cfs_rdwr; + + + + + out:: + + struct cfs_rdwr_out { + int rwflag; + int count; + caddr_t data; /* Place holder for data. */ + } cfs_rdwr; + + + + Description + This upcall asks Venus to read or write from a file. + + + .. Note:: + + It should be removed since it is against the Coda philosophy that + read/write operations never reach Venus. I have been told the + operation does not work. It is not currently used. + + + +4.24. odymount +--------------- + + + Summary + Allows mounting multiple Coda "filesystems" on one Unix mount point. + + Arguments + in:: + + struct ody_mount_in { + char *name; /* Place holder for data. */ + } ody_mount; + + + + out:: + + struct ody_mount_out { + ViceFid VFid; + } ody_mount; + + + + Description + Asks Venus to return the rootfid of a Coda system named + name. The fid is returned in VFid. + + .. Note:: + + This call was used by David for dynamic sets. It should be + removed since it causes a jungle of pointers in the VFS mounting area. + It is not used by Coda proper. Call is not implemented by Venus. + + +4.25. ody_lookup +----------------- + + + Summary + Looks up something. + + Arguments + in + + irrelevant + + + out + + irrelevant + + + .. Note:: Gut it. Call is not implemented by Venus. + + +4.26. ody_expand +----------------- + + + Summary + expands something in a dynamic set. + + Arguments + in + + irrelevant + + out + + irrelevant + + .. Note:: Gut it. Call is not implemented by Venus. + + +4.27. prefetch +--------------- + + + Summary + Prefetch a dynamic set. + + Arguments + + in + + Not documented. + + out + + Not documented. + + Description + Venus worker.cc has support for this call, although it is + noted that it doesn't work. Not surprising, since the kernel does not + have support for it. (ODY_PREFETCH is not a defined operation). + + + .. Note:: Gut it. It isn't working and isn't used by Coda. + + + +4.28. signal +------------- + + + Summary + Send Venus a signal about an upcall. + + Arguments + in + + none + + out + + not applicable. + + Description + This is an out-of-band upcall to Venus to inform Venus + that the calling process received a signal after Venus read the + message from the input queue. Venus is supposed to clean up the + operation. + + Errors + No reply is given. + + .. Note:: + + We need to better understand what Venus needs to clean up and if + it is doing this correctly. Also we need to handle multiple upcall + per system call situations correctly. It would be important to know + what state changes in Venus take place after an upcall for which the + kernel is responsible for notifying Venus to clean up (e.g. open + definitely is such a state change, but many others are maybe not). + + +5. The minicache and downcalls +=============================== + + + The Coda FS Driver can cache results of lookup and access upcalls, to + limit the frequency of upcalls. Upcalls carry a price since a process + context switch needs to take place. The counterpart of caching the + information is that Venus will notify the FS Driver that cached + entries must be flushed or renamed. + + The kernel code generally has to maintain a structure which links the + internal file handles (called vnodes in BSD, inodes in Linux and + FileHandles in Windows) with the ViceFid's which Venus maintains. The + reason is that frequent translations back and forth are needed in + order to make upcalls and use the results of upcalls. Such linking + objects are called cnodes. + + The current minicache implementations have cache entries which record + the following: + + 1. the name of the file + + 2. the cnode of the directory containing the object + + 3. a list of CodaCred's for which the lookup is permitted. + + 4. the cnode of the object + + The lookup call in the Coda FS Driver may request the cnode of the + desired object from the cache, by passing its name, directory and the + CodaCred's of the caller. The cache will return the cnode or indicate + that it cannot be found. The Coda FS Driver must be careful to + invalidate cache entries when it modifies or removes objects. + + When Venus obtains information that indicates that cache entries are + no longer valid, it will make a downcall to the kernel. Downcalls are + intercepted by the Coda FS Driver and lead to cache invalidations of + the kind described below. The Coda FS Driver does not return an error + unless the downcall data could not be read into kernel memory. + + +5.1. INVALIDATE +---------------- + + + No information is available on this call. + + +5.2. FLUSH +----------- + + + + Arguments + None + + Summary + Flush the name cache entirely. + + Description + Venus issues this call upon startup and when it dies. This + is to prevent stale cache information being held. Some operating + systems allow the kernel name cache to be switched off dynamically. + When this is done, this downcall is made. + + +5.3. PURGEUSER +--------------- + + + Arguments + :: + + struct cfs_purgeuser_out {/* CFS_PURGEUSER is a venus->kernel call */ + struct CodaCred cred; + } cfs_purgeuser; + + + + Description + Remove all entries in the cache carrying the Cred. This + call is issued when tokens for a user expire or are flushed. + + +5.4. ZAPFILE +------------- + + + Arguments + :: + + struct cfs_zapfile_out { /* CFS_ZAPFILE is a venus->kernel call */ + ViceFid CodaFid; + } cfs_zapfile; + + + + Description + Remove all entries which have the (dir vnode, name) pair. + This is issued as a result of an invalidation of cached attributes of + a vnode. + + .. Note:: + + Call is not named correctly in NetBSD and Mach. The minicache + zapfile routine takes different arguments. Linux does not implement + the invalidation of attributes correctly. + + + +5.5. ZAPDIR +------------ + + + Arguments + :: + + struct cfs_zapdir_out { /* CFS_ZAPDIR is a venus->kernel call */ + ViceFid CodaFid; + } cfs_zapdir; + + + + Description + Remove all entries in the cache lying in a directory + CodaFid, and all children of this directory. This call is issued when + Venus receives a callback on the directory. + + +5.6. ZAPVNODE +-------------- + + + + Arguments + :: + + struct cfs_zapvnode_out { /* CFS_ZAPVNODE is a venus->kernel call */ + struct CodaCred cred; + ViceFid VFid; + } cfs_zapvnode; + + + + Description + Remove all entries in the cache carrying the cred and VFid + as in the arguments. This downcall is probably never issued. + + +5.7. PURGEFID +-------------- + + + Arguments + :: + + struct cfs_purgefid_out { /* CFS_PURGEFID is a venus->kernel call */ + ViceFid CodaFid; + } cfs_purgefid; + + + + Description + Flush the attribute for the file. If it is a dir (odd + vnode), purge its children from the namecache and remove the file from the + namecache. + + + +5.8. REPLACE +------------- + + + Summary + Replace the Fid's for a collection of names. + + Arguments + :: + + struct cfs_replace_out { /* cfs_replace is a venus->kernel call */ + ViceFid NewFid; + ViceFid OldFid; + } cfs_replace; + + + + Description + This routine replaces a ViceFid in the name cache with + another. It is added to allow Venus during reintegration to replace + locally allocated temp fids while disconnected with global fids even + when the reference counts on those fids are not zero. + + +6. Initialization and cleanup +============================== + + + This section gives brief hints as to desirable features for the Coda + FS Driver at startup and upon shutdown or Venus failures. Before + entering the discussion it is useful to repeat that the Coda FS Driver + maintains the following data: + + + 1. message queues + + 2. cnodes + + 3. name cache entries + + The name cache entries are entirely private to the driver, so they + can easily be manipulated. The message queues will generally have + clear points of initialization and destruction. The cnodes are + much more delicate. User processes hold reference counts in Coda + filesystems and it can be difficult to clean up the cnodes. + + It can expect requests through: + + 1. the message subsystem + + 2. the VFS layer + + 3. pioctl interface + + Currently the pioctl passes through the VFS for Coda so we can + treat these similarly. + + +6.1. Requirements +------------------ + + + The following requirements should be accommodated: + + 1. The message queues should have open and close routines. On Unix + the opening of the character devices are such routines. + + - Before opening, no messages can be placed. + + - Opening will remove any old messages still pending. + + - Close will notify any sleeping processes that their upcall cannot + be completed. + + - Close will free all memory allocated by the message queues. + + + 2. At open the namecache shall be initialized to empty state. + + 3. Before the message queues are open, all VFS operations will fail. + Fortunately this can be achieved by making sure than mounting the + Coda filesystem cannot succeed before opening. + + 4. After closing of the queues, no VFS operations can succeed. Here + one needs to be careful, since a few operations (lookup, + read/write, readdir) can proceed without upcalls. These must be + explicitly blocked. + + 5. Upon closing the namecache shall be flushed and disabled. + + 6. All memory held by cnodes can be freed without relying on upcalls. + + 7. Unmounting the file system can be done without relying on upcalls. + + 8. Mounting the Coda filesystem should fail gracefully if Venus cannot + get the rootfid or the attributes of the rootfid. The latter is + best implemented by Venus fetching these objects before attempting + to mount. + + .. Note:: + + NetBSD in particular but also Linux have not implemented the + above requirements fully. For smooth operation this needs to be + corrected. + + + diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt deleted file mode 100644 index 1711ad48e38a..000000000000 --- a/Documentation/filesystems/coda.txt +++ /dev/null @@ -1,1676 +0,0 @@ -NOTE: -This is one of the technical documents describing a component of -Coda -- this document describes the client kernel-Venus interface. - -For more information: - http://www.coda.cs.cmu.edu -For user level software needed to run Coda: - ftp://ftp.coda.cs.cmu.edu - -To run Coda you need to get a user level cache manager for the client, -named Venus, as well as tools to manipulate ACLs, to log in, etc. The -client needs to have the Coda filesystem selected in the kernel -configuration. - -The server needs a user level server and at present does not depend on -kernel support. - - - - - - - - The Venus kernel interface - Peter J. Braam - v1.0, Nov 9, 1997 - - This document describes the communication between Venus and kernel - level filesystem code needed for the operation of the Coda file sys- - tem. This document version is meant to describe the current interface - (version 1.0) as well as improvements we envisage. - ______________________________________________________________________ - - Table of Contents - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1. Introduction - - 2. Servicing Coda filesystem calls - - 3. The message layer - - 3.1 Implementation details - - 4. The interface at the call level - - 4.1 Data structures shared by the kernel and Venus - 4.2 The pioctl interface - 4.3 root - 4.4 lookup - 4.5 getattr - 4.6 setattr - 4.7 access - 4.8 create - 4.9 mkdir - 4.10 link - 4.11 symlink - 4.12 remove - 4.13 rmdir - 4.14 readlink - 4.15 open - 4.16 close - 4.17 ioctl - 4.18 rename - 4.19 readdir - 4.20 vget - 4.21 fsync - 4.22 inactive - 4.23 rdwr - 4.24 odymount - 4.25 ody_lookup - 4.26 ody_expand - 4.27 prefetch - 4.28 signal - - 5. The minicache and downcalls - - 5.1 INVALIDATE - 5.2 FLUSH - 5.3 PURGEUSER - 5.4 ZAPFILE - 5.5 ZAPDIR - 5.6 ZAPVNODE - 5.7 PURGEFID - 5.8 REPLACE - - 6. Initialization and cleanup - - 6.1 Requirements - - - ______________________________________________________________________ - 0wpage - - 11.. IInnttrroodduuccttiioonn - - - - A key component in the Coda Distributed File System is the cache - manager, _V_e_n_u_s. - - - When processes on a Coda enabled system access files in the Coda - filesystem, requests are directed at the filesystem layer in the - operating system. The operating system will communicate with Venus to - service the request for the process. Venus manages a persistent - client cache and makes remote procedure calls to Coda file servers and - related servers (such as authentication servers) to service these - requests it receives from the operating system. When Venus has - serviced a request it replies to the operating system with appropriate - return codes, and other data related to the request. Optionally the - kernel support for Coda may maintain a minicache of recently processed - requests to limit the number of interactions with Venus. Venus - possesses the facility to inform the kernel when elements from its - minicache are no longer valid. - - This document describes precisely this communication between the - kernel and Venus. The definitions of so called upcalls and downcalls - will be given with the format of the data they handle. We shall also - describe the semantic invariants resulting from the calls. - - Historically Coda was implemented in a BSD file system in Mach 2.6. - The interface between the kernel and Venus is very similar to the BSD - VFS interface. Similar functionality is provided, and the format of - the parameters and returned data is very similar to the BSD VFS. This - leads to an almost natural environment for implementing a kernel-level - filesystem driver for Coda in a BSD system. However, other operating - systems such as Linux and Windows 95 and NT have virtual filesystem - with different interfaces. - - To implement Coda on these systems some reverse engineering of the - Venus/Kernel protocol is necessary. Also it came to light that other - systems could profit significantly from certain small optimizations - and modifications to the protocol. To facilitate this work as well as - to make future ports easier, communication between Venus and the - kernel should be documented in great detail. This is the aim of this - document. - - 0wpage - - 22.. SSeerrvviicciinngg CCooddaa ffiilleessyysstteemm ccaallllss - - The service of a request for a Coda file system service originates in - a process PP which accessing a Coda file. It makes a system call which - traps to the OS kernel. Examples of such calls trapping to the kernel - are _r_e_a_d_, _w_r_i_t_e_, _o_p_e_n_, _c_l_o_s_e_, _c_r_e_a_t_e_, _m_k_d_i_r_, _r_m_d_i_r_, _c_h_m_o_d in a Unix - context. Similar calls exist in the Win32 environment, and are named - _C_r_e_a_t_e_F_i_l_e_, . - - Generally the operating system handles the request in a virtual - filesystem (VFS) layer, which is named I/O Manager in NT and IFS - manager in Windows 95. The VFS is responsible for partial processing - of the request and for locating the specific filesystem(s) which will - service parts of the request. Usually the information in the path - assists in locating the correct FS drivers. Sometimes after extensive - pre-processing, the VFS starts invoking exported routines in the FS - driver. This is the point where the FS specific processing of the - request starts, and here the Coda specific kernel code comes into - play. - - The FS layer for Coda must expose and implement several interfaces. - First and foremost the VFS must be able to make all necessary calls to - the Coda FS layer, so the Coda FS driver must expose the VFS interface - as applicable in the operating system. These differ very significantly - among operating systems, but share features such as facilities to - read/write and create and remove objects. The Coda FS layer services - such VFS requests by invoking one or more well defined services - offered by the cache manager Venus. When the replies from Venus have - come back to the FS driver, servicing of the VFS call continues and - finishes with a reply to the kernel's VFS. Finally the VFS layer - returns to the process. - - As a result of this design a basic interface exposed by the FS driver - must allow Venus to manage message traffic. In particular Venus must - be able to retrieve and place messages and to be notified of the - arrival of a new message. The notification must be through a mechanism - which does not block Venus since Venus must attend to other tasks even - when no messages are waiting or being processed. - - - - - - - Interfaces of the Coda FS Driver - - Furthermore the FS layer provides for a special path of communication - between a user process and Venus, called the pioctl interface. The - pioctl interface is used for Coda specific services, such as - requesting detailed information about the persistent cache managed by - Venus. Here the involvement of the kernel is minimal. It identifies - the calling process and passes the information on to Venus. When - Venus replies the response is passed back to the caller in unmodified - form. - - Finally Venus allows the kernel FS driver to cache the results from - certain services. This is done to avoid excessive context switches - and results in an efficient system. However, Venus may acquire - information, for example from the network which implies that cached - information must be flushed or replaced. Venus then makes a downcall - to the Coda FS layer to request flushes or updates in the cache. The - kernel FS driver handles such requests synchronously. - - Among these interfaces the VFS interface and the facility to place, - receive and be notified of messages are platform specific. We will - not go into the calls exported to the VFS layer but we will state the - requirements of the message exchange mechanism. - - 0wpage - - 33.. TThhee mmeessssaaggee llaayyeerr - - - - At the lowest level the communication between Venus and the FS driver - proceeds through messages. The synchronization between processes - requesting Coda file service and Venus relies on blocking and waking - up processes. The Coda FS driver processes VFS- and pioctl-requests - on behalf of a process P, creates messages for Venus, awaits replies - and finally returns to the caller. The implementation of the exchange - of messages is platform specific, but the semantics have (so far) - appeared to be generally applicable. Data buffers are created by the - FS Driver in kernel memory on behalf of P and copied to user memory in - Venus. - - The FS Driver while servicing P makes upcalls to Venus. Such an - upcall is dispatched to Venus by creating a message structure. The - structure contains the identification of P, the message sequence - number, the size of the request and a pointer to the data in kernel - memory for the request. Since the data buffer is re-used to hold the - reply from Venus, there is a field for the size of the reply. A flags - field is used in the message to precisely record the status of the - message. Additional platform dependent structures involve pointers to - determine the position of the message on queues and pointers to - synchronization objects. In the upcall routine the message structure - is filled in, flags are set to 0, and it is placed on the _p_e_n_d_i_n_g - queue. The routine calling upcall is responsible for allocating the - data buffer; its structure will be described in the next section. - - A facility must exist to notify Venus that the message has been - created, and implemented using available synchronization objects in - the OS. This notification is done in the upcall context of the process - P. When the message is on the pending queue, process P cannot proceed - in upcall. The (kernel mode) processing of P in the filesystem - request routine must be suspended until Venus has replied. Therefore - the calling thread in P is blocked in upcall. A pointer in the - message structure will locate the synchronization object on which P is - sleeping. - - Venus detects the notification that a message has arrived, and the FS - driver allow Venus to retrieve the message with a getmsg_from_kernel - call. This action finishes in the kernel by putting the message on the - queue of processing messages and setting flags to READ. Venus is - passed the contents of the data buffer. The getmsg_from_kernel call - now returns and Venus processes the request. - - At some later point the FS driver receives a message from Venus, - namely when Venus calls sendmsg_to_kernel. At this moment the Coda FS - driver looks at the contents of the message and decides if: - - - +o the message is a reply for a suspended thread P. If so it removes - the message from the processing queue and marks the message as - WRITTEN. Finally, the FS driver unblocks P (still in the kernel - mode context of Venus) and the sendmsg_to_kernel call returns to - Venus. The process P will be scheduled at some point and continues - processing its upcall with the data buffer replaced with the reply - from Venus. - - +o The message is a _d_o_w_n_c_a_l_l. A downcall is a request from Venus to - the FS Driver. The FS driver processes the request immediately - (usually a cache eviction or replacement) and when it finishes - sendmsg_to_kernel returns. - - Now P awakes and continues processing upcall. There are some - subtleties to take account of. First P will determine if it was woken - up in upcall by a signal from some other source (for example an - attempt to terminate P) or as is normally the case by Venus in its - sendmsg_to_kernel call. In the normal case, the upcall routine will - deallocate the message structure and return. The FS routine can proceed - with its processing. - - - - - - - - Sleeping and IPC arrangements - - In case P is woken up by a signal and not by Venus, it will first look - at the flags field. If the message is not yet READ, the process P can - handle its signal without notifying Venus. If Venus has READ, and - the request should not be processed, P can send Venus a signal message - to indicate that it should disregard the previous message. Such - signals are put in the queue at the head, and read first by Venus. If - the message is already marked as WRITTEN it is too late to stop the - processing. The VFS routine will now continue. (-- If a VFS request - involves more than one upcall, this can lead to complicated state, an - extra field "handle_signals" could be added in the message structure - to indicate points of no return have been passed.--) - - - - 33..11.. IImmpplleemmeennttaattiioonn ddeettaaiillss - - The Unix implementation of this mechanism has been through the - implementation of a character device associated with Coda. Venus - retrieves messages by doing a read on the device, replies are sent - with a write and notification is through the select system call on the - file descriptor for the device. The process P is kept waiting on an - interruptible wait queue object. - - In Windows NT and the DPMI Windows 95 implementation a DeviceIoControl - call is used. The DeviceIoControl call is designed to copy buffers - from user memory to kernel memory with OPCODES. The sendmsg_to_kernel - is issued as a synchronous call, while the getmsg_from_kernel call is - asynchronous. Windows EventObjects are used for notification of - message arrival. The process P is kept waiting on a KernelEvent - object in NT and a semaphore in Windows 95. - - 0wpage - - 44.. TThhee iinntteerrffaaccee aatt tthhee ccaallll lleevveell - - - This section describes the upcalls a Coda FS driver can make to Venus. - Each of these upcalls make use of two structures: inputArgs and - outputArgs. In pseudo BNF form the structures take the following - form: - - - struct inputArgs { - u_long opcode; - u_long unique; /* Keep multiple outstanding msgs distinct */ - u_short pid; /* Common to all */ - u_short pgid; /* Common to all */ - struct CodaCred cred; /* Common to all */ - - - }; - - struct outputArgs { - u_long opcode; - u_long unique; /* Keep multiple outstanding msgs distinct */ - u_long result; - - - }; - - - - Before going on let us elucidate the role of the various fields. The - inputArgs start with the opcode which defines the type of service - requested from Venus. There are approximately 30 upcalls at present - which we will discuss. The unique field labels the inputArg with a - unique number which will identify the message uniquely. A process and - process group id are passed. Finally the credentials of the caller - are included. - - Before delving into the specific calls we need to discuss a variety of - data structures shared by the kernel and Venus. - - - - - 44..11.. DDaattaa ssttrruuccttuurreess sshhaarreedd bbyy tthhee kkeerrnneell aanndd VVeennuuss - - - The CodaCred structure defines a variety of user and group ids as - they are set for the calling process. The vuid_t and vgid_t are 32 bit - unsigned integers. It also defines group membership in an array. On - Unix the CodaCred has proven sufficient to implement good security - semantics for Coda but the structure may have to undergo modification - for the Windows environment when these mature. - - struct CodaCred { - vuid_t cr_uid, cr_euid, cr_suid, cr_fsuid; /* Real, effective, set, fs uid */ - vgid_t cr_gid, cr_egid, cr_sgid, cr_fsgid; /* same for groups */ - vgid_t cr_groups[NGROUPS]; /* Group membership for caller */ - }; - - - - NNOOTTEE It is questionable if we need CodaCreds in Venus. Finally Venus - doesn't know about groups, although it does create files with the - default uid/gid. Perhaps the list of group membership is superfluous. - - - The next item is the fundamental identifier used to identify Coda - files, the ViceFid. A fid of a file uniquely defines a file or - directory in the Coda filesystem within a _c_e_l_l. (-- A _c_e_l_l is a - group of Coda servers acting under the aegis of a single system - control machine or SCM. See the Coda Administration manual for a - detailed description of the role of the SCM.--) - - - typedef struct ViceFid { - VolumeId Volume; - VnodeId Vnode; - Unique_t Unique; - } ViceFid; - - - - Each of the constituent fields: VolumeId, VnodeId and Unique_t are - unsigned 32 bit integers. We envisage that a further field will need - to be prefixed to identify the Coda cell; this will probably take the - form of a Ipv6 size IP address naming the Coda cell through DNS. - - The next important structure shared between Venus and the kernel is - the attributes of the file. The following structure is used to - exchange information. It has room for future extensions such as - support for device files (currently not present in Coda). - - - - - - - - - - - - - - - - - struct coda_timespec { - int64_t tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ - }; - - struct coda_vattr { - enum coda_vtype va_type; /* vnode type (for create) */ - u_short va_mode; /* files access mode and type */ - short va_nlink; /* number of references to file */ - vuid_t va_uid; /* owner user id */ - vgid_t va_gid; /* owner group id */ - long va_fsid; /* file system id (dev for now) */ - long va_fileid; /* file id */ - u_quad_t va_size; /* file size in bytes */ - long va_blocksize; /* blocksize preferred for i/o */ - struct coda_timespec va_atime; /* time of last access */ - struct coda_timespec va_mtime; /* time of last modification */ - struct coda_timespec va_ctime; /* time file changed */ - u_long va_gen; /* generation number of file */ - u_long va_flags; /* flags defined for file */ - dev_t va_rdev; /* device special file represents */ - u_quad_t va_bytes; /* bytes of disk space held by file */ - u_quad_t va_filerev; /* file modification number */ - u_int va_vaflags; /* operations flags, see below */ - long va_spare; /* remain quad aligned */ - }; - - - - - 44..22.. TThhee ppiiooccttll iinntteerrffaaccee - - - Coda specific requests can be made by application through the pioctl - interface. The pioctl is implemented as an ordinary ioctl on a - fictitious file /coda/.CONTROL. The pioctl call opens this file, gets - a file handle and makes the ioctl call. Finally it closes the file. - - The kernel involvement in this is limited to providing the facility to - open and close and pass the ioctl message _a_n_d to verify that a path in - the pioctl data buffers is a file in a Coda filesystem. - - The kernel is handed a data packet of the form: - - struct { - const char *path; - struct ViceIoctl vidata; - int follow; - } data; - - - - where - - - struct ViceIoctl { - caddr_t in, out; /* Data to be transferred in, or out */ - short in_size; /* Size of input buffer <= 2K */ - short out_size; /* Maximum size of output buffer, <= 2K */ - }; - - - - The path must be a Coda file, otherwise the ioctl upcall will not be - made. - - NNOOTTEE The data structures and code are a mess. We need to clean this - up. - - We now proceed to document the individual calls: - - 0wpage - - 44..33.. rroooott - - - AArrgguummeennttss - - iinn empty - - oouutt - - struct cfs_root_out { - ViceFid VFid; - } cfs_root; - - - - DDeessccrriippttiioonn This call is made to Venus during the initialization of - the Coda filesystem. If the result is zero, the cfs_root structure - contains the ViceFid of the root of the Coda filesystem. If a non-zero - result is generated, its value is a platform dependent error code - indicating the difficulty Venus encountered in locating the root of - the Coda filesystem. - - 0wpage - - 44..44.. llooookkuupp - - - SSuummmmaarryy Find the ViceFid and type of an object in a directory if it - exists. - - AArrgguummeennttss - - iinn - - struct cfs_lookup_in { - ViceFid VFid; - char *name; /* Place holder for data. */ - } cfs_lookup; - - - - oouutt - - struct cfs_lookup_out { - ViceFid VFid; - int vtype; - } cfs_lookup; - - - - DDeessccrriippttiioonn This call is made to determine the ViceFid and filetype of - a directory entry. The directory entry requested carries name name - and Venus will search the directory identified by cfs_lookup_in.VFid. - The result may indicate that the name does not exist, or that - difficulty was encountered in finding it (e.g. due to disconnection). - If the result is zero, the field cfs_lookup_out.VFid contains the - targets ViceFid and cfs_lookup_out.vtype the coda_vtype giving the - type of object the name designates. - - The name of the object is an 8 bit character string of maximum length - CFS_MAXNAMLEN, currently set to 256 (including a 0 terminator.) - - It is extremely important to realize that Venus bitwise ors the field - cfs_lookup.vtype with CFS_NOCACHE to indicate that the object should - not be put in the kernel name cache. - - NNOOTTEE The type of the vtype is currently wrong. It should be - coda_vtype. Linux does not take note of CFS_NOCACHE. It should. - - 0wpage - - 44..55.. ggeettaattttrr - - - SSuummmmaarryy Get the attributes of a file. - - AArrgguummeennttss - - iinn - - struct cfs_getattr_in { - ViceFid VFid; - struct coda_vattr attr; /* XXXXX */ - } cfs_getattr; - - - - oouutt - - struct cfs_getattr_out { - struct coda_vattr attr; - } cfs_getattr; - - - - DDeessccrriippttiioonn This call returns the attributes of the file identified by - fid. - - EErrrroorrss Errors can occur if the object with fid does not exist, is - unaccessible or if the caller does not have permission to fetch - attributes. - - NNoottee Many kernel FS drivers (Linux, NT and Windows 95) need to acquire - the attributes as well as the Fid for the instantiation of an internal - "inode" or "FileHandle". A significant improvement in performance on - such systems could be made by combining the _l_o_o_k_u_p and _g_e_t_a_t_t_r calls - both at the Venus/kernel interaction level and at the RPC level. - - The vattr structure included in the input arguments is superfluous and - should be removed. - - 0wpage - - 44..66.. sseettaattttrr - - - SSuummmmaarryy Set the attributes of a file. - - AArrgguummeennttss - - iinn - - struct cfs_setattr_in { - ViceFid VFid; - struct coda_vattr attr; - } cfs_setattr; - - - - - oouutt - empty - - DDeessccrriippttiioonn The structure attr is filled with attributes to be changed - in BSD style. Attributes not to be changed are set to -1, apart from - vtype which is set to VNON. Other are set to the value to be assigned. - The only attributes which the FS driver may request to change are the - mode, owner, groupid, atime, mtime and ctime. The return value - indicates success or failure. - - EErrrroorrss A variety of errors can occur. The object may not exist, may - be inaccessible, or permission may not be granted by Venus. - - 0wpage - - 44..77.. aacccceessss - - - SSuummmmaarryy - - AArrgguummeennttss - - iinn - - struct cfs_access_in { - ViceFid VFid; - int flags; - } cfs_access; - - - - oouutt - empty - - DDeessccrriippttiioonn Verify if access to the object identified by VFid for - operations described by flags is permitted. The result indicates if - access will be granted. It is important to remember that Coda uses - ACLs to enforce protection and that ultimately the servers, not the - clients enforce the security of the system. The result of this call - will depend on whether a _t_o_k_e_n is held by the user. - - EErrrroorrss The object may not exist, or the ACL describing the protection - may not be accessible. - - 0wpage - - 44..88.. ccrreeaattee - - - SSuummmmaarryy Invoked to create a file - - AArrgguummeennttss - - iinn - - struct cfs_create_in { - ViceFid VFid; - struct coda_vattr attr; - int excl; - int mode; - char *name; /* Place holder for data. */ - } cfs_create; - - - - - oouutt - - struct cfs_create_out { - ViceFid VFid; - struct coda_vattr attr; - } cfs_create; - - - - DDeessccrriippttiioonn This upcall is invoked to request creation of a file. - The file will be created in the directory identified by VFid, its name - will be name, and the mode will be mode. If excl is set an error will - be returned if the file already exists. If the size field in attr is - set to zero the file will be truncated. The uid and gid of the file - are set by converting the CodaCred to a uid using a macro CRTOUID - (this macro is platform dependent). Upon success the VFid and - attributes of the file are returned. The Coda FS Driver will normally - instantiate a vnode, inode or file handle at kernel level for the new - object. - - - EErrrroorrss A variety of errors can occur. Permissions may be insufficient. - If the object exists and is not a file the error EISDIR is returned - under Unix. - - NNOOTTEE The packing of parameters is very inefficient and appears to - indicate confusion between the system call creat and the VFS operation - create. The VFS operation create is only called to create new objects. - This create call differs from the Unix one in that it is not invoked - to return a file descriptor. The truncate and exclusive options, - together with the mode, could simply be part of the mode as it is - under Unix. There should be no flags argument; this is used in open - (2) to return a file descriptor for READ or WRITE mode. - - The attributes of the directory should be returned too, since the size - and mtime changed. - - 0wpage - - 44..99.. mmkkddiirr - - - SSuummmmaarryy Create a new directory. - - AArrgguummeennttss - - iinn - - struct cfs_mkdir_in { - ViceFid VFid; - struct coda_vattr attr; - char *name; /* Place holder for data. */ - } cfs_mkdir; - - - - oouutt - - struct cfs_mkdir_out { - ViceFid VFid; - struct coda_vattr attr; - } cfs_mkdir; - - - - - DDeessccrriippttiioonn This call is similar to create but creates a directory. - Only the mode field in the input parameters is used for creation. - Upon successful creation, the attr returned contains the attributes of - the new directory. - - EErrrroorrss As for create. - - NNOOTTEE The input parameter should be changed to mode instead of - attributes. - - The attributes of the parent should be returned since the size and - mtime changes. - - 0wpage - - 44..1100.. lliinnkk - - - SSuummmmaarryy Create a link to an existing file. - - AArrgguummeennttss - - iinn - - struct cfs_link_in { - ViceFid sourceFid; /* cnode to link *to* */ - ViceFid destFid; /* Directory in which to place link */ - char *tname; /* Place holder for data. */ - } cfs_link; - - - - oouutt - empty - - DDeessccrriippttiioonn This call creates a link to the sourceFid in the directory - identified by destFid with name tname. The source must reside in the - target's parent, i.e. the source must be have parent destFid, i.e. Coda - does not support cross directory hard links. Only the return value is - relevant. It indicates success or the type of failure. - - EErrrroorrss The usual errors can occur.0wpage - - 44..1111.. ssyymmlliinnkk - - - SSuummmmaarryy create a symbolic link - - AArrgguummeennttss - - iinn - - struct cfs_symlink_in { - ViceFid VFid; /* Directory to put symlink in */ - char *srcname; - struct coda_vattr attr; - char *tname; - } cfs_symlink; - - - - oouutt - none - - DDeessccrriippttiioonn Create a symbolic link. The link is to be placed in the - directory identified by VFid and named tname. It should point to the - pathname srcname. The attributes of the newly created object are to - be set to attr. - - EErrrroorrss - - NNOOTTEE The attributes of the target directory should be returned since - its size changed. - - 0wpage - - 44..1122.. rreemmoovvee - - - SSuummmmaarryy Remove a file - - AArrgguummeennttss - - iinn - - struct cfs_remove_in { - ViceFid VFid; - char *name; /* Place holder for data. */ - } cfs_remove; - - - - oouutt - none - - DDeessccrriippttiioonn Remove file named cfs_remove_in.name in directory - identified by VFid. - - EErrrroorrss - - NNOOTTEE The attributes of the directory should be returned since its - mtime and size may change. - - 0wpage - - 44..1133.. rrmmddiirr - - - SSuummmmaarryy Remove a directory - - AArrgguummeennttss - - iinn - - struct cfs_rmdir_in { - ViceFid VFid; - char *name; /* Place holder for data. */ - } cfs_rmdir; - - - - oouutt - none - - DDeessccrriippttiioonn Remove the directory with name name from the directory - identified by VFid. - - EErrrroorrss - - NNOOTTEE The attributes of the parent directory should be returned since - its mtime and size may change. - - 0wpage - - 44..1144.. rreeaaddlliinnkk - - - SSuummmmaarryy Read the value of a symbolic link. - - AArrgguummeennttss - - iinn - - struct cfs_readlink_in { - ViceFid VFid; - } cfs_readlink; - - - - oouutt - - struct cfs_readlink_out { - int count; - caddr_t data; /* Place holder for data. */ - } cfs_readlink; - - - - DDeessccrriippttiioonn This routine reads the contents of symbolic link - identified by VFid into the buffer data. The buffer data must be able - to hold any name up to CFS_MAXNAMLEN (PATH or NAM??). - - EErrrroorrss No unusual errors. - - 0wpage - - 44..1155.. ooppeenn - - - SSuummmmaarryy Open a file. - - AArrgguummeennttss - - iinn - - struct cfs_open_in { - ViceFid VFid; - int flags; - } cfs_open; - - - - oouutt - - struct cfs_open_out { - dev_t dev; - ino_t inode; - } cfs_open; - - - - DDeessccrriippttiioonn This request asks Venus to place the file identified by - VFid in its cache and to note that the calling process wishes to open - it with flags as in open(2). The return value to the kernel differs - for Unix and Windows systems. For Unix systems the Coda FS Driver is - informed of the device and inode number of the container file in the - fields dev and inode. For Windows the path of the container file is - returned to the kernel. - EErrrroorrss - - NNOOTTEE Currently the cfs_open_out structure is not properly adapted to - deal with the Windows case. It might be best to implement two - upcalls, one to open aiming at a container file name, the other at a - container file inode. - - 0wpage - - 44..1166.. cclloossee - - - SSuummmmaarryy Close a file, update it on the servers. - - AArrgguummeennttss - - iinn - - struct cfs_close_in { - ViceFid VFid; - int flags; - } cfs_close; - - - - oouutt - none - - DDeessccrriippttiioonn Close the file identified by VFid. - - EErrrroorrss - - NNOOTTEE The flags argument is bogus and not used. However, Venus' code - has room to deal with an execp input field, probably this field should - be used to inform Venus that the file was closed but is still memory - mapped for execution. There are comments about fetching versus not - fetching the data in Venus vproc_vfscalls. This seems silly. If a - file is being closed, the data in the container file is to be the new - data. Here again the execp flag might be in play to create confusion: - currently Venus might think a file can be flushed from the cache when - it is still memory mapped. This needs to be understood. - - 0wpage - - 44..1177.. iiooccttll - - - SSuummmmaarryy Do an ioctl on a file. This includes the pioctl interface. - - AArrgguummeennttss - - iinn - - struct cfs_ioctl_in { - ViceFid VFid; - int cmd; - int len; - int rwflag; - char *data; /* Place holder for data. */ - } cfs_ioctl; - - - - oouutt - - - struct cfs_ioctl_out { - int len; - caddr_t data; /* Place holder for data. */ - } cfs_ioctl; - - - - DDeessccrriippttiioonn Do an ioctl operation on a file. The command, len and - data arguments are filled as usual. flags is not used by Venus. - - EErrrroorrss - - NNOOTTEE Another bogus parameter. flags is not used. What is the - business about PREFETCHING in the Venus code? - - - 0wpage - - 44..1188.. rreennaammee - - - SSuummmmaarryy Rename a fid. - - AArrgguummeennttss - - iinn - - struct cfs_rename_in { - ViceFid sourceFid; - char *srcname; - ViceFid destFid; - char *destname; - } cfs_rename; - - - - oouutt - none - - DDeessccrriippttiioonn Rename the object with name srcname in directory - sourceFid to destname in destFid. It is important that the names - srcname and destname are 0 terminated strings. Strings in Unix - kernels are not always null terminated. - - EErrrroorrss - - 0wpage - - 44..1199.. rreeaaddddiirr - - - SSuummmmaarryy Read directory entries. - - AArrgguummeennttss - - iinn - - struct cfs_readdir_in { - ViceFid VFid; - int count; - int offset; - } cfs_readdir; - - - - - oouutt - - struct cfs_readdir_out { - int size; - caddr_t data; /* Place holder for data. */ - } cfs_readdir; - - - - DDeessccrriippttiioonn Read directory entries from VFid starting at offset and - read at most count bytes. Returns the data in data and returns - the size in size. - - EErrrroorrss - - NNOOTTEE This call is not used. Readdir operations exploit container - files. We will re-evaluate this during the directory revamp which is - about to take place. - - 0wpage - - 44..2200.. vvggeett - - - SSuummmmaarryy instructs Venus to do an FSDB->Get. - - AArrgguummeennttss - - iinn - - struct cfs_vget_in { - ViceFid VFid; - } cfs_vget; - - - - oouutt - - struct cfs_vget_out { - ViceFid VFid; - int vtype; - } cfs_vget; - - - - DDeessccrriippttiioonn This upcall asks Venus to do a get operation on an fsobj - labelled by VFid. - - EErrrroorrss - - NNOOTTEE This operation is not used. However, it is extremely useful - since it can be used to deal with read/write memory mapped files. - These can be "pinned" in the Venus cache using vget and released with - inactive. - - 0wpage - - 44..2211.. ffssyynncc - - - SSuummmmaarryy Tell Venus to update the RVM attributes of a file. - - AArrgguummeennttss - - iinn - - struct cfs_fsync_in { - ViceFid VFid; - } cfs_fsync; - - - - oouutt - none - - DDeessccrriippttiioonn Ask Venus to update RVM attributes of object VFid. This - should be called as part of kernel level fsync type calls. The - result indicates if the syncing was successful. - - EErrrroorrss - - NNOOTTEE Linux does not implement this call. It should. - - 0wpage - - 44..2222.. iinnaaccttiivvee - - - SSuummmmaarryy Tell Venus a vnode is no longer in use. - - AArrgguummeennttss - - iinn - - struct cfs_inactive_in { - ViceFid VFid; - } cfs_inactive; - - - - oouutt - none - - DDeessccrriippttiioonn This operation returns EOPNOTSUPP. - - EErrrroorrss - - NNOOTTEE This should perhaps be removed. - - 0wpage - - 44..2233.. rrddwwrr - - - SSuummmmaarryy Read or write from a file - - AArrgguummeennttss - - iinn - - struct cfs_rdwr_in { - ViceFid VFid; - int rwflag; - int count; - int offset; - int ioflag; - caddr_t data; /* Place holder for data. */ - } cfs_rdwr; - - - - - oouutt - - struct cfs_rdwr_out { - int rwflag; - int count; - caddr_t data; /* Place holder for data. */ - } cfs_rdwr; - - - - DDeessccrriippttiioonn This upcall asks Venus to read or write from a file. - - EErrrroorrss - - NNOOTTEE It should be removed since it is against the Coda philosophy that - read/write operations never reach Venus. I have been told the - operation does not work. It is not currently used. - - - 0wpage - - 44..2244.. ooddyymmoouunntt - - - SSuummmmaarryy Allows mounting multiple Coda "filesystems" on one Unix mount - point. - - AArrgguummeennttss - - iinn - - struct ody_mount_in { - char *name; /* Place holder for data. */ - } ody_mount; - - - - oouutt - - struct ody_mount_out { - ViceFid VFid; - } ody_mount; - - - - DDeessccrriippttiioonn Asks Venus to return the rootfid of a Coda system named - name. The fid is returned in VFid. - - EErrrroorrss - - NNOOTTEE This call was used by David for dynamic sets. It should be - removed since it causes a jungle of pointers in the VFS mounting area. - It is not used by Coda proper. Call is not implemented by Venus. - - 0wpage - - 44..2255.. ooddyy__llooookkuupp - - - SSuummmmaarryy Looks up something. - - AArrgguummeennttss - - iinn irrelevant - - - oouutt - irrelevant - - DDeessccrriippttiioonn - - EErrrroorrss - - NNOOTTEE Gut it. Call is not implemented by Venus. - - 0wpage - - 44..2266.. ooddyy__eexxppaanndd - - - SSuummmmaarryy expands something in a dynamic set. - - AArrgguummeennttss - - iinn irrelevant - - oouutt - irrelevant - - DDeessccrriippttiioonn - - EErrrroorrss - - NNOOTTEE Gut it. Call is not implemented by Venus. - - 0wpage - - 44..2277.. pprreeffeettcchh - - - SSuummmmaarryy Prefetch a dynamic set. - - AArrgguummeennttss - - iinn Not documented. - - oouutt - Not documented. - - DDeessccrriippttiioonn Venus worker.cc has support for this call, although it is - noted that it doesn't work. Not surprising, since the kernel does not - have support for it. (ODY_PREFETCH is not a defined operation). - - EErrrroorrss - - NNOOTTEE Gut it. It isn't working and isn't used by Coda. - - - 0wpage - - 44..2288.. ssiiggnnaall - - - SSuummmmaarryy Send Venus a signal about an upcall. - - AArrgguummeennttss - - iinn none - - oouutt - not applicable. - - DDeessccrriippttiioonn This is an out-of-band upcall to Venus to inform Venus - that the calling process received a signal after Venus read the - message from the input queue. Venus is supposed to clean up the - operation. - - EErrrroorrss No reply is given. - - NNOOTTEE We need to better understand what Venus needs to clean up and if - it is doing this correctly. Also we need to handle multiple upcall - per system call situations correctly. It would be important to know - what state changes in Venus take place after an upcall for which the - kernel is responsible for notifying Venus to clean up (e.g. open - definitely is such a state change, but many others are maybe not). - - 0wpage - - 55.. TThhee mmiinniiccaacchhee aanndd ddoowwnnccaallllss - - - The Coda FS Driver can cache results of lookup and access upcalls, to - limit the frequency of upcalls. Upcalls carry a price since a process - context switch needs to take place. The counterpart of caching the - information is that Venus will notify the FS Driver that cached - entries must be flushed or renamed. - - The kernel code generally has to maintain a structure which links the - internal file handles (called vnodes in BSD, inodes in Linux and - FileHandles in Windows) with the ViceFid's which Venus maintains. The - reason is that frequent translations back and forth are needed in - order to make upcalls and use the results of upcalls. Such linking - objects are called ccnnooddeess. - - The current minicache implementations have cache entries which record - the following: - - 1. the name of the file - - 2. the cnode of the directory containing the object - - 3. a list of CodaCred's for which the lookup is permitted. - - 4. the cnode of the object - - The lookup call in the Coda FS Driver may request the cnode of the - desired object from the cache, by passing its name, directory and the - CodaCred's of the caller. The cache will return the cnode or indicate - that it cannot be found. The Coda FS Driver must be careful to - invalidate cache entries when it modifies or removes objects. - - When Venus obtains information that indicates that cache entries are - no longer valid, it will make a downcall to the kernel. Downcalls are - intercepted by the Coda FS Driver and lead to cache invalidations of - the kind described below. The Coda FS Driver does not return an error - unless the downcall data could not be read into kernel memory. - - - 55..11.. IINNVVAALLIIDDAATTEE - - - No information is available on this call. - - - 55..22.. FFLLUUSSHH - - - - AArrgguummeennttss None - - SSuummmmaarryy Flush the name cache entirely. - - DDeessccrriippttiioonn Venus issues this call upon startup and when it dies. This - is to prevent stale cache information being held. Some operating - systems allow the kernel name cache to be switched off dynamically. - When this is done, this downcall is made. - - - 55..33.. PPUURRGGEEUUSSEERR - - - AArrgguummeennttss - - struct cfs_purgeuser_out {/* CFS_PURGEUSER is a venus->kernel call */ - struct CodaCred cred; - } cfs_purgeuser; - - - - DDeessccrriippttiioonn Remove all entries in the cache carrying the Cred. This - call is issued when tokens for a user expire or are flushed. - - - 55..44.. ZZAAPPFFIILLEE - - - AArrgguummeennttss - - struct cfs_zapfile_out { /* CFS_ZAPFILE is a venus->kernel call */ - ViceFid CodaFid; - } cfs_zapfile; - - - - DDeessccrriippttiioonn Remove all entries which have the (dir vnode, name) pair. - This is issued as a result of an invalidation of cached attributes of - a vnode. - - NNOOTTEE Call is not named correctly in NetBSD and Mach. The minicache - zapfile routine takes different arguments. Linux does not implement - the invalidation of attributes correctly. - - - - 55..55.. ZZAAPPDDIIRR - - - AArrgguummeennttss - - struct cfs_zapdir_out { /* CFS_ZAPDIR is a venus->kernel call */ - ViceFid CodaFid; - } cfs_zapdir; - - - - DDeessccrriippttiioonn Remove all entries in the cache lying in a directory - CodaFid, and all children of this directory. This call is issued when - Venus receives a callback on the directory. - - - 55..66.. ZZAAPPVVNNOODDEE - - - - AArrgguummeennttss - - struct cfs_zapvnode_out { /* CFS_ZAPVNODE is a venus->kernel call */ - struct CodaCred cred; - ViceFid VFid; - } cfs_zapvnode; - - - - DDeessccrriippttiioonn Remove all entries in the cache carrying the cred and VFid - as in the arguments. This downcall is probably never issued. - - - 55..77.. PPUURRGGEEFFIIDD - - - SSuummmmaarryy - - AArrgguummeennttss - - struct cfs_purgefid_out { /* CFS_PURGEFID is a venus->kernel call */ - ViceFid CodaFid; - } cfs_purgefid; - - - - DDeessccrriippttiioonn Flush the attribute for the file. If it is a dir (odd - vnode), purge its children from the namecache and remove the file from the - namecache. - - - - 55..88.. RREEPPLLAACCEE - - - SSuummmmaarryy Replace the Fid's for a collection of names. - - AArrgguummeennttss - - struct cfs_replace_out { /* cfs_replace is a venus->kernel call */ - ViceFid NewFid; - ViceFid OldFid; - } cfs_replace; - - - - DDeessccrriippttiioonn This routine replaces a ViceFid in the name cache with - another. It is added to allow Venus during reintegration to replace - locally allocated temp fids while disconnected with global fids even - when the reference counts on those fids are not zero. - - 0wpage - - 66.. IInniittiiaalliizzaattiioonn aanndd cclleeaannuupp - - - This section gives brief hints as to desirable features for the Coda - FS Driver at startup and upon shutdown or Venus failures. Before - entering the discussion it is useful to repeat that the Coda FS Driver - maintains the following data: - - - 1. message queues - - 2. cnodes - - 3. name cache entries - - The name cache entries are entirely private to the driver, so they - can easily be manipulated. The message queues will generally have - clear points of initialization and destruction. The cnodes are - much more delicate. User processes hold reference counts in Coda - filesystems and it can be difficult to clean up the cnodes. - - It can expect requests through: - - 1. the message subsystem - - 2. the VFS layer - - 3. pioctl interface - - Currently the _p_i_o_c_t_l passes through the VFS for Coda so we can - treat these similarly. - - - 66..11.. RReeqquuiirreemmeennttss - - - The following requirements should be accommodated: - - 1. The message queues should have open and close routines. On Unix - the opening of the character devices are such routines. - - +o Before opening, no messages can be placed. - - +o Opening will remove any old messages still pending. - - +o Close will notify any sleeping processes that their upcall cannot - be completed. - - +o Close will free all memory allocated by the message queues. - - - 2. At open the namecache shall be initialized to empty state. - - 3. Before the message queues are open, all VFS operations will fail. - Fortunately this can be achieved by making sure than mounting the - Coda filesystem cannot succeed before opening. - - 4. After closing of the queues, no VFS operations can succeed. Here - one needs to be careful, since a few operations (lookup, - read/write, readdir) can proceed without upcalls. These must be - explicitly blocked. - - 5. Upon closing the namecache shall be flushed and disabled. - - 6. All memory held by cnodes can be freed without relying on upcalls. - - 7. Unmounting the file system can be done without relying on upcalls. - - 8. Mounting the Coda filesystem should fail gracefully if Venus cannot - get the rootfid or the attributes of the rootfid. The latter is - best implemented by Venus fetching these objects before attempting - to mount. - - NNOOTTEE NetBSD in particular but also Linux have not implemented the - above requirements fully. For smooth operation this needs to be - corrected. - - - diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 373d695bac65..ec6e308587e1 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -63,6 +63,7 @@ Documentation for filesystem implementations. btrfs cifs/cifsroot ceph + coda cramfs debugfs dlmfs diff --git a/MAINTAINERS b/MAINTAINERS index a103117dec85..7d2fda93f838 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4203,7 +4203,7 @@ M: coda@cs.cmu.edu L: codalist@coda.cs.cmu.edu S: Maintained W: http://www.coda.cs.cmu.edu/ -F: Documentation/filesystems/coda.txt +F: Documentation/filesystems/coda.rst F: fs/coda/ F: include/linux/coda*.h F: include/uapi/linux/coda*.h diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig index ae6759f9594a..c3477eeafb3f 100644 --- a/fs/coda/Kconfig +++ b/fs/coda/Kconfig @@ -15,7 +15,7 @@ config CODA_FS *client*. You will need user level code as well, both for the client and server. Servers are currently user level, i.e. they need no kernel support. Please read - and check out the Coda + and check out the Coda home page . To compile the coda client support as a module, choose M here: the -- cgit v1.2.3 From 01478b833176761e188a465fc7a22bd4e07d040e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:02 +0200 Subject: docs: filesystems: convert devpts.txt to ReST - Add a SPDX header; - Add a document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/4ac8f3a7edd4d817acf0d173ead7ef74fe010c6c.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/sysctl/kernel.rst | 2 +- Documentation/filesystems/devpts.rst | 36 +++++++++++++++++++++++++++++ Documentation/filesystems/devpts.txt | 26 --------------------- Documentation/filesystems/index.rst | 1 + 4 files changed, 38 insertions(+), 27 deletions(-) create mode 100644 Documentation/filesystems/devpts.rst delete mode 100644 Documentation/filesystems/devpts.txt diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 82bfd5892663..706c2158ad25 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -895,7 +895,7 @@ this sysctl interface anymore. pty === -See Documentation/filesystems/devpts.txt. +See Documentation/filesystems/devpts.rst. randomize_va_space diff --git a/Documentation/filesystems/devpts.rst b/Documentation/filesystems/devpts.rst new file mode 100644 index 000000000000..a03248ddfb4c --- /dev/null +++ b/Documentation/filesystems/devpts.rst @@ -0,0 +1,36 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +The Devpts Filesystem +===================== + +Each mount of the devpts filesystem is now distinct such that ptys +and their indicies allocated in one mount are independent from ptys +and their indicies in all other mounts. + +All mounts of the devpts filesystem now create a ``/dev/pts/ptmx`` node +with permissions ``0000``. + +To retain backwards compatibility the a ptmx device node (aka any node +created with ``mknod name c 5 2``) when opened will look for an instance +of devpts under the name ``pts`` in the same directory as the ptmx device +node. + +As an option instead of placing a ``/dev/ptmx`` device node at ``/dev/ptmx`` +it is possible to place a symlink to ``/dev/pts/ptmx`` at ``/dev/ptmx`` or +to bind mount ``/dev/ptx/ptmx`` to ``/dev/ptmx``. If you opt for using +the devpts filesystem in this manner devpts should be mounted with +the ``ptmxmode=0666``, or ``chmod 0666 /dev/pts/ptmx`` should be called. + +Total count of pty pairs in all instances is limited by sysctls:: + + kernel.pty.max = 4096 - global limit + kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace + kernel.pty.nr - current count of ptys + +Per-instance limit could be set by adding mount option ``max=``. + +This feature was added in kernel 3.4 together with +``sysctl kernel.pty.reserve``. + +In kernels older than 3.4 sysctl ``kernel.pty.max`` works as per-instance limit. diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt deleted file mode 100644 index 9f94fe276dea..000000000000 --- a/Documentation/filesystems/devpts.txt +++ /dev/null @@ -1,26 +0,0 @@ -Each mount of the devpts filesystem is now distinct such that ptys -and their indicies allocated in one mount are independent from ptys -and their indicies in all other mounts. - -All mounts of the devpts filesystem now create a /dev/pts/ptmx node -with permissions 0000. - -To retain backwards compatibility the a ptmx device node (aka any node -created with "mknod name c 5 2") when opened will look for an instance -of devpts under the name "pts" in the same directory as the ptmx device -node. - -As an option instead of placing a /dev/ptmx device node at /dev/ptmx -it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or -to bind mount /dev/ptx/ptmx to /dev/ptmx. If you opt for using -the devpts filesystem in this manner devpts should be mounted with -the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called. - -Total count of pty pairs in all instances is limited by sysctls: -kernel.pty.max = 4096 - global limit -kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace -kernel.pty.nr - current count of ptys - -Per-instance limit could be set by adding mount option "max=". -This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve. -In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index ec6e308587e1..43717f11d10b 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -24,6 +24,7 @@ algorithms work. splice locking directory-locking + devpts automount-support -- cgit v1.2.3 From b31763cff488344aaabbee99739774aea1636e27 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:03 +0200 Subject: docs: filesystems: convert dnotify.txt to ReST - Add a SPDX header; - Add a document title; - Some whitespace fixes and new line breaks; - Add table markups; - Add it to filesystems/index.rst Signed-off-by: Mauro Carvalho Chehab Acked-by: Jan Kara Link: https://lore.kernel.org/r/b39d6430d1c28438e833f01cb4597eff78703c75.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/dnotify.rst | 75 +++++++++++++++++++++++++++++++++++ Documentation/filesystems/dnotify.txt | 70 -------------------------------- Documentation/filesystems/index.rst | 1 + MAINTAINERS | 2 +- 4 files changed, 77 insertions(+), 71 deletions(-) create mode 100644 Documentation/filesystems/dnotify.rst delete mode 100644 Documentation/filesystems/dnotify.txt diff --git a/Documentation/filesystems/dnotify.rst b/Documentation/filesystems/dnotify.rst new file mode 100644 index 000000000000..a28a1f9ef79c --- /dev/null +++ b/Documentation/filesystems/dnotify.rst @@ -0,0 +1,75 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Linux Directory Notification +============================ + + Stephen Rothwell + +The intention of directory notification is to allow user applications +to be notified when a directory, or any of the files in it, are changed. +The basic mechanism involves the application registering for notification +on a directory using a fcntl(2) call and the notifications themselves +being delivered using signals. + +The application decides which "events" it wants to be notified about. +The currently defined events are: + + ========= ===================================================== + DN_ACCESS A file in the directory was accessed (read) + DN_MODIFY A file in the directory was modified (write,truncate) + DN_CREATE A file was created in the directory + DN_DELETE A file was unlinked from directory + DN_RENAME A file in the directory was renamed + DN_ATTRIB A file in the directory had its attributes + changed (chmod,chown) + ========= ===================================================== + +Usually, the application must reregister after each notification, but +if DN_MULTISHOT is or'ed with the event mask, then the registration will +remain until explicitly removed (by registering for no events). + +By default, SIGIO will be delivered to the process and no other useful +information. However, if the F_SETSIG fcntl(2) call is used to let the +kernel know which signal to deliver, a siginfo structure will be passed to +the signal handler and the si_fd member of that structure will contain the +file descriptor associated with the directory in which the event occurred. + +Preferably the application will choose one of the real time signals +(SIGRTMIN + ) so that the notifications may be queued. This is +especially important if DN_MULTISHOT is specified. Note that SIGRTMIN +is often blocked, so it is better to use (at least) SIGRTMIN + 1. + +Implementation expectations (features and bugs :-)) +--------------------------------------------------- + +The notification should work for any local access to files even if the +actual file system is on a remote server. This implies that remote +access to files served by local user mode servers should be notified. +Also, remote accesses to files served by a local kernel NFS server should +be notified. + +In order to make the impact on the file system code as small as possible, +the problem of hard links to files has been ignored. So if a file (x) +exists in two directories (a and b) then a change to the file using the +name "a/x" should be notified to a program expecting notifications on +directory "a", but will not be notified to one expecting notifications on +directory "b". + +Also, files that are unlinked, will still cause notifications in the +last directory that they were linked to. + +Configuration +------------- + +Dnotify is controlled via the CONFIG_DNOTIFY configuration option. When +disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL. + +Example +------- +See tools/testing/selftests/filesystems/dnotify_test.c for an example. + +NOTE +---- +Beginning with Linux 2.6.13, dnotify has been replaced by inotify. +See Documentation/filesystems/inotify.rst for more information on it. diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt deleted file mode 100644 index 08d575ece45d..000000000000 --- a/Documentation/filesystems/dnotify.txt +++ /dev/null @@ -1,70 +0,0 @@ - Linux Directory Notification - ============================ - - Stephen Rothwell - -The intention of directory notification is to allow user applications -to be notified when a directory, or any of the files in it, are changed. -The basic mechanism involves the application registering for notification -on a directory using a fcntl(2) call and the notifications themselves -being delivered using signals. - -The application decides which "events" it wants to be notified about. -The currently defined events are: - - DN_ACCESS A file in the directory was accessed (read) - DN_MODIFY A file in the directory was modified (write,truncate) - DN_CREATE A file was created in the directory - DN_DELETE A file was unlinked from directory - DN_RENAME A file in the directory was renamed - DN_ATTRIB A file in the directory had its attributes - changed (chmod,chown) - -Usually, the application must reregister after each notification, but -if DN_MULTISHOT is or'ed with the event mask, then the registration will -remain until explicitly removed (by registering for no events). - -By default, SIGIO will be delivered to the process and no other useful -information. However, if the F_SETSIG fcntl(2) call is used to let the -kernel know which signal to deliver, a siginfo structure will be passed to -the signal handler and the si_fd member of that structure will contain the -file descriptor associated with the directory in which the event occurred. - -Preferably the application will choose one of the real time signals -(SIGRTMIN + ) so that the notifications may be queued. This is -especially important if DN_MULTISHOT is specified. Note that SIGRTMIN -is often blocked, so it is better to use (at least) SIGRTMIN + 1. - -Implementation expectations (features and bugs :-)) ---------------------------- - -The notification should work for any local access to files even if the -actual file system is on a remote server. This implies that remote -access to files served by local user mode servers should be notified. -Also, remote accesses to files served by a local kernel NFS server should -be notified. - -In order to make the impact on the file system code as small as possible, -the problem of hard links to files has been ignored. So if a file (x) -exists in two directories (a and b) then a change to the file using the -name "a/x" should be notified to a program expecting notifications on -directory "a", but will not be notified to one expecting notifications on -directory "b". - -Also, files that are unlinked, will still cause notifications in the -last directory that they were linked to. - -Configuration -------------- - -Dnotify is controlled via the CONFIG_DNOTIFY configuration option. When -disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL. - -Example -------- -See tools/testing/selftests/filesystems/dnotify_test.c for an example. - -NOTE ----- -Beginning with Linux 2.6.13, dnotify has been replaced by inotify. -See Documentation/filesystems/inotify.rst for more information on it. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 43717f11d10b..815101241115 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -25,6 +25,7 @@ algorithms work. locking directory-locking devpts + dnotify automount-support diff --git a/MAINTAINERS b/MAINTAINERS index 7d2fda93f838..46ba68a07a8f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4996,7 +4996,7 @@ M: Jan Kara R: Amir Goldstein L: linux-fsdevel@vger.kernel.org S: Maintained -F: Documentation/filesystems/dnotify.txt +F: Documentation/filesystems/dnotify.rst F: fs/notify/dnotify/ F: include/linux/dnotify.h -- cgit v1.2.3 From e6f7df74ec1ab956430db2dcdcaf217ae5cf4ae0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:04 +0200 Subject: docs: filesystems: convert fiemap.txt to ReST - Add a SPDX header; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9182d49ffca7a0580e32ab24ecf5f8cc8d8924af.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/fiemap.rst | 234 +++++++++++++++++++++++++++++++++++ Documentation/filesystems/fiemap.txt | 231 ---------------------------------- Documentation/filesystems/index.rst | 1 + 3 files changed, 235 insertions(+), 231 deletions(-) create mode 100644 Documentation/filesystems/fiemap.rst delete mode 100644 Documentation/filesystems/fiemap.txt diff --git a/Documentation/filesystems/fiemap.rst b/Documentation/filesystems/fiemap.rst new file mode 100644 index 000000000000..2a572e7edc08 --- /dev/null +++ b/Documentation/filesystems/fiemap.rst @@ -0,0 +1,234 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============ +Fiemap Ioctl +============ + +The fiemap ioctl is an efficient method for userspace to get file +extent mappings. Instead of block-by-block mapping (such as bmap), fiemap +returns a list of extents. + + +Request Basics +-------------- + +A fiemap request is encoded within struct fiemap:: + + struct fiemap { + __u64 fm_start; /* logical offset (inclusive) at + * which to start mapping (in) */ + __u64 fm_length; /* logical length of mapping which + * userspace cares about (in) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_mapped_extents; /* number of extents that were + * mapped (out) */ + __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u32 fm_reserved; + struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ + }; + + +fm_start, and fm_length specify the logical range within the file +which the process would like mappings for. Extents returned mirror +those on disk - that is, the logical offset of the 1st returned extent +may start before fm_start, and the range covered by the last returned +extent may end after fm_length. All offsets and lengths are in bytes. + +Certain flags to modify the way in which mappings are looked up can be +set in fm_flags. If the kernel doesn't understand some particular +flags, it will return EBADR and the contents of fm_flags will contain +the set of flags which caused the error. If the kernel is compatible +with all flags passed, the contents of fm_flags will be unmodified. +It is up to userspace to determine whether rejection of a particular +flag is fatal to its operation. This scheme is intended to allow the +fiemap interface to grow in the future but without losing +compatibility with old software. + +fm_extent_count specifies the number of elements in the fm_extents[] array +that can be used to return extents. If fm_extent_count is zero, then the +fm_extents[] array is ignored (no extents will be returned), and the +fm_mapped_extents count will hold the number of extents needed in +fm_extents[] to hold the file's current mapping. Note that there is +nothing to prevent the file from changing between calls to FIEMAP. + +The following flags can be set in fm_flags: + +FIEMAP_FLAG_SYNC + If this flag is set, the kernel will sync the file before mapping extents. + +FIEMAP_FLAG_XATTR + If this flag is set, the extents returned will describe the inodes + extended attribute lookup tree, instead of its data tree. + + +Extent Mapping +-------------- + +Extent information is returned within the embedded fm_extents array +which userspace must allocate along with the fiemap structure. The +number of elements in the fiemap_extents[] array should be passed via +fm_extent_count. The number of extents mapped by kernel will be +returned via fm_mapped_extents. If the number of fiemap_extents +allocated is less than would be required to map the requested range, +the maximum number of extents that can be mapped in the fm_extent[] +array will be returned and fm_mapped_extents will be equal to +fm_extent_count. In that case, the last extent in the array will not +complete the requested range and will not have the FIEMAP_EXTENT_LAST +flag set (see the next section on extent flags). + +Each extent is described by a single fiemap_extent structure as +returned in fm_extents:: + + struct fiemap_extent { + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent */ + __u64 fe_length; /* length in bytes for the extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_reserved[3]; + }; + +All offsets and lengths are in bytes and mirror those on disk. It is valid +for an extents logical offset to start before the request or its logical +length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is +returned, fe_logical, fe_physical, and fe_length will be aligned to the +block size of the file system. With the exception of extents flagged as +FIEMAP_EXTENT_MERGED, adjacent extents will not be merged. + +The fe_flags field contains flags which describe the extent returned. +A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in +the file so that the process making fiemap calls can determine when no +more extents are available, without having to call the ioctl again. + +Some flags are intentionally vague and will always be set in the +presence of other more specific flags. This way a program looking for +a general property does not have to know all existing and future flags +which imply that property. + +For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL +are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking +for inline or tail-packed data can key on the specific flag. Software +which simply cares not to try operating on non-aligned extents +however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to +worry about all present and future flags which might imply unaligned +data. Note that the opposite is not true - it would be valid for +FIEMAP_EXTENT_NOT_ALIGNED to appear alone. + +FIEMAP_EXTENT_LAST + This is generally the last extent in the file. A mapping attempt past + this extent may return nothing. Some implementations set this flag to + indicate this extent is the last one in the range queried by the user + (via fiemap->fm_length). + +FIEMAP_EXTENT_UNKNOWN + The location of this extent is currently unknown. This may indicate + the data is stored on an inaccessible volume or that no storage has + been allocated for the file yet. + +FIEMAP_EXTENT_DELALLOC + This will also set FIEMAP_EXTENT_UNKNOWN. + + Delayed allocation - while there is data for this extent, its + physical location has not been allocated yet. + +FIEMAP_EXTENT_ENCODED + This extent does not consist of plain filesystem blocks but is + encoded (e.g. encrypted or compressed). Reading the data in this + extent via I/O to the block device will have undefined results. + +Note that it is *always* undefined to try to update the data +in-place by writing to the indicated location without the +assistance of the filesystem, or to access the data using the +information returned by the FIEMAP interface while the filesystem +is mounted. In other words, user applications may only read the +extent data via I/O to the block device while the filesystem is +unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is +clear; user applications must not try reading or writing to the +filesystem via the block device under any other circumstances. + +FIEMAP_EXTENT_DATA_ENCRYPTED + This will also set FIEMAP_EXTENT_ENCODED + The data in this extent has been encrypted by the file system. + +FIEMAP_EXTENT_NOT_ALIGNED + Extent offsets and length are not guaranteed to be block aligned. + +FIEMAP_EXTENT_DATA_INLINE + This will also set FIEMAP_EXTENT_NOT_ALIGNED + Data is located within a meta data block. + +FIEMAP_EXTENT_DATA_TAIL + This will also set FIEMAP_EXTENT_NOT_ALIGNED + Data is packed into a block with data from other files. + +FIEMAP_EXTENT_UNWRITTEN + Unwritten extent - the extent is allocated but its data has not been + initialized. This indicates the extent's data will be all zero if read + through the filesystem but the contents are undefined if read directly from + the device. + +FIEMAP_EXTENT_MERGED + This will be set when a file does not support extents, i.e., it uses a block + based addressing scheme. Since returning an extent for each block back to + userspace would be highly inefficient, the kernel will try to merge most + adjacent blocks into 'extents'. + + +VFS -> File System Implementation +--------------------------------- + +File systems wishing to support fiemap must implement a ->fiemap callback on +their inode_operations structure. The fs ->fiemap call is responsible for +defining its set of supported fiemap flags, and calling a helper function on +each discovered extent:: + + struct inode_operations { + ... + + int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, + u64 len); + +->fiemap is passed struct fiemap_extent_info which describes the +fiemap request:: + + struct fiemap_extent_info { + unsigned int fi_flags; /* Flags as passed from user */ + unsigned int fi_extents_mapped; /* Number of mapped extents */ + unsigned int fi_extents_max; /* Size of fiemap_extent array */ + struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ + }; + +It is intended that the file system should not need to access any of this +structure directly. Filesystem handlers should be tolerant to signals and return +EINTR once fatal signal received. + + +Flag checking should be done at the beginning of the ->fiemap callback via the +fiemap_check_flags() helper:: + + int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); + +The struct fieinfo should be passed in as received from ioctl_fiemap(). The +set of fiemap flags which the fs understands should be passed via fs_flags. If +fiemap_check_flags finds invalid user flags, it will place the bad values in +fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from +fiemap_check_flags(), it should immediately exit, returning that error back to +ioctl_fiemap(). + + +For each extent in the request range, the file system should call +the helper function, fiemap_fill_next_extent():: + + int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, + u64 phys, u64 len, u32 flags, u32 dev); + +fiemap_fill_next_extent() will use the passed values to populate the +next free extent in the fm_extents array. 'General' extent flags will +automatically be set from specific flags on behalf of the calling file +system so that the userspace API is not broken. + +fiemap_fill_next_extent() returns 0 on success, and 1 when the +user-supplied fm_extents array is full. If an error is encountered +while copying the extent to user memory, -EFAULT will be returned. diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt deleted file mode 100644 index ac87e6fda842..000000000000 --- a/Documentation/filesystems/fiemap.txt +++ /dev/null @@ -1,231 +0,0 @@ -============ -Fiemap Ioctl -============ - -The fiemap ioctl is an efficient method for userspace to get file -extent mappings. Instead of block-by-block mapping (such as bmap), fiemap -returns a list of extents. - - -Request Basics --------------- - -A fiemap request is encoded within struct fiemap: - -struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace cares about (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents; /* number of extents that were - * mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ - __u32 fm_reserved; - struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ -}; - - -fm_start, and fm_length specify the logical range within the file -which the process would like mappings for. Extents returned mirror -those on disk - that is, the logical offset of the 1st returned extent -may start before fm_start, and the range covered by the last returned -extent may end after fm_length. All offsets and lengths are in bytes. - -Certain flags to modify the way in which mappings are looked up can be -set in fm_flags. If the kernel doesn't understand some particular -flags, it will return EBADR and the contents of fm_flags will contain -the set of flags which caused the error. If the kernel is compatible -with all flags passed, the contents of fm_flags will be unmodified. -It is up to userspace to determine whether rejection of a particular -flag is fatal to its operation. This scheme is intended to allow the -fiemap interface to grow in the future but without losing -compatibility with old software. - -fm_extent_count specifies the number of elements in the fm_extents[] array -that can be used to return extents. If fm_extent_count is zero, then the -fm_extents[] array is ignored (no extents will be returned), and the -fm_mapped_extents count will hold the number of extents needed in -fm_extents[] to hold the file's current mapping. Note that there is -nothing to prevent the file from changing between calls to FIEMAP. - -The following flags can be set in fm_flags: - -* FIEMAP_FLAG_SYNC -If this flag is set, the kernel will sync the file before mapping extents. - -* FIEMAP_FLAG_XATTR -If this flag is set, the extents returned will describe the inodes -extended attribute lookup tree, instead of its data tree. - - -Extent Mapping --------------- - -Extent information is returned within the embedded fm_extents array -which userspace must allocate along with the fiemap structure. The -number of elements in the fiemap_extents[] array should be passed via -fm_extent_count. The number of extents mapped by kernel will be -returned via fm_mapped_extents. If the number of fiemap_extents -allocated is less than would be required to map the requested range, -the maximum number of extents that can be mapped in the fm_extent[] -array will be returned and fm_mapped_extents will be equal to -fm_extent_count. In that case, the last extent in the array will not -complete the requested range and will not have the FIEMAP_EXTENT_LAST -flag set (see the next section on extent flags). - -Each extent is described by a single fiemap_extent structure as -returned in fm_extents. - -struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent */ - __u64 fe_length; /* length in bytes for the extent */ - __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ - __u32 fe_reserved[3]; -}; - -All offsets and lengths are in bytes and mirror those on disk. It is valid -for an extents logical offset to start before the request or its logical -length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is -returned, fe_logical, fe_physical, and fe_length will be aligned to the -block size of the file system. With the exception of extents flagged as -FIEMAP_EXTENT_MERGED, adjacent extents will not be merged. - -The fe_flags field contains flags which describe the extent returned. -A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in -the file so that the process making fiemap calls can determine when no -more extents are available, without having to call the ioctl again. - -Some flags are intentionally vague and will always be set in the -presence of other more specific flags. This way a program looking for -a general property does not have to know all existing and future flags -which imply that property. - -For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL -are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking -for inline or tail-packed data can key on the specific flag. Software -which simply cares not to try operating on non-aligned extents -however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to -worry about all present and future flags which might imply unaligned -data. Note that the opposite is not true - it would be valid for -FIEMAP_EXTENT_NOT_ALIGNED to appear alone. - -* FIEMAP_EXTENT_LAST -This is generally the last extent in the file. A mapping attempt past -this extent may return nothing. Some implementations set this flag to -indicate this extent is the last one in the range queried by the user -(via fiemap->fm_length). - -* FIEMAP_EXTENT_UNKNOWN -The location of this extent is currently unknown. This may indicate -the data is stored on an inaccessible volume or that no storage has -been allocated for the file yet. - -* FIEMAP_EXTENT_DELALLOC - - This will also set FIEMAP_EXTENT_UNKNOWN. -Delayed allocation - while there is data for this extent, its -physical location has not been allocated yet. - -* FIEMAP_EXTENT_ENCODED -This extent does not consist of plain filesystem blocks but is -encoded (e.g. encrypted or compressed). Reading the data in this -extent via I/O to the block device will have undefined results. - -Note that it is *always* undefined to try to update the data -in-place by writing to the indicated location without the -assistance of the filesystem, or to access the data using the -information returned by the FIEMAP interface while the filesystem -is mounted. In other words, user applications may only read the -extent data via I/O to the block device while the filesystem is -unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is -clear; user applications must not try reading or writing to the -filesystem via the block device under any other circumstances. - -* FIEMAP_EXTENT_DATA_ENCRYPTED - - This will also set FIEMAP_EXTENT_ENCODED -The data in this extent has been encrypted by the file system. - -* FIEMAP_EXTENT_NOT_ALIGNED -Extent offsets and length are not guaranteed to be block aligned. - -* FIEMAP_EXTENT_DATA_INLINE - This will also set FIEMAP_EXTENT_NOT_ALIGNED -Data is located within a meta data block. - -* FIEMAP_EXTENT_DATA_TAIL - This will also set FIEMAP_EXTENT_NOT_ALIGNED -Data is packed into a block with data from other files. - -* FIEMAP_EXTENT_UNWRITTEN -Unwritten extent - the extent is allocated but its data has not been -initialized. This indicates the extent's data will be all zero if read -through the filesystem but the contents are undefined if read directly from -the device. - -* FIEMAP_EXTENT_MERGED -This will be set when a file does not support extents, i.e., it uses a block -based addressing scheme. Since returning an extent for each block back to -userspace would be highly inefficient, the kernel will try to merge most -adjacent blocks into 'extents'. - - -VFS -> File System Implementation ---------------------------------- - -File systems wishing to support fiemap must implement a ->fiemap callback on -their inode_operations structure. The fs ->fiemap call is responsible for -defining its set of supported fiemap flags, and calling a helper function on -each discovered extent: - -struct inode_operations { - ... - - int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, - u64 len); - -->fiemap is passed struct fiemap_extent_info which describes the -fiemap request: - -struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ -}; - -It is intended that the file system should not need to access any of this -structure directly. Filesystem handlers should be tolerant to signals and return -EINTR once fatal signal received. - - -Flag checking should be done at the beginning of the ->fiemap callback via the -fiemap_check_flags() helper: - -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); - -The struct fieinfo should be passed in as received from ioctl_fiemap(). The -set of fiemap flags which the fs understands should be passed via fs_flags. If -fiemap_check_flags finds invalid user flags, it will place the bad values in -fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from -fiemap_check_flags(), it should immediately exit, returning that error back to -ioctl_fiemap(). - - -For each extent in the request range, the file system should call -the helper function, fiemap_fill_next_extent(): - -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags, u32 dev); - -fiemap_fill_next_extent() will use the passed values to populate the -next free extent in the fm_extents array. 'General' extent flags will -automatically be set from specific flags on behalf of the calling file -system so that the userspace API is not broken. - -fiemap_fill_next_extent() returns 0 on success, and 1 when the -user-supplied fm_extents array is full. If an error is encountered -while copying the extent to user memory, -EFAULT will be returned. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 815101241115..99b450260e4d 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -26,6 +26,7 @@ algorithms work. directory-locking devpts dnotify + fiemap automount-support -- cgit v1.2.3 From e6d42cb19c0367ae6b0edfc09daceaefded4cbdd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:05 +0200 Subject: docs: filesystems: convert files.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/e31b0f6a7ee466a233dc7f9c73f53f07ebb07f0b.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/files.rst | 128 ++++++++++++++++++++++++++++++++++++ Documentation/filesystems/files.txt | 123 ---------------------------------- Documentation/filesystems/index.rst | 1 + 3 files changed, 129 insertions(+), 123 deletions(-) create mode 100644 Documentation/filesystems/files.rst delete mode 100644 Documentation/filesystems/files.txt diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst new file mode 100644 index 000000000000..cbf8e57376bf --- /dev/null +++ b/Documentation/filesystems/files.rst @@ -0,0 +1,128 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +File management in the Linux kernel +=================================== + +This document describes how locking for files (struct file) +and file descriptor table (struct files) works. + +Up until 2.6.12, the file descriptor table has been protected +with a lock (files->file_lock) and reference count (files->count). +->file_lock protected accesses to all the file related fields +of the table. ->count was used for sharing the file descriptor +table between tasks cloned with CLONE_FILES flag. Typically +this would be the case for posix threads. As with the common +refcounting model in the kernel, the last task doing +a put_files_struct() frees the file descriptor (fd) table. +The files (struct file) themselves are protected using +reference count (->f_count). + +In the new lock-free model of file descriptor management, +the reference counting is similar, but the locking is +based on RCU. The file descriptor table contains multiple +elements - the fd sets (open_fds and close_on_exec, the +array of file pointers, the sizes of the sets and the array +etc.). In order for the updates to appear atomic to +a lock-free reader, all the elements of the file descriptor +table are in a separate structure - struct fdtable. +files_struct contains a pointer to struct fdtable through +which the actual fd table is accessed. Initially the +fdtable is embedded in files_struct itself. On a subsequent +expansion of fdtable, a new fdtable structure is allocated +and files->fdtab points to the new structure. The fdtable +structure is freed with RCU and lock-free readers either +see the old fdtable or the new fdtable making the update +appear atomic. Here are the locking rules for +the fdtable structure - + +1. All references to the fdtable must be done through + the files_fdtable() macro:: + + struct fdtable *fdt; + + rcu_read_lock(); + + fdt = files_fdtable(files); + .... + if (n <= fdt->max_fds) + .... + ... + rcu_read_unlock(); + + files_fdtable() uses rcu_dereference() macro which takes care of + the memory barrier requirements for lock-free dereference. + The fdtable pointer must be read within the read-side + critical section. + +2. Reading of the fdtable as described above must be protected + by rcu_read_lock()/rcu_read_unlock(). + +3. For any update to the fd table, files->file_lock must + be held. + +4. To look up the file structure given an fd, a reader + must use either fcheck() or fcheck_files() APIs. These + take care of barrier requirements due to lock-free lookup. + + An example:: + + struct file *file; + + rcu_read_lock(); + file = fcheck(fd); + if (file) { + ... + } + .... + rcu_read_unlock(); + +5. Handling of the file structures is special. Since the look-up + of the fd (fget()/fget_light()) are lock-free, it is possible + that look-up may race with the last put() operation on the + file structure. This is avoided using atomic_long_inc_not_zero() + on ->f_count:: + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + .... + return file; + + atomic_long_inc_not_zero() detects if refcounts is already zero or + goes to zero during increment. If it does, we fail + fget()/fget_light(). + +6. Since both fdtable and file structures can be looked up + lock-free, they must be installed using rcu_assign_pointer() + API. If they are looked up lock-free, rcu_dereference() + must be used. However it is advisable to use files_fdtable() + and fcheck()/fcheck_files() which take care of these issues. + +7. While updating, the fdtable pointer must be looked up while + holding files->file_lock. If ->file_lock is dropped, then + another thread expand the files thereby creating a new + fdtable and making the earlier fdtable pointer stale. + + For example:: + + spin_lock(&files->file_lock); + fd = locate_fd(files, file, start); + if (fd >= 0) { + /* locate_fd() may have expanded fdtable, load the ptr */ + fdt = files_fdtable(files); + __set_open_fd(fd, fdt); + __clear_close_on_exec(fd, fdt); + spin_unlock(&files->file_lock); + ..... + + Since locate_fd() can drop ->file_lock (and reacquire ->file_lock), + the fdtable pointer (fdt) must be loaded after locate_fd(). + diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt deleted file mode 100644 index 46dfc6b038c3..000000000000 --- a/Documentation/filesystems/files.txt +++ /dev/null @@ -1,123 +0,0 @@ -File management in the Linux kernel ------------------------------------ - -This document describes how locking for files (struct file) -and file descriptor table (struct files) works. - -Up until 2.6.12, the file descriptor table has been protected -with a lock (files->file_lock) and reference count (files->count). -->file_lock protected accesses to all the file related fields -of the table. ->count was used for sharing the file descriptor -table between tasks cloned with CLONE_FILES flag. Typically -this would be the case for posix threads. As with the common -refcounting model in the kernel, the last task doing -a put_files_struct() frees the file descriptor (fd) table. -The files (struct file) themselves are protected using -reference count (->f_count). - -In the new lock-free model of file descriptor management, -the reference counting is similar, but the locking is -based on RCU. The file descriptor table contains multiple -elements - the fd sets (open_fds and close_on_exec, the -array of file pointers, the sizes of the sets and the array -etc.). In order for the updates to appear atomic to -a lock-free reader, all the elements of the file descriptor -table are in a separate structure - struct fdtable. -files_struct contains a pointer to struct fdtable through -which the actual fd table is accessed. Initially the -fdtable is embedded in files_struct itself. On a subsequent -expansion of fdtable, a new fdtable structure is allocated -and files->fdtab points to the new structure. The fdtable -structure is freed with RCU and lock-free readers either -see the old fdtable or the new fdtable making the update -appear atomic. Here are the locking rules for -the fdtable structure - - -1. All references to the fdtable must be done through - the files_fdtable() macro : - - struct fdtable *fdt; - - rcu_read_lock(); - - fdt = files_fdtable(files); - .... - if (n <= fdt->max_fds) - .... - ... - rcu_read_unlock(); - - files_fdtable() uses rcu_dereference() macro which takes care of - the memory barrier requirements for lock-free dereference. - The fdtable pointer must be read within the read-side - critical section. - -2. Reading of the fdtable as described above must be protected - by rcu_read_lock()/rcu_read_unlock(). - -3. For any update to the fd table, files->file_lock must - be held. - -4. To look up the file structure given an fd, a reader - must use either fcheck() or fcheck_files() APIs. These - take care of barrier requirements due to lock-free lookup. - An example : - - struct file *file; - - rcu_read_lock(); - file = fcheck(fd); - if (file) { - ... - } - .... - rcu_read_unlock(); - -5. Handling of the file structures is special. Since the look-up - of the fd (fget()/fget_light()) are lock-free, it is possible - that look-up may race with the last put() operation on the - file structure. This is avoided using atomic_long_inc_not_zero() - on ->f_count : - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); - .... - return file; - - atomic_long_inc_not_zero() detects if refcounts is already zero or - goes to zero during increment. If it does, we fail - fget()/fget_light(). - -6. Since both fdtable and file structures can be looked up - lock-free, they must be installed using rcu_assign_pointer() - API. If they are looked up lock-free, rcu_dereference() - must be used. However it is advisable to use files_fdtable() - and fcheck()/fcheck_files() which take care of these issues. - -7. While updating, the fdtable pointer must be looked up while - holding files->file_lock. If ->file_lock is dropped, then - another thread expand the files thereby creating a new - fdtable and making the earlier fdtable pointer stale. - For example : - - spin_lock(&files->file_lock); - fd = locate_fd(files, file, start); - if (fd >= 0) { - /* locate_fd() may have expanded fdtable, load the ptr */ - fdt = files_fdtable(files); - __set_open_fd(fd, fdt); - __clear_close_on_exec(fd, fdt); - spin_unlock(&files->file_lock); - ..... - - Since locate_fd() can drop ->file_lock (and reacquire ->file_lock), - the fdtable pointer (fdt) must be loaded after locate_fd(). - diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 99b450260e4d..597ea3ed415b 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -27,6 +27,7 @@ algorithms work. devpts dnotify fiemap + files automount-support -- cgit v1.2.3 From ba302d2a8ef0917a6d1c3775d1b4b9f2b26c8ef5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:06 +0200 Subject: docs: filesystems: convert fuse-io.txt to ReST - Add a SPDX header; - Add a document title; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/88ec8025c1c5fc3ac5b65f1151c41ebcc696dc0e.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/fuse-io.rst | 44 +++++++++++++++++++++++++++++++++++ Documentation/filesystems/fuse-io.txt | 38 ------------------------------ Documentation/filesystems/index.rst | 1 + 3 files changed, 45 insertions(+), 38 deletions(-) create mode 100644 Documentation/filesystems/fuse-io.rst delete mode 100644 Documentation/filesystems/fuse-io.txt diff --git a/Documentation/filesystems/fuse-io.rst b/Documentation/filesystems/fuse-io.rst new file mode 100644 index 000000000000..255a368fe534 --- /dev/null +++ b/Documentation/filesystems/fuse-io.rst @@ -0,0 +1,44 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +Fuse I/O Modes +============== + +Fuse supports the following I/O modes: + +- direct-io +- cached + + write-through + + writeback-cache + +The direct-io mode can be selected with the FOPEN_DIRECT_IO flag in the +FUSE_OPEN reply. + +In direct-io mode the page cache is completely bypassed for reads and writes. +No read-ahead takes place. Shared mmap is disabled. + +In cached mode reads may be satisfied from the page cache, and data may be +read-ahead by the kernel to fill the cache. The cache is always kept consistent +after any writes to the file. All mmap modes are supported. + +The cached mode has two sub modes controlling how writes are handled. The +write-through mode is the default and is supported on all kernels. The +writeback-cache mode may be selected by the FUSE_WRITEBACK_CACHE flag in the +FUSE_INIT reply. + +In write-through mode each write is immediately sent to userspace as one or more +WRITE requests, as well as updating any cached pages (and caching previously +uncached, but fully written pages). No READ requests are ever sent for writes, +so when an uncached page is partially written, the page is discarded. + +In writeback-cache mode (enabled by the FUSE_WRITEBACK_CACHE flag) writes go to +the cache only, which means that the write(2) syscall can often complete very +fast. Dirty pages are written back implicitly (background writeback or page +reclaim on memory pressure) or explicitly (invoked by close(2), fsync(2) and +when the last ref to the file is being released on munmap(2)). This mode +assumes that all changes to the filesystem go through the FUSE kernel module +(size and atime/ctime/mtime attributes are kept up-to-date by the kernel), so +it's generally not suitable for network filesystems. If a partial page is +written, then the page needs to be first read from userspace. This means, that +even for files opened for O_WRONLY it is possible that READ requests will be +generated by the kernel. diff --git a/Documentation/filesystems/fuse-io.txt b/Documentation/filesystems/fuse-io.txt deleted file mode 100644 index 07b8f73f100f..000000000000 --- a/Documentation/filesystems/fuse-io.txt +++ /dev/null @@ -1,38 +0,0 @@ -Fuse supports the following I/O modes: - -- direct-io -- cached - + write-through - + writeback-cache - -The direct-io mode can be selected with the FOPEN_DIRECT_IO flag in the -FUSE_OPEN reply. - -In direct-io mode the page cache is completely bypassed for reads and writes. -No read-ahead takes place. Shared mmap is disabled. - -In cached mode reads may be satisfied from the page cache, and data may be -read-ahead by the kernel to fill the cache. The cache is always kept consistent -after any writes to the file. All mmap modes are supported. - -The cached mode has two sub modes controlling how writes are handled. The -write-through mode is the default and is supported on all kernels. The -writeback-cache mode may be selected by the FUSE_WRITEBACK_CACHE flag in the -FUSE_INIT reply. - -In write-through mode each write is immediately sent to userspace as one or more -WRITE requests, as well as updating any cached pages (and caching previously -uncached, but fully written pages). No READ requests are ever sent for writes, -so when an uncached page is partially written, the page is discarded. - -In writeback-cache mode (enabled by the FUSE_WRITEBACK_CACHE flag) writes go to -the cache only, which means that the write(2) syscall can often complete very -fast. Dirty pages are written back implicitly (background writeback or page -reclaim on memory pressure) or explicitly (invoked by close(2), fsync(2) and -when the last ref to the file is being released on munmap(2)). This mode -assumes that all changes to the filesystem go through the FUSE kernel module -(size and atime/ctime/mtime attributes are kept up-to-date by the kernel), so -it's generally not suitable for network filesystems. If a partial page is -written, then the page needs to be first read from userspace. This means, that -even for files opened for O_WRONLY it is possible that READ requests will be -generated by the kernel. diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 597ea3ed415b..ace804c4c3ab 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -83,6 +83,7 @@ Documentation for filesystem implementations. hfsplus hpfs fuse + fuse-io inotify isofs nilfs2 -- cgit v1.2.3 From 63526525dd7a9c85786ff9f9592b6f4281ad6171 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:07 +0200 Subject: docs: filesystems: convert locks.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/467b3f20e63d2640d22599b99229699b5fb79251.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/locks.rst | 72 +++++++++++++++++++++++++++++++++++++ Documentation/filesystems/locks.txt | 68 ----------------------------------- 3 files changed, 73 insertions(+), 68 deletions(-) create mode 100644 Documentation/filesystems/locks.rst delete mode 100644 Documentation/filesystems/locks.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index ace804c4c3ab..64535f57c3ac 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -28,6 +28,7 @@ algorithms work. dnotify fiemap files + locks automount-support diff --git a/Documentation/filesystems/locks.rst b/Documentation/filesystems/locks.rst new file mode 100644 index 000000000000..10f67fb9ce07 --- /dev/null +++ b/Documentation/filesystems/locks.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +File Locking Release Notes +========================== + + Andy Walker + + 12 May 1997 + + +1. What's New? +============== + +1.1 Broken Flock Emulation +-------------------------- + +The old flock(2) emulation in the kernel was swapped for proper BSD +compatible flock(2) support in the 1.3.x series of kernels. With the +release of the 2.1.x kernel series, support for the old emulation has +been totally removed, so that we don't need to carry this baggage +forever. + +This should not cause problems for anybody, since everybody using a +2.1.x kernel should have updated their C library to a suitable version +anyway (see the file "Documentation/process/changes.rst".) + +1.2 Allow Mixed Locks Again +--------------------------- + +1.2.1 Typical Problems - Sendmail +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Because sendmail was unable to use the old flock() emulation, many sendmail +installations use fcntl() instead of flock(). This is true of Slackware 3.0 +for example. This gave rise to some other subtle problems if sendmail was +configured to rebuild the alias file. Sendmail tried to lock the aliases.dir +file with fcntl() at the same time as the GDBM routines tried to lock this +file with flock(). With pre 1.3.96 kernels this could result in deadlocks that, +over time, or under a very heavy mail load, would eventually cause the kernel +to lock solid with deadlocked processes. + + +1.2.2 The Solution +^^^^^^^^^^^^^^^^^^ +The solution I have chosen, after much experimentation and discussion, +is to make flock() and fcntl() locks oblivious to each other. Both can +exists, and neither will have any effect on the other. + +I wanted the two lock styles to be cooperative, but there were so many +race and deadlock conditions that the current solution was the only +practical one. It puts us in the same position as, for example, SunOS +4.1.x and several other commercial Unices. The only OS's that support +cooperative flock()/fcntl() are those that emulate flock() using +fcntl(), with all the problems that implies. + + +1.3 Mandatory Locking As A Mount Option +--------------------------------------- + +Mandatory locking, as described in +'Documentation/filesystems/mandatory-locking.txt' was prior to this release a +general configuration option that was valid for all mounted filesystems. This +had a number of inherent dangers, not the least of which was the ability to +freeze an NFS server by asking it to read a file for which a mandatory lock +existed. + +From this release of the kernel, mandatory locking can be turned on and off +on a per-filesystem basis, using the mount options 'mand' and 'nomand'. +The default is to disallow mandatory locking. The intention is that +mandatory locking only be enabled on a local filesystem as the specific need +arises. + diff --git a/Documentation/filesystems/locks.txt b/Documentation/filesystems/locks.txt deleted file mode 100644 index 5368690f412e..000000000000 --- a/Documentation/filesystems/locks.txt +++ /dev/null @@ -1,68 +0,0 @@ - File Locking Release Notes - - Andy Walker - - 12 May 1997 - - -1. What's New? --------------- - -1.1 Broken Flock Emulation --------------------------- - -The old flock(2) emulation in the kernel was swapped for proper BSD -compatible flock(2) support in the 1.3.x series of kernels. With the -release of the 2.1.x kernel series, support for the old emulation has -been totally removed, so that we don't need to carry this baggage -forever. - -This should not cause problems for anybody, since everybody using a -2.1.x kernel should have updated their C library to a suitable version -anyway (see the file "Documentation/process/changes.rst".) - -1.2 Allow Mixed Locks Again ---------------------------- - -1.2.1 Typical Problems - Sendmail ---------------------------------- -Because sendmail was unable to use the old flock() emulation, many sendmail -installations use fcntl() instead of flock(). This is true of Slackware 3.0 -for example. This gave rise to some other subtle problems if sendmail was -configured to rebuild the alias file. Sendmail tried to lock the aliases.dir -file with fcntl() at the same time as the GDBM routines tried to lock this -file with flock(). With pre 1.3.96 kernels this could result in deadlocks that, -over time, or under a very heavy mail load, would eventually cause the kernel -to lock solid with deadlocked processes. - - -1.2.2 The Solution ------------------- -The solution I have chosen, after much experimentation and discussion, -is to make flock() and fcntl() locks oblivious to each other. Both can -exists, and neither will have any effect on the other. - -I wanted the two lock styles to be cooperative, but there were so many -race and deadlock conditions that the current solution was the only -practical one. It puts us in the same position as, for example, SunOS -4.1.x and several other commercial Unices. The only OS's that support -cooperative flock()/fcntl() are those that emulate flock() using -fcntl(), with all the problems that implies. - - -1.3 Mandatory Locking As A Mount Option ---------------------------------------- - -Mandatory locking, as described in -'Documentation/filesystems/mandatory-locking.txt' was prior to this release a -general configuration option that was valid for all mounted filesystems. This -had a number of inherent dangers, not the least of which was the ability to -freeze an NFS server by asking it to read a file for which a mandatory lock -existed. - -From this release of the kernel, mandatory locking can be turned on and off -on a per-filesystem basis, using the mount options 'mand' and 'nomand'. -The default is to disallow mandatory locking. The intention is that -mandatory locking only be enabled on a local filesystem as the specific need -arises. - -- cgit v1.2.3 From a02dcdf65bcfca1068e9c6d2cbafdab724cfbce9 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:08 +0200 Subject: docs: filesystems: convert mandatory-locking.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Use notes markups; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/aecd6259fe9f99b2c2b3440eab6a2b989125e00d.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/locks.rst | 2 +- Documentation/filesystems/mandatory-locking.rst | 188 ++++++++++++++++++++++++ Documentation/filesystems/mandatory-locking.txt | 181 ----------------------- fs/locks.c | 2 +- 5 files changed, 191 insertions(+), 183 deletions(-) create mode 100644 Documentation/filesystems/mandatory-locking.rst delete mode 100644 Documentation/filesystems/mandatory-locking.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 64535f57c3ac..df1a474ad9b9 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -29,6 +29,7 @@ algorithms work. fiemap files locks + mandatory-locking automount-support diff --git a/Documentation/filesystems/locks.rst b/Documentation/filesystems/locks.rst index 10f67fb9ce07..c5ae858b1aac 100644 --- a/Documentation/filesystems/locks.rst +++ b/Documentation/filesystems/locks.rst @@ -58,7 +58,7 @@ fcntl(), with all the problems that implies. --------------------------------------- Mandatory locking, as described in -'Documentation/filesystems/mandatory-locking.txt' was prior to this release a +'Documentation/filesystems/mandatory-locking.rst' was prior to this release a general configuration option that was valid for all mounted filesystems. This had a number of inherent dangers, not the least of which was the ability to freeze an NFS server by asking it to read a file for which a mandatory lock diff --git a/Documentation/filesystems/mandatory-locking.rst b/Documentation/filesystems/mandatory-locking.rst new file mode 100644 index 000000000000..9ce73544a8f0 --- /dev/null +++ b/Documentation/filesystems/mandatory-locking.rst @@ -0,0 +1,188 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================================== +Mandatory File Locking For The Linux Operating System +===================================================== + + Andy Walker + + 15 April 1996 + + (Updated September 2007) + +0. Why you should avoid mandatory locking +----------------------------------------- + +The Linux implementation is prey to a number of difficult-to-fix race +conditions which in practice make it not dependable: + + - The write system call checks for a mandatory lock only once + at its start. It is therefore possible for a lock request to + be granted after this check but before the data is modified. + A process may then see file data change even while a mandatory + lock was held. + - Similarly, an exclusive lock may be granted on a file after + the kernel has decided to proceed with a read, but before the + read has actually completed, and the reading process may see + the file data in a state which should not have been visible + to it. + - Similar races make the claimed mutual exclusion between lock + and mmap similarly unreliable. + +1. What is mandatory locking? +------------------------------ + +Mandatory locking is kernel enforced file locking, as opposed to the more usual +cooperative file locking used to guarantee sequential access to files among +processes. File locks are applied using the flock() and fcntl() system calls +(and the lockf() library routine which is a wrapper around fcntl().) It is +normally a process' responsibility to check for locks on a file it wishes to +update, before applying its own lock, updating the file and unlocking it again. +The most commonly used example of this (and in the case of sendmail, the most +troublesome) is access to a user's mailbox. The mail user agent and the mail +transfer agent must guard against updating the mailbox at the same time, and +prevent reading the mailbox while it is being updated. + +In a perfect world all processes would use and honour a cooperative, or +"advisory" locking scheme. However, the world isn't perfect, and there's +a lot of poorly written code out there. + +In trying to address this problem, the designers of System V UNIX came up +with a "mandatory" locking scheme, whereby the operating system kernel would +block attempts by a process to write to a file that another process holds a +"read" -or- "shared" lock on, and block attempts to both read and write to a +file that a process holds a "write " -or- "exclusive" lock on. + +The System V mandatory locking scheme was intended to have as little impact as +possible on existing user code. The scheme is based on marking individual files +as candidates for mandatory locking, and using the existing fcntl()/lockf() +interface for applying locks just as if they were normal, advisory locks. + +.. Note:: + + 1. In saying "file" in the paragraphs above I am actually not telling + the whole truth. System V locking is based on fcntl(). The granularity of + fcntl() is such that it allows the locking of byte ranges in files, in + addition to entire files, so the mandatory locking rules also have byte + level granularity. + + 2. POSIX.1 does not specify any scheme for mandatory locking, despite + borrowing the fcntl() locking scheme from System V. The mandatory locking + scheme is defined by the System V Interface Definition (SVID) Version 3. + +2. Marking a file for mandatory locking +--------------------------------------- + +A file is marked as a candidate for mandatory locking by setting the group-id +bit in its file mode but removing the group-execute bit. This is an otherwise +meaningless combination, and was chosen by the System V implementors so as not +to break existing user programs. + +Note that the group-id bit is usually automatically cleared by the kernel when +a setgid file is written to. This is a security measure. The kernel has been +modified to recognize the special case of a mandatory lock candidate and to +refrain from clearing this bit. Similarly the kernel has been modified not +to run mandatory lock candidates with setgid privileges. + +3. Available implementations +---------------------------- + +I have considered the implementations of mandatory locking available with +SunOS 4.1.x, Solaris 2.x and HP-UX 9.x. + +Generally I have tried to make the most sense out of the behaviour exhibited +by these three reference systems. There are many anomalies. + +All the reference systems reject all calls to open() for a file on which +another process has outstanding mandatory locks. This is in direct +contravention of SVID 3, which states that only calls to open() with the +O_TRUNC flag set should be rejected. The Linux implementation follows the SVID +definition, which is the "Right Thing", since only calls with O_TRUNC can +modify the contents of the file. + +HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not +just mandatory locks. That would appear to contravene POSIX.1. + +mmap() is another interesting case. All the operating systems mentioned +prevent mandatory locks from being applied to an mmap()'ed file, but HP-UX +also disallows advisory locks for such a file. SVID actually specifies the +paranoid HP-UX behaviour. + +In my opinion only MAP_SHARED mappings should be immune from locking, and then +only from mandatory locks - that is what is currently implemented. + +SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for +mandatory locks, so reads and writes to locked files always block when they +should return EAGAIN. + +I'm afraid that this is such an esoteric area that the semantics described +below are just as valid as any others, so long as the main points seem to +agree. + +4. Semantics +------------ + +1. Mandatory locks can only be applied via the fcntl()/lockf() locking + interface - in other words the System V/POSIX interface. BSD style + locks using flock() never result in a mandatory lock. + +2. If a process has locked a region of a file with a mandatory read lock, then + other processes are permitted to read from that region. If any of these + processes attempts to write to the region it will block until the lock is + released, unless the process has opened the file with the O_NONBLOCK + flag in which case the system call will return immediately with the error + status EAGAIN. + +3. If a process has locked a region of a file with a mandatory write lock, all + attempts to read or write to that region block until the lock is released, + unless a process has opened the file with the O_NONBLOCK flag in which case + the system call will return immediately with the error status EAGAIN. + +4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has + any mandatory locks owned by other processes will be rejected with the + error status EAGAIN. + +5. Attempts to apply a mandatory lock to a file that is memory mapped and + shared (via mmap() with MAP_SHARED) will be rejected with the error status + EAGAIN. + +6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED) + that has any mandatory locks in effect will be rejected with the error status + EAGAIN. + +5. Which system calls are affected? +----------------------------------- + +Those which modify a file's contents, not just the inode. That gives read(), +write(), readv(), writev(), open(), creat(), mmap(), truncate() and +ftruncate(). truncate() and ftruncate() are considered to be "write" actions +for the purposes of mandatory locking. + +The affected region is usually defined as stretching from the current position +for the total number of bytes read or written. For the truncate calls it is +defined as the bytes of a file removed or added (we must also consider bytes +added, as a lock can specify just "the whole file", rather than a specific +range of bytes.) + +Note 3: I may have overlooked some system calls that need mandatory lock +checking in my eagerness to get this code out the door. Please let me know, or +better still fix the system calls yourself and submit a patch to me or Linus. + +6. Warning! +----------- + +Not even root can override a mandatory lock, so runaway processes can wreak +havoc if they lock crucial files. The way around it is to change the file +permissions (remove the setgid bit) before trying to read or write to it. +Of course, that might be a bit tricky if the system is hung :-( + +7. The "mand" mount option +-------------------------- +Mandatory locking is disabled on all filesystems by default, and must be +administratively enabled by mounting with "-o mand". That mount option +is only allowed if the mounting task has the CAP_SYS_ADMIN capability. + +Since kernel v4.5, it is possible to disable mandatory locking +altogether by setting CONFIG_MANDATORY_FILE_LOCKING to "n". A kernel +with this disabled will reject attempts to mount filesystems with the +"mand" mount option with the error status EPERM. diff --git a/Documentation/filesystems/mandatory-locking.txt b/Documentation/filesystems/mandatory-locking.txt deleted file mode 100644 index a251ca33164a..000000000000 --- a/Documentation/filesystems/mandatory-locking.txt +++ /dev/null @@ -1,181 +0,0 @@ - Mandatory File Locking For The Linux Operating System - - Andy Walker - - 15 April 1996 - (Updated September 2007) - -0. Why you should avoid mandatory locking ------------------------------------------ - -The Linux implementation is prey to a number of difficult-to-fix race -conditions which in practice make it not dependable: - - - The write system call checks for a mandatory lock only once - at its start. It is therefore possible for a lock request to - be granted after this check but before the data is modified. - A process may then see file data change even while a mandatory - lock was held. - - Similarly, an exclusive lock may be granted on a file after - the kernel has decided to proceed with a read, but before the - read has actually completed, and the reading process may see - the file data in a state which should not have been visible - to it. - - Similar races make the claimed mutual exclusion between lock - and mmap similarly unreliable. - -1. What is mandatory locking? ------------------------------- - -Mandatory locking is kernel enforced file locking, as opposed to the more usual -cooperative file locking used to guarantee sequential access to files among -processes. File locks are applied using the flock() and fcntl() system calls -(and the lockf() library routine which is a wrapper around fcntl().) It is -normally a process' responsibility to check for locks on a file it wishes to -update, before applying its own lock, updating the file and unlocking it again. -The most commonly used example of this (and in the case of sendmail, the most -troublesome) is access to a user's mailbox. The mail user agent and the mail -transfer agent must guard against updating the mailbox at the same time, and -prevent reading the mailbox while it is being updated. - -In a perfect world all processes would use and honour a cooperative, or -"advisory" locking scheme. However, the world isn't perfect, and there's -a lot of poorly written code out there. - -In trying to address this problem, the designers of System V UNIX came up -with a "mandatory" locking scheme, whereby the operating system kernel would -block attempts by a process to write to a file that another process holds a -"read" -or- "shared" lock on, and block attempts to both read and write to a -file that a process holds a "write " -or- "exclusive" lock on. - -The System V mandatory locking scheme was intended to have as little impact as -possible on existing user code. The scheme is based on marking individual files -as candidates for mandatory locking, and using the existing fcntl()/lockf() -interface for applying locks just as if they were normal, advisory locks. - -Note 1: In saying "file" in the paragraphs above I am actually not telling -the whole truth. System V locking is based on fcntl(). The granularity of -fcntl() is such that it allows the locking of byte ranges in files, in addition -to entire files, so the mandatory locking rules also have byte level -granularity. - -Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite -borrowing the fcntl() locking scheme from System V. The mandatory locking -scheme is defined by the System V Interface Definition (SVID) Version 3. - -2. Marking a file for mandatory locking ---------------------------------------- - -A file is marked as a candidate for mandatory locking by setting the group-id -bit in its file mode but removing the group-execute bit. This is an otherwise -meaningless combination, and was chosen by the System V implementors so as not -to break existing user programs. - -Note that the group-id bit is usually automatically cleared by the kernel when -a setgid file is written to. This is a security measure. The kernel has been -modified to recognize the special case of a mandatory lock candidate and to -refrain from clearing this bit. Similarly the kernel has been modified not -to run mandatory lock candidates with setgid privileges. - -3. Available implementations ----------------------------- - -I have considered the implementations of mandatory locking available with -SunOS 4.1.x, Solaris 2.x and HP-UX 9.x. - -Generally I have tried to make the most sense out of the behaviour exhibited -by these three reference systems. There are many anomalies. - -All the reference systems reject all calls to open() for a file on which -another process has outstanding mandatory locks. This is in direct -contravention of SVID 3, which states that only calls to open() with the -O_TRUNC flag set should be rejected. The Linux implementation follows the SVID -definition, which is the "Right Thing", since only calls with O_TRUNC can -modify the contents of the file. - -HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not -just mandatory locks. That would appear to contravene POSIX.1. - -mmap() is another interesting case. All the operating systems mentioned -prevent mandatory locks from being applied to an mmap()'ed file, but HP-UX -also disallows advisory locks for such a file. SVID actually specifies the -paranoid HP-UX behaviour. - -In my opinion only MAP_SHARED mappings should be immune from locking, and then -only from mandatory locks - that is what is currently implemented. - -SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for -mandatory locks, so reads and writes to locked files always block when they -should return EAGAIN. - -I'm afraid that this is such an esoteric area that the semantics described -below are just as valid as any others, so long as the main points seem to -agree. - -4. Semantics ------------- - -1. Mandatory locks can only be applied via the fcntl()/lockf() locking - interface - in other words the System V/POSIX interface. BSD style - locks using flock() never result in a mandatory lock. - -2. If a process has locked a region of a file with a mandatory read lock, then - other processes are permitted to read from that region. If any of these - processes attempts to write to the region it will block until the lock is - released, unless the process has opened the file with the O_NONBLOCK - flag in which case the system call will return immediately with the error - status EAGAIN. - -3. If a process has locked a region of a file with a mandatory write lock, all - attempts to read or write to that region block until the lock is released, - unless a process has opened the file with the O_NONBLOCK flag in which case - the system call will return immediately with the error status EAGAIN. - -4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has - any mandatory locks owned by other processes will be rejected with the - error status EAGAIN. - -5. Attempts to apply a mandatory lock to a file that is memory mapped and - shared (via mmap() with MAP_SHARED) will be rejected with the error status - EAGAIN. - -6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED) - that has any mandatory locks in effect will be rejected with the error status - EAGAIN. - -5. Which system calls are affected? ------------------------------------ - -Those which modify a file's contents, not just the inode. That gives read(), -write(), readv(), writev(), open(), creat(), mmap(), truncate() and -ftruncate(). truncate() and ftruncate() are considered to be "write" actions -for the purposes of mandatory locking. - -The affected region is usually defined as stretching from the current position -for the total number of bytes read or written. For the truncate calls it is -defined as the bytes of a file removed or added (we must also consider bytes -added, as a lock can specify just "the whole file", rather than a specific -range of bytes.) - -Note 3: I may have overlooked some system calls that need mandatory lock -checking in my eagerness to get this code out the door. Please let me know, or -better still fix the system calls yourself and submit a patch to me or Linus. - -6. Warning! ------------ - -Not even root can override a mandatory lock, so runaway processes can wreak -havoc if they lock crucial files. The way around it is to change the file -permissions (remove the setgid bit) before trying to read or write to it. -Of course, that might be a bit tricky if the system is hung :-( - -7. The "mand" mount option --------------------------- -Mandatory locking is disabled on all filesystems by default, and must be -administratively enabled by mounting with "-o mand". That mount option -is only allowed if the mounting task has the CAP_SYS_ADMIN capability. - -Since kernel v4.5, it is possible to disable mandatory locking -altogether by setting CONFIG_MANDATORY_FILE_LOCKING to "n". A kernel -with this disabled will reject attempts to mount filesystems with the -"mand" mount option with the error status EPERM. diff --git a/fs/locks.c b/fs/locks.c index b8a31c1c4fff..1d4f4d5da704 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -61,7 +61,7 @@ * * Initial implementation of mandatory locks. SunOS turned out to be * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/filesystems/mandatory-locking.txt' for details. + * See 'Documentation/filesystems/mandatory-locking.rst' for details. * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. * * Don't allow mandatory locks on mmap()'ed files. Added simple functions to -- cgit v1.2.3 From 791a17ee19736fc6d4e35c4bf9f8efd1001be77c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:09 +0200 Subject: docs: filesystems: convert mount_api.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add table markups; - Add lists markups; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/32332c1659a28c22561cb5e64162c959856066b4.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/mount_api.rst | 825 ++++++++++++++++++++++++++++++++ Documentation/filesystems/mount_api.txt | 724 ---------------------------- include/linux/fs_context.h | 2 +- include/linux/lsm_hooks.h | 2 +- 5 files changed, 828 insertions(+), 726 deletions(-) create mode 100644 Documentation/filesystems/mount_api.rst delete mode 100644 Documentation/filesystems/mount_api.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index df1a474ad9b9..4532b4d2631f 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -30,6 +30,7 @@ algorithms work. files locks mandatory-locking + mount_api automount-support diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst new file mode 100644 index 000000000000..dea22d64f060 --- /dev/null +++ b/Documentation/filesystems/mount_api.rst @@ -0,0 +1,825 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +fILESYSTEM Mount API +==================== + +.. CONTENTS + + (1) Overview. + + (2) The filesystem context. + + (3) The filesystem context operations. + + (4) Filesystem context security. + + (5) VFS filesystem context API. + + (6) Superblock creation helpers. + + (7) Parameter description. + + (8) Parameter helper functions. + + +Overview +======== + +The creation of new mounts is now to be done in a multistep process: + + (1) Create a filesystem context. + + (2) Parse the parameters and attach them to the context. Parameters are + expected to be passed individually from userspace, though legacy binary + parameters can also be handled. + + (3) Validate and pre-process the context. + + (4) Get or create a superblock and mountable root. + + (5) Perform the mount. + + (6) Return an error message attached to the context. + + (7) Destroy the context. + +To support this, the file_system_type struct gains two new fields:: + + int (*init_fs_context)(struct fs_context *fc); + const struct fs_parameter_description *parameters; + +The first is invoked to set up the filesystem-specific parts of a filesystem +context, including the additional space, and the second points to the +parameter description for validation at registration time and querying by a +future system call. + +Note that security initialisation is done *after* the filesystem is called so +that the namespaces may be adjusted first. + + +The Filesystem context +====================== + +The creation and reconfiguration of a superblock is governed by a filesystem +context. This is represented by the fs_context structure:: + + struct fs_context { + const struct fs_context_operations *ops; + struct file_system_type *fs_type; + void *fs_private; + struct dentry *root; + struct user_namespace *user_ns; + struct net *net_ns; + const struct cred *cred; + char *source; + char *subtype; + void *security; + void *s_fs_info; + unsigned int sb_flags; + unsigned int sb_flags_mask; + unsigned int s_iflags; + unsigned int lsm_flags; + enum fs_context_purpose purpose:8; + ... + }; + +The fs_context fields are as follows: + + * :: + + const struct fs_context_operations *ops + + These are operations that can be done on a filesystem context (see + below). This must be set by the ->init_fs_context() file_system_type + operation. + + * :: + + struct file_system_type *fs_type + + A pointer to the file_system_type of the filesystem that is being + constructed or reconfigured. This retains a reference on the type owner. + + * :: + + void *fs_private + + A pointer to the file system's private data. This is where the filesystem + will need to store any options it parses. + + * :: + + struct dentry *root + + A pointer to the root of the mountable tree (and indirectly, the + superblock thereof). This is filled in by the ->get_tree() op. If this + is set, an active reference on root->d_sb must also be held. + + * :: + + struct user_namespace *user_ns + struct net *net_ns + + There are a subset of the namespaces in use by the invoking process. They + retain references on each namespace. The subscribed namespaces may be + replaced by the filesystem to reflect other sources, such as the parent + mount superblock on an automount. + + * :: + + const struct cred *cred + + The mounter's credentials. This retains a reference on the credentials. + + * :: + + char *source + + This specifies the source. It may be a block device (e.g. /dev/sda1) or + something more exotic, such as the "host:/path" that NFS desires. + + * :: + + char *subtype + + This is a string to be added to the type displayed in /proc/mounts to + qualify it (used by FUSE). This is available for the filesystem to set if + desired. + + * :: + + void *security + + A place for the LSMs to hang their security data for the superblock. The + relevant security operations are described below. + + * :: + + void *s_fs_info + + The proposed s_fs_info for a new superblock, set in the superblock by + sget_fc(). This can be used to distinguish superblocks. + + * :: + + unsigned int sb_flags + unsigned int sb_flags_mask + + Which bits SB_* flags are to be set/cleared in super_block::s_flags. + + * :: + + unsigned int s_iflags + + These will be bitwise-OR'd with s->s_iflags when a superblock is created. + + * :: + + enum fs_context_purpose + + This indicates the purpose for which the context is intended. The + available values are: + + ========================== ====================================== + FS_CONTEXT_FOR_MOUNT, New superblock for explicit mount + FS_CONTEXT_FOR_SUBMOUNT New automatic submount of extant mount + FS_CONTEXT_FOR_RECONFIGURE Change an existing mount + ========================== ====================================== + +The mount context is created by calling vfs_new_fs_context() or +vfs_dup_fs_context() and is destroyed with put_fs_context(). Note that the +structure is not refcounted. + +VFS, security and filesystem mount options are set individually with +vfs_parse_mount_option(). Options provided by the old mount(2) system call as +a page of data can be parsed with generic_parse_monolithic(). + +When mounting, the filesystem is allowed to take data from any of the pointers +and attach it to the superblock (or whatever), provided it clears the pointer +in the mount context. + +The filesystem is also allowed to allocate resources and pin them with the +mount context. For instance, NFS might pin the appropriate protocol version +module. + + +The Filesystem Context Operations +================================= + +The filesystem context points to a table of operations:: + + struct fs_context_operations { + void (*free)(struct fs_context *fc); + int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + int (*parse_param)(struct fs_context *fc, + struct struct fs_parameter *param); + int (*parse_monolithic)(struct fs_context *fc, void *data); + int (*get_tree)(struct fs_context *fc); + int (*reconfigure)(struct fs_context *fc); + }; + +These operations are invoked by the various stages of the mount procedure to +manage the filesystem context. They are as follows: + + * :: + + void (*free)(struct fs_context *fc); + + Called to clean up the filesystem-specific part of the filesystem context + when the context is destroyed. It should be aware that parts of the + context may have been removed and NULL'd out by ->get_tree(). + + * :: + + int (*dup)(struct fs_context *fc, struct fs_context *src_fc); + + Called when a filesystem context has been duplicated to duplicate the + filesystem-private data. An error may be returned to indicate failure to + do this. + + .. Warning:: + + Note that even if this fails, put_fs_context() will be called + immediately thereafter, so ->dup() *must* make the + filesystem-private data safe for ->free(). + + * :: + + int (*parse_param)(struct fs_context *fc, + struct struct fs_parameter *param); + + Called when a parameter is being added to the filesystem context. param + points to the key name and maybe a value object. VFS-specific options + will have been weeded out and fc->sb_flags updated in the context. + Security options will also have been weeded out and fc->security updated. + + The parameter can be parsed with fs_parse() and fs_lookup_param(). Note + that the source(s) are presented as parameters named "source". + + If successful, 0 should be returned or a negative error code otherwise. + + * :: + + int (*parse_monolithic)(struct fs_context *fc, void *data); + + Called when the mount(2) system call is invoked to pass the entire data + page in one go. If this is expected to be just a list of "key[=val]" + items separated by commas, then this may be set to NULL. + + The return value is as for ->parse_param(). + + If the filesystem (e.g. NFS) needs to examine the data first and then + finds it's the standard key-val list then it may pass it off to + generic_parse_monolithic(). + + * :: + + int (*get_tree)(struct fs_context *fc); + + Called to get or create the mountable root and superblock, using the + information stored in the filesystem context (reconfiguration goes via a + different vector). It may detach any resources it desires from the + filesystem context and transfer them to the superblock it creates. + + On success it should set fc->root to the mountable root and return 0. In + the case of an error, it should return a negative error code. + + The phase on a userspace-driven context will be set to only allow this to + be called once on any particular context. + + * :: + + int (*reconfigure)(struct fs_context *fc); + + Called to effect reconfiguration of a superblock using information stored + in the filesystem context. It may detach any resources it desires from + the filesystem context and transfer them to the superblock. The + superblock can be found from fc->root->d_sb. + + On success it should return 0. In the case of an error, it should return + a negative error code. + + .. Note:: reconfigure is intended as a replacement for remount_fs. + + +Filesystem context Security +=========================== + +The filesystem context contains a security pointer that the LSMs can use for +building up a security context for the superblock to be mounted. There are a +number of operations used by the new mount code for this purpose: + + * :: + + int security_fs_context_alloc(struct fs_context *fc, + struct dentry *reference); + + Called to initialise fc->security (which is preset to NULL) and allocate + any resources needed. It should return 0 on success or a negative error + code on failure. + + reference will be non-NULL if the context is being created for superblock + reconfiguration (FS_CONTEXT_FOR_RECONFIGURE) in which case it indicates + the root dentry of the superblock to be reconfigured. It will also be + non-NULL in the case of a submount (FS_CONTEXT_FOR_SUBMOUNT) in which case + it indicates the automount point. + + * :: + + int security_fs_context_dup(struct fs_context *fc, + struct fs_context *src_fc); + + Called to initialise fc->security (which is preset to NULL) and allocate + any resources needed. The original filesystem context is pointed to by + src_fc and may be used for reference. It should return 0 on success or a + negative error code on failure. + + * :: + + void security_fs_context_free(struct fs_context *fc); + + Called to clean up anything attached to fc->security. Note that the + contents may have been transferred to a superblock and the pointer cleared + during get_tree. + + * :: + + int security_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param); + + Called for each mount parameter, including the source. The arguments are + as for the ->parse_param() method. It should return 0 to indicate that + the parameter should be passed on to the filesystem, 1 to indicate that + the parameter should be discarded or an error to indicate that the + parameter should be rejected. + + The value pointed to by param may be modified (if a string) or stolen + (provided the value pointer is NULL'd out). If it is stolen, 1 must be + returned to prevent it being passed to the filesystem. + + * :: + + int security_fs_context_validate(struct fs_context *fc); + + Called after all the options have been parsed to validate the collection + as a whole and to do any necessary allocation so that + security_sb_get_tree() and security_sb_reconfigure() are less likely to + fail. It should return 0 or a negative error code. + + In the case of reconfiguration, the target superblock will be accessible + via fc->root. + + * :: + + int security_sb_get_tree(struct fs_context *fc); + + Called during the mount procedure to verify that the specified superblock + is allowed to be mounted and to transfer the security data there. It + should return 0 or a negative error code. + + * :: + + void security_sb_reconfigure(struct fs_context *fc); + + Called to apply any reconfiguration to an LSM's context. It must not + fail. Error checking and resource allocation must be done in advance by + the parameter parsing and validation hooks. + + * :: + + int security_sb_mountpoint(struct fs_context *fc, + struct path *mountpoint, + unsigned int mnt_flags); + + Called during the mount procedure to verify that the root dentry attached + to the context is permitted to be attached to the specified mountpoint. + It should return 0 on success or a negative error code on failure. + + +VFS Filesystem context API +========================== + +There are four operations for creating a filesystem context and one for +destroying a context: + + * :: + + struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags); + + Allocate a filesystem context for the purpose of setting up a new mount, + whether that be with a new superblock or sharing an existing one. This + sets the superblock flags, initialises the security and calls + fs_type->init_fs_context() to initialise the filesystem private data. + + fs_type specifies the filesystem type that will manage the context and + sb_flags presets the superblock flags stored therein. + + * :: + + struct fs_context *fs_context_for_reconfigure( + struct dentry *dentry, + unsigned int sb_flags, + unsigned int sb_flags_mask); + + Allocate a filesystem context for the purpose of reconfiguring an + existing superblock. dentry provides a reference to the superblock to be + configured. sb_flags and sb_flags_mask indicate which superblock flags + need changing and to what. + + * :: + + struct fs_context *fs_context_for_submount( + struct file_system_type *fs_type, + struct dentry *reference); + + Allocate a filesystem context for the purpose of creating a new mount for + an automount point or other derived superblock. fs_type specifies the + filesystem type that will manage the context and the reference dentry + supplies the parameters. Namespaces are propagated from the reference + dentry's superblock also. + + Note that it's not a requirement that the reference dentry be of the same + filesystem type as fs_type. + + * :: + + struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc); + + Duplicate a filesystem context, copying any options noted and duplicating + or additionally referencing any resources held therein. This is available + for use where a filesystem has to get a mount within a mount, such as NFS4 + does by internally mounting the root of the target server and then doing a + private pathwalk to the target directory. + + The purpose in the new context is inherited from the old one. + + * :: + + void put_fs_context(struct fs_context *fc); + + Destroy a filesystem context, releasing any resources it holds. This + calls the ->free() operation. This is intended to be called by anyone who + created a filesystem context. + + .. Warning:: + + filesystem contexts are not refcounted, so this causes unconditional + destruction. + +In all the above operations, apart from the put op, the return is a mount +context pointer or a negative error code. + +For the remaining operations, if an error occurs, a negative error code will be +returned. + + * :: + + int vfs_parse_fs_param(struct fs_context *fc, + struct fs_parameter *param); + + Supply a single mount parameter to the filesystem context. This include + the specification of the source/device which is specified as the "source" + parameter (which may be specified multiple times if the filesystem + supports that). + + param specifies the parameter key name and the value. The parameter is + first checked to see if it corresponds to a standard mount flag (in which + case it is used to set an SB_xxx flag and consumed) or a security option + (in which case the LSM consumes it) before it is passed on to the + filesystem. + + The parameter value is typed and can be one of: + + ==================== ============================= + fs_value_is_flag Parameter not given a value + fs_value_is_string Value is a string + fs_value_is_blob Value is a binary blob + fs_value_is_filename Value is a filename* + dirfd + fs_value_is_file Value is an open file (file*) + ==================== ============================= + + If there is a value, that value is stored in a union in the struct in one + of param->{string,blob,name,file}. Note that the function may steal and + clear the pointer, but then becomes responsible for disposing of the + object. + + * :: + + int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size); + + A wrapper around vfs_parse_fs_param() that copies the value string it is + passed. + + * :: + + int generic_parse_monolithic(struct fs_context *fc, void *data); + + Parse a sys_mount() data page, assuming the form to be a text list + consisting of key[=val] options separated by commas. Each item in the + list is passed to vfs_mount_option(). This is the default when the + ->parse_monolithic() method is NULL. + + * :: + + int vfs_get_tree(struct fs_context *fc); + + Get or create the mountable root and superblock, using the parameters in + the filesystem context to select/configure the superblock. This invokes + the ->get_tree() method. + + * :: + + struct vfsmount *vfs_create_mount(struct fs_context *fc); + + Create a mount given the parameters in the specified filesystem context. + Note that this does not attach the mount to anything. + + +Superblock Creation Helpers +=========================== + +A number of VFS helpers are available for use by filesystems for the creation +or looking up of superblocks. + + * :: + + struct super_block * + sget_fc(struct fs_context *fc, + int (*test)(struct super_block *sb, struct fs_context *fc), + int (*set)(struct super_block *sb, struct fs_context *fc)); + + This is the core routine. If test is non-NULL, it searches for an + existing superblock matching the criteria held in the fs_context, using + the test function to match them. If no match is found, a new superblock + is created and the set function is called to set it up. + + Prior to the set function being called, fc->s_fs_info will be transferred + to sb->s_fs_info - and fc->s_fs_info will be cleared if set returns + success (ie. 0). + +The following helpers all wrap sget_fc(): + + * :: + + int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) + + This creates/looks up a deviceless superblock. The keying indicates how + many superblocks of this type may exist and in what manner they may be + shared: + + (1) vfs_get_single_super + + Only one such superblock may exist in the system. Any further + attempt to get a new superblock gets this one (and any parameter + differences are ignored). + + (2) vfs_get_keyed_super + + Multiple superblocks of this type may exist and they're keyed on + their s_fs_info pointer (for example this may refer to a + namespace). + + (3) vfs_get_independent_super + + Multiple independent superblocks of this type may exist. This + function never matches an existing one and always creates a new + one. + + +===================== +PARAMETER DESCRIPTION +===================== + +Parameters are described using structures defined in linux/fs_parser.h. +There's a core description struct that links everything together:: + + struct fs_parameter_description { + const struct fs_parameter_spec *specs; + const struct fs_parameter_enum *enums; + }; + +For example:: + + enum { + Opt_autocell, + Opt_bar, + Opt_dyn, + Opt_foo, + Opt_source, + }; + + static const struct fs_parameter_description afs_fs_parameters = { + .specs = afs_param_specs, + .enums = afs_param_enums, + }; + +The members are as follows: + + (1) :: + + const struct fs_parameter_specification *specs; + + Table of parameter specifications, terminated with a null entry, where the + entries are of type:: + + struct fs_parameter_spec { + const char *name; + u8 opt; + enum fs_parameter_type type:8; + unsigned short flags; + }; + + The 'name' field is a string to match exactly to the parameter key (no + wildcards, patterns and no case-independence) and 'opt' is the value that + will be returned by the fs_parser() function in the case of a successful + match. + + The 'type' field indicates the desired value type and must be one of: + + ======================= ======================= ===================== + TYPE NAME EXPECTED VALUE RESULT IN + ======================= ======================= ===================== + fs_param_is_flag No value n/a + fs_param_is_bool Boolean value result->boolean + fs_param_is_u32 32-bit unsigned int result->uint_32 + fs_param_is_u32_octal 32-bit octal int result->uint_32 + fs_param_is_u32_hex 32-bit hex int result->uint_32 + fs_param_is_s32 32-bit signed int result->int_32 + fs_param_is_u64 64-bit unsigned int result->uint_64 + fs_param_is_enum Enum value name result->uint_32 + fs_param_is_string Arbitrary string param->string + fs_param_is_blob Binary blob param->blob + fs_param_is_blockdev Blockdev path * Needs lookup + fs_param_is_path Path * Needs lookup + fs_param_is_fd File descriptor result->int_32 + ======================= ======================= ===================== + + Note that if the value is of fs_param_is_bool type, fs_parse() will try + to match any string value against "0", "1", "no", "yes", "false", "true". + + Each parameter can also be qualified with 'flags': + + ======================= ================================================ + fs_param_v_optional The value is optional + fs_param_neg_with_no result->negated set if key is prefixed with "no" + fs_param_neg_with_empty result->negated set if value is "" + fs_param_deprecated The parameter is deprecated. + ======================= ================================================ + + These are wrapped with a number of convenience wrappers: + + ======================= =============================================== + MACRO SPECIFIES + ======================= =============================================== + fsparam_flag() fs_param_is_flag + fsparam_flag_no() fs_param_is_flag, fs_param_neg_with_no + fsparam_bool() fs_param_is_bool + fsparam_u32() fs_param_is_u32 + fsparam_u32oct() fs_param_is_u32_octal + fsparam_u32hex() fs_param_is_u32_hex + fsparam_s32() fs_param_is_s32 + fsparam_u64() fs_param_is_u64 + fsparam_enum() fs_param_is_enum + fsparam_string() fs_param_is_string + fsparam_blob() fs_param_is_blob + fsparam_bdev() fs_param_is_blockdev + fsparam_path() fs_param_is_path + fsparam_fd() fs_param_is_fd + ======================= =============================================== + + all of which take two arguments, name string and option number - for + example:: + + static const struct fs_parameter_spec afs_param_specs[] = { + fsparam_flag ("autocell", Opt_autocell), + fsparam_flag ("dyn", Opt_dyn), + fsparam_string ("source", Opt_source), + fsparam_flag_no ("foo", Opt_foo), + {} + }; + + An addition macro, __fsparam() is provided that takes an additional pair + of arguments to specify the type and the flags for anything that doesn't + match one of the above macros. + + (2) :: + + const struct fs_parameter_enum *enums; + + Table of enum value names to integer mappings, terminated with a null + entry. This is of type:: + + struct fs_parameter_enum { + u8 opt; + char name[14]; + u8 value; + }; + + Where the array is an unsorted list of { parameter ID, name }-keyed + elements that indicate the value to map to, e.g.:: + + static const struct fs_parameter_enum afs_param_enums[] = { + { Opt_bar, "x", 1}, + { Opt_bar, "y", 23}, + { Opt_bar, "z", 42}, + }; + + If a parameter of type fs_param_is_enum is encountered, fs_parse() will + try to look the value up in the enum table and the result will be stored + in the parse result. + +The parser should be pointed to by the parser pointer in the file_system_type +struct as this will provide validation on registration (if +CONFIG_VALIDATE_FS_PARSER=y) and will allow the description to be queried from +userspace using the fsinfo() syscall. + + +Parameter Helper Functions +========================== + +A number of helper functions are provided to help a filesystem or an LSM +process the parameters it is given. + + * :: + + int lookup_constant(const struct constant_table tbl[], + const char *name, int not_found); + + Look up a constant by name in a table of name -> integer mappings. The + table is an array of elements of the following type:: + + struct constant_table { + const char *name; + int value; + }; + + If a match is found, the corresponding value is returned. If a match + isn't found, the not_found value is returned instead. + + * :: + + bool validate_constant_table(const struct constant_table *tbl, + size_t tbl_size, + int low, int high, int special); + + Validate a constant table. Checks that all the elements are appropriately + ordered, that there are no duplicates and that the values are between low + and high inclusive, though provision is made for one allowable special + value outside of that range. If no special value is required, special + should just be set to lie inside the low-to-high range. + + If all is good, true is returned. If the table is invalid, errors are + logged to dmesg and false is returned. + + * :: + + bool fs_validate_description(const struct fs_parameter_description *desc); + + This performs some validation checks on a parameter description. It + returns true if the description is good and false if it is not. It will + log errors to dmesg if validation fails. + + * :: + + int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *param, + struct fs_parse_result *result); + + This is the main interpreter of parameters. It uses the parameter + description to look up a parameter by key name and to convert that to an + option number (which it returns). + + If successful, and if the parameter type indicates the result is a + boolean, integer or enum type, the value is converted by this function and + the result stored in result->{boolean,int_32,uint_32,uint_64}. + + If a match isn't initially made, the key is prefixed with "no" and no + value is present then an attempt will be made to look up the key with the + prefix removed. If this matches a parameter for which the type has flag + fs_param_neg_with_no set, then a match will be made and result->negated + will be set to true. + + If the parameter isn't matched, -ENOPARAM will be returned; if the + parameter is matched, but the value is erroneous, -EINVAL will be + returned; otherwise the parameter's option number will be returned. + + * :: + + int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *value, + bool want_bdev, + struct path *_path); + + This takes a parameter that carries a string or filename type and attempts + to do a path lookup on it. If the parameter expects a blockdev, a check + is made that the inode actually represents one. + + Returns 0 if successful and ``*_path`` will be set; returns a negative + error code if not. diff --git a/Documentation/filesystems/mount_api.txt b/Documentation/filesystems/mount_api.txt deleted file mode 100644 index 87c14bbb2b35..000000000000 --- a/Documentation/filesystems/mount_api.txt +++ /dev/null @@ -1,724 +0,0 @@ - ==================== - FILESYSTEM MOUNT API - ==================== - -CONTENTS - - (1) Overview. - - (2) The filesystem context. - - (3) The filesystem context operations. - - (4) Filesystem context security. - - (5) VFS filesystem context API. - - (6) Superblock creation helpers. - - (7) Parameter description. - - (8) Parameter helper functions. - - -======== -OVERVIEW -======== - -The creation of new mounts is now to be done in a multistep process: - - (1) Create a filesystem context. - - (2) Parse the parameters and attach them to the context. Parameters are - expected to be passed individually from userspace, though legacy binary - parameters can also be handled. - - (3) Validate and pre-process the context. - - (4) Get or create a superblock and mountable root. - - (5) Perform the mount. - - (6) Return an error message attached to the context. - - (7) Destroy the context. - -To support this, the file_system_type struct gains two new fields: - - int (*init_fs_context)(struct fs_context *fc); - const struct fs_parameter_description *parameters; - -The first is invoked to set up the filesystem-specific parts of a filesystem -context, including the additional space, and the second points to the -parameter description for validation at registration time and querying by a -future system call. - -Note that security initialisation is done *after* the filesystem is called so -that the namespaces may be adjusted first. - - -====================== -THE FILESYSTEM CONTEXT -====================== - -The creation and reconfiguration of a superblock is governed by a filesystem -context. This is represented by the fs_context structure: - - struct fs_context { - const struct fs_context_operations *ops; - struct file_system_type *fs_type; - void *fs_private; - struct dentry *root; - struct user_namespace *user_ns; - struct net *net_ns; - const struct cred *cred; - char *source; - char *subtype; - void *security; - void *s_fs_info; - unsigned int sb_flags; - unsigned int sb_flags_mask; - unsigned int s_iflags; - unsigned int lsm_flags; - enum fs_context_purpose purpose:8; - ... - }; - -The fs_context fields are as follows: - - (*) const struct fs_context_operations *ops - - These are operations that can be done on a filesystem context (see - below). This must be set by the ->init_fs_context() file_system_type - operation. - - (*) struct file_system_type *fs_type - - A pointer to the file_system_type of the filesystem that is being - constructed or reconfigured. This retains a reference on the type owner. - - (*) void *fs_private - - A pointer to the file system's private data. This is where the filesystem - will need to store any options it parses. - - (*) struct dentry *root - - A pointer to the root of the mountable tree (and indirectly, the - superblock thereof). This is filled in by the ->get_tree() op. If this - is set, an active reference on root->d_sb must also be held. - - (*) struct user_namespace *user_ns - (*) struct net *net_ns - - There are a subset of the namespaces in use by the invoking process. They - retain references on each namespace. The subscribed namespaces may be - replaced by the filesystem to reflect other sources, such as the parent - mount superblock on an automount. - - (*) const struct cred *cred - - The mounter's credentials. This retains a reference on the credentials. - - (*) char *source - - This specifies the source. It may be a block device (e.g. /dev/sda1) or - something more exotic, such as the "host:/path" that NFS desires. - - (*) char *subtype - - This is a string to be added to the type displayed in /proc/mounts to - qualify it (used by FUSE). This is available for the filesystem to set if - desired. - - (*) void *security - - A place for the LSMs to hang their security data for the superblock. The - relevant security operations are described below. - - (*) void *s_fs_info - - The proposed s_fs_info for a new superblock, set in the superblock by - sget_fc(). This can be used to distinguish superblocks. - - (*) unsigned int sb_flags - (*) unsigned int sb_flags_mask - - Which bits SB_* flags are to be set/cleared in super_block::s_flags. - - (*) unsigned int s_iflags - - These will be bitwise-OR'd with s->s_iflags when a superblock is created. - - (*) enum fs_context_purpose - - This indicates the purpose for which the context is intended. The - available values are: - - FS_CONTEXT_FOR_MOUNT, -- New superblock for explicit mount - FS_CONTEXT_FOR_SUBMOUNT -- New automatic submount of extant mount - FS_CONTEXT_FOR_RECONFIGURE -- Change an existing mount - -The mount context is created by calling vfs_new_fs_context() or -vfs_dup_fs_context() and is destroyed with put_fs_context(). Note that the -structure is not refcounted. - -VFS, security and filesystem mount options are set individually with -vfs_parse_mount_option(). Options provided by the old mount(2) system call as -a page of data can be parsed with generic_parse_monolithic(). - -When mounting, the filesystem is allowed to take data from any of the pointers -and attach it to the superblock (or whatever), provided it clears the pointer -in the mount context. - -The filesystem is also allowed to allocate resources and pin them with the -mount context. For instance, NFS might pin the appropriate protocol version -module. - - -================================= -THE FILESYSTEM CONTEXT OPERATIONS -================================= - -The filesystem context points to a table of operations: - - struct fs_context_operations { - void (*free)(struct fs_context *fc); - int (*dup)(struct fs_context *fc, struct fs_context *src_fc); - int (*parse_param)(struct fs_context *fc, - struct struct fs_parameter *param); - int (*parse_monolithic)(struct fs_context *fc, void *data); - int (*get_tree)(struct fs_context *fc); - int (*reconfigure)(struct fs_context *fc); - }; - -These operations are invoked by the various stages of the mount procedure to -manage the filesystem context. They are as follows: - - (*) void (*free)(struct fs_context *fc); - - Called to clean up the filesystem-specific part of the filesystem context - when the context is destroyed. It should be aware that parts of the - context may have been removed and NULL'd out by ->get_tree(). - - (*) int (*dup)(struct fs_context *fc, struct fs_context *src_fc); - - Called when a filesystem context has been duplicated to duplicate the - filesystem-private data. An error may be returned to indicate failure to - do this. - - [!] Note that even if this fails, put_fs_context() will be called - immediately thereafter, so ->dup() *must* make the - filesystem-private data safe for ->free(). - - (*) int (*parse_param)(struct fs_context *fc, - struct struct fs_parameter *param); - - Called when a parameter is being added to the filesystem context. param - points to the key name and maybe a value object. VFS-specific options - will have been weeded out and fc->sb_flags updated in the context. - Security options will also have been weeded out and fc->security updated. - - The parameter can be parsed with fs_parse() and fs_lookup_param(). Note - that the source(s) are presented as parameters named "source". - - If successful, 0 should be returned or a negative error code otherwise. - - (*) int (*parse_monolithic)(struct fs_context *fc, void *data); - - Called when the mount(2) system call is invoked to pass the entire data - page in one go. If this is expected to be just a list of "key[=val]" - items separated by commas, then this may be set to NULL. - - The return value is as for ->parse_param(). - - If the filesystem (e.g. NFS) needs to examine the data first and then - finds it's the standard key-val list then it may pass it off to - generic_parse_monolithic(). - - (*) int (*get_tree)(struct fs_context *fc); - - Called to get or create the mountable root and superblock, using the - information stored in the filesystem context (reconfiguration goes via a - different vector). It may detach any resources it desires from the - filesystem context and transfer them to the superblock it creates. - - On success it should set fc->root to the mountable root and return 0. In - the case of an error, it should return a negative error code. - - The phase on a userspace-driven context will be set to only allow this to - be called once on any particular context. - - (*) int (*reconfigure)(struct fs_context *fc); - - Called to effect reconfiguration of a superblock using information stored - in the filesystem context. It may detach any resources it desires from - the filesystem context and transfer them to the superblock. The - superblock can be found from fc->root->d_sb. - - On success it should return 0. In the case of an error, it should return - a negative error code. - - [NOTE] reconfigure is intended as a replacement for remount_fs. - - -=========================== -FILESYSTEM CONTEXT SECURITY -=========================== - -The filesystem context contains a security pointer that the LSMs can use for -building up a security context for the superblock to be mounted. There are a -number of operations used by the new mount code for this purpose: - - (*) int security_fs_context_alloc(struct fs_context *fc, - struct dentry *reference); - - Called to initialise fc->security (which is preset to NULL) and allocate - any resources needed. It should return 0 on success or a negative error - code on failure. - - reference will be non-NULL if the context is being created for superblock - reconfiguration (FS_CONTEXT_FOR_RECONFIGURE) in which case it indicates - the root dentry of the superblock to be reconfigured. It will also be - non-NULL in the case of a submount (FS_CONTEXT_FOR_SUBMOUNT) in which case - it indicates the automount point. - - (*) int security_fs_context_dup(struct fs_context *fc, - struct fs_context *src_fc); - - Called to initialise fc->security (which is preset to NULL) and allocate - any resources needed. The original filesystem context is pointed to by - src_fc and may be used for reference. It should return 0 on success or a - negative error code on failure. - - (*) void security_fs_context_free(struct fs_context *fc); - - Called to clean up anything attached to fc->security. Note that the - contents may have been transferred to a superblock and the pointer cleared - during get_tree. - - (*) int security_fs_context_parse_param(struct fs_context *fc, - struct fs_parameter *param); - - Called for each mount parameter, including the source. The arguments are - as for the ->parse_param() method. It should return 0 to indicate that - the parameter should be passed on to the filesystem, 1 to indicate that - the parameter should be discarded or an error to indicate that the - parameter should be rejected. - - The value pointed to by param may be modified (if a string) or stolen - (provided the value pointer is NULL'd out). If it is stolen, 1 must be - returned to prevent it being passed to the filesystem. - - (*) int security_fs_context_validate(struct fs_context *fc); - - Called after all the options have been parsed to validate the collection - as a whole and to do any necessary allocation so that - security_sb_get_tree() and security_sb_reconfigure() are less likely to - fail. It should return 0 or a negative error code. - - In the case of reconfiguration, the target superblock will be accessible - via fc->root. - - (*) int security_sb_get_tree(struct fs_context *fc); - - Called during the mount procedure to verify that the specified superblock - is allowed to be mounted and to transfer the security data there. It - should return 0 or a negative error code. - - (*) void security_sb_reconfigure(struct fs_context *fc); - - Called to apply any reconfiguration to an LSM's context. It must not - fail. Error checking and resource allocation must be done in advance by - the parameter parsing and validation hooks. - - (*) int security_sb_mountpoint(struct fs_context *fc, struct path *mountpoint, - unsigned int mnt_flags); - - Called during the mount procedure to verify that the root dentry attached - to the context is permitted to be attached to the specified mountpoint. - It should return 0 on success or a negative error code on failure. - - -========================== -VFS FILESYSTEM CONTEXT API -========================== - -There are four operations for creating a filesystem context and one for -destroying a context: - - (*) struct fs_context *fs_context_for_mount( - struct file_system_type *fs_type, - unsigned int sb_flags); - - Allocate a filesystem context for the purpose of setting up a new mount, - whether that be with a new superblock or sharing an existing one. This - sets the superblock flags, initialises the security and calls - fs_type->init_fs_context() to initialise the filesystem private data. - - fs_type specifies the filesystem type that will manage the context and - sb_flags presets the superblock flags stored therein. - - (*) struct fs_context *fs_context_for_reconfigure( - struct dentry *dentry, - unsigned int sb_flags, - unsigned int sb_flags_mask); - - Allocate a filesystem context for the purpose of reconfiguring an - existing superblock. dentry provides a reference to the superblock to be - configured. sb_flags and sb_flags_mask indicate which superblock flags - need changing and to what. - - (*) struct fs_context *fs_context_for_submount( - struct file_system_type *fs_type, - struct dentry *reference); - - Allocate a filesystem context for the purpose of creating a new mount for - an automount point or other derived superblock. fs_type specifies the - filesystem type that will manage the context and the reference dentry - supplies the parameters. Namespaces are propagated from the reference - dentry's superblock also. - - Note that it's not a requirement that the reference dentry be of the same - filesystem type as fs_type. - - (*) struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc); - - Duplicate a filesystem context, copying any options noted and duplicating - or additionally referencing any resources held therein. This is available - for use where a filesystem has to get a mount within a mount, such as NFS4 - does by internally mounting the root of the target server and then doing a - private pathwalk to the target directory. - - The purpose in the new context is inherited from the old one. - - (*) void put_fs_context(struct fs_context *fc); - - Destroy a filesystem context, releasing any resources it holds. This - calls the ->free() operation. This is intended to be called by anyone who - created a filesystem context. - - [!] filesystem contexts are not refcounted, so this causes unconditional - destruction. - -In all the above operations, apart from the put op, the return is a mount -context pointer or a negative error code. - -For the remaining operations, if an error occurs, a negative error code will be -returned. - - (*) int vfs_parse_fs_param(struct fs_context *fc, - struct fs_parameter *param); - - Supply a single mount parameter to the filesystem context. This include - the specification of the source/device which is specified as the "source" - parameter (which may be specified multiple times if the filesystem - supports that). - - param specifies the parameter key name and the value. The parameter is - first checked to see if it corresponds to a standard mount flag (in which - case it is used to set an SB_xxx flag and consumed) or a security option - (in which case the LSM consumes it) before it is passed on to the - filesystem. - - The parameter value is typed and can be one of: - - fs_value_is_flag, Parameter not given a value. - fs_value_is_string, Value is a string - fs_value_is_blob, Value is a binary blob - fs_value_is_filename, Value is a filename* + dirfd - fs_value_is_file, Value is an open file (file*) - - If there is a value, that value is stored in a union in the struct in one - of param->{string,blob,name,file}. Note that the function may steal and - clear the pointer, but then becomes responsible for disposing of the - object. - - (*) int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size); - - A wrapper around vfs_parse_fs_param() that copies the value string it is - passed. - - (*) int generic_parse_monolithic(struct fs_context *fc, void *data); - - Parse a sys_mount() data page, assuming the form to be a text list - consisting of key[=val] options separated by commas. Each item in the - list is passed to vfs_mount_option(). This is the default when the - ->parse_monolithic() method is NULL. - - (*) int vfs_get_tree(struct fs_context *fc); - - Get or create the mountable root and superblock, using the parameters in - the filesystem context to select/configure the superblock. This invokes - the ->get_tree() method. - - (*) struct vfsmount *vfs_create_mount(struct fs_context *fc); - - Create a mount given the parameters in the specified filesystem context. - Note that this does not attach the mount to anything. - - -=========================== -SUPERBLOCK CREATION HELPERS -=========================== - -A number of VFS helpers are available for use by filesystems for the creation -or looking up of superblocks. - - (*) struct super_block * - sget_fc(struct fs_context *fc, - int (*test)(struct super_block *sb, struct fs_context *fc), - int (*set)(struct super_block *sb, struct fs_context *fc)); - - This is the core routine. If test is non-NULL, it searches for an - existing superblock matching the criteria held in the fs_context, using - the test function to match them. If no match is found, a new superblock - is created and the set function is called to set it up. - - Prior to the set function being called, fc->s_fs_info will be transferred - to sb->s_fs_info - and fc->s_fs_info will be cleared if set returns - success (ie. 0). - -The following helpers all wrap sget_fc(): - - (*) int vfs_get_super(struct fs_context *fc, - enum vfs_get_super_keying keying, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) - - This creates/looks up a deviceless superblock. The keying indicates how - many superblocks of this type may exist and in what manner they may be - shared: - - (1) vfs_get_single_super - - Only one such superblock may exist in the system. Any further - attempt to get a new superblock gets this one (and any parameter - differences are ignored). - - (2) vfs_get_keyed_super - - Multiple superblocks of this type may exist and they're keyed on - their s_fs_info pointer (for example this may refer to a - namespace). - - (3) vfs_get_independent_super - - Multiple independent superblocks of this type may exist. This - function never matches an existing one and always creates a new - one. - - -===================== -PARAMETER DESCRIPTION -===================== - -Parameters are described using structures defined in linux/fs_parser.h. -There's a core description struct that links everything together: - - struct fs_parameter_description { - const struct fs_parameter_spec *specs; - const struct fs_parameter_enum *enums; - }; - -For example: - - enum { - Opt_autocell, - Opt_bar, - Opt_dyn, - Opt_foo, - Opt_source, - }; - - static const struct fs_parameter_description afs_fs_parameters = { - .specs = afs_param_specs, - .enums = afs_param_enums, - }; - -The members are as follows: - - (1) const struct fs_parameter_specification *specs; - - Table of parameter specifications, terminated with a null entry, where the - entries are of type: - - struct fs_parameter_spec { - const char *name; - u8 opt; - enum fs_parameter_type type:8; - unsigned short flags; - }; - - The 'name' field is a string to match exactly to the parameter key (no - wildcards, patterns and no case-independence) and 'opt' is the value that - will be returned by the fs_parser() function in the case of a successful - match. - - The 'type' field indicates the desired value type and must be one of: - - TYPE NAME EXPECTED VALUE RESULT IN - ======================= ======================= ===================== - fs_param_is_flag No value n/a - fs_param_is_bool Boolean value result->boolean - fs_param_is_u32 32-bit unsigned int result->uint_32 - fs_param_is_u32_octal 32-bit octal int result->uint_32 - fs_param_is_u32_hex 32-bit hex int result->uint_32 - fs_param_is_s32 32-bit signed int result->int_32 - fs_param_is_u64 64-bit unsigned int result->uint_64 - fs_param_is_enum Enum value name result->uint_32 - fs_param_is_string Arbitrary string param->string - fs_param_is_blob Binary blob param->blob - fs_param_is_blockdev Blockdev path * Needs lookup - fs_param_is_path Path * Needs lookup - fs_param_is_fd File descriptor result->int_32 - - Note that if the value is of fs_param_is_bool type, fs_parse() will try - to match any string value against "0", "1", "no", "yes", "false", "true". - - Each parameter can also be qualified with 'flags': - - fs_param_v_optional The value is optional - fs_param_neg_with_no result->negated set if key is prefixed with "no" - fs_param_neg_with_empty result->negated set if value is "" - fs_param_deprecated The parameter is deprecated. - - These are wrapped with a number of convenience wrappers: - - MACRO SPECIFIES - ======================= =============================================== - fsparam_flag() fs_param_is_flag - fsparam_flag_no() fs_param_is_flag, fs_param_neg_with_no - fsparam_bool() fs_param_is_bool - fsparam_u32() fs_param_is_u32 - fsparam_u32oct() fs_param_is_u32_octal - fsparam_u32hex() fs_param_is_u32_hex - fsparam_s32() fs_param_is_s32 - fsparam_u64() fs_param_is_u64 - fsparam_enum() fs_param_is_enum - fsparam_string() fs_param_is_string - fsparam_blob() fs_param_is_blob - fsparam_bdev() fs_param_is_blockdev - fsparam_path() fs_param_is_path - fsparam_fd() fs_param_is_fd - - all of which take two arguments, name string and option number - for - example: - - static const struct fs_parameter_spec afs_param_specs[] = { - fsparam_flag ("autocell", Opt_autocell), - fsparam_flag ("dyn", Opt_dyn), - fsparam_string ("source", Opt_source), - fsparam_flag_no ("foo", Opt_foo), - {} - }; - - An addition macro, __fsparam() is provided that takes an additional pair - of arguments to specify the type and the flags for anything that doesn't - match one of the above macros. - - (2) const struct fs_parameter_enum *enums; - - Table of enum value names to integer mappings, terminated with a null - entry. This is of type: - - struct fs_parameter_enum { - u8 opt; - char name[14]; - u8 value; - }; - - Where the array is an unsorted list of { parameter ID, name }-keyed - elements that indicate the value to map to, e.g.: - - static const struct fs_parameter_enum afs_param_enums[] = { - { Opt_bar, "x", 1}, - { Opt_bar, "y", 23}, - { Opt_bar, "z", 42}, - }; - - If a parameter of type fs_param_is_enum is encountered, fs_parse() will - try to look the value up in the enum table and the result will be stored - in the parse result. - -The parser should be pointed to by the parser pointer in the file_system_type -struct as this will provide validation on registration (if -CONFIG_VALIDATE_FS_PARSER=y) and will allow the description to be queried from -userspace using the fsinfo() syscall. - - -========================== -PARAMETER HELPER FUNCTIONS -========================== - -A number of helper functions are provided to help a filesystem or an LSM -process the parameters it is given. - - (*) int lookup_constant(const struct constant_table tbl[], - const char *name, int not_found); - - Look up a constant by name in a table of name -> integer mappings. The - table is an array of elements of the following type: - - struct constant_table { - const char *name; - int value; - }; - - If a match is found, the corresponding value is returned. If a match - isn't found, the not_found value is returned instead. - - (*) bool validate_constant_table(const struct constant_table *tbl, - size_t tbl_size, - int low, int high, int special); - - Validate a constant table. Checks that all the elements are appropriately - ordered, that there are no duplicates and that the values are between low - and high inclusive, though provision is made for one allowable special - value outside of that range. If no special value is required, special - should just be set to lie inside the low-to-high range. - - If all is good, true is returned. If the table is invalid, errors are - logged to dmesg and false is returned. - - (*) bool fs_validate_description(const struct fs_parameter_description *desc); - - This performs some validation checks on a parameter description. It - returns true if the description is good and false if it is not. It will - log errors to dmesg if validation fails. - - (*) int fs_parse(struct fs_context *fc, - const struct fs_parameter_description *desc, - struct fs_parameter *param, - struct fs_parse_result *result); - - This is the main interpreter of parameters. It uses the parameter - description to look up a parameter by key name and to convert that to an - option number (which it returns). - - If successful, and if the parameter type indicates the result is a - boolean, integer or enum type, the value is converted by this function and - the result stored in result->{boolean,int_32,uint_32,uint_64}. - - If a match isn't initially made, the key is prefixed with "no" and no - value is present then an attempt will be made to look up the key with the - prefix removed. If this matches a parameter for which the type has flag - fs_param_neg_with_no set, then a match will be made and result->negated - will be set to true. - - If the parameter isn't matched, -ENOPARAM will be returned; if the - parameter is matched, but the value is erroneous, -EINVAL will be - returned; otherwise the parameter's option number will be returned. - - (*) int fs_lookup_param(struct fs_context *fc, - struct fs_parameter *value, - bool want_bdev, - struct path *_path); - - This takes a parameter that carries a string or filename type and attempts - to do a path lookup on it. If the parameter expects a blockdev, a check - is made that the inode actually represents one. - - Returns 0 if successful and *_path will be set; returns a negative error - code if not. diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index e6c3e4c61dad..5f24fcbfbfb4 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -85,7 +85,7 @@ struct p_log { * Superblock creation fills in ->root whereas reconfiguration begins with this * already set. * - * See Documentation/filesystems/mount_api.txt + * See Documentation/filesystems/mount_api.rst */ struct fs_context { const struct fs_context_operations *ops; diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 988ca0df7824..44d5422c18e4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -77,7 +77,7 @@ * state. This is called immediately after commit_creds(). * * Security hooks for mount using fs_context. - * [See also Documentation/filesystems/mount_api.txt] + * [See also Documentation/filesystems/mount_api.rst] * * @fs_context_dup: * Allocate and attach a security structure to sc->security. This pointer -- cgit v1.2.3 From 9b6f151e768682206673de065eaca401e2baaf11 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:10 +0200 Subject: docs: filesystems: convert quota.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Acked-by: Jan Kara Link: https://lore.kernel.org/r/10a707377475bb252f454af2b8f58a038527933f.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/quota.rst | 85 +++++++++++++++++++++++++++++++++++++ Documentation/filesystems/quota.txt | 68 ----------------------------- MAINTAINERS | 2 +- 4 files changed, 87 insertions(+), 69 deletions(-) create mode 100644 Documentation/filesystems/quota.rst delete mode 100644 Documentation/filesystems/quota.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 4532b4d2631f..e2bbccf6d5bc 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -31,6 +31,7 @@ algorithms work. locks mandatory-locking mount_api + quota automount-support diff --git a/Documentation/filesystems/quota.rst b/Documentation/filesystems/quota.rst new file mode 100644 index 000000000000..a30cdd47c652 --- /dev/null +++ b/Documentation/filesystems/quota.rst @@ -0,0 +1,85 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Quota subsystem +=============== + +Quota subsystem allows system administrator to set limits on used space and +number of used inodes (inode is a filesystem structure which is associated with +each file or directory) for users and/or groups. For both used space and number +of used inodes there are actually two limits. The first one is called softlimit +and the second one hardlimit. A user can never exceed a hardlimit for any +resource (unless he has CAP_SYS_RESOURCE capability). User is allowed to exceed +softlimit but only for limited period of time. This period is called "grace +period" or "grace time". When grace time is over, user is not able to allocate +more space/inodes until he frees enough of them to get below softlimit. + +Quota limits (and amount of grace time) are set independently for each +filesystem. + +For more details about quota design, see the documentation in quota-tools package +(http://sourceforge.net/projects/linuxquota). + +Quota netlink interface +======================= +When user exceeds a softlimit, runs out of grace time or reaches hardlimit, +quota subsystem traditionally printed a message to the controlling terminal of +the process which caused the excess. This method has the disadvantage that +when user is using a graphical desktop he usually cannot see the message. +Thus quota netlink interface has been designed to pass information about +the above events to userspace. There they can be captured by an application +and processed accordingly. + +The interface uses generic netlink framework (see +http://lwn.net/Articles/208755/ and http://people.suug.ch/~tgr/libnl/ for more +details about this layer). The name of the quota generic netlink interface +is "VFS_DQUOT". Definitions of constants below are in . +Since the quota netlink protocol is not namespace aware, quota netlink messages +are sent only in initial network namespace. + +Currently, the interface supports only one message type QUOTA_NL_C_WARNING. +This command is used to send a notification about any of the above mentioned +events. Each message has six attributes. These are (type of the argument is +in parentheses): + + QUOTA_NL_A_QTYPE (u32) + - type of quota being exceeded (one of USRQUOTA, GRPQUOTA) + QUOTA_NL_A_EXCESS_ID (u64) + - UID/GID (depends on quota type) of user / group whose limit + is being exceeded. + QUOTA_NL_A_CAUSED_ID (u64) + - UID of a user who caused the event + QUOTA_NL_A_WARNING (u32) + - what kind of limit is exceeded: + + QUOTA_NL_IHARDWARN + inode hardlimit + QUOTA_NL_ISOFTLONGWARN + inode softlimit is exceeded longer + than given grace period + QUOTA_NL_ISOFTWARN + inode softlimit + QUOTA_NL_BHARDWARN + space (block) hardlimit + QUOTA_NL_BSOFTLONGWARN + space (block) softlimit is exceeded + longer than given grace period. + QUOTA_NL_BSOFTWARN + space (block) softlimit + + - four warnings are also defined for the event when user stops + exceeding some limit: + + QUOTA_NL_IHARDBELOW + inode hardlimit + QUOTA_NL_ISOFTBELOW + inode softlimit + QUOTA_NL_BHARDBELOW + space (block) hardlimit + QUOTA_NL_BSOFTBELOW + space (block) softlimit + + QUOTA_NL_A_DEV_MAJOR (u32) + - major number of a device with the affected filesystem + QUOTA_NL_A_DEV_MINOR (u32) + - minor number of a device with the affected filesystem diff --git a/Documentation/filesystems/quota.txt b/Documentation/filesystems/quota.txt deleted file mode 100644 index 32874b06ebe9..000000000000 --- a/Documentation/filesystems/quota.txt +++ /dev/null @@ -1,68 +0,0 @@ - -Quota subsystem -=============== - -Quota subsystem allows system administrator to set limits on used space and -number of used inodes (inode is a filesystem structure which is associated with -each file or directory) for users and/or groups. For both used space and number -of used inodes there are actually two limits. The first one is called softlimit -and the second one hardlimit. A user can never exceed a hardlimit for any -resource (unless he has CAP_SYS_RESOURCE capability). User is allowed to exceed -softlimit but only for limited period of time. This period is called "grace -period" or "grace time". When grace time is over, user is not able to allocate -more space/inodes until he frees enough of them to get below softlimit. - -Quota limits (and amount of grace time) are set independently for each -filesystem. - -For more details about quota design, see the documentation in quota-tools package -(http://sourceforge.net/projects/linuxquota). - -Quota netlink interface -======================= -When user exceeds a softlimit, runs out of grace time or reaches hardlimit, -quota subsystem traditionally printed a message to the controlling terminal of -the process which caused the excess. This method has the disadvantage that -when user is using a graphical desktop he usually cannot see the message. -Thus quota netlink interface has been designed to pass information about -the above events to userspace. There they can be captured by an application -and processed accordingly. - -The interface uses generic netlink framework (see -http://lwn.net/Articles/208755/ and http://people.suug.ch/~tgr/libnl/ for more -details about this layer). The name of the quota generic netlink interface -is "VFS_DQUOT". Definitions of constants below are in . -Since the quota netlink protocol is not namespace aware, quota netlink messages -are sent only in initial network namespace. - -Currently, the interface supports only one message type QUOTA_NL_C_WARNING. -This command is used to send a notification about any of the above mentioned -events. Each message has six attributes. These are (type of the argument is -in parentheses): - QUOTA_NL_A_QTYPE (u32) - - type of quota being exceeded (one of USRQUOTA, GRPQUOTA) - QUOTA_NL_A_EXCESS_ID (u64) - - UID/GID (depends on quota type) of user / group whose limit - is being exceeded. - QUOTA_NL_A_CAUSED_ID (u64) - - UID of a user who caused the event - QUOTA_NL_A_WARNING (u32) - - what kind of limit is exceeded: - QUOTA_NL_IHARDWARN - inode hardlimit - QUOTA_NL_ISOFTLONGWARN - inode softlimit is exceeded longer - than given grace period - QUOTA_NL_ISOFTWARN - inode softlimit - QUOTA_NL_BHARDWARN - space (block) hardlimit - QUOTA_NL_BSOFTLONGWARN - space (block) softlimit is exceeded - longer than given grace period. - QUOTA_NL_BSOFTWARN - space (block) softlimit - - four warnings are also defined for the event when user stops - exceeding some limit: - QUOTA_NL_IHARDBELOW - inode hardlimit - QUOTA_NL_ISOFTBELOW - inode softlimit - QUOTA_NL_BHARDBELOW - space (block) hardlimit - QUOTA_NL_BSOFTBELOW - space (block) softlimit - QUOTA_NL_A_DEV_MAJOR (u32) - - major number of a device with the affected filesystem - QUOTA_NL_A_DEV_MINOR (u32) - - minor number of a device with the affected filesystem diff --git a/MAINTAINERS b/MAINTAINERS index 46ba68a07a8f..1c3bd61ad03e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5010,7 +5010,7 @@ W: http://www.win.tue.nl/~aeb/partitions/partition_types-1.html DISKQUOTA M: Jan Kara S: Maintained -F: Documentation/filesystems/quota.txt +F: Documentation/filesystems/quota.rst F: fs/quota/ F: include/linux/quota*.h F: include/uapi/linux/quota*.h -- cgit v1.2.3 From 53a41d3eec863f12fead722f774fc36bd697f550 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:11 +0200 Subject: docs: filesystems: convert seq_file.txt to ReST - Add a SPDX header; - Add a document title; - Adjust section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/f950a0a56178ee05872ae2a2711a04d7af8ebb24.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/seq_file.rst | 372 +++++++++++++++++++++++++++++++++ Documentation/filesystems/seq_file.txt | 359 ------------------------------- 3 files changed, 373 insertions(+), 359 deletions(-) create mode 100644 Documentation/filesystems/seq_file.rst delete mode 100644 Documentation/filesystems/seq_file.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e2bbccf6d5bc..3c8c5b7908e0 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -32,6 +32,7 @@ algorithms work. mandatory-locking mount_api quota + seq_file automount-support diff --git a/Documentation/filesystems/seq_file.rst b/Documentation/filesystems/seq_file.rst new file mode 100644 index 000000000000..fab302046b13 --- /dev/null +++ b/Documentation/filesystems/seq_file.rst @@ -0,0 +1,372 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +The seq_file Interface +====================== + + Copyright 2003 Jonathan Corbet + + This file is originally from the LWN.net Driver Porting series at + http://lwn.net/Articles/driver-porting/ + + +There are numerous ways for a device driver (or other kernel component) to +provide information to the user or system administrator. One useful +technique is the creation of virtual files, in debugfs, /proc or elsewhere. +Virtual files can provide human-readable output that is easy to get at +without any special utility programs; they can also make life easier for +script writers. It is not surprising that the use of virtual files has +grown over the years. + +Creating those files correctly has always been a bit of a challenge, +however. It is not that hard to make a virtual file which returns a +string. But life gets trickier if the output is long - anything greater +than an application is likely to read in a single operation. Handling +multiple reads (and seeks) requires careful attention to the reader's +position within the virtual file - that position is, likely as not, in the +middle of a line of output. The kernel has traditionally had a number of +implementations that got this wrong. + +The 2.6 kernel contains a set of functions (implemented by Alexander Viro) +which are designed to make it easy for virtual file creators to get it +right. + +The seq_file interface is available via . There are +three aspects to seq_file: + + * An iterator interface which lets a virtual file implementation + step through the objects it is presenting. + + * Some utility functions for formatting objects for output without + needing to worry about things like output buffers. + + * A set of canned file_operations which implement most operations on + the virtual file. + +We'll look at the seq_file interface via an extremely simple example: a +loadable module which creates a file called /proc/sequence. The file, when +read, simply produces a set of increasing integer values, one per line. The +sequence will continue until the user loses patience and finds something +better to do. The file is seekable, in that one can do something like the +following:: + + dd if=/proc/sequence of=out1 count=1 + dd if=/proc/sequence skip=1 of=out2 count=1 + +Then concatenate the output files out1 and out2 and get the right +result. Yes, it is a thoroughly useless module, but the point is to show +how the mechanism works without getting lost in other details. (Those +wanting to see the full source for this module can find it at +http://lwn.net/Articles/22359/). + +Deprecated create_proc_entry +============================ + +Note that the above article uses create_proc_entry which was removed in +kernel 3.10. Current versions require the following update:: + + - entry = create_proc_entry("sequence", 0, NULL); + - if (entry) + - entry->proc_fops = &ct_file_ops; + + entry = proc_create("sequence", 0, NULL, &ct_file_ops); + +The iterator interface +====================== + +Modules implementing a virtual file with seq_file must implement an +iterator object that allows stepping through the data of interest +during a "session" (roughly one read() system call). If the iterator +is able to move to a specific position - like the file they implement, +though with freedom to map the position number to a sequence location +in whatever way is convenient - the iterator need only exist +transiently during a session. If the iterator cannot easily find a +numerical position but works well with a first/next interface, the +iterator can be stored in the private data area and continue from one +session to the next. + +A seq_file implementation that is formatting firewall rules from a +table, for example, could provide a simple iterator that interprets +position N as the Nth rule in the chain. A seq_file implementation +that presents the content of a, potentially volatile, linked list +might record a pointer into that list, providing that can be done +without risk of the current location being removed. + +Positioning can thus be done in whatever way makes the most sense for +the generator of the data, which need not be aware of how a position +translates to an offset in the virtual file. The one obvious exception +is that a position of zero should indicate the beginning of the file. + +The /proc/sequence iterator just uses the count of the next number it +will output as its position. + +Four functions must be implemented to make the iterator work. The +first, called start(), starts a session and takes a position as an +argument, returning an iterator which will start reading at that +position. The pos passed to start() will always be either zero, or +the most recent pos used in the previous session. + +For our simple sequence example, +the start() function looks like:: + + static void *ct_seq_start(struct seq_file *s, loff_t *pos) + { + loff_t *spos = kmalloc(sizeof(loff_t), GFP_KERNEL); + if (! spos) + return NULL; + *spos = *pos; + return spos; + } + +The entire data structure for this iterator is a single loff_t value +holding the current position. There is no upper bound for the sequence +iterator, but that will not be the case for most other seq_file +implementations; in most cases the start() function should check for a +"past end of file" condition and return NULL if need be. + +For more complicated applications, the private field of the seq_file +structure can be used to hold state from session to session. There is +also a special value which can be returned by the start() function +called SEQ_START_TOKEN; it can be used if you wish to instruct your +show() function (described below) to print a header at the top of the +output. SEQ_START_TOKEN should only be used if the offset is zero, +however. + +The next function to implement is called, amazingly, next(); its job is to +move the iterator forward to the next position in the sequence. The +example module can simply increment the position by one; more useful +modules will do what is needed to step through some data structure. The +next() function returns a new iterator, or NULL if the sequence is +complete. Here's the example version:: + + static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) + { + loff_t *spos = v; + *pos = ++*spos; + return spos; + } + +The stop() function closes a session; its job, of course, is to clean +up. If dynamic memory is allocated for the iterator, stop() is the +place to free it; if a lock was taken by start(), stop() must release +that lock. The value that ``*pos`` was set to by the last next() call +before stop() is remembered, and used for the first start() call of +the next session unless lseek() has been called on the file; in that +case next start() will be asked to start at position zero:: + + static void ct_seq_stop(struct seq_file *s, void *v) + { + kfree(v); + } + +Finally, the show() function should format the object currently pointed to +by the iterator for output. The example module's show() function is:: + + static int ct_seq_show(struct seq_file *s, void *v) + { + loff_t *spos = v; + seq_printf(s, "%lld\n", (long long)*spos); + return 0; + } + +If all is well, the show() function should return zero. A negative error +code in the usual manner indicates that something went wrong; it will be +passed back to user space. This function can also return SEQ_SKIP, which +causes the current item to be skipped; if the show() function has already +generated output before returning SEQ_SKIP, that output will be dropped. + +We will look at seq_printf() in a moment. But first, the definition of the +seq_file iterator is finished by creating a seq_operations structure with +the four functions we have just defined:: + + static const struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show + }; + +This structure will be needed to tie our iterator to the /proc file in +a little bit. + +It's worth noting that the iterator value returned by start() and +manipulated by the other functions is considered to be completely opaque by +the seq_file code. It can thus be anything that is useful in stepping +through the data to be output. Counters can be useful, but it could also be +a direct pointer into an array or linked list. Anything goes, as long as +the programmer is aware that things can happen between calls to the +iterator function. However, the seq_file code (by design) will not sleep +between the calls to start() and stop(), so holding a lock during that time +is a reasonable thing to do. The seq_file code will also avoid taking any +other locks while the iterator is active. + + +Formatted output +================ + +The seq_file code manages positioning within the output created by the +iterator and getting it into the user's buffer. But, for that to work, that +output must be passed to the seq_file code. Some utility functions have +been defined which make this task easy. + +Most code will simply use seq_printf(), which works pretty much like +printk(), but which requires the seq_file pointer as an argument. + +For straight character output, the following functions may be used:: + + seq_putc(struct seq_file *m, char c); + seq_puts(struct seq_file *m, const char *s); + seq_escape(struct seq_file *m, const char *s, const char *esc); + +The first two output a single character and a string, just like one would +expect. seq_escape() is like seq_puts(), except that any character in s +which is in the string esc will be represented in octal form in the output. + +There are also a pair of functions for printing filenames:: + + int seq_path(struct seq_file *m, const struct path *path, + const char *esc); + int seq_path_root(struct seq_file *m, const struct path *path, + const struct path *root, const char *esc) + +Here, path indicates the file of interest, and esc is a set of characters +which should be escaped in the output. A call to seq_path() will output +the path relative to the current process's filesystem root. If a different +root is desired, it can be used with seq_path_root(). If it turns out that +path cannot be reached from root, seq_path_root() returns SEQ_SKIP. + +A function producing complicated output may want to check:: + + bool seq_has_overflowed(struct seq_file *m); + +and avoid further seq_ calls if true is returned. + +A true return from seq_has_overflowed means that the seq_file buffer will +be discarded and the seq_show function will attempt to allocate a larger +buffer and retry printing. + + +Making it all work +================== + +So far, we have a nice set of functions which can produce output within the +seq_file system, but we have not yet turned them into a file that a user +can see. Creating a file within the kernel requires, of course, the +creation of a set of file_operations which implement the operations on that +file. The seq_file interface provides a set of canned operations which do +most of the work. The virtual file author still must implement the open() +method, however, to hook everything up. The open function is often a single +line, as in the example module:: + + static int ct_open(struct inode *inode, struct file *file) + { + return seq_open(file, &ct_seq_ops); + } + +Here, the call to seq_open() takes the seq_operations structure we created +before, and gets set up to iterate through the virtual file. + +On a successful open, seq_open() stores the struct seq_file pointer in +file->private_data. If you have an application where the same iterator can +be used for more than one file, you can store an arbitrary pointer in the +private field of the seq_file structure; that value can then be retrieved +by the iterator functions. + +There is also a wrapper function to seq_open() called seq_open_private(). It +kmallocs a zero filled block of memory and stores a pointer to it in the +private field of the seq_file structure, returning 0 on success. The +block size is specified in a third parameter to the function, e.g.:: + + static int ct_open(struct inode *inode, struct file *file) + { + return seq_open_private(file, &ct_seq_ops, + sizeof(struct mystruct)); + } + +There is also a variant function, __seq_open_private(), which is functionally +identical except that, if successful, it returns the pointer to the allocated +memory block, allowing further initialisation e.g.:: + + static int ct_open(struct inode *inode, struct file *file) + { + struct mystruct *p = + __seq_open_private(file, &ct_seq_ops, sizeof(*p)); + + if (!p) + return -ENOMEM; + + p->foo = bar; /* initialize my stuff */ + ... + p->baz = true; + + return 0; + } + +A corresponding close function, seq_release_private() is available which +frees the memory allocated in the corresponding open. + +The other operations of interest - read(), llseek(), and release() - are +all implemented by the seq_file code itself. So a virtual file's +file_operations structure will look like:: + + static const struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release + }; + +There is also a seq_release_private() which passes the contents of the +seq_file private field to kfree() before releasing the structure. + +The final step is the creation of the /proc file itself. In the example +code, that is done in the initialization code in the usual way:: + + static int ct_init(void) + { + struct proc_dir_entry *entry; + + proc_create("sequence", 0, NULL, &ct_file_ops); + return 0; + } + + module_init(ct_init); + +And that is pretty much it. + + +seq_list +======== + +If your file will be iterating through a linked list, you may find these +routines useful:: + + struct list_head *seq_list_start(struct list_head *head, + loff_t pos); + struct list_head *seq_list_start_head(struct list_head *head, + loff_t pos); + struct list_head *seq_list_next(void *v, struct list_head *head, + loff_t *ppos); + +These helpers will interpret pos as a position within the list and iterate +accordingly. Your start() and next() functions need only invoke the +``seq_list_*`` helpers with a pointer to the appropriate list_head structure. + + +The extra-simple version +======================== + +For extremely simple virtual files, there is an even easier interface. A +module can define only the show() function, which should create all the +output that the virtual file will contain. The file's open() method then +calls:: + + int single_open(struct file *file, + int (*show)(struct seq_file *m, void *p), + void *data); + +When output time comes, the show() function will be called once. The data +value given to single_open() can be found in the private field of the +seq_file structure. When using single_open(), the programmer should use +single_release() instead of seq_release() in the file_operations structure +to avoid a memory leak. diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt deleted file mode 100644 index d412b236a9d6..000000000000 --- a/Documentation/filesystems/seq_file.txt +++ /dev/null @@ -1,359 +0,0 @@ -The seq_file interface - - Copyright 2003 Jonathan Corbet - This file is originally from the LWN.net Driver Porting series at - http://lwn.net/Articles/driver-porting/ - - -There are numerous ways for a device driver (or other kernel component) to -provide information to the user or system administrator. One useful -technique is the creation of virtual files, in debugfs, /proc or elsewhere. -Virtual files can provide human-readable output that is easy to get at -without any special utility programs; they can also make life easier for -script writers. It is not surprising that the use of virtual files has -grown over the years. - -Creating those files correctly has always been a bit of a challenge, -however. It is not that hard to make a virtual file which returns a -string. But life gets trickier if the output is long - anything greater -than an application is likely to read in a single operation. Handling -multiple reads (and seeks) requires careful attention to the reader's -position within the virtual file - that position is, likely as not, in the -middle of a line of output. The kernel has traditionally had a number of -implementations that got this wrong. - -The 2.6 kernel contains a set of functions (implemented by Alexander Viro) -which are designed to make it easy for virtual file creators to get it -right. - -The seq_file interface is available via . There are -three aspects to seq_file: - - * An iterator interface which lets a virtual file implementation - step through the objects it is presenting. - - * Some utility functions for formatting objects for output without - needing to worry about things like output buffers. - - * A set of canned file_operations which implement most operations on - the virtual file. - -We'll look at the seq_file interface via an extremely simple example: a -loadable module which creates a file called /proc/sequence. The file, when -read, simply produces a set of increasing integer values, one per line. The -sequence will continue until the user loses patience and finds something -better to do. The file is seekable, in that one can do something like the -following: - - dd if=/proc/sequence of=out1 count=1 - dd if=/proc/sequence skip=1 of=out2 count=1 - -Then concatenate the output files out1 and out2 and get the right -result. Yes, it is a thoroughly useless module, but the point is to show -how the mechanism works without getting lost in other details. (Those -wanting to see the full source for this module can find it at -http://lwn.net/Articles/22359/). - -Deprecated create_proc_entry - -Note that the above article uses create_proc_entry which was removed in -kernel 3.10. Current versions require the following update - -- entry = create_proc_entry("sequence", 0, NULL); -- if (entry) -- entry->proc_fops = &ct_file_ops; -+ entry = proc_create("sequence", 0, NULL, &ct_file_ops); - -The iterator interface - -Modules implementing a virtual file with seq_file must implement an -iterator object that allows stepping through the data of interest -during a "session" (roughly one read() system call). If the iterator -is able to move to a specific position - like the file they implement, -though with freedom to map the position number to a sequence location -in whatever way is convenient - the iterator need only exist -transiently during a session. If the iterator cannot easily find a -numerical position but works well with a first/next interface, the -iterator can be stored in the private data area and continue from one -session to the next. - -A seq_file implementation that is formatting firewall rules from a -table, for example, could provide a simple iterator that interprets -position N as the Nth rule in the chain. A seq_file implementation -that presents the content of a, potentially volatile, linked list -might record a pointer into that list, providing that can be done -without risk of the current location being removed. - -Positioning can thus be done in whatever way makes the most sense for -the generator of the data, which need not be aware of how a position -translates to an offset in the virtual file. The one obvious exception -is that a position of zero should indicate the beginning of the file. - -The /proc/sequence iterator just uses the count of the next number it -will output as its position. - -Four functions must be implemented to make the iterator work. The -first, called start(), starts a session and takes a position as an -argument, returning an iterator which will start reading at that -position. The pos passed to start() will always be either zero, or -the most recent pos used in the previous session. - -For our simple sequence example, -the start() function looks like: - - static void *ct_seq_start(struct seq_file *s, loff_t *pos) - { - loff_t *spos = kmalloc(sizeof(loff_t), GFP_KERNEL); - if (! spos) - return NULL; - *spos = *pos; - return spos; - } - -The entire data structure for this iterator is a single loff_t value -holding the current position. There is no upper bound for the sequence -iterator, but that will not be the case for most other seq_file -implementations; in most cases the start() function should check for a -"past end of file" condition and return NULL if need be. - -For more complicated applications, the private field of the seq_file -structure can be used to hold state from session to session. There is -also a special value which can be returned by the start() function -called SEQ_START_TOKEN; it can be used if you wish to instruct your -show() function (described below) to print a header at the top of the -output. SEQ_START_TOKEN should only be used if the offset is zero, -however. - -The next function to implement is called, amazingly, next(); its job is to -move the iterator forward to the next position in the sequence. The -example module can simply increment the position by one; more useful -modules will do what is needed to step through some data structure. The -next() function returns a new iterator, or NULL if the sequence is -complete. Here's the example version: - - static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) - { - loff_t *spos = v; - *pos = ++*spos; - return spos; - } - -The stop() function closes a session; its job, of course, is to clean -up. If dynamic memory is allocated for the iterator, stop() is the -place to free it; if a lock was taken by start(), stop() must release -that lock. The value that *pos was set to by the last next() call -before stop() is remembered, and used for the first start() call of -the next session unless lseek() has been called on the file; in that -case next start() will be asked to start at position zero. - - static void ct_seq_stop(struct seq_file *s, void *v) - { - kfree(v); - } - -Finally, the show() function should format the object currently pointed to -by the iterator for output. The example module's show() function is: - - static int ct_seq_show(struct seq_file *s, void *v) - { - loff_t *spos = v; - seq_printf(s, "%lld\n", (long long)*spos); - return 0; - } - -If all is well, the show() function should return zero. A negative error -code in the usual manner indicates that something went wrong; it will be -passed back to user space. This function can also return SEQ_SKIP, which -causes the current item to be skipped; if the show() function has already -generated output before returning SEQ_SKIP, that output will be dropped. - -We will look at seq_printf() in a moment. But first, the definition of the -seq_file iterator is finished by creating a seq_operations structure with -the four functions we have just defined: - - static const struct seq_operations ct_seq_ops = { - .start = ct_seq_start, - .next = ct_seq_next, - .stop = ct_seq_stop, - .show = ct_seq_show - }; - -This structure will be needed to tie our iterator to the /proc file in -a little bit. - -It's worth noting that the iterator value returned by start() and -manipulated by the other functions is considered to be completely opaque by -the seq_file code. It can thus be anything that is useful in stepping -through the data to be output. Counters can be useful, but it could also be -a direct pointer into an array or linked list. Anything goes, as long as -the programmer is aware that things can happen between calls to the -iterator function. However, the seq_file code (by design) will not sleep -between the calls to start() and stop(), so holding a lock during that time -is a reasonable thing to do. The seq_file code will also avoid taking any -other locks while the iterator is active. - - -Formatted output - -The seq_file code manages positioning within the output created by the -iterator and getting it into the user's buffer. But, for that to work, that -output must be passed to the seq_file code. Some utility functions have -been defined which make this task easy. - -Most code will simply use seq_printf(), which works pretty much like -printk(), but which requires the seq_file pointer as an argument. - -For straight character output, the following functions may be used: - - seq_putc(struct seq_file *m, char c); - seq_puts(struct seq_file *m, const char *s); - seq_escape(struct seq_file *m, const char *s, const char *esc); - -The first two output a single character and a string, just like one would -expect. seq_escape() is like seq_puts(), except that any character in s -which is in the string esc will be represented in octal form in the output. - -There are also a pair of functions for printing filenames: - - int seq_path(struct seq_file *m, const struct path *path, - const char *esc); - int seq_path_root(struct seq_file *m, const struct path *path, - const struct path *root, const char *esc) - -Here, path indicates the file of interest, and esc is a set of characters -which should be escaped in the output. A call to seq_path() will output -the path relative to the current process's filesystem root. If a different -root is desired, it can be used with seq_path_root(). If it turns out that -path cannot be reached from root, seq_path_root() returns SEQ_SKIP. - -A function producing complicated output may want to check - bool seq_has_overflowed(struct seq_file *m); -and avoid further seq_ calls if true is returned. - -A true return from seq_has_overflowed means that the seq_file buffer will -be discarded and the seq_show function will attempt to allocate a larger -buffer and retry printing. - - -Making it all work - -So far, we have a nice set of functions which can produce output within the -seq_file system, but we have not yet turned them into a file that a user -can see. Creating a file within the kernel requires, of course, the -creation of a set of file_operations which implement the operations on that -file. The seq_file interface provides a set of canned operations which do -most of the work. The virtual file author still must implement the open() -method, however, to hook everything up. The open function is often a single -line, as in the example module: - - static int ct_open(struct inode *inode, struct file *file) - { - return seq_open(file, &ct_seq_ops); - } - -Here, the call to seq_open() takes the seq_operations structure we created -before, and gets set up to iterate through the virtual file. - -On a successful open, seq_open() stores the struct seq_file pointer in -file->private_data. If you have an application where the same iterator can -be used for more than one file, you can store an arbitrary pointer in the -private field of the seq_file structure; that value can then be retrieved -by the iterator functions. - -There is also a wrapper function to seq_open() called seq_open_private(). It -kmallocs a zero filled block of memory and stores a pointer to it in the -private field of the seq_file structure, returning 0 on success. The -block size is specified in a third parameter to the function, e.g.: - - static int ct_open(struct inode *inode, struct file *file) - { - return seq_open_private(file, &ct_seq_ops, - sizeof(struct mystruct)); - } - -There is also a variant function, __seq_open_private(), which is functionally -identical except that, if successful, it returns the pointer to the allocated -memory block, allowing further initialisation e.g.: - - static int ct_open(struct inode *inode, struct file *file) - { - struct mystruct *p = - __seq_open_private(file, &ct_seq_ops, sizeof(*p)); - - if (!p) - return -ENOMEM; - - p->foo = bar; /* initialize my stuff */ - ... - p->baz = true; - - return 0; - } - -A corresponding close function, seq_release_private() is available which -frees the memory allocated in the corresponding open. - -The other operations of interest - read(), llseek(), and release() - are -all implemented by the seq_file code itself. So a virtual file's -file_operations structure will look like: - - static const struct file_operations ct_file_ops = { - .owner = THIS_MODULE, - .open = ct_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release - }; - -There is also a seq_release_private() which passes the contents of the -seq_file private field to kfree() before releasing the structure. - -The final step is the creation of the /proc file itself. In the example -code, that is done in the initialization code in the usual way: - - static int ct_init(void) - { - struct proc_dir_entry *entry; - - proc_create("sequence", 0, NULL, &ct_file_ops); - return 0; - } - - module_init(ct_init); - -And that is pretty much it. - - -seq_list - -If your file will be iterating through a linked list, you may find these -routines useful: - - struct list_head *seq_list_start(struct list_head *head, - loff_t pos); - struct list_head *seq_list_start_head(struct list_head *head, - loff_t pos); - struct list_head *seq_list_next(void *v, struct list_head *head, - loff_t *ppos); - -These helpers will interpret pos as a position within the list and iterate -accordingly. Your start() and next() functions need only invoke the -seq_list_* helpers with a pointer to the appropriate list_head structure. - - -The extra-simple version - -For extremely simple virtual files, there is an even easier interface. A -module can define only the show() function, which should create all the -output that the virtual file will contain. The file's open() method then -calls: - - int single_open(struct file *file, - int (*show)(struct seq_file *m, void *p), - void *data); - -When output time comes, the show() function will be called once. The data -value given to single_open() can be found in the private field of the -seq_file structure. When using single_open(), the programmer should use -single_release() instead of seq_release() in the file_operations structure -to avoid a memory leak. -- cgit v1.2.3 From cf06612c65e5dc86f42cdad0b9fc24a90fe18a07 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:12 +0200 Subject: docs: filesystems: convert sharedsubtree.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add table markups; - Add it to filesystems/index.rst Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6692b8abc177130e9e53aace94117a2ad076cab5.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/proc.rst | 2 +- Documentation/filesystems/sharedsubtree.rst | 995 ++++++++++++++++++++++++++++ Documentation/filesystems/sharedsubtree.txt | 939 -------------------------- 4 files changed, 997 insertions(+), 940 deletions(-) create mode 100644 Documentation/filesystems/sharedsubtree.rst delete mode 100644 Documentation/filesystems/sharedsubtree.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 3c8c5b7908e0..c658ddee40fa 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -33,6 +33,7 @@ algorithms work. mount_api quota seq_file + sharedsubtree automount-support diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 38b606991065..a567bcccbb02 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1870,7 +1870,7 @@ unbindable mount is unbindable For more information on mount propagation see: - Documentation/filesystems/sharedsubtree.txt + Documentation/filesystems/sharedsubtree.rst 3.6 /proc//comm & /proc//task//comm diff --git a/Documentation/filesystems/sharedsubtree.rst b/Documentation/filesystems/sharedsubtree.rst new file mode 100644 index 000000000000..d83395354250 --- /dev/null +++ b/Documentation/filesystems/sharedsubtree.rst @@ -0,0 +1,995 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Shared Subtrees +=============== + +.. Contents: + 1) Overview + 2) Features + 3) Setting mount states + 4) Use-case + 5) Detailed semantics + 6) Quiz + 7) FAQ + 8) Implementation + + +1) Overview +----------- + +Consider the following situation: + +A process wants to clone its own namespace, but still wants to access the CD +that got mounted recently. Shared subtree semantics provide the necessary +mechanism to accomplish the above. + +It provides the necessary building blocks for features like per-user-namespace +and versioned filesystem. + +2) Features +----------- + +Shared subtree provides four different flavors of mounts; struct vfsmount to be +precise + + a. shared mount + b. slave mount + c. private mount + d. unbindable mount + + +2a) A shared mount can be replicated to as many mountpoints and all the +replicas continue to be exactly same. + + Here is an example: + + Let's say /mnt has a mount that is shared:: + + mount --make-shared /mnt + + Note: mount(8) command now supports the --make-shared flag, + so the sample 'smount' program is no longer needed and has been + removed. + + :: + + # mount --bind /mnt /tmp + + The above command replicates the mount at /mnt to the mountpoint /tmp + and the contents of both the mounts remain identical. + + :: + + #ls /mnt + a b c + + #ls /tmp + a b c + + Now let's say we mount a device at /tmp/a:: + + # mount /dev/sd0 /tmp/a + + #ls /tmp/a + t1 t2 t3 + + #ls /mnt/a + t1 t2 t3 + + Note that the mount has propagated to the mount at /mnt as well. + + And the same is true even when /dev/sd0 is mounted on /mnt/a. The + contents will be visible under /tmp/a too. + + +2b) A slave mount is like a shared mount except that mount and umount events + only propagate towards it. + + All slave mounts have a master mount which is a shared. + + Here is an example: + + Let's say /mnt has a mount which is shared. + # mount --make-shared /mnt + + Let's bind mount /mnt to /tmp + # mount --bind /mnt /tmp + + the new mount at /tmp becomes a shared mount and it is a replica of + the mount at /mnt. + + Now let's make the mount at /tmp; a slave of /mnt + # mount --make-slave /tmp + + let's mount /dev/sd0 on /mnt/a + # mount /dev/sd0 /mnt/a + + #ls /mnt/a + t1 t2 t3 + + #ls /tmp/a + t1 t2 t3 + + Note the mount event has propagated to the mount at /tmp + + However let's see what happens if we mount something on the mount at /tmp + + # mount /dev/sd1 /tmp/b + + #ls /tmp/b + s1 s2 s3 + + #ls /mnt/b + + Note how the mount event has not propagated to the mount at + /mnt + + +2c) A private mount does not forward or receive propagation. + + This is the mount we are familiar with. Its the default type. + + +2d) A unbindable mount is a unbindable private mount + + let's say we have a mount at /mnt and we make it unbindable:: + + # mount --make-unbindable /mnt + + Let's try to bind mount this mount somewhere else:: + + # mount --bind /mnt /tmp + mount: wrong fs type, bad option, bad superblock on /mnt, + or too many mounted file systems + + Binding a unbindable mount is a invalid operation. + + +3) Setting mount states + + The mount command (util-linux package) can be used to set mount + states:: + + mount --make-shared mountpoint + mount --make-slave mountpoint + mount --make-private mountpoint + mount --make-unbindable mountpoint + + +4) Use cases +------------ + + A) A process wants to clone its own namespace, but still wants to + access the CD that got mounted recently. + + Solution: + + The system administrator can make the mount at /cdrom shared:: + + mount --bind /cdrom /cdrom + mount --make-shared /cdrom + + Now any process that clones off a new namespace will have a + mount at /cdrom which is a replica of the same mount in the + parent namespace. + + So when a CD is inserted and mounted at /cdrom that mount gets + propagated to the other mount at /cdrom in all the other clone + namespaces. + + B) A process wants its mounts invisible to any other process, but + still be able to see the other system mounts. + + Solution: + + To begin with, the administrator can mark the entire mount tree + as shareable:: + + mount --make-rshared / + + A new process can clone off a new namespace. And mark some part + of its namespace as slave:: + + mount --make-rslave /myprivatetree + + Hence forth any mounts within the /myprivatetree done by the + process will not show up in any other namespace. However mounts + done in the parent namespace under /myprivatetree still shows + up in the process's namespace. + + + Apart from the above semantics this feature provides the + building blocks to solve the following problems: + + C) Per-user namespace + + The above semantics allows a way to share mounts across + namespaces. But namespaces are associated with processes. If + namespaces are made first class objects with user API to + associate/disassociate a namespace with userid, then each user + could have his/her own namespace and tailor it to his/her + requirements. This needs to be supported in PAM. + + D) Versioned files + + If the entire mount tree is visible at multiple locations, then + an underlying versioning file system can return different + versions of the file depending on the path used to access that + file. + + An example is:: + + mount --make-shared / + mount --rbind / /view/v1 + mount --rbind / /view/v2 + mount --rbind / /view/v3 + mount --rbind / /view/v4 + + and if /usr has a versioning filesystem mounted, then that + mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and + /view/v4/usr too + + A user can request v3 version of the file /usr/fs/namespace.c + by accessing /view/v3/usr/fs/namespace.c . The underlying + versioning filesystem can then decipher that v3 version of the + filesystem is being requested and return the corresponding + inode. + +5) Detailed semantics +--------------------- + The section below explains the detailed semantics of + bind, rbind, move, mount, umount and clone-namespace operations. + + Note: the word 'vfsmount' and the noun 'mount' have been used + to mean the same thing, throughout this document. + +5a) Mount states + + A given mount can be in one of the following states + + 1) shared + 2) slave + 3) shared and slave + 4) private + 5) unbindable + + A 'propagation event' is defined as event generated on a vfsmount + that leads to mount or unmount actions in other vfsmounts. + + A 'peer group' is defined as a group of vfsmounts that propagate + events to each other. + + (1) Shared mounts + + A 'shared mount' is defined as a vfsmount that belongs to a + 'peer group'. + + For example:: + + mount --make-shared /mnt + mount --bind /mnt /tmp + + The mount at /mnt and that at /tmp are both shared and belong + to the same peer group. Anything mounted or unmounted under + /mnt or /tmp reflect in all the other mounts of its peer + group. + + + (2) Slave mounts + + A 'slave mount' is defined as a vfsmount that receives + propagation events and does not forward propagation events. + + A slave mount as the name implies has a master mount from which + mount/unmount events are received. Events do not propagate from + the slave mount to the master. Only a shared mount can be made + a slave by executing the following command:: + + mount --make-slave mount + + A shared mount that is made as a slave is no more shared unless + modified to become shared. + + (3) Shared and Slave + + A vfsmount can be both shared as well as slave. This state + indicates that the mount is a slave of some vfsmount, and + has its own peer group too. This vfsmount receives propagation + events from its master vfsmount, and also forwards propagation + events to its 'peer group' and to its slave vfsmounts. + + Strictly speaking, the vfsmount is shared having its own + peer group, and this peer-group is a slave of some other + peer group. + + Only a slave vfsmount can be made as 'shared and slave' by + either executing the following command:: + + mount --make-shared mount + + or by moving the slave vfsmount under a shared vfsmount. + + (4) Private mount + + A 'private mount' is defined as vfsmount that does not + receive or forward any propagation events. + + (5) Unbindable mount + + A 'unbindable mount' is defined as vfsmount that does not + receive or forward any propagation events and cannot + be bind mounted. + + + State diagram: + + The state diagram below explains the state transition of a mount, + in response to various commands:: + + ----------------------------------------------------------------------- + | |make-shared | make-slave | make-private |make-unbindab| + --------------|------------|--------------|--------------|-------------| + |shared |shared |*slave/private| private | unbindable | + | | | | | | + |-------------|------------|--------------|--------------|-------------| + |slave |shared | **slave | private | unbindable | + | |and slave | | | | + |-------------|------------|--------------|--------------|-------------| + |shared |shared | slave | private | unbindable | + |and slave |and slave | | | | + |-------------|------------|--------------|--------------|-------------| + |private |shared | **private | private | unbindable | + |-------------|------------|--------------|--------------|-------------| + |unbindable |shared |**unbindable | private | unbindable | + ------------------------------------------------------------------------ + + * if the shared mount is the only mount in its peer group, making it + slave, makes it private automatically. Note that there is no master to + which it can be slaved to. + + ** slaving a non-shared mount has no effect on the mount. + + Apart from the commands listed below, the 'move' operation also changes + the state of a mount depending on type of the destination mount. Its + explained in section 5d. + +5b) Bind semantics + + Consider the following command:: + + mount --bind A/a B/b + + where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B' + is the destination mount and 'b' is the dentry in the destination mount. + + The outcome depends on the type of mount of 'A' and 'B'. The table + below contains quick reference:: + + -------------------------------------------------------------------------- + | BIND MOUNT OPERATION | + |************************************************************************| + |source(A)->| shared | private | slave | unbindable | + | dest(B) | | | | | + | | | | | | | + | v | | | | | + |************************************************************************| + | shared | shared | shared | shared & slave | invalid | + | | | | | | + |non-shared| shared | private | slave | invalid | + ************************************************************************** + + Details: + + 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' + which is clone of 'A', is created. Its root dentry is 'a' . 'C' is + mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... + are created and mounted at the dentry 'b' on all mounts where 'B' + propagates to. A new propagation tree containing 'C1',..,'Cn' is + created. This propagation tree is identical to the propagation tree of + 'B'. And finally the peer-group of 'C' is merged with the peer group + of 'A'. + + 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' + which is clone of 'A', is created. Its root dentry is 'a'. 'C' is + mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... + are created and mounted at the dentry 'b' on all mounts where 'B' + propagates to. A new propagation tree is set containing all new mounts + 'C', 'C1', .., 'Cn' with exactly the same configuration as the + propagation tree for 'B'. + + 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new + mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . + 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2', + 'C3' ... are created and mounted at the dentry 'b' on all mounts where + 'B' propagates to. A new propagation tree containing the new mounts + 'C','C1',.. 'Cn' is created. This propagation tree is identical to the + propagation tree for 'B'. And finally the mount 'C' and its peer group + is made the slave of mount 'Z'. In other words, mount 'C' is in the + state 'slave and shared'. + + 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a + invalid operation. + + 5. 'A' is a private mount and 'B' is a non-shared(private or slave or + unbindable) mount. A new mount 'C' which is clone of 'A', is created. + Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. + + 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' + which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is + mounted on mount 'B' at dentry 'b'. 'C' is made a member of the + peer-group of 'A'. + + 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A + new mount 'C' which is a clone of 'A' is created. Its root dentry is + 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a + slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of + 'Z'. All mount/unmount events on 'Z' propagates to 'A' and 'C'. But + mount/unmount on 'A' do not propagate anywhere else. Similarly + mount/unmount on 'C' do not propagate anywhere else. + + 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a + invalid operation. A unbindable mount cannot be bind mounted. + +5c) Rbind semantics + + rbind is same as bind. Bind replicates the specified mount. Rbind + replicates all the mounts in the tree belonging to the specified mount. + Rbind mount is bind mount applied to all the mounts in the tree. + + If the source tree that is rbind has some unbindable mounts, + then the subtree under the unbindable mount is pruned in the new + location. + + eg: + + let's say we have the following mount tree:: + + A + / \ + B C + / \ / \ + D E F G + + Let's say all the mount except the mount C in the tree are + of a type other than unbindable. + + If this tree is rbound to say Z + + We will have the following tree at the new location:: + + Z + | + A' + / + B' Note how the tree under C is pruned + / \ in the new location. + D' E' + + + +5d) Move semantics + + Consider the following command + + mount --move A B/b + + where 'A' is the source mount, 'B' is the destination mount and 'b' is + the dentry in the destination mount. + + The outcome depends on the type of the mount of 'A' and 'B'. The table + below is a quick reference:: + + --------------------------------------------------------------------------- + | MOVE MOUNT OPERATION | + |************************************************************************** + | source(A)->| shared | private | slave | unbindable | + | dest(B) | | | | | + | | | | | | | + | v | | | | | + |************************************************************************** + | shared | shared | shared |shared and slave| invalid | + | | | | | | + |non-shared| shared | private | slave | unbindable | + *************************************************************************** + + .. Note:: moving a mount residing under a shared mount is invalid. + + Details follow: + + 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is + mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An' + are created and mounted at dentry 'b' on all mounts that receive + propagation from mount 'B'. A new propagation tree is created in the + exact same configuration as that of 'B'. This new propagation tree + contains all the new mounts 'A1', 'A2'... 'An'. And this new + propagation tree is appended to the already existing propagation tree + of 'A'. + + 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is + mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An' + are created and mounted at dentry 'b' on all mounts that receive + propagation from mount 'B'. The mount 'A' becomes a shared mount and a + propagation tree is created which is identical to that of + 'B'. This new propagation tree contains all the new mounts 'A1', + 'A2'... 'An'. + + 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The + mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', + 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that + receive propagation from mount 'B'. A new propagation tree is created + in the exact same configuration as that of 'B'. This new propagation + tree contains all the new mounts 'A1', 'A2'... 'An'. And this new + propagation tree is appended to the already existing propagation tree of + 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also + becomes 'shared'. + + 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation + is invalid. Because mounting anything on the shared mount 'B' can + create new mounts that get mounted on the mounts that receive + propagation from 'B'. And since the mount 'A' is unbindable, cloning + it to mount at other mountpoints is not possible. + + 5. 'A' is a private mount and 'B' is a non-shared(private or slave or + unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. + + 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' + is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a + shared mount. + + 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. + The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' + continues to be a slave mount of mount 'Z'. + + 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount + 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a + unbindable mount. + +5e) Mount semantics + + Consider the following command:: + + mount device B/b + + 'B' is the destination mount and 'b' is the dentry in the destination + mount. + + The above operation is the same as bind operation with the exception + that the source mount is always a private mount. + + +5f) Unmount semantics + + Consider the following command:: + + umount A + + where 'A' is a mount mounted on mount 'B' at dentry 'b'. + + If mount 'B' is shared, then all most-recently-mounted mounts at dentry + 'b' on mounts that receive propagation from mount 'B' and does not have + sub-mounts within them are unmounted. + + Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to + each other. + + let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount + 'B1', 'B2' and 'B3' respectively. + + let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on + mount 'B1', 'B2' and 'B3' respectively. + + if 'C1' is unmounted, all the mounts that are most-recently-mounted on + 'B1' and on the mounts that 'B1' propagates-to are unmounted. + + 'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount + on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'. + + So all 'C1', 'C2' and 'C3' should be unmounted. + + If any of 'C2' or 'C3' has some child mounts, then that mount is not + unmounted, but all other mounts are unmounted. However if 'C1' is told + to be unmounted and 'C1' has some sub-mounts, the umount operation is + failed entirely. + +5g) Clone Namespace + + A cloned namespace contains all the mounts as that of the parent + namespace. + + Let's say 'A' and 'B' are the corresponding mounts in the parent and the + child namespace. + + If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to + each other. + + If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of + 'Z'. + + If 'A' is a private mount, then 'B' is a private mount too. + + If 'A' is unbindable mount, then 'B' is a unbindable mount too. + + +6) Quiz + + A. What is the result of the following command sequence? + + :: + + mount --bind /mnt /mnt + mount --make-shared /mnt + mount --bind /mnt /tmp + mount --move /tmp /mnt/1 + + what should be the contents of /mnt /mnt/1 /mnt/1/1 should be? + Should they all be identical? or should /mnt and /mnt/1 be + identical only? + + + B. What is the result of the following command sequence? + + :: + + mount --make-rshared / + mkdir -p /v/1 + mount --rbind / /v/1 + + what should be the content of /v/1/v/1 be? + + + C. What is the result of the following command sequence? + + :: + + mount --bind /mnt /mnt + mount --make-shared /mnt + mkdir -p /mnt/1/2/3 /mnt/1/test + mount --bind /mnt/1 /tmp + mount --make-slave /mnt + mount --make-shared /mnt + mount --bind /mnt/1/2 /tmp1 + mount --make-slave /mnt + + At this point we have the first mount at /tmp and + its root dentry is 1. Let's call this mount 'A' + And then we have a second mount at /tmp1 with root + dentry 2. Let's call this mount 'B' + Next we have a third mount at /mnt with root dentry + mnt. Let's call this mount 'C' + + 'B' is the slave of 'A' and 'C' is a slave of 'B' + A -> B -> C + + at this point if we execute the following command + + mount --bind /bin /tmp/test + + The mount is attempted on 'A' + + will the mount propagate to 'B' and 'C' ? + + what would be the contents of + /mnt/1/test be? + +7) FAQ + + Q1. Why is bind mount needed? How is it different from symbolic links? + symbolic links can get stale if the destination mount gets + unmounted or moved. Bind mounts continue to exist even if the + other mount is unmounted or moved. + + Q2. Why can't the shared subtree be implemented using exportfs? + + exportfs is a heavyweight way of accomplishing part of what + shared subtree can do. I cannot imagine a way to implement the + semantics of slave mount using exportfs? + + Q3 Why is unbindable mount needed? + + Let's say we want to replicate the mount tree at multiple + locations within the same subtree. + + if one rbind mounts a tree within the same subtree 'n' times + the number of mounts created is an exponential function of 'n'. + Having unbindable mount can help prune the unneeded bind + mounts. Here is an example. + + step 1: + let's say the root tree has just two directories with + one vfsmount:: + + root + / \ + tmp usr + + And we want to replicate the tree at multiple + mountpoints under /root/tmp + + step 2: + :: + + + mount --make-shared /root + + mkdir -p /tmp/m1 + + mount --rbind /root /tmp/m1 + + the new tree now looks like this:: + + root + / \ + tmp usr + / + m1 + / \ + tmp usr + / + m1 + + it has two vfsmounts + + step 3: + :: + + mkdir -p /tmp/m2 + mount --rbind /root /tmp/m2 + + the new tree now looks like this:: + + root + / \ + tmp usr + / \ + m1 m2 + / \ / \ + tmp usr tmp usr + / \ / + m1 m2 m1 + / \ / \ + tmp usr tmp usr + / / \ + m1 m1 m2 + / \ + tmp usr + / \ + m1 m2 + + it has 6 vfsmounts + + step 4: + :: + mkdir -p /tmp/m3 + mount --rbind /root /tmp/m3 + + I won't draw the tree..but it has 24 vfsmounts + + + at step i the number of vfsmounts is V[i] = i*V[i-1]. + This is an exponential function. And this tree has way more + mounts than what we really needed in the first place. + + One could use a series of umount at each step to prune + out the unneeded mounts. But there is a better solution. + Unclonable mounts come in handy here. + + step 1: + let's say the root tree has just two directories with + one vfsmount:: + + root + / \ + tmp usr + + How do we set up the same tree at multiple locations under + /root/tmp + + step 2: + :: + + + mount --bind /root/tmp /root/tmp + + mount --make-rshared /root + mount --make-unbindable /root/tmp + + mkdir -p /tmp/m1 + + mount --rbind /root /tmp/m1 + + the new tree now looks like this:: + + root + / \ + tmp usr + / + m1 + / \ + tmp usr + + step 3: + :: + + mkdir -p /tmp/m2 + mount --rbind /root /tmp/m2 + + the new tree now looks like this:: + + root + / \ + tmp usr + / \ + m1 m2 + / \ / \ + tmp usr tmp usr + + step 4: + :: + + mkdir -p /tmp/m3 + mount --rbind /root /tmp/m3 + + the new tree now looks like this:: + + root + / \ + tmp usr + / \ \ + m1 m2 m3 + / \ / \ / \ + tmp usr tmp usr tmp usr + +8) Implementation + +8A) Datastructure + + 4 new fields are introduced to struct vfsmount: + + * ->mnt_share + * ->mnt_slave_list + * ->mnt_slave + * ->mnt_master + + ->mnt_share + links together all the mount to/from which this vfsmount + send/receives propagation events. + + ->mnt_slave_list + links all the mounts to which this vfsmount propagates + to. + + ->mnt_slave + links together all the slaves that its master vfsmount + propagates to. + + ->mnt_master + points to the master vfsmount from which this vfsmount + receives propagation. + + ->mnt_flags + takes two more flags to indicate the propagation status of + the vfsmount. MNT_SHARE indicates that the vfsmount is a shared + vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be + replicated. + + All the shared vfsmounts in a peer group form a cyclic list through + ->mnt_share. + + All vfsmounts with the same ->mnt_master form on a cyclic list anchored + in ->mnt_master->mnt_slave_list and going through ->mnt_slave. + + ->mnt_master can point to arbitrary (and possibly different) members + of master peer group. To find all immediate slaves of a peer group + you need to go through _all_ ->mnt_slave_list of its members. + Conceptually it's just a single set - distribution among the + individual lists does not affect propagation or the way propagation + tree is modified by operations. + + All vfsmounts in a peer group have the same ->mnt_master. If it is + non-NULL, they form a contiguous (ordered) segment of slave list. + + A example propagation tree looks as shown in the figure below. + [ NOTE: Though it looks like a forest, if we consider all the shared + mounts as a conceptual entity called 'pnode', it becomes a tree]:: + + + A <--> B <--> C <---> D + /|\ /| |\ + / F G J K H I + / + E<-->K + /|\ + M L N + + In the above figure A,B,C and D all are shared and propagate to each + other. 'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave + mounts 'J' and 'K' and 'D' has got two slave mounts 'H' and 'I'. + 'E' is also shared with 'K' and they propagate to each other. And + 'K' has 3 slaves 'M', 'L' and 'N' + + A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D' + + A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G' + + E's ->mnt_share links with ->mnt_share of K + + 'E', 'K', 'F', 'G' have their ->mnt_master point to struct vfsmount of 'A' + + 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K' + + K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N' + + C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K' + + J and K's ->mnt_master points to struct vfsmount of C + + and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I' + + 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'. + + + NOTE: The propagation tree is orthogonal to the mount tree. + +8B Locking: + + ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected + by namespace_sem (exclusive for modifications, shared for reading). + + Normally we have ->mnt_flags modifications serialized by vfsmount_lock. + There are two exceptions: do_add_mount() and clone_mnt(). + The former modifies a vfsmount that has not been visible in any shared + data structures yet. + The latter holds namespace_sem and the only references to vfsmount + are in lists that can't be traversed without namespace_sem. + +8C Algorithm: + + The crux of the implementation resides in rbind/move operation. + + The overall algorithm breaks the operation into 3 phases: (look at + attach_recursive_mnt() and propagate_mnt()) + + 1. prepare phase. + 2. commit phases. + 3. abort phases. + + Prepare phase: + + for each mount in the source tree: + + a) Create the necessary number of mount trees to + be attached to each of the mounts that receive + propagation from the destination mount. + b) Do not attach any of the trees to its destination. + However note down its ->mnt_parent and ->mnt_mountpoint + c) Link all the new mounts to form a propagation tree that + is identical to the propagation tree of the destination + mount. + + If this phase is successful, there should be 'n' new + propagation trees; where 'n' is the number of mounts in the + source tree. Go to the commit phase + + Also there should be 'm' new mount trees, where 'm' is + the number of mounts to which the destination mount + propagates to. + + if any memory allocations fail, go to the abort phase. + + Commit phase + attach each of the mount trees to their corresponding + destination mounts. + + Abort phase + delete all the newly created trees. + + .. Note:: + all the propagation related functionality resides in the file pnode.c + + +------------------------------------------------------------------------ + +version 0.1 (created the initial document, Ram Pai linuxram@us.ibm.com) + +version 0.2 (Incorporated comments from Al Viro) diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt deleted file mode 100644 index 8ccfbd55244b..000000000000 --- a/Documentation/filesystems/sharedsubtree.txt +++ /dev/null @@ -1,939 +0,0 @@ -Shared Subtrees ---------------- - -Contents: - 1) Overview - 2) Features - 3) Setting mount states - 4) Use-case - 5) Detailed semantics - 6) Quiz - 7) FAQ - 8) Implementation - - -1) Overview ------------ - -Consider the following situation: - -A process wants to clone its own namespace, but still wants to access the CD -that got mounted recently. Shared subtree semantics provide the necessary -mechanism to accomplish the above. - -It provides the necessary building blocks for features like per-user-namespace -and versioned filesystem. - -2) Features ------------ - -Shared subtree provides four different flavors of mounts; struct vfsmount to be -precise - - a. shared mount - b. slave mount - c. private mount - d. unbindable mount - - -2a) A shared mount can be replicated to as many mountpoints and all the -replicas continue to be exactly same. - - Here is an example: - - Let's say /mnt has a mount that is shared. - mount --make-shared /mnt - - Note: mount(8) command now supports the --make-shared flag, - so the sample 'smount' program is no longer needed and has been - removed. - - # mount --bind /mnt /tmp - The above command replicates the mount at /mnt to the mountpoint /tmp - and the contents of both the mounts remain identical. - - #ls /mnt - a b c - - #ls /tmp - a b c - - Now let's say we mount a device at /tmp/a - # mount /dev/sd0 /tmp/a - - #ls /tmp/a - t1 t2 t3 - - #ls /mnt/a - t1 t2 t3 - - Note that the mount has propagated to the mount at /mnt as well. - - And the same is true even when /dev/sd0 is mounted on /mnt/a. The - contents will be visible under /tmp/a too. - - -2b) A slave mount is like a shared mount except that mount and umount events - only propagate towards it. - - All slave mounts have a master mount which is a shared. - - Here is an example: - - Let's say /mnt has a mount which is shared. - # mount --make-shared /mnt - - Let's bind mount /mnt to /tmp - # mount --bind /mnt /tmp - - the new mount at /tmp becomes a shared mount and it is a replica of - the mount at /mnt. - - Now let's make the mount at /tmp; a slave of /mnt - # mount --make-slave /tmp - - let's mount /dev/sd0 on /mnt/a - # mount /dev/sd0 /mnt/a - - #ls /mnt/a - t1 t2 t3 - - #ls /tmp/a - t1 t2 t3 - - Note the mount event has propagated to the mount at /tmp - - However let's see what happens if we mount something on the mount at /tmp - - # mount /dev/sd1 /tmp/b - - #ls /tmp/b - s1 s2 s3 - - #ls /mnt/b - - Note how the mount event has not propagated to the mount at - /mnt - - -2c) A private mount does not forward or receive propagation. - - This is the mount we are familiar with. Its the default type. - - -2d) A unbindable mount is a unbindable private mount - - let's say we have a mount at /mnt and we make it unbindable - - # mount --make-unbindable /mnt - - Let's try to bind mount this mount somewhere else. - # mount --bind /mnt /tmp - mount: wrong fs type, bad option, bad superblock on /mnt, - or too many mounted file systems - - Binding a unbindable mount is a invalid operation. - - -3) Setting mount states - - The mount command (util-linux package) can be used to set mount - states: - - mount --make-shared mountpoint - mount --make-slave mountpoint - mount --make-private mountpoint - mount --make-unbindable mountpoint - - -4) Use cases ------------- - - A) A process wants to clone its own namespace, but still wants to - access the CD that got mounted recently. - - Solution: - - The system administrator can make the mount at /cdrom shared - mount --bind /cdrom /cdrom - mount --make-shared /cdrom - - Now any process that clones off a new namespace will have a - mount at /cdrom which is a replica of the same mount in the - parent namespace. - - So when a CD is inserted and mounted at /cdrom that mount gets - propagated to the other mount at /cdrom in all the other clone - namespaces. - - B) A process wants its mounts invisible to any other process, but - still be able to see the other system mounts. - - Solution: - - To begin with, the administrator can mark the entire mount tree - as shareable. - - mount --make-rshared / - - A new process can clone off a new namespace. And mark some part - of its namespace as slave - - mount --make-rslave /myprivatetree - - Hence forth any mounts within the /myprivatetree done by the - process will not show up in any other namespace. However mounts - done in the parent namespace under /myprivatetree still shows - up in the process's namespace. - - - Apart from the above semantics this feature provides the - building blocks to solve the following problems: - - C) Per-user namespace - - The above semantics allows a way to share mounts across - namespaces. But namespaces are associated with processes. If - namespaces are made first class objects with user API to - associate/disassociate a namespace with userid, then each user - could have his/her own namespace and tailor it to his/her - requirements. This needs to be supported in PAM. - - D) Versioned files - - If the entire mount tree is visible at multiple locations, then - an underlying versioning file system can return different - versions of the file depending on the path used to access that - file. - - An example is: - - mount --make-shared / - mount --rbind / /view/v1 - mount --rbind / /view/v2 - mount --rbind / /view/v3 - mount --rbind / /view/v4 - - and if /usr has a versioning filesystem mounted, then that - mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and - /view/v4/usr too - - A user can request v3 version of the file /usr/fs/namespace.c - by accessing /view/v3/usr/fs/namespace.c . The underlying - versioning filesystem can then decipher that v3 version of the - filesystem is being requested and return the corresponding - inode. - -5) Detailed semantics: -------------------- - The section below explains the detailed semantics of - bind, rbind, move, mount, umount and clone-namespace operations. - - Note: the word 'vfsmount' and the noun 'mount' have been used - to mean the same thing, throughout this document. - -5a) Mount states - - A given mount can be in one of the following states - 1) shared - 2) slave - 3) shared and slave - 4) private - 5) unbindable - - A 'propagation event' is defined as event generated on a vfsmount - that leads to mount or unmount actions in other vfsmounts. - - A 'peer group' is defined as a group of vfsmounts that propagate - events to each other. - - (1) Shared mounts - - A 'shared mount' is defined as a vfsmount that belongs to a - 'peer group'. - - For example: - mount --make-shared /mnt - mount --bind /mnt /tmp - - The mount at /mnt and that at /tmp are both shared and belong - to the same peer group. Anything mounted or unmounted under - /mnt or /tmp reflect in all the other mounts of its peer - group. - - - (2) Slave mounts - - A 'slave mount' is defined as a vfsmount that receives - propagation events and does not forward propagation events. - - A slave mount as the name implies has a master mount from which - mount/unmount events are received. Events do not propagate from - the slave mount to the master. Only a shared mount can be made - a slave by executing the following command - - mount --make-slave mount - - A shared mount that is made as a slave is no more shared unless - modified to become shared. - - (3) Shared and Slave - - A vfsmount can be both shared as well as slave. This state - indicates that the mount is a slave of some vfsmount, and - has its own peer group too. This vfsmount receives propagation - events from its master vfsmount, and also forwards propagation - events to its 'peer group' and to its slave vfsmounts. - - Strictly speaking, the vfsmount is shared having its own - peer group, and this peer-group is a slave of some other - peer group. - - Only a slave vfsmount can be made as 'shared and slave' by - either executing the following command - mount --make-shared mount - or by moving the slave vfsmount under a shared vfsmount. - - (4) Private mount - - A 'private mount' is defined as vfsmount that does not - receive or forward any propagation events. - - (5) Unbindable mount - - A 'unbindable mount' is defined as vfsmount that does not - receive or forward any propagation events and cannot - be bind mounted. - - - State diagram: - The state diagram below explains the state transition of a mount, - in response to various commands. - ------------------------------------------------------------------------ - | |make-shared | make-slave | make-private |make-unbindab| - --------------|------------|--------------|--------------|-------------| - |shared |shared |*slave/private| private | unbindable | - | | | | | | - |-------------|------------|--------------|--------------|-------------| - |slave |shared | **slave | private | unbindable | - | |and slave | | | | - |-------------|------------|--------------|--------------|-------------| - |shared |shared | slave | private | unbindable | - |and slave |and slave | | | | - |-------------|------------|--------------|--------------|-------------| - |private |shared | **private | private | unbindable | - |-------------|------------|--------------|--------------|-------------| - |unbindable |shared |**unbindable | private | unbindable | - ------------------------------------------------------------------------ - - * if the shared mount is the only mount in its peer group, making it - slave, makes it private automatically. Note that there is no master to - which it can be slaved to. - - ** slaving a non-shared mount has no effect on the mount. - - Apart from the commands listed below, the 'move' operation also changes - the state of a mount depending on type of the destination mount. Its - explained in section 5d. - -5b) Bind semantics - - Consider the following command - - mount --bind A/a B/b - - where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B' - is the destination mount and 'b' is the dentry in the destination mount. - - The outcome depends on the type of mount of 'A' and 'B'. The table - below contains quick reference. - --------------------------------------------------------------------------- - | BIND MOUNT OPERATION | - |************************************************************************** - |source(A)->| shared | private | slave | unbindable | - | dest(B) | | | | | - | | | | | | | - | v | | | | | - |************************************************************************** - | shared | shared | shared | shared & slave | invalid | - | | | | | | - |non-shared| shared | private | slave | invalid | - *************************************************************************** - - Details: - - 1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C' - which is clone of 'A', is created. Its root dentry is 'a' . 'C' is - mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... - are created and mounted at the dentry 'b' on all mounts where 'B' - propagates to. A new propagation tree containing 'C1',..,'Cn' is - created. This propagation tree is identical to the propagation tree of - 'B'. And finally the peer-group of 'C' is merged with the peer group - of 'A'. - - 2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C' - which is clone of 'A', is created. Its root dentry is 'a'. 'C' is - mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ... - are created and mounted at the dentry 'b' on all mounts where 'B' - propagates to. A new propagation tree is set containing all new mounts - 'C', 'C1', .., 'Cn' with exactly the same configuration as the - propagation tree for 'B'. - - 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new - mount 'C' which is clone of 'A', is created. Its root dentry is 'a' . - 'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2', - 'C3' ... are created and mounted at the dentry 'b' on all mounts where - 'B' propagates to. A new propagation tree containing the new mounts - 'C','C1',.. 'Cn' is created. This propagation tree is identical to the - propagation tree for 'B'. And finally the mount 'C' and its peer group - is made the slave of mount 'Z'. In other words, mount 'C' is in the - state 'slave and shared'. - - 4. 'A' is a unbindable mount and 'B' is a shared mount. This is a - invalid operation. - - 5. 'A' is a private mount and 'B' is a non-shared(private or slave or - unbindable) mount. A new mount 'C' which is clone of 'A', is created. - Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'. - - 6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C' - which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is - mounted on mount 'B' at dentry 'b'. 'C' is made a member of the - peer-group of 'A'. - - 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A - new mount 'C' which is a clone of 'A' is created. Its root dentry is - 'a'. 'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a - slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of - 'Z'. All mount/unmount events on 'Z' propagates to 'A' and 'C'. But - mount/unmount on 'A' do not propagate anywhere else. Similarly - mount/unmount on 'C' do not propagate anywhere else. - - 8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a - invalid operation. A unbindable mount cannot be bind mounted. - -5c) Rbind semantics - - rbind is same as bind. Bind replicates the specified mount. Rbind - replicates all the mounts in the tree belonging to the specified mount. - Rbind mount is bind mount applied to all the mounts in the tree. - - If the source tree that is rbind has some unbindable mounts, - then the subtree under the unbindable mount is pruned in the new - location. - - eg: let's say we have the following mount tree. - - A - / \ - B C - / \ / \ - D E F G - - Let's say all the mount except the mount C in the tree are - of a type other than unbindable. - - If this tree is rbound to say Z - - We will have the following tree at the new location. - - Z - | - A' - / - B' Note how the tree under C is pruned - / \ in the new location. - D' E' - - - -5d) Move semantics - - Consider the following command - - mount --move A B/b - - where 'A' is the source mount, 'B' is the destination mount and 'b' is - the dentry in the destination mount. - - The outcome depends on the type of the mount of 'A' and 'B'. The table - below is a quick reference. - --------------------------------------------------------------------------- - | MOVE MOUNT OPERATION | - |************************************************************************** - | source(A)->| shared | private | slave | unbindable | - | dest(B) | | | | | - | | | | | | | - | v | | | | | - |************************************************************************** - | shared | shared | shared |shared and slave| invalid | - | | | | | | - |non-shared| shared | private | slave | unbindable | - *************************************************************************** - NOTE: moving a mount residing under a shared mount is invalid. - - Details follow: - - 1. 'A' is a shared mount and 'B' is a shared mount. The mount 'A' is - mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', 'A2'...'An' - are created and mounted at dentry 'b' on all mounts that receive - propagation from mount 'B'. A new propagation tree is created in the - exact same configuration as that of 'B'. This new propagation tree - contains all the new mounts 'A1', 'A2'... 'An'. And this new - propagation tree is appended to the already existing propagation tree - of 'A'. - - 2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is - mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An' - are created and mounted at dentry 'b' on all mounts that receive - propagation from mount 'B'. The mount 'A' becomes a shared mount and a - propagation tree is created which is identical to that of - 'B'. This new propagation tree contains all the new mounts 'A1', - 'A2'... 'An'. - - 3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. The - mount 'A' is mounted on mount 'B' at dentry 'b'. Also new mounts 'A1', - 'A2'... 'An' are created and mounted at dentry 'b' on all mounts that - receive propagation from mount 'B'. A new propagation tree is created - in the exact same configuration as that of 'B'. This new propagation - tree contains all the new mounts 'A1', 'A2'... 'An'. And this new - propagation tree is appended to the already existing propagation tree of - 'A'. Mount 'A' continues to be the slave mount of 'Z' but it also - becomes 'shared'. - - 4. 'A' is a unbindable mount and 'B' is a shared mount. The operation - is invalid. Because mounting anything on the shared mount 'B' can - create new mounts that get mounted on the mounts that receive - propagation from 'B'. And since the mount 'A' is unbindable, cloning - it to mount at other mountpoints is not possible. - - 5. 'A' is a private mount and 'B' is a non-shared(private or slave or - unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'. - - 6. 'A' is a shared mount and 'B' is a non-shared mount. The mount 'A' - is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a - shared mount. - - 7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. - The mount 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' - continues to be a slave mount of mount 'Z'. - - 8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount - 'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a - unbindable mount. - -5e) Mount semantics - - Consider the following command - - mount device B/b - - 'B' is the destination mount and 'b' is the dentry in the destination - mount. - - The above operation is the same as bind operation with the exception - that the source mount is always a private mount. - - -5f) Unmount semantics - - Consider the following command - - umount A - - where 'A' is a mount mounted on mount 'B' at dentry 'b'. - - If mount 'B' is shared, then all most-recently-mounted mounts at dentry - 'b' on mounts that receive propagation from mount 'B' and does not have - sub-mounts within them are unmounted. - - Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to - each other. - - let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount - 'B1', 'B2' and 'B3' respectively. - - let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on - mount 'B1', 'B2' and 'B3' respectively. - - if 'C1' is unmounted, all the mounts that are most-recently-mounted on - 'B1' and on the mounts that 'B1' propagates-to are unmounted. - - 'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount - on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'. - - So all 'C1', 'C2' and 'C3' should be unmounted. - - If any of 'C2' or 'C3' has some child mounts, then that mount is not - unmounted, but all other mounts are unmounted. However if 'C1' is told - to be unmounted and 'C1' has some sub-mounts, the umount operation is - failed entirely. - -5g) Clone Namespace - - A cloned namespace contains all the mounts as that of the parent - namespace. - - Let's say 'A' and 'B' are the corresponding mounts in the parent and the - child namespace. - - If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to - each other. - - If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of - 'Z'. - - If 'A' is a private mount, then 'B' is a private mount too. - - If 'A' is unbindable mount, then 'B' is a unbindable mount too. - - -6) Quiz - - A. What is the result of the following command sequence? - - mount --bind /mnt /mnt - mount --make-shared /mnt - mount --bind /mnt /tmp - mount --move /tmp /mnt/1 - - what should be the contents of /mnt /mnt/1 /mnt/1/1 should be? - Should they all be identical? or should /mnt and /mnt/1 be - identical only? - - - B. What is the result of the following command sequence? - - mount --make-rshared / - mkdir -p /v/1 - mount --rbind / /v/1 - - what should be the content of /v/1/v/1 be? - - - C. What is the result of the following command sequence? - - mount --bind /mnt /mnt - mount --make-shared /mnt - mkdir -p /mnt/1/2/3 /mnt/1/test - mount --bind /mnt/1 /tmp - mount --make-slave /mnt - mount --make-shared /mnt - mount --bind /mnt/1/2 /tmp1 - mount --make-slave /mnt - - At this point we have the first mount at /tmp and - its root dentry is 1. Let's call this mount 'A' - And then we have a second mount at /tmp1 with root - dentry 2. Let's call this mount 'B' - Next we have a third mount at /mnt with root dentry - mnt. Let's call this mount 'C' - - 'B' is the slave of 'A' and 'C' is a slave of 'B' - A -> B -> C - - at this point if we execute the following command - - mount --bind /bin /tmp/test - - The mount is attempted on 'A' - - will the mount propagate to 'B' and 'C' ? - - what would be the contents of - /mnt/1/test be? - -7) FAQ - - Q1. Why is bind mount needed? How is it different from symbolic links? - symbolic links can get stale if the destination mount gets - unmounted or moved. Bind mounts continue to exist even if the - other mount is unmounted or moved. - - Q2. Why can't the shared subtree be implemented using exportfs? - - exportfs is a heavyweight way of accomplishing part of what - shared subtree can do. I cannot imagine a way to implement the - semantics of slave mount using exportfs? - - Q3 Why is unbindable mount needed? - - Let's say we want to replicate the mount tree at multiple - locations within the same subtree. - - if one rbind mounts a tree within the same subtree 'n' times - the number of mounts created is an exponential function of 'n'. - Having unbindable mount can help prune the unneeded bind - mounts. Here is an example. - - step 1: - let's say the root tree has just two directories with - one vfsmount. - root - / \ - tmp usr - - And we want to replicate the tree at multiple - mountpoints under /root/tmp - - step2: - mount --make-shared /root - - mkdir -p /tmp/m1 - - mount --rbind /root /tmp/m1 - - the new tree now looks like this: - - root - / \ - tmp usr - / - m1 - / \ - tmp usr - / - m1 - - it has two vfsmounts - - step3: - mkdir -p /tmp/m2 - mount --rbind /root /tmp/m2 - - the new tree now looks like this: - - root - / \ - tmp usr - / \ - m1 m2 - / \ / \ - tmp usr tmp usr - / \ / - m1 m2 m1 - / \ / \ - tmp usr tmp usr - / / \ - m1 m1 m2 - / \ - tmp usr - / \ - m1 m2 - - it has 6 vfsmounts - - step 4: - mkdir -p /tmp/m3 - mount --rbind /root /tmp/m3 - - I won't draw the tree..but it has 24 vfsmounts - - - at step i the number of vfsmounts is V[i] = i*V[i-1]. - This is an exponential function. And this tree has way more - mounts than what we really needed in the first place. - - One could use a series of umount at each step to prune - out the unneeded mounts. But there is a better solution. - Unclonable mounts come in handy here. - - step 1: - let's say the root tree has just two directories with - one vfsmount. - root - / \ - tmp usr - - How do we set up the same tree at multiple locations under - /root/tmp - - step2: - mount --bind /root/tmp /root/tmp - - mount --make-rshared /root - mount --make-unbindable /root/tmp - - mkdir -p /tmp/m1 - - mount --rbind /root /tmp/m1 - - the new tree now looks like this: - - root - / \ - tmp usr - / - m1 - / \ - tmp usr - - step3: - mkdir -p /tmp/m2 - mount --rbind /root /tmp/m2 - - the new tree now looks like this: - - root - / \ - tmp usr - / \ - m1 m2 - / \ / \ - tmp usr tmp usr - - step4: - - mkdir -p /tmp/m3 - mount --rbind /root /tmp/m3 - - the new tree now looks like this: - - root - / \ - tmp usr - / \ \ - m1 m2 m3 - / \ / \ / \ - tmp usr tmp usr tmp usr - -8) Implementation - -8A) Datastructure - - 4 new fields are introduced to struct vfsmount - ->mnt_share - ->mnt_slave_list - ->mnt_slave - ->mnt_master - - ->mnt_share links together all the mount to/from which this vfsmount - send/receives propagation events. - - ->mnt_slave_list links all the mounts to which this vfsmount propagates - to. - - ->mnt_slave links together all the slaves that its master vfsmount - propagates to. - - ->mnt_master points to the master vfsmount from which this vfsmount - receives propagation. - - ->mnt_flags takes two more flags to indicate the propagation status of - the vfsmount. MNT_SHARE indicates that the vfsmount is a shared - vfsmount. MNT_UNCLONABLE indicates that the vfsmount cannot be - replicated. - - All the shared vfsmounts in a peer group form a cyclic list through - ->mnt_share. - - All vfsmounts with the same ->mnt_master form on a cyclic list anchored - in ->mnt_master->mnt_slave_list and going through ->mnt_slave. - - ->mnt_master can point to arbitrary (and possibly different) members - of master peer group. To find all immediate slaves of a peer group - you need to go through _all_ ->mnt_slave_list of its members. - Conceptually it's just a single set - distribution among the - individual lists does not affect propagation or the way propagation - tree is modified by operations. - - All vfsmounts in a peer group have the same ->mnt_master. If it is - non-NULL, they form a contiguous (ordered) segment of slave list. - - A example propagation tree looks as shown in the figure below. - [ NOTE: Though it looks like a forest, if we consider all the shared - mounts as a conceptual entity called 'pnode', it becomes a tree] - - - A <--> B <--> C <---> D - /|\ /| |\ - / F G J K H I - / - E<-->K - /|\ - M L N - - In the above figure A,B,C and D all are shared and propagate to each - other. 'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave - mounts 'J' and 'K' and 'D' has got two slave mounts 'H' and 'I'. - 'E' is also shared with 'K' and they propagate to each other. And - 'K' has 3 slaves 'M', 'L' and 'N' - - A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D' - - A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G' - - E's ->mnt_share links with ->mnt_share of K - 'E', 'K', 'F', 'G' have their ->mnt_master point to struct - vfsmount of 'A' - 'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K' - K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N' - - C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K' - J and K's ->mnt_master points to struct vfsmount of C - and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I' - 'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'. - - - NOTE: The propagation tree is orthogonal to the mount tree. - -8B Locking: - - ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected - by namespace_sem (exclusive for modifications, shared for reading). - - Normally we have ->mnt_flags modifications serialized by vfsmount_lock. - There are two exceptions: do_add_mount() and clone_mnt(). - The former modifies a vfsmount that has not been visible in any shared - data structures yet. - The latter holds namespace_sem and the only references to vfsmount - are in lists that can't be traversed without namespace_sem. - -8C Algorithm: - - The crux of the implementation resides in rbind/move operation. - - The overall algorithm breaks the operation into 3 phases: (look at - attach_recursive_mnt() and propagate_mnt()) - - 1. prepare phase. - 2. commit phases. - 3. abort phases. - - Prepare phase: - - for each mount in the source tree: - a) Create the necessary number of mount trees to - be attached to each of the mounts that receive - propagation from the destination mount. - b) Do not attach any of the trees to its destination. - However note down its ->mnt_parent and ->mnt_mountpoint - c) Link all the new mounts to form a propagation tree that - is identical to the propagation tree of the destination - mount. - - If this phase is successful, there should be 'n' new - propagation trees; where 'n' is the number of mounts in the - source tree. Go to the commit phase - - Also there should be 'm' new mount trees, where 'm' is - the number of mounts to which the destination mount - propagates to. - - if any memory allocations fail, go to the abort phase. - - Commit phase - attach each of the mount trees to their corresponding - destination mounts. - - Abort phase - delete all the newly created trees. - - NOTE: all the propagation related functionality resides in the file - pnode.c - - ------------------------------------------------------------------------- - -version 0.1 (created the initial document, Ram Pai linuxram@us.ibm.com) -version 0.2 (Incorporated comments from Al Viro) -- cgit v1.2.3 From a5a1c349ac48252e871abaecabc158f761e0aa92 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:13 +0200 Subject: docs: filesystems: split spufs.txt into 3 separate files This file has manpages for 3 different things. As we'll be converting it to ReST, let's fist split it into their individual components. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/3753aa73524f4e1cbf0c19e34f7b322420e0b1c6.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/spufs.txt | 521 ------------------------- Documentation/filesystems/spufs/spu_create.txt | 119 ++++++ Documentation/filesystems/spufs/spu_run.txt | 121 ++++++ Documentation/filesystems/spufs/spufs.txt | 276 +++++++++++++ 4 files changed, 516 insertions(+), 521 deletions(-) delete mode 100644 Documentation/filesystems/spufs.txt create mode 100644 Documentation/filesystems/spufs/spu_create.txt create mode 100644 Documentation/filesystems/spufs/spu_run.txt create mode 100644 Documentation/filesystems/spufs/spufs.txt diff --git a/Documentation/filesystems/spufs.txt b/Documentation/filesystems/spufs.txt deleted file mode 100644 index eb9e3aa63026..000000000000 --- a/Documentation/filesystems/spufs.txt +++ /dev/null @@ -1,521 +0,0 @@ -SPUFS(2) Linux Programmer's Manual SPUFS(2) - - - -NAME - spufs - the SPU file system - - -DESCRIPTION - The SPU file system is used on PowerPC machines that implement the Cell - Broadband Engine Architecture in order to access Synergistic Processor - Units (SPUs). - - The file system provides a name space similar to posix shared memory or - message queues. Users that have write permissions on the file system - can use spu_create(2) to establish SPU contexts in the spufs root. - - Every SPU context is represented by a directory containing a predefined - set of files. These files can be used for manipulating the state of the - logical SPU. Users can change permissions on those files, but not actu- - ally add or remove files. - - -MOUNT OPTIONS - uid= - set the user owning the mount point, the default is 0 (root). - - gid= - set the group owning the mount point, the default is 0 (root). - - -FILES - The files in spufs mostly follow the standard behavior for regular sys- - tem calls like read(2) or write(2), but often support only a subset of - the operations supported on regular file systems. This list details the - supported operations and the deviations from the behaviour in the - respective man pages. - - All files that support the read(2) operation also support readv(2) and - all files that support the write(2) operation also support writev(2). - All files support the access(2) and stat(2) family of operations, but - only the st_mode, st_nlink, st_uid and st_gid fields of struct stat - contain reliable information. - - All files support the chmod(2)/fchmod(2) and chown(2)/fchown(2) opera- - tions, but will not be able to grant permissions that contradict the - possible operations, e.g. read access on the wbox file. - - The current set of files is: - - - /mem - the contents of the local storage memory of the SPU. This can be - accessed like a regular shared memory file and contains both code and - data in the address space of the SPU. The possible operations on an - open mem file are: - - read(2), pread(2), write(2), pwrite(2), lseek(2) - These operate as documented, with the exception that seek(2), - write(2) and pwrite(2) are not supported beyond the end of the - file. The file size is the size of the local storage of the SPU, - which normally is 256 kilobytes. - - mmap(2) - Mapping mem into the process address space gives access to the - SPU local storage within the process address space. Only - MAP_SHARED mappings are allowed. - - - /mbox - The first SPU to CPU communication mailbox. This file is read-only and - can be read in units of 32 bits. The file can only be used in non- - blocking mode and it even poll() will not block on it. The possible - operations on an open mbox file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. If there is no data available in the mail - box, the return value is set to -1 and errno becomes EAGAIN. - When data has been read successfully, four bytes are placed in - the data buffer and the value four is returned. - - - /ibox - The second SPU to CPU communication mailbox. This file is similar to - the first mailbox file, but can be read in blocking I/O mode, and the - poll family of system calls can be used to wait for it. The possible - operations on an open ibox file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. If there is no data available in the mail - box and the file descriptor has been opened with O_NONBLOCK, the - return value is set to -1 and errno becomes EAGAIN. - - If there is no data available in the mail box and the file - descriptor has been opened without O_NONBLOCK, the call will - block until the SPU writes to its interrupt mailbox channel. - When data has been read successfully, four bytes are placed in - the data buffer and the value four is returned. - - poll(2) - Poll on the ibox file returns (POLLIN | POLLRDNORM) whenever - data is available for reading. - - - /wbox - The CPU to SPU communation mailbox. It is write-only and can be written - in units of 32 bits. If the mailbox is full, write() will block and - poll can be used to wait for it becoming empty again. The possible - operations on an open wbox file are: write(2) If a count smaller than - four is requested, write returns -1 and sets errno to EINVAL. If there - is no space available in the mail box and the file descriptor has been - opened with O_NONBLOCK, the return value is set to -1 and errno becomes - EAGAIN. - - If there is no space available in the mail box and the file descriptor - has been opened without O_NONBLOCK, the call will block until the SPU - reads from its PPE mailbox channel. When data has been read success- - fully, four bytes are placed in the data buffer and the value four is - returned. - - poll(2) - Poll on the ibox file returns (POLLOUT | POLLWRNORM) whenever - space is available for writing. - - - /mbox_stat - /ibox_stat - /wbox_stat - Read-only files that contain the length of the current queue, i.e. how - many words can be read from mbox or ibox or how many words can be - written to wbox without blocking. The files can be read only in 4-byte - units and return a big-endian binary integer number. The possible - operations on an open *box_stat file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is placed in - the data buffer, containing the number of elements that can be - read from (for mbox_stat and ibox_stat) or written to (for - wbox_stat) the respective mail box without blocking or resulting - in EAGAIN. - - - /npc - /decr - /decr_status - /spu_tag_mask - /event_mask - /srr0 - Internal registers of the SPU. The representation is an ASCII string - with the numeric value of the next instruction to be executed. These - can be used in read/write mode for debugging, but normal operation of - programs should not rely on them because access to any of them except - npc requires an SPU context save and is therefore very inefficient. - - The contents of these files are: - - npc Next Program Counter - - decr SPU Decrementer - - decr_status Decrementer Status - - spu_tag_mask MFC tag mask for SPU DMA - - event_mask Event mask for SPU interrupts - - srr0 Interrupt Return address register - - - The possible operations on an open npc, decr, decr_status, - spu_tag_mask, event_mask or srr0 file are: - - read(2) - When the count supplied to the read call is shorter than the - required length for the pointer value plus a newline character, - subsequent reads from the same file descriptor will result in - completing the string, regardless of changes to the register by - a running SPU task. When a complete string has been read, all - subsequent read operations will return zero bytes and a new file - descriptor needs to be opened to read the value again. - - write(2) - A write operation on the file results in setting the register to - the value given in the string. The string is parsed from the - beginning to the first non-numeric character or the end of the - buffer. Subsequent writes to the same file descriptor overwrite - the previous setting. - - - /fpcr - This file gives access to the Floating Point Status and Control Regis- - ter as a four byte long file. The operations on the fpcr file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is placed in - the data buffer, containing the current value of the fpcr regis- - ter. - - write(2) - If a count smaller than four is requested, write returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is copied - from the data buffer, updating the value of the fpcr register. - - - /signal1 - /signal2 - The two signal notification channels of an SPU. These are read-write - files that operate on a 32 bit word. Writing to one of these files - triggers an interrupt on the SPU. The value written to the signal - files can be read from the SPU through a channel read or from host user - space through the file. After the value has been read by the SPU, it - is reset to zero. The possible operations on an open signal1 or sig- - nal2 file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is placed in - the data buffer, containing the current value of the specified - signal notification register. - - write(2) - If a count smaller than four is requested, write returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is copied - from the data buffer, updating the value of the specified signal - notification register. The signal notification register will - either be replaced with the input data or will be updated to the - bitwise OR or the old value and the input data, depending on the - contents of the signal1_type, or signal2_type respectively, - file. - - - /signal1_type - /signal2_type - These two files change the behavior of the signal1 and signal2 notifi- - cation files. The contain a numerical ASCII string which is read as - either "1" or "0". In mode 0 (overwrite), the hardware replaces the - contents of the signal channel with the data that is written to it. in - mode 1 (logical OR), the hardware accumulates the bits that are subse- - quently written to it. The possible operations on an open signal1_type - or signal2_type file are: - - read(2) - When the count supplied to the read call is shorter than the - required length for the digit plus a newline character, subse- - quent reads from the same file descriptor will result in com- - pleting the string. When a complete string has been read, all - subsequent read operations will return zero bytes and a new file - descriptor needs to be opened to read the value again. - - write(2) - A write operation on the file results in setting the register to - the value given in the string. The string is parsed from the - beginning to the first non-numeric character or the end of the - buffer. Subsequent writes to the same file descriptor overwrite - the previous setting. - - -EXAMPLES - /etc/fstab entry - none /spu spufs gid=spu 0 0 - - -AUTHORS - Arnd Bergmann , Mark Nutter , - Ulrich Weigand - -SEE ALSO - capabilities(7), close(2), spu_create(2), spu_run(2), spufs(7) - - - -Linux 2005-09-28 SPUFS(2) - ------------------------------------------------------------------------------- - -SPU_RUN(2) Linux Programmer's Manual SPU_RUN(2) - - - -NAME - spu_run - execute an spu context - - -SYNOPSIS - #include - - int spu_run(int fd, unsigned int *npc, unsigned int *event); - -DESCRIPTION - The spu_run system call is used on PowerPC machines that implement the - Cell Broadband Engine Architecture in order to access Synergistic Pro- - cessor Units (SPUs). It uses the fd that was returned from spu_cre- - ate(2) to address a specific SPU context. When the context gets sched- - uled to a physical SPU, it starts execution at the instruction pointer - passed in npc. - - Execution of SPU code happens synchronously, meaning that spu_run does - not return while the SPU is still running. If there is a need to exe- - cute SPU code in parallel with other code on either the main CPU or - other SPUs, you need to create a new thread of execution first, e.g. - using the pthread_create(3) call. - - When spu_run returns, the current value of the SPU instruction pointer - is written back to npc, so you can call spu_run again without updating - the pointers. - - event can be a NULL pointer or point to an extended status code that - gets filled when spu_run returns. It can be one of the following con- - stants: - - SPE_EVENT_DMA_ALIGNMENT - A DMA alignment error - - SPE_EVENT_SPE_DATA_SEGMENT - A DMA segmentation error - - SPE_EVENT_SPE_DATA_STORAGE - A DMA storage error - - If NULL is passed as the event argument, these errors will result in a - signal delivered to the calling process. - -RETURN VALUE - spu_run returns the value of the spu_status register or -1 to indicate - an error and set errno to one of the error codes listed below. The - spu_status register value contains a bit mask of status codes and - optionally a 14 bit code returned from the stop-and-signal instruction - on the SPU. The bit masks for the status codes are: - - 0x02 SPU was stopped by stop-and-signal. - - 0x04 SPU was stopped by halt. - - 0x08 SPU is waiting for a channel. - - 0x10 SPU is in single-step mode. - - 0x20 SPU has tried to execute an invalid instruction. - - 0x40 SPU has tried to access an invalid channel. - - 0x3fff0000 - The bits masked with this value contain the code returned from - stop-and-signal. - - There are always one or more of the lower eight bits set or an error - code is returned from spu_run. - -ERRORS - EAGAIN or EWOULDBLOCK - fd is in non-blocking mode and spu_run would block. - - EBADF fd is not a valid file descriptor. - - EFAULT npc is not a valid pointer or status is neither NULL nor a valid - pointer. - - EINTR A signal occurred while spu_run was in progress. The npc value - has been updated to the new program counter value if necessary. - - EINVAL fd is not a file descriptor returned from spu_create(2). - - ENOMEM Insufficient memory was available to handle a page fault result- - ing from an MFC direct memory access. - - ENOSYS the functionality is not provided by the current system, because - either the hardware does not provide SPUs or the spufs module is - not loaded. - - -NOTES - spu_run is meant to be used from libraries that implement a more - abstract interface to SPUs, not to be used from regular applications. - See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- - ommended libraries. - - -CONFORMING TO - This call is Linux specific and only implemented by the ppc64 architec- - ture. Programs using this system call are not portable. - - -BUGS - The code does not yet fully implement all features lined out here. - - -AUTHOR - Arnd Bergmann - -SEE ALSO - capabilities(7), close(2), spu_create(2), spufs(7) - - - -Linux 2005-09-28 SPU_RUN(2) - ------------------------------------------------------------------------------- - -SPU_CREATE(2) Linux Programmer's Manual SPU_CREATE(2) - - - -NAME - spu_create - create a new spu context - - -SYNOPSIS - #include - #include - - int spu_create(const char *pathname, int flags, mode_t mode); - -DESCRIPTION - The spu_create system call is used on PowerPC machines that implement - the Cell Broadband Engine Architecture in order to access Synergistic - Processor Units (SPUs). It creates a new logical context for an SPU in - pathname and returns a handle to associated with it. pathname must - point to a non-existing directory in the mount point of the SPU file - system (spufs). When spu_create is successful, a directory gets cre- - ated on pathname and it is populated with files. - - The returned file handle can only be passed to spu_run(2) or closed, - other operations are not defined on it. When it is closed, all associ- - ated directory entries in spufs are removed. When the last file handle - pointing either inside of the context directory or to this file - descriptor is closed, the logical SPU context is destroyed. - - The parameter flags can be zero or any bitwise or'd combination of the - following constants: - - SPU_RAWIO - Allow mapping of some of the hardware registers of the SPU into - user space. This flag requires the CAP_SYS_RAWIO capability, see - capabilities(7). - - The mode parameter specifies the permissions used for creating the new - directory in spufs. mode is modified with the user's umask(2) value - and then used for both the directory and the files contained in it. The - file permissions mask out some more bits of mode because they typically - support only read or write access. See stat(2) for a full list of the - possible mode values. - - -RETURN VALUE - spu_create returns a new file descriptor. It may return -1 to indicate - an error condition and set errno to one of the error codes listed - below. - - -ERRORS - EACCES - The current user does not have write access on the spufs mount - point. - - EEXIST An SPU context already exists at the given path name. - - EFAULT pathname is not a valid string pointer in the current address - space. - - EINVAL pathname is not a directory in the spufs mount point. - - ELOOP Too many symlinks were found while resolving pathname. - - EMFILE The process has reached its maximum open file limit. - - ENAMETOOLONG - pathname was too long. - - ENFILE The system has reached the global open file limit. - - ENOENT Part of pathname could not be resolved. - - ENOMEM The kernel could not allocate all resources required. - - ENOSPC There are not enough SPU resources available to create a new - context or the user specific limit for the number of SPU con- - texts has been reached. - - ENOSYS the functionality is not provided by the current system, because - either the hardware does not provide SPUs or the spufs module is - not loaded. - - ENOTDIR - A part of pathname is not a directory. - - - -NOTES - spu_create is meant to be used from libraries that implement a more - abstract interface to SPUs, not to be used from regular applications. - See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- - ommended libraries. - - -FILES - pathname must point to a location beneath the mount point of spufs. By - convention, it gets mounted in /spu. - - -CONFORMING TO - This call is Linux specific and only implemented by the ppc64 architec- - ture. Programs using this system call are not portable. - - -BUGS - The code does not yet fully implement all features lined out here. - - -AUTHOR - Arnd Bergmann - -SEE ALSO - capabilities(7), close(2), spu_run(2), spufs(7) - - - -Linux 2005-09-28 SPU_CREATE(2) diff --git a/Documentation/filesystems/spufs/spu_create.txt b/Documentation/filesystems/spufs/spu_create.txt new file mode 100644 index 000000000000..8ede5a35340f --- /dev/null +++ b/Documentation/filesystems/spufs/spu_create.txt @@ -0,0 +1,119 @@ +SPU_CREATE(2) Linux Programmer's Manual SPU_CREATE(2) + + + +NAME + spu_create - create a new spu context + + +SYNOPSIS + #include + #include + + int spu_create(const char *pathname, int flags, mode_t mode); + +DESCRIPTION + The spu_create system call is used on PowerPC machines that implement + the Cell Broadband Engine Architecture in order to access Synergistic + Processor Units (SPUs). It creates a new logical context for an SPU in + pathname and returns a handle to associated with it. pathname must + point to a non-existing directory in the mount point of the SPU file + system (spufs). When spu_create is successful, a directory gets cre- + ated on pathname and it is populated with files. + + The returned file handle can only be passed to spu_run(2) or closed, + other operations are not defined on it. When it is closed, all associ- + ated directory entries in spufs are removed. When the last file handle + pointing either inside of the context directory or to this file + descriptor is closed, the logical SPU context is destroyed. + + The parameter flags can be zero or any bitwise or'd combination of the + following constants: + + SPU_RAWIO + Allow mapping of some of the hardware registers of the SPU into + user space. This flag requires the CAP_SYS_RAWIO capability, see + capabilities(7). + + The mode parameter specifies the permissions used for creating the new + directory in spufs. mode is modified with the user's umask(2) value + and then used for both the directory and the files contained in it. The + file permissions mask out some more bits of mode because they typically + support only read or write access. See stat(2) for a full list of the + possible mode values. + + +RETURN VALUE + spu_create returns a new file descriptor. It may return -1 to indicate + an error condition and set errno to one of the error codes listed + below. + + +ERRORS + EACCES + The current user does not have write access on the spufs mount + point. + + EEXIST An SPU context already exists at the given path name. + + EFAULT pathname is not a valid string pointer in the current address + space. + + EINVAL pathname is not a directory in the spufs mount point. + + ELOOP Too many symlinks were found while resolving pathname. + + EMFILE The process has reached its maximum open file limit. + + ENAMETOOLONG + pathname was too long. + + ENFILE The system has reached the global open file limit. + + ENOENT Part of pathname could not be resolved. + + ENOMEM The kernel could not allocate all resources required. + + ENOSPC There are not enough SPU resources available to create a new + context or the user specific limit for the number of SPU con- + texts has been reached. + + ENOSYS the functionality is not provided by the current system, because + either the hardware does not provide SPUs or the spufs module is + not loaded. + + ENOTDIR + A part of pathname is not a directory. + + + +NOTES + spu_create is meant to be used from libraries that implement a more + abstract interface to SPUs, not to be used from regular applications. + See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- + ommended libraries. + + +FILES + pathname must point to a location beneath the mount point of spufs. By + convention, it gets mounted in /spu. + + +CONFORMING TO + This call is Linux specific and only implemented by the ppc64 architec- + ture. Programs using this system call are not portable. + + +BUGS + The code does not yet fully implement all features lined out here. + + +AUTHOR + Arnd Bergmann + +SEE ALSO + capabilities(7), close(2), spu_run(2), spufs(7) + + + +Linux 2005-09-28 SPU_CREATE(2) diff --git a/Documentation/filesystems/spufs/spu_run.txt b/Documentation/filesystems/spufs/spu_run.txt new file mode 100644 index 000000000000..d5c6a00d0f97 --- /dev/null +++ b/Documentation/filesystems/spufs/spu_run.txt @@ -0,0 +1,121 @@ +SPU_RUN(2) Linux Programmer's Manual SPU_RUN(2) + + + +NAME + spu_run - execute an spu context + + +SYNOPSIS + #include + + int spu_run(int fd, unsigned int *npc, unsigned int *event); + +DESCRIPTION + The spu_run system call is used on PowerPC machines that implement the + Cell Broadband Engine Architecture in order to access Synergistic Pro- + cessor Units (SPUs). It uses the fd that was returned from spu_cre- + ate(2) to address a specific SPU context. When the context gets sched- + uled to a physical SPU, it starts execution at the instruction pointer + passed in npc. + + Execution of SPU code happens synchronously, meaning that spu_run does + not return while the SPU is still running. If there is a need to exe- + cute SPU code in parallel with other code on either the main CPU or + other SPUs, you need to create a new thread of execution first, e.g. + using the pthread_create(3) call. + + When spu_run returns, the current value of the SPU instruction pointer + is written back to npc, so you can call spu_run again without updating + the pointers. + + event can be a NULL pointer or point to an extended status code that + gets filled when spu_run returns. It can be one of the following con- + stants: + + SPE_EVENT_DMA_ALIGNMENT + A DMA alignment error + + SPE_EVENT_SPE_DATA_SEGMENT + A DMA segmentation error + + SPE_EVENT_SPE_DATA_STORAGE + A DMA storage error + + If NULL is passed as the event argument, these errors will result in a + signal delivered to the calling process. + +RETURN VALUE + spu_run returns the value of the spu_status register or -1 to indicate + an error and set errno to one of the error codes listed below. The + spu_status register value contains a bit mask of status codes and + optionally a 14 bit code returned from the stop-and-signal instruction + on the SPU. The bit masks for the status codes are: + + 0x02 SPU was stopped by stop-and-signal. + + 0x04 SPU was stopped by halt. + + 0x08 SPU is waiting for a channel. + + 0x10 SPU is in single-step mode. + + 0x20 SPU has tried to execute an invalid instruction. + + 0x40 SPU has tried to access an invalid channel. + + 0x3fff0000 + The bits masked with this value contain the code returned from + stop-and-signal. + + There are always one or more of the lower eight bits set or an error + code is returned from spu_run. + +ERRORS + EAGAIN or EWOULDBLOCK + fd is in non-blocking mode and spu_run would block. + + EBADF fd is not a valid file descriptor. + + EFAULT npc is not a valid pointer or status is neither NULL nor a valid + pointer. + + EINTR A signal occurred while spu_run was in progress. The npc value + has been updated to the new program counter value if necessary. + + EINVAL fd is not a file descriptor returned from spu_create(2). + + ENOMEM Insufficient memory was available to handle a page fault result- + ing from an MFC direct memory access. + + ENOSYS the functionality is not provided by the current system, because + either the hardware does not provide SPUs or the spufs module is + not loaded. + + +NOTES + spu_run is meant to be used from libraries that implement a more + abstract interface to SPUs, not to be used from regular applications. + See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- + ommended libraries. + + +CONFORMING TO + This call is Linux specific and only implemented by the ppc64 architec- + ture. Programs using this system call are not portable. + + +BUGS + The code does not yet fully implement all features lined out here. + + +AUTHOR + Arnd Bergmann + +SEE ALSO + capabilities(7), close(2), spu_create(2), spufs(7) + + + + +Linux 2005-09-28 SPU_RUN(2) diff --git a/Documentation/filesystems/spufs/spufs.txt b/Documentation/filesystems/spufs/spufs.txt new file mode 100644 index 000000000000..caf36aaae804 --- /dev/null +++ b/Documentation/filesystems/spufs/spufs.txt @@ -0,0 +1,276 @@ +SPUFS(2) Linux Programmer's Manual SPUFS(2) + + + +NAME + spufs - the SPU file system + + +DESCRIPTION + The SPU file system is used on PowerPC machines that implement the Cell + Broadband Engine Architecture in order to access Synergistic Processor + Units (SPUs). + + The file system provides a name space similar to posix shared memory or + message queues. Users that have write permissions on the file system + can use spu_create(2) to establish SPU contexts in the spufs root. + + Every SPU context is represented by a directory containing a predefined + set of files. These files can be used for manipulating the state of the + logical SPU. Users can change permissions on those files, but not actu- + ally add or remove files. + + +MOUNT OPTIONS + uid= + set the user owning the mount point, the default is 0 (root). + + gid= + set the group owning the mount point, the default is 0 (root). + + +FILES + The files in spufs mostly follow the standard behavior for regular sys- + tem calls like read(2) or write(2), but often support only a subset of + the operations supported on regular file systems. This list details the + supported operations and the deviations from the behaviour in the + respective man pages. + + All files that support the read(2) operation also support readv(2) and + all files that support the write(2) operation also support writev(2). + All files support the access(2) and stat(2) family of operations, but + only the st_mode, st_nlink, st_uid and st_gid fields of struct stat + contain reliable information. + + All files support the chmod(2)/fchmod(2) and chown(2)/fchown(2) opera- + tions, but will not be able to grant permissions that contradict the + possible operations, e.g. read access on the wbox file. + + The current set of files is: + + + /mem + the contents of the local storage memory of the SPU. This can be + accessed like a regular shared memory file and contains both code and + data in the address space of the SPU. The possible operations on an + open mem file are: + + read(2), pread(2), write(2), pwrite(2), lseek(2) + These operate as documented, with the exception that seek(2), + write(2) and pwrite(2) are not supported beyond the end of the + file. The file size is the size of the local storage of the SPU, + which normally is 256 kilobytes. + + mmap(2) + Mapping mem into the process address space gives access to the + SPU local storage within the process address space. Only + MAP_SHARED mappings are allowed. + + + /mbox + The first SPU to CPU communication mailbox. This file is read-only and + can be read in units of 32 bits. The file can only be used in non- + blocking mode and it even poll() will not block on it. The possible + operations on an open mbox file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. If there is no data available in the mail + box, the return value is set to -1 and errno becomes EAGAIN. + When data has been read successfully, four bytes are placed in + the data buffer and the value four is returned. + + + /ibox + The second SPU to CPU communication mailbox. This file is similar to + the first mailbox file, but can be read in blocking I/O mode, and the + poll family of system calls can be used to wait for it. The possible + operations on an open ibox file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. If there is no data available in the mail + box and the file descriptor has been opened with O_NONBLOCK, the + return value is set to -1 and errno becomes EAGAIN. + + If there is no data available in the mail box and the file + descriptor has been opened without O_NONBLOCK, the call will + block until the SPU writes to its interrupt mailbox channel. + When data has been read successfully, four bytes are placed in + the data buffer and the value four is returned. + + poll(2) + Poll on the ibox file returns (POLLIN | POLLRDNORM) whenever + data is available for reading. + + + /wbox + The CPU to SPU communation mailbox. It is write-only and can be written + in units of 32 bits. If the mailbox is full, write() will block and + poll can be used to wait for it becoming empty again. The possible + operations on an open wbox file are: write(2) If a count smaller than + four is requested, write returns -1 and sets errno to EINVAL. If there + is no space available in the mail box and the file descriptor has been + opened with O_NONBLOCK, the return value is set to -1 and errno becomes + EAGAIN. + + If there is no space available in the mail box and the file descriptor + has been opened without O_NONBLOCK, the call will block until the SPU + reads from its PPE mailbox channel. When data has been read success- + fully, four bytes are placed in the data buffer and the value four is + returned. + + poll(2) + Poll on the ibox file returns (POLLOUT | POLLWRNORM) whenever + space is available for writing. + + + /mbox_stat + /ibox_stat + /wbox_stat + Read-only files that contain the length of the current queue, i.e. how + many words can be read from mbox or ibox or how many words can be + written to wbox without blocking. The files can be read only in 4-byte + units and return a big-endian binary integer number. The possible + operations on an open *box_stat file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is placed in + the data buffer, containing the number of elements that can be + read from (for mbox_stat and ibox_stat) or written to (for + wbox_stat) the respective mail box without blocking or resulting + in EAGAIN. + + + /npc + /decr + /decr_status + /spu_tag_mask + /event_mask + /srr0 + Internal registers of the SPU. The representation is an ASCII string + with the numeric value of the next instruction to be executed. These + can be used in read/write mode for debugging, but normal operation of + programs should not rely on them because access to any of them except + npc requires an SPU context save and is therefore very inefficient. + + The contents of these files are: + + npc Next Program Counter + + decr SPU Decrementer + + decr_status Decrementer Status + + spu_tag_mask MFC tag mask for SPU DMA + + event_mask Event mask for SPU interrupts + + srr0 Interrupt Return address register + + + The possible operations on an open npc, decr, decr_status, + spu_tag_mask, event_mask or srr0 file are: + + read(2) + When the count supplied to the read call is shorter than the + required length for the pointer value plus a newline character, + subsequent reads from the same file descriptor will result in + completing the string, regardless of changes to the register by + a running SPU task. When a complete string has been read, all + subsequent read operations will return zero bytes and a new file + descriptor needs to be opened to read the value again. + + write(2) + A write operation on the file results in setting the register to + the value given in the string. The string is parsed from the + beginning to the first non-numeric character or the end of the + buffer. Subsequent writes to the same file descriptor overwrite + the previous setting. + + + /fpcr + This file gives access to the Floating Point Status and Control Regis- + ter as a four byte long file. The operations on the fpcr file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is placed in + the data buffer, containing the current value of the fpcr regis- + ter. + + write(2) + If a count smaller than four is requested, write returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is copied + from the data buffer, updating the value of the fpcr register. + + + /signal1 + /signal2 + The two signal notification channels of an SPU. These are read-write + files that operate on a 32 bit word. Writing to one of these files + triggers an interrupt on the SPU. The value written to the signal + files can be read from the SPU through a channel read or from host user + space through the file. After the value has been read by the SPU, it + is reset to zero. The possible operations on an open signal1 or sig- + nal2 file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is placed in + the data buffer, containing the current value of the specified + signal notification register. + + write(2) + If a count smaller than four is requested, write returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is copied + from the data buffer, updating the value of the specified signal + notification register. The signal notification register will + either be replaced with the input data or will be updated to the + bitwise OR or the old value and the input data, depending on the + contents of the signal1_type, or signal2_type respectively, + file. + + + /signal1_type + /signal2_type + These two files change the behavior of the signal1 and signal2 notifi- + cation files. The contain a numerical ASCII string which is read as + either "1" or "0". In mode 0 (overwrite), the hardware replaces the + contents of the signal channel with the data that is written to it. in + mode 1 (logical OR), the hardware accumulates the bits that are subse- + quently written to it. The possible operations on an open signal1_type + or signal2_type file are: + + read(2) + When the count supplied to the read call is shorter than the + required length for the digit plus a newline character, subse- + quent reads from the same file descriptor will result in com- + pleting the string. When a complete string has been read, all + subsequent read operations will return zero bytes and a new file + descriptor needs to be opened to read the value again. + + write(2) + A write operation on the file results in setting the register to + the value given in the string. The string is parsed from the + beginning to the first non-numeric character or the end of the + buffer. Subsequent writes to the same file descriptor overwrite + the previous setting. + + +EXAMPLES + /etc/fstab entry + none /spu spufs gid=spu 0 0 + + +AUTHORS + Arnd Bergmann , Mark Nutter , + Ulrich Weigand + +SEE ALSO + capabilities(7), close(2), spu_create(2), spu_run(2), spufs(7) + + + +Linux 2005-09-28 SPUFS(2) -- cgit v1.2.3 From dc3f043ff0e44862a237d87f37ca04694d884049 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:14 +0200 Subject: docs: filesystems: convert spufs/spu_create.txt to ReST This file is at groff output format. Manually convert it to ReST format, trying to preserve a similar output after parsed. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/4d42e97d5560a79bd5dd443c592be04f9ae9a757.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/spufs/index.rst | 11 +++ Documentation/filesystems/spufs/spu_create.rst | 131 +++++++++++++++++++++++++ Documentation/filesystems/spufs/spu_create.txt | 119 ---------------------- 4 files changed, 143 insertions(+), 119 deletions(-) create mode 100644 Documentation/filesystems/spufs/index.rst create mode 100644 Documentation/filesystems/spufs/spu_create.rst delete mode 100644 Documentation/filesystems/spufs/spu_create.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index c658ddee40fa..5fb2b2166204 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -105,6 +105,7 @@ Documentation for filesystem implementations. ramfs-rootfs-initramfs relay romfs + spufs/index squashfs sysfs sysv-fs diff --git a/Documentation/filesystems/spufs/index.rst b/Documentation/filesystems/spufs/index.rst new file mode 100644 index 000000000000..39553c6ebefd --- /dev/null +++ b/Documentation/filesystems/spufs/index.rst @@ -0,0 +1,11 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +SPU Filesystem +============== + + +.. toctree:: + :maxdepth: 1 + + spu_create diff --git a/Documentation/filesystems/spufs/spu_create.rst b/Documentation/filesystems/spufs/spu_create.rst new file mode 100644 index 000000000000..83108c099696 --- /dev/null +++ b/Documentation/filesystems/spufs/spu_create.rst @@ -0,0 +1,131 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +spu_create +========== + +Name +==== + spu_create - create a new spu context + + +Synopsis +======== + + :: + + #include + #include + + int spu_create(const char *pathname, int flags, mode_t mode); + +Description +=========== + The spu_create system call is used on PowerPC machines that implement + the Cell Broadband Engine Architecture in order to access Synergistic + Processor Units (SPUs). It creates a new logical context for an SPU in + pathname and returns a handle to associated with it. pathname must + point to a non-existing directory in the mount point of the SPU file + system (spufs). When spu_create is successful, a directory gets cre- + ated on pathname and it is populated with files. + + The returned file handle can only be passed to spu_run(2) or closed, + other operations are not defined on it. When it is closed, all associ- + ated directory entries in spufs are removed. When the last file handle + pointing either inside of the context directory or to this file + descriptor is closed, the logical SPU context is destroyed. + + The parameter flags can be zero or any bitwise or'd combination of the + following constants: + + SPU_RAWIO + Allow mapping of some of the hardware registers of the SPU into + user space. This flag requires the CAP_SYS_RAWIO capability, see + capabilities(7). + + The mode parameter specifies the permissions used for creating the new + directory in spufs. mode is modified with the user's umask(2) value + and then used for both the directory and the files contained in it. The + file permissions mask out some more bits of mode because they typically + support only read or write access. See stat(2) for a full list of the + possible mode values. + + +Return Value +============ + spu_create returns a new file descriptor. It may return -1 to indicate + an error condition and set errno to one of the error codes listed + below. + + +Errors +====== + EACCES + The current user does not have write access on the spufs mount + point. + + EEXIST An SPU context already exists at the given path name. + + EFAULT pathname is not a valid string pointer in the current address + space. + + EINVAL pathname is not a directory in the spufs mount point. + + ELOOP Too many symlinks were found while resolving pathname. + + EMFILE The process has reached its maximum open file limit. + + ENAMETOOLONG + pathname was too long. + + ENFILE The system has reached the global open file limit. + + ENOENT Part of pathname could not be resolved. + + ENOMEM The kernel could not allocate all resources required. + + ENOSPC There are not enough SPU resources available to create a new + context or the user specific limit for the number of SPU con- + texts has been reached. + + ENOSYS the functionality is not provided by the current system, because + either the hardware does not provide SPUs or the spufs module is + not loaded. + + ENOTDIR + A part of pathname is not a directory. + + + +Notes +===== + spu_create is meant to be used from libraries that implement a more + abstract interface to SPUs, not to be used from regular applications. + See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- + ommended libraries. + + +Files +===== + pathname must point to a location beneath the mount point of spufs. By + convention, it gets mounted in /spu. + + +Conforming to +============= + This call is Linux specific and only implemented by the ppc64 architec- + ture. Programs using this system call are not portable. + + +Bugs +==== + The code does not yet fully implement all features lined out here. + + +Author +====== + Arnd Bergmann + +See Also +======== + capabilities(7), close(2), spu_run(2), spufs(7) diff --git a/Documentation/filesystems/spufs/spu_create.txt b/Documentation/filesystems/spufs/spu_create.txt deleted file mode 100644 index 8ede5a35340f..000000000000 --- a/Documentation/filesystems/spufs/spu_create.txt +++ /dev/null @@ -1,119 +0,0 @@ -SPU_CREATE(2) Linux Programmer's Manual SPU_CREATE(2) - - - -NAME - spu_create - create a new spu context - - -SYNOPSIS - #include - #include - - int spu_create(const char *pathname, int flags, mode_t mode); - -DESCRIPTION - The spu_create system call is used on PowerPC machines that implement - the Cell Broadband Engine Architecture in order to access Synergistic - Processor Units (SPUs). It creates a new logical context for an SPU in - pathname and returns a handle to associated with it. pathname must - point to a non-existing directory in the mount point of the SPU file - system (spufs). When spu_create is successful, a directory gets cre- - ated on pathname and it is populated with files. - - The returned file handle can only be passed to spu_run(2) or closed, - other operations are not defined on it. When it is closed, all associ- - ated directory entries in spufs are removed. When the last file handle - pointing either inside of the context directory or to this file - descriptor is closed, the logical SPU context is destroyed. - - The parameter flags can be zero or any bitwise or'd combination of the - following constants: - - SPU_RAWIO - Allow mapping of some of the hardware registers of the SPU into - user space. This flag requires the CAP_SYS_RAWIO capability, see - capabilities(7). - - The mode parameter specifies the permissions used for creating the new - directory in spufs. mode is modified with the user's umask(2) value - and then used for both the directory and the files contained in it. The - file permissions mask out some more bits of mode because they typically - support only read or write access. See stat(2) for a full list of the - possible mode values. - - -RETURN VALUE - spu_create returns a new file descriptor. It may return -1 to indicate - an error condition and set errno to one of the error codes listed - below. - - -ERRORS - EACCES - The current user does not have write access on the spufs mount - point. - - EEXIST An SPU context already exists at the given path name. - - EFAULT pathname is not a valid string pointer in the current address - space. - - EINVAL pathname is not a directory in the spufs mount point. - - ELOOP Too many symlinks were found while resolving pathname. - - EMFILE The process has reached its maximum open file limit. - - ENAMETOOLONG - pathname was too long. - - ENFILE The system has reached the global open file limit. - - ENOENT Part of pathname could not be resolved. - - ENOMEM The kernel could not allocate all resources required. - - ENOSPC There are not enough SPU resources available to create a new - context or the user specific limit for the number of SPU con- - texts has been reached. - - ENOSYS the functionality is not provided by the current system, because - either the hardware does not provide SPUs or the spufs module is - not loaded. - - ENOTDIR - A part of pathname is not a directory. - - - -NOTES - spu_create is meant to be used from libraries that implement a more - abstract interface to SPUs, not to be used from regular applications. - See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- - ommended libraries. - - -FILES - pathname must point to a location beneath the mount point of spufs. By - convention, it gets mounted in /spu. - - -CONFORMING TO - This call is Linux specific and only implemented by the ppc64 architec- - ture. Programs using this system call are not portable. - - -BUGS - The code does not yet fully implement all features lined out here. - - -AUTHOR - Arnd Bergmann - -SEE ALSO - capabilities(7), close(2), spu_run(2), spufs(7) - - - -Linux 2005-09-28 SPU_CREATE(2) -- cgit v1.2.3 From 299cd2747c61009f32943217c79f3c89728eb2ac Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:15 +0200 Subject: docs: filesystems: convert spufs/spufs.txt to ReST This file is at groff output format. Manually convert it to ReST format, trying to preserve a similar output after parsed. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9ca05fad12390931bc7da0fa2502d1a450a4b87f.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/spufs/index.rst | 1 + Documentation/filesystems/spufs/spufs.rst | 273 +++++++++++++++++++++++++++++ Documentation/filesystems/spufs/spufs.txt | 276 ------------------------------ MAINTAINERS | 2 +- 4 files changed, 275 insertions(+), 277 deletions(-) create mode 100644 Documentation/filesystems/spufs/spufs.rst delete mode 100644 Documentation/filesystems/spufs/spufs.txt diff --git a/Documentation/filesystems/spufs/index.rst b/Documentation/filesystems/spufs/index.rst index 39553c6ebefd..939cf59a7d9e 100644 --- a/Documentation/filesystems/spufs/index.rst +++ b/Documentation/filesystems/spufs/index.rst @@ -8,4 +8,5 @@ SPU Filesystem .. toctree:: :maxdepth: 1 + spufs spu_create diff --git a/Documentation/filesystems/spufs/spufs.rst b/Documentation/filesystems/spufs/spufs.rst new file mode 100644 index 000000000000..8a42859bb100 --- /dev/null +++ b/Documentation/filesystems/spufs/spufs.rst @@ -0,0 +1,273 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===== +spufs +===== + +Name +==== + + spufs - the SPU file system + + +Description +=========== + + The SPU file system is used on PowerPC machines that implement the Cell + Broadband Engine Architecture in order to access Synergistic Processor + Units (SPUs). + + The file system provides a name space similar to posix shared memory or + message queues. Users that have write permissions on the file system + can use spu_create(2) to establish SPU contexts in the spufs root. + + Every SPU context is represented by a directory containing a predefined + set of files. These files can be used for manipulating the state of the + logical SPU. Users can change permissions on those files, but not actu- + ally add or remove files. + + +Mount Options +============= + + uid= + set the user owning the mount point, the default is 0 (root). + + gid= + set the group owning the mount point, the default is 0 (root). + + +Files +===== + + The files in spufs mostly follow the standard behavior for regular sys- + tem calls like read(2) or write(2), but often support only a subset of + the operations supported on regular file systems. This list details the + supported operations and the deviations from the behaviour in the + respective man pages. + + All files that support the read(2) operation also support readv(2) and + all files that support the write(2) operation also support writev(2). + All files support the access(2) and stat(2) family of operations, but + only the st_mode, st_nlink, st_uid and st_gid fields of struct stat + contain reliable information. + + All files support the chmod(2)/fchmod(2) and chown(2)/fchown(2) opera- + tions, but will not be able to grant permissions that contradict the + possible operations, e.g. read access on the wbox file. + + The current set of files is: + + + /mem + the contents of the local storage memory of the SPU. This can be + accessed like a regular shared memory file and contains both code and + data in the address space of the SPU. The possible operations on an + open mem file are: + + read(2), pread(2), write(2), pwrite(2), lseek(2) + These operate as documented, with the exception that seek(2), + write(2) and pwrite(2) are not supported beyond the end of the + file. The file size is the size of the local storage of the SPU, + which normally is 256 kilobytes. + + mmap(2) + Mapping mem into the process address space gives access to the + SPU local storage within the process address space. Only + MAP_SHARED mappings are allowed. + + + /mbox + The first SPU to CPU communication mailbox. This file is read-only and + can be read in units of 32 bits. The file can only be used in non- + blocking mode and it even poll() will not block on it. The possible + operations on an open mbox file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. If there is no data available in the mail + box, the return value is set to -1 and errno becomes EAGAIN. + When data has been read successfully, four bytes are placed in + the data buffer and the value four is returned. + + + /ibox + The second SPU to CPU communication mailbox. This file is similar to + the first mailbox file, but can be read in blocking I/O mode, and the + poll family of system calls can be used to wait for it. The possible + operations on an open ibox file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. If there is no data available in the mail + box and the file descriptor has been opened with O_NONBLOCK, the + return value is set to -1 and errno becomes EAGAIN. + + If there is no data available in the mail box and the file + descriptor has been opened without O_NONBLOCK, the call will + block until the SPU writes to its interrupt mailbox channel. + When data has been read successfully, four bytes are placed in + the data buffer and the value four is returned. + + poll(2) + Poll on the ibox file returns (POLLIN | POLLRDNORM) whenever + data is available for reading. + + + /wbox + The CPU to SPU communation mailbox. It is write-only and can be written + in units of 32 bits. If the mailbox is full, write() will block and + poll can be used to wait for it becoming empty again. The possible + operations on an open wbox file are: write(2) If a count smaller than + four is requested, write returns -1 and sets errno to EINVAL. If there + is no space available in the mail box and the file descriptor has been + opened with O_NONBLOCK, the return value is set to -1 and errno becomes + EAGAIN. + + If there is no space available in the mail box and the file descriptor + has been opened without O_NONBLOCK, the call will block until the SPU + reads from its PPE mailbox channel. When data has been read success- + fully, four bytes are placed in the data buffer and the value four is + returned. + + poll(2) + Poll on the ibox file returns (POLLOUT | POLLWRNORM) whenever + space is available for writing. + + + /mbox_stat, /ibox_stat, /wbox_stat + Read-only files that contain the length of the current queue, i.e. how + many words can be read from mbox or ibox or how many words can be + written to wbox without blocking. The files can be read only in 4-byte + units and return a big-endian binary integer number. The possible + operations on an open ``*box_stat`` file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is placed in + the data buffer, containing the number of elements that can be + read from (for mbox_stat and ibox_stat) or written to (for + wbox_stat) the respective mail box without blocking or resulting + in EAGAIN. + + + /npc, /decr, /decr_status, /spu_tag_mask, /event_mask, /srr0 + Internal registers of the SPU. The representation is an ASCII string + with the numeric value of the next instruction to be executed. These + can be used in read/write mode for debugging, but normal operation of + programs should not rely on them because access to any of them except + npc requires an SPU context save and is therefore very inefficient. + + The contents of these files are: + + =================== =================================== + npc Next Program Counter + decr SPU Decrementer + decr_status Decrementer Status + spu_tag_mask MFC tag mask for SPU DMA + event_mask Event mask for SPU interrupts + srr0 Interrupt Return address register + =================== =================================== + + + The possible operations on an open npc, decr, decr_status, + spu_tag_mask, event_mask or srr0 file are: + + read(2) + When the count supplied to the read call is shorter than the + required length for the pointer value plus a newline character, + subsequent reads from the same file descriptor will result in + completing the string, regardless of changes to the register by + a running SPU task. When a complete string has been read, all + subsequent read operations will return zero bytes and a new file + descriptor needs to be opened to read the value again. + + write(2) + A write operation on the file results in setting the register to + the value given in the string. The string is parsed from the + beginning to the first non-numeric character or the end of the + buffer. Subsequent writes to the same file descriptor overwrite + the previous setting. + + + /fpcr + This file gives access to the Floating Point Status and Control Regis- + ter as a four byte long file. The operations on the fpcr file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is placed in + the data buffer, containing the current value of the fpcr regis- + ter. + + write(2) + If a count smaller than four is requested, write returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is copied + from the data buffer, updating the value of the fpcr register. + + + /signal1, /signal2 + The two signal notification channels of an SPU. These are read-write + files that operate on a 32 bit word. Writing to one of these files + triggers an interrupt on the SPU. The value written to the signal + files can be read from the SPU through a channel read or from host user + space through the file. After the value has been read by the SPU, it + is reset to zero. The possible operations on an open signal1 or sig- + nal2 file are: + + read(2) + If a count smaller than four is requested, read returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is placed in + the data buffer, containing the current value of the specified + signal notification register. + + write(2) + If a count smaller than four is requested, write returns -1 and + sets errno to EINVAL. Otherwise, a four byte value is copied + from the data buffer, updating the value of the specified signal + notification register. The signal notification register will + either be replaced with the input data or will be updated to the + bitwise OR or the old value and the input data, depending on the + contents of the signal1_type, or signal2_type respectively, + file. + + + /signal1_type, /signal2_type + These two files change the behavior of the signal1 and signal2 notifi- + cation files. The contain a numerical ASCII string which is read as + either "1" or "0". In mode 0 (overwrite), the hardware replaces the + contents of the signal channel with the data that is written to it. in + mode 1 (logical OR), the hardware accumulates the bits that are subse- + quently written to it. The possible operations on an open signal1_type + or signal2_type file are: + + read(2) + When the count supplied to the read call is shorter than the + required length for the digit plus a newline character, subse- + quent reads from the same file descriptor will result in com- + pleting the string. When a complete string has been read, all + subsequent read operations will return zero bytes and a new file + descriptor needs to be opened to read the value again. + + write(2) + A write operation on the file results in setting the register to + the value given in the string. The string is parsed from the + beginning to the first non-numeric character or the end of the + buffer. Subsequent writes to the same file descriptor overwrite + the previous setting. + + +Examples +======== + /etc/fstab entry + none /spu spufs gid=spu 0 0 + + +Authors +======= + Arnd Bergmann , Mark Nutter , + Ulrich Weigand + +See Also +======== + capabilities(7), close(2), spu_create(2), spu_run(2), spufs(7) diff --git a/Documentation/filesystems/spufs/spufs.txt b/Documentation/filesystems/spufs/spufs.txt deleted file mode 100644 index caf36aaae804..000000000000 --- a/Documentation/filesystems/spufs/spufs.txt +++ /dev/null @@ -1,276 +0,0 @@ -SPUFS(2) Linux Programmer's Manual SPUFS(2) - - - -NAME - spufs - the SPU file system - - -DESCRIPTION - The SPU file system is used on PowerPC machines that implement the Cell - Broadband Engine Architecture in order to access Synergistic Processor - Units (SPUs). - - The file system provides a name space similar to posix shared memory or - message queues. Users that have write permissions on the file system - can use spu_create(2) to establish SPU contexts in the spufs root. - - Every SPU context is represented by a directory containing a predefined - set of files. These files can be used for manipulating the state of the - logical SPU. Users can change permissions on those files, but not actu- - ally add or remove files. - - -MOUNT OPTIONS - uid= - set the user owning the mount point, the default is 0 (root). - - gid= - set the group owning the mount point, the default is 0 (root). - - -FILES - The files in spufs mostly follow the standard behavior for regular sys- - tem calls like read(2) or write(2), but often support only a subset of - the operations supported on regular file systems. This list details the - supported operations and the deviations from the behaviour in the - respective man pages. - - All files that support the read(2) operation also support readv(2) and - all files that support the write(2) operation also support writev(2). - All files support the access(2) and stat(2) family of operations, but - only the st_mode, st_nlink, st_uid and st_gid fields of struct stat - contain reliable information. - - All files support the chmod(2)/fchmod(2) and chown(2)/fchown(2) opera- - tions, but will not be able to grant permissions that contradict the - possible operations, e.g. read access on the wbox file. - - The current set of files is: - - - /mem - the contents of the local storage memory of the SPU. This can be - accessed like a regular shared memory file and contains both code and - data in the address space of the SPU. The possible operations on an - open mem file are: - - read(2), pread(2), write(2), pwrite(2), lseek(2) - These operate as documented, with the exception that seek(2), - write(2) and pwrite(2) are not supported beyond the end of the - file. The file size is the size of the local storage of the SPU, - which normally is 256 kilobytes. - - mmap(2) - Mapping mem into the process address space gives access to the - SPU local storage within the process address space. Only - MAP_SHARED mappings are allowed. - - - /mbox - The first SPU to CPU communication mailbox. This file is read-only and - can be read in units of 32 bits. The file can only be used in non- - blocking mode and it even poll() will not block on it. The possible - operations on an open mbox file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. If there is no data available in the mail - box, the return value is set to -1 and errno becomes EAGAIN. - When data has been read successfully, four bytes are placed in - the data buffer and the value four is returned. - - - /ibox - The second SPU to CPU communication mailbox. This file is similar to - the first mailbox file, but can be read in blocking I/O mode, and the - poll family of system calls can be used to wait for it. The possible - operations on an open ibox file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. If there is no data available in the mail - box and the file descriptor has been opened with O_NONBLOCK, the - return value is set to -1 and errno becomes EAGAIN. - - If there is no data available in the mail box and the file - descriptor has been opened without O_NONBLOCK, the call will - block until the SPU writes to its interrupt mailbox channel. - When data has been read successfully, four bytes are placed in - the data buffer and the value four is returned. - - poll(2) - Poll on the ibox file returns (POLLIN | POLLRDNORM) whenever - data is available for reading. - - - /wbox - The CPU to SPU communation mailbox. It is write-only and can be written - in units of 32 bits. If the mailbox is full, write() will block and - poll can be used to wait for it becoming empty again. The possible - operations on an open wbox file are: write(2) If a count smaller than - four is requested, write returns -1 and sets errno to EINVAL. If there - is no space available in the mail box and the file descriptor has been - opened with O_NONBLOCK, the return value is set to -1 and errno becomes - EAGAIN. - - If there is no space available in the mail box and the file descriptor - has been opened without O_NONBLOCK, the call will block until the SPU - reads from its PPE mailbox channel. When data has been read success- - fully, four bytes are placed in the data buffer and the value four is - returned. - - poll(2) - Poll on the ibox file returns (POLLOUT | POLLWRNORM) whenever - space is available for writing. - - - /mbox_stat - /ibox_stat - /wbox_stat - Read-only files that contain the length of the current queue, i.e. how - many words can be read from mbox or ibox or how many words can be - written to wbox without blocking. The files can be read only in 4-byte - units and return a big-endian binary integer number. The possible - operations on an open *box_stat file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is placed in - the data buffer, containing the number of elements that can be - read from (for mbox_stat and ibox_stat) or written to (for - wbox_stat) the respective mail box without blocking or resulting - in EAGAIN. - - - /npc - /decr - /decr_status - /spu_tag_mask - /event_mask - /srr0 - Internal registers of the SPU. The representation is an ASCII string - with the numeric value of the next instruction to be executed. These - can be used in read/write mode for debugging, but normal operation of - programs should not rely on them because access to any of them except - npc requires an SPU context save and is therefore very inefficient. - - The contents of these files are: - - npc Next Program Counter - - decr SPU Decrementer - - decr_status Decrementer Status - - spu_tag_mask MFC tag mask for SPU DMA - - event_mask Event mask for SPU interrupts - - srr0 Interrupt Return address register - - - The possible operations on an open npc, decr, decr_status, - spu_tag_mask, event_mask or srr0 file are: - - read(2) - When the count supplied to the read call is shorter than the - required length for the pointer value plus a newline character, - subsequent reads from the same file descriptor will result in - completing the string, regardless of changes to the register by - a running SPU task. When a complete string has been read, all - subsequent read operations will return zero bytes and a new file - descriptor needs to be opened to read the value again. - - write(2) - A write operation on the file results in setting the register to - the value given in the string. The string is parsed from the - beginning to the first non-numeric character or the end of the - buffer. Subsequent writes to the same file descriptor overwrite - the previous setting. - - - /fpcr - This file gives access to the Floating Point Status and Control Regis- - ter as a four byte long file. The operations on the fpcr file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is placed in - the data buffer, containing the current value of the fpcr regis- - ter. - - write(2) - If a count smaller than four is requested, write returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is copied - from the data buffer, updating the value of the fpcr register. - - - /signal1 - /signal2 - The two signal notification channels of an SPU. These are read-write - files that operate on a 32 bit word. Writing to one of these files - triggers an interrupt on the SPU. The value written to the signal - files can be read from the SPU through a channel read or from host user - space through the file. After the value has been read by the SPU, it - is reset to zero. The possible operations on an open signal1 or sig- - nal2 file are: - - read(2) - If a count smaller than four is requested, read returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is placed in - the data buffer, containing the current value of the specified - signal notification register. - - write(2) - If a count smaller than four is requested, write returns -1 and - sets errno to EINVAL. Otherwise, a four byte value is copied - from the data buffer, updating the value of the specified signal - notification register. The signal notification register will - either be replaced with the input data or will be updated to the - bitwise OR or the old value and the input data, depending on the - contents of the signal1_type, or signal2_type respectively, - file. - - - /signal1_type - /signal2_type - These two files change the behavior of the signal1 and signal2 notifi- - cation files. The contain a numerical ASCII string which is read as - either "1" or "0". In mode 0 (overwrite), the hardware replaces the - contents of the signal channel with the data that is written to it. in - mode 1 (logical OR), the hardware accumulates the bits that are subse- - quently written to it. The possible operations on an open signal1_type - or signal2_type file are: - - read(2) - When the count supplied to the read call is shorter than the - required length for the digit plus a newline character, subse- - quent reads from the same file descriptor will result in com- - pleting the string. When a complete string has been read, all - subsequent read operations will return zero bytes and a new file - descriptor needs to be opened to read the value again. - - write(2) - A write operation on the file results in setting the register to - the value given in the string. The string is parsed from the - beginning to the first non-numeric character or the end of the - buffer. Subsequent writes to the same file descriptor overwrite - the previous setting. - - -EXAMPLES - /etc/fstab entry - none /spu spufs gid=spu 0 0 - - -AUTHORS - Arnd Bergmann , Mark Nutter , - Ulrich Weigand - -SEE ALSO - capabilities(7), close(2), spu_create(2), spu_run(2), spufs(7) - - - -Linux 2005-09-28 SPUFS(2) diff --git a/MAINTAINERS b/MAINTAINERS index 1c3bd61ad03e..bbf85128ca89 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15882,7 +15882,7 @@ M: Jeremy Kerr L: linuxppc-dev@lists.ozlabs.org S: Supported W: http://www.ibm.com/developerworks/power/cell/ -F: Documentation/filesystems/spufs.txt +F: Documentation/filesystems/spufs/spufs.rst F: arch/powerpc/platforms/cell/spufs/ SQUASHFS FILE SYSTEM -- cgit v1.2.3 From e2975d7ca8df8775409c948310bc0f39678d1612 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:16 +0200 Subject: docs: filesystems: convert spufs/spu_run.txt to ReST This file is at groff output format. Manually convert it to ReST format, trying to preserve a similar output after parsed. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/7d8ee1edf5ef0137009bc65ff0441826ce555895.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/spufs/index.rst | 1 + Documentation/filesystems/spufs/spu_run.rst | 138 ++++++++++++++++++++++++++++ Documentation/filesystems/spufs/spu_run.txt | 121 ------------------------ 3 files changed, 139 insertions(+), 121 deletions(-) create mode 100644 Documentation/filesystems/spufs/spu_run.rst delete mode 100644 Documentation/filesystems/spufs/spu_run.txt diff --git a/Documentation/filesystems/spufs/index.rst b/Documentation/filesystems/spufs/index.rst index 939cf59a7d9e..5ed4a8494967 100644 --- a/Documentation/filesystems/spufs/index.rst +++ b/Documentation/filesystems/spufs/index.rst @@ -10,3 +10,4 @@ SPU Filesystem spufs spu_create + spu_run diff --git a/Documentation/filesystems/spufs/spu_run.rst b/Documentation/filesystems/spufs/spu_run.rst new file mode 100644 index 000000000000..7fdb1c31cb91 --- /dev/null +++ b/Documentation/filesystems/spufs/spu_run.rst @@ -0,0 +1,138 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +spu_run +======= + + +Name +==== + spu_run - execute an spu context + + +Synopsis +======== + + :: + + #include + + int spu_run(int fd, unsigned int *npc, unsigned int *event); + +Description +=========== + The spu_run system call is used on PowerPC machines that implement the + Cell Broadband Engine Architecture in order to access Synergistic Pro- + cessor Units (SPUs). It uses the fd that was returned from spu_cre- + ate(2) to address a specific SPU context. When the context gets sched- + uled to a physical SPU, it starts execution at the instruction pointer + passed in npc. + + Execution of SPU code happens synchronously, meaning that spu_run does + not return while the SPU is still running. If there is a need to exe- + cute SPU code in parallel with other code on either the main CPU or + other SPUs, you need to create a new thread of execution first, e.g. + using the pthread_create(3) call. + + When spu_run returns, the current value of the SPU instruction pointer + is written back to npc, so you can call spu_run again without updating + the pointers. + + event can be a NULL pointer or point to an extended status code that + gets filled when spu_run returns. It can be one of the following con- + stants: + + SPE_EVENT_DMA_ALIGNMENT + A DMA alignment error + + SPE_EVENT_SPE_DATA_SEGMENT + A DMA segmentation error + + SPE_EVENT_SPE_DATA_STORAGE + A DMA storage error + + If NULL is passed as the event argument, these errors will result in a + signal delivered to the calling process. + +Return Value +============ + spu_run returns the value of the spu_status register or -1 to indicate + an error and set errno to one of the error codes listed below. The + spu_status register value contains a bit mask of status codes and + optionally a 14 bit code returned from the stop-and-signal instruction + on the SPU. The bit masks for the status codes are: + + 0x02 + SPU was stopped by stop-and-signal. + + 0x04 + SPU was stopped by halt. + + 0x08 + SPU is waiting for a channel. + + 0x10 + SPU is in single-step mode. + + 0x20 + SPU has tried to execute an invalid instruction. + + 0x40 + SPU has tried to access an invalid channel. + + 0x3fff0000 + The bits masked with this value contain the code returned from + stop-and-signal. + + There are always one or more of the lower eight bits set or an error + code is returned from spu_run. + +Errors +====== + EAGAIN or EWOULDBLOCK + fd is in non-blocking mode and spu_run would block. + + EBADF fd is not a valid file descriptor. + + EFAULT npc is not a valid pointer or status is neither NULL nor a valid + pointer. + + EINTR A signal occurred while spu_run was in progress. The npc value + has been updated to the new program counter value if necessary. + + EINVAL fd is not a file descriptor returned from spu_create(2). + + ENOMEM Insufficient memory was available to handle a page fault result- + ing from an MFC direct memory access. + + ENOSYS the functionality is not provided by the current system, because + either the hardware does not provide SPUs or the spufs module is + not loaded. + + +Notes +===== + spu_run is meant to be used from libraries that implement a more + abstract interface to SPUs, not to be used from regular applications. + See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- + ommended libraries. + + +Conforming to +============= + This call is Linux specific and only implemented by the ppc64 architec- + ture. Programs using this system call are not portable. + + +Bugs +==== + The code does not yet fully implement all features lined out here. + + +Author +====== + Arnd Bergmann + +See Also +======== + capabilities(7), close(2), spu_create(2), spufs(7) diff --git a/Documentation/filesystems/spufs/spu_run.txt b/Documentation/filesystems/spufs/spu_run.txt deleted file mode 100644 index d5c6a00d0f97..000000000000 --- a/Documentation/filesystems/spufs/spu_run.txt +++ /dev/null @@ -1,121 +0,0 @@ -SPU_RUN(2) Linux Programmer's Manual SPU_RUN(2) - - - -NAME - spu_run - execute an spu context - - -SYNOPSIS - #include - - int spu_run(int fd, unsigned int *npc, unsigned int *event); - -DESCRIPTION - The spu_run system call is used on PowerPC machines that implement the - Cell Broadband Engine Architecture in order to access Synergistic Pro- - cessor Units (SPUs). It uses the fd that was returned from spu_cre- - ate(2) to address a specific SPU context. When the context gets sched- - uled to a physical SPU, it starts execution at the instruction pointer - passed in npc. - - Execution of SPU code happens synchronously, meaning that spu_run does - not return while the SPU is still running. If there is a need to exe- - cute SPU code in parallel with other code on either the main CPU or - other SPUs, you need to create a new thread of execution first, e.g. - using the pthread_create(3) call. - - When spu_run returns, the current value of the SPU instruction pointer - is written back to npc, so you can call spu_run again without updating - the pointers. - - event can be a NULL pointer or point to an extended status code that - gets filled when spu_run returns. It can be one of the following con- - stants: - - SPE_EVENT_DMA_ALIGNMENT - A DMA alignment error - - SPE_EVENT_SPE_DATA_SEGMENT - A DMA segmentation error - - SPE_EVENT_SPE_DATA_STORAGE - A DMA storage error - - If NULL is passed as the event argument, these errors will result in a - signal delivered to the calling process. - -RETURN VALUE - spu_run returns the value of the spu_status register or -1 to indicate - an error and set errno to one of the error codes listed below. The - spu_status register value contains a bit mask of status codes and - optionally a 14 bit code returned from the stop-and-signal instruction - on the SPU. The bit masks for the status codes are: - - 0x02 SPU was stopped by stop-and-signal. - - 0x04 SPU was stopped by halt. - - 0x08 SPU is waiting for a channel. - - 0x10 SPU is in single-step mode. - - 0x20 SPU has tried to execute an invalid instruction. - - 0x40 SPU has tried to access an invalid channel. - - 0x3fff0000 - The bits masked with this value contain the code returned from - stop-and-signal. - - There are always one or more of the lower eight bits set or an error - code is returned from spu_run. - -ERRORS - EAGAIN or EWOULDBLOCK - fd is in non-blocking mode and spu_run would block. - - EBADF fd is not a valid file descriptor. - - EFAULT npc is not a valid pointer or status is neither NULL nor a valid - pointer. - - EINTR A signal occurred while spu_run was in progress. The npc value - has been updated to the new program counter value if necessary. - - EINVAL fd is not a file descriptor returned from spu_create(2). - - ENOMEM Insufficient memory was available to handle a page fault result- - ing from an MFC direct memory access. - - ENOSYS the functionality is not provided by the current system, because - either the hardware does not provide SPUs or the spufs module is - not loaded. - - -NOTES - spu_run is meant to be used from libraries that implement a more - abstract interface to SPUs, not to be used from regular applications. - See http://www.bsc.es/projects/deepcomputing/linuxoncell/ for the rec- - ommended libraries. - - -CONFORMING TO - This call is Linux specific and only implemented by the ppc64 architec- - ture. Programs using this system call are not portable. - - -BUGS - The code does not yet fully implement all features lined out here. - - -AUTHOR - Arnd Bergmann - -SEE ALSO - capabilities(7), close(2), spu_create(2), spufs(7) - - - - -Linux 2005-09-28 SPU_RUN(2) -- cgit v1.2.3 From 28bcadf0ae994fd3584f43c39b6915ee70b59749 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:17 +0200 Subject: docs: filesystems: convert sysfs-pci.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add table markups; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/2a9d307753c97d1a843341a2ef1993d43a407ded.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/sysfs-pci.rst | 138 ++++++++++++++++++++++++++++++++ Documentation/filesystems/sysfs-pci.txt | 131 ------------------------------ 3 files changed, 139 insertions(+), 131 deletions(-) create mode 100644 Documentation/filesystems/sysfs-pci.rst delete mode 100644 Documentation/filesystems/sysfs-pci.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 5fb2b2166204..7afb58dff43b 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -34,6 +34,7 @@ algorithms work. quota seq_file sharedsubtree + sysfs-pci automount-support diff --git a/Documentation/filesystems/sysfs-pci.rst b/Documentation/filesystems/sysfs-pci.rst new file mode 100644 index 000000000000..a265f3e2cc80 --- /dev/null +++ b/Documentation/filesystems/sysfs-pci.rst @@ -0,0 +1,138 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================ +Accessing PCI device resources through sysfs +============================================ + +sysfs, usually mounted at /sys, provides access to PCI resources on platforms +that support it. For example, a given bus might look like this:: + + /sys/devices/pci0000:17 + |-- 0000:17:00.0 + | |-- class + | |-- config + | |-- device + | |-- enable + | |-- irq + | |-- local_cpus + | |-- remove + | |-- resource + | |-- resource0 + | |-- resource1 + | |-- resource2 + | |-- revision + | |-- rom + | |-- subsystem_device + | |-- subsystem_vendor + | `-- vendor + `-- ... + +The topmost element describes the PCI domain and bus number. In this case, +the domain number is 0000 and the bus number is 17 (both values are in hex). +This bus contains a single function device in slot 0. The domain and bus +numbers are reproduced for convenience. Under the device directory are several +files, each with their own function. + + =================== ===================================================== + file function + =================== ===================================================== + class PCI class (ascii, ro) + config PCI config space (binary, rw) + device PCI device (ascii, ro) + enable Whether the device is enabled (ascii, rw) + irq IRQ number (ascii, ro) + local_cpus nearby CPU mask (cpumask, ro) + remove remove device from kernel's list (ascii, wo) + resource PCI resource host addresses (ascii, ro) + resource0..N PCI resource N, if present (binary, mmap, rw\ [1]_) + resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) + revision PCI revision (ascii, ro) + rom PCI ROM resource, if present (binary, ro) + subsystem_device PCI subsystem device (ascii, ro) + subsystem_vendor PCI subsystem vendor (ascii, ro) + vendor PCI vendor (ascii, ro) + =================== ===================================================== + +:: + + ro - read only file + rw - file is readable and writable + wo - write only file + mmap - file is mmapable + ascii - file contains ascii text + binary - file contains binary data + cpumask - file contains a cpumask type + +.. [1] rw for RESOURCE_IO (I/O port) regions only + +The read only files are informational, writes to them will be ignored, with +the exception of the 'rom' file. Writable files can be used to perform +actions on the device (e.g. changing config space, detaching a device). +mmapable files are available via an mmap of the file at offset 0 and can be +used to do actual device programming from userspace. Note that some platforms +don't support mmapping of certain resources, so be sure to check the return +value from any attempted mmap. The most notable of these are I/O port +resources, which also provide read/write access. + +The 'enable' file provides a counter that indicates how many times the device +has been enabled. If the 'enable' file currently returns '4', and a '1' is +echoed into it, it will then return '5'. Echoing a '0' into it will decrease +the count. Even when it returns to 0, though, some of the initialisation +may not be reversed. + +The 'rom' file is special in that it provides read-only access to the device's +ROM file, if available. It's disabled by default, however, so applications +should write the string "1" to the file to enable it before attempting a read +call, and disable it following the access by writing "0" to the file. Note +that the device must be enabled for a rom read to return data successfully. +In the event a driver is not bound to the device, it can be enabled using the +'enable' file, documented above. + +The 'remove' file is used to remove the PCI device, by writing a non-zero +integer to the file. This does not involve any kind of hot-plug functionality, +e.g. powering off the device. The device is removed from the kernel's list of +PCI devices, the sysfs directory for it is removed, and the device will be +removed from any drivers attached to it. Removal of PCI root buses is +disallowed. + +Accessing legacy resources through sysfs +---------------------------------------- + +Legacy I/O port and ISA memory resources are also provided in sysfs if the +underlying platform supports them. They're located in the PCI class hierarchy, +e.g.:: + + /sys/class/pci_bus/0000:17/ + |-- bridge -> ../../../devices/pci0000:17 + |-- cpuaffinity + |-- legacy_io + `-- legacy_mem + +The legacy_io file is a read/write file that can be used by applications to +do legacy port I/O. The application should open the file, seek to the desired +port (e.g. 0x3e8) and do a read or a write of 1, 2 or 4 bytes. The legacy_mem +file should be mmapped with an offset corresponding to the memory offset +desired, e.g. 0xa0000 for the VGA frame buffer. The application can then +simply dereference the returned pointer (after checking for errors of course) +to access legacy memory space. + +Supporting PCI access on new platforms +-------------------------------------- + +In order to support PCI resource mapping as described above, Linux platform +code should ideally define ARCH_GENERIC_PCI_MMAP_RESOURCE and use the generic +implementation of that functionality. To support the historical interface of +mmap() through files in /proc/bus/pci, platforms may also set HAVE_PCI_MMAP. + +Alternatively, platforms which set HAVE_PCI_MMAP may provide their own +implementation of pci_mmap_page_range() instead of defining +ARCH_GENERIC_PCI_MMAP_RESOURCE. + +Platforms which support write-combining maps of PCI resources must define +arch_can_pci_mmap_wc() which shall evaluate to non-zero at runtime when +write-combining is permitted. Platforms which support maps of I/O resources +define arch_can_pci_mmap_io() similarly. + +Legacy resources are protected by the HAVE_PCI_LEGACY define. Platforms +wishing to support legacy functionality should define it and provide +pci_legacy_read, pci_legacy_write and pci_mmap_legacy_page_range functions. diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt deleted file mode 100644 index 06f1d64c6f70..000000000000 --- a/Documentation/filesystems/sysfs-pci.txt +++ /dev/null @@ -1,131 +0,0 @@ -Accessing PCI device resources through sysfs --------------------------------------------- - -sysfs, usually mounted at /sys, provides access to PCI resources on platforms -that support it. For example, a given bus might look like this: - - /sys/devices/pci0000:17 - |-- 0000:17:00.0 - | |-- class - | |-- config - | |-- device - | |-- enable - | |-- irq - | |-- local_cpus - | |-- remove - | |-- resource - | |-- resource0 - | |-- resource1 - | |-- resource2 - | |-- revision - | |-- rom - | |-- subsystem_device - | |-- subsystem_vendor - | `-- vendor - `-- ... - -The topmost element describes the PCI domain and bus number. In this case, -the domain number is 0000 and the bus number is 17 (both values are in hex). -This bus contains a single function device in slot 0. The domain and bus -numbers are reproduced for convenience. Under the device directory are several -files, each with their own function. - - file function - ---- -------- - class PCI class (ascii, ro) - config PCI config space (binary, rw) - device PCI device (ascii, ro) - enable Whether the device is enabled (ascii, rw) - irq IRQ number (ascii, ro) - local_cpus nearby CPU mask (cpumask, ro) - remove remove device from kernel's list (ascii, wo) - resource PCI resource host addresses (ascii, ro) - resource0..N PCI resource N, if present (binary, mmap, rw[1]) - resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) - revision PCI revision (ascii, ro) - rom PCI ROM resource, if present (binary, ro) - subsystem_device PCI subsystem device (ascii, ro) - subsystem_vendor PCI subsystem vendor (ascii, ro) - vendor PCI vendor (ascii, ro) - - ro - read only file - rw - file is readable and writable - wo - write only file - mmap - file is mmapable - ascii - file contains ascii text - binary - file contains binary data - cpumask - file contains a cpumask type - -[1] rw for RESOURCE_IO (I/O port) regions only - -The read only files are informational, writes to them will be ignored, with -the exception of the 'rom' file. Writable files can be used to perform -actions on the device (e.g. changing config space, detaching a device). -mmapable files are available via an mmap of the file at offset 0 and can be -used to do actual device programming from userspace. Note that some platforms -don't support mmapping of certain resources, so be sure to check the return -value from any attempted mmap. The most notable of these are I/O port -resources, which also provide read/write access. - -The 'enable' file provides a counter that indicates how many times the device -has been enabled. If the 'enable' file currently returns '4', and a '1' is -echoed into it, it will then return '5'. Echoing a '0' into it will decrease -the count. Even when it returns to 0, though, some of the initialisation -may not be reversed. - -The 'rom' file is special in that it provides read-only access to the device's -ROM file, if available. It's disabled by default, however, so applications -should write the string "1" to the file to enable it before attempting a read -call, and disable it following the access by writing "0" to the file. Note -that the device must be enabled for a rom read to return data successfully. -In the event a driver is not bound to the device, it can be enabled using the -'enable' file, documented above. - -The 'remove' file is used to remove the PCI device, by writing a non-zero -integer to the file. This does not involve any kind of hot-plug functionality, -e.g. powering off the device. The device is removed from the kernel's list of -PCI devices, the sysfs directory for it is removed, and the device will be -removed from any drivers attached to it. Removal of PCI root buses is -disallowed. - -Accessing legacy resources through sysfs ----------------------------------------- - -Legacy I/O port and ISA memory resources are also provided in sysfs if the -underlying platform supports them. They're located in the PCI class hierarchy, -e.g. - - /sys/class/pci_bus/0000:17/ - |-- bridge -> ../../../devices/pci0000:17 - |-- cpuaffinity - |-- legacy_io - `-- legacy_mem - -The legacy_io file is a read/write file that can be used by applications to -do legacy port I/O. The application should open the file, seek to the desired -port (e.g. 0x3e8) and do a read or a write of 1, 2 or 4 bytes. The legacy_mem -file should be mmapped with an offset corresponding to the memory offset -desired, e.g. 0xa0000 for the VGA frame buffer. The application can then -simply dereference the returned pointer (after checking for errors of course) -to access legacy memory space. - -Supporting PCI access on new platforms --------------------------------------- - -In order to support PCI resource mapping as described above, Linux platform -code should ideally define ARCH_GENERIC_PCI_MMAP_RESOURCE and use the generic -implementation of that functionality. To support the historical interface of -mmap() through files in /proc/bus/pci, platforms may also set HAVE_PCI_MMAP. - -Alternatively, platforms which set HAVE_PCI_MMAP may provide their own -implementation of pci_mmap_page_range() instead of defining -ARCH_GENERIC_PCI_MMAP_RESOURCE. - -Platforms which support write-combining maps of PCI resources must define -arch_can_pci_mmap_wc() which shall evaluate to non-zero at runtime when -write-combining is permitted. Platforms which support maps of I/O resources -define arch_can_pci_mmap_io() similarly. - -Legacy resources are protected by the HAVE_PCI_LEGACY define. Platforms -wishing to support legacy functionality should define it and provide -pci_legacy_read, pci_legacy_write and pci_mmap_legacy_page_range functions. -- cgit v1.2.3 From ec4551f45036d23a34d4f13ab3ce2ea345a13de7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:18 +0200 Subject: docs: filesystems: convert sysfs-tagging.txt to ReST - Add a SPDX header; - Adjust document title; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/45a01fa5edd5c6ee8fc0754fc74f7ef65a3e5581.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/sysfs-tagging.rst | 48 +++++++++++++++++++++++++++++ Documentation/filesystems/sysfs-tagging.txt | 42 ------------------------- 3 files changed, 49 insertions(+), 42 deletions(-) create mode 100644 Documentation/filesystems/sysfs-tagging.rst delete mode 100644 Documentation/filesystems/sysfs-tagging.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 7afb58dff43b..14e298d84c8a 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -35,6 +35,7 @@ algorithms work. seq_file sharedsubtree sysfs-pci + sysfs-tagging automount-support diff --git a/Documentation/filesystems/sysfs-tagging.rst b/Documentation/filesystems/sysfs-tagging.rst new file mode 100644 index 000000000000..8888a05c398e --- /dev/null +++ b/Documentation/filesystems/sysfs-tagging.rst @@ -0,0 +1,48 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= +Sysfs tagging +============= + +(Taken almost verbatim from Eric Biederman's netns tagging patch +commit msg) + +The problem. Network devices show up in sysfs and with the network +namespace active multiple devices with the same name can show up in +the same directory, ouch! + +To avoid that problem and allow existing applications in network +namespaces to see the same interface that is currently presented in +sysfs, sysfs now has tagging directory support. + +By using the network namespace pointers as tags to separate out the +the sysfs directory entries we ensure that we don't have conflicts +in the directories and applications only see a limited set of +the network devices. + +Each sysfs directory entry may be tagged with a namespace via the +``void *ns member`` of its ``kernfs_node``. If a directory entry is tagged, +then ``kernfs_node->flags`` will have a flag between KOBJ_NS_TYPE_NONE +and KOBJ_NS_TYPES, and ns will point to the namespace to which it +belongs. + +Each sysfs superblock's kernfs_super_info contains an array +``void *ns[KOBJ_NS_TYPES]``. When a task in a tagging namespace +kobj_nstype first mounts sysfs, a new superblock is created. It +will be differentiated from other sysfs mounts by having its +``s_fs_info->ns[kobj_nstype]`` set to the new namespace. Note that +through bind mounting and mounts propagation, a task can easily view +the contents of other namespaces' sysfs mounts. Therefore, when a +namespace exits, it will call kobj_ns_exit() to invalidate any +kernfs_node->ns pointers pointing to it. + +Users of this interface: + +- define a type in the ``kobj_ns_type`` enumeration. +- call kobj_ns_type_register() with its ``kobj_ns_type_operations`` which has + + - current_ns() which returns current's namespace + - netlink_ns() which returns a socket's namespace + - initial_ns() which returns the initial namesapce + +- call kobj_ns_exit() when an individual tag is no longer valid diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt deleted file mode 100644 index c7c8e6438958..000000000000 --- a/Documentation/filesystems/sysfs-tagging.txt +++ /dev/null @@ -1,42 +0,0 @@ -Sysfs tagging -------------- - -(Taken almost verbatim from Eric Biederman's netns tagging patch -commit msg) - -The problem. Network devices show up in sysfs and with the network -namespace active multiple devices with the same name can show up in -the same directory, ouch! - -To avoid that problem and allow existing applications in network -namespaces to see the same interface that is currently presented in -sysfs, sysfs now has tagging directory support. - -By using the network namespace pointers as tags to separate out the -the sysfs directory entries we ensure that we don't have conflicts -in the directories and applications only see a limited set of -the network devices. - -Each sysfs directory entry may be tagged with a namespace via the -void *ns member of its kernfs_node. If a directory entry is tagged, -then kernfs_node->flags will have a flag between KOBJ_NS_TYPE_NONE -and KOBJ_NS_TYPES, and ns will point to the namespace to which it -belongs. - -Each sysfs superblock's kernfs_super_info contains an array void -*ns[KOBJ_NS_TYPES]. When a task in a tagging namespace -kobj_nstype first mounts sysfs, a new superblock is created. It -will be differentiated from other sysfs mounts by having its -s_fs_info->ns[kobj_nstype] set to the new namespace. Note that -through bind mounting and mounts propagation, a task can easily view -the contents of other namespaces' sysfs mounts. Therefore, when a -namespace exits, it will call kobj_ns_exit() to invalidate any -kernfs_node->ns pointers pointing to it. - -Users of this interface: -- define a type in the kobj_ns_type enumeration. -- call kobj_ns_type_register() with its kobj_ns_type_operations which has - - current_ns() which returns current's namespace - - netlink_ns() which returns a socket's namespace - - initial_ns() which returns the initial namesapce -- call kobj_ns_exit() when an individual tag is no longer valid -- cgit v1.2.3 From c3d2f6cb4c707736e19eee4340357f2d8286b5c8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:19 +0200 Subject: docs: filesystems: convert xfs-delayed-logging-design.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/2233c248f12e7b465cd27ee30a86f96eb632946a.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + .../filesystems/xfs-delayed-logging-design.rst | 804 +++++++++++++++++++++ .../filesystems/xfs-delayed-logging-design.txt | 793 -------------------- MAINTAINERS | 2 +- 4 files changed, 806 insertions(+), 794 deletions(-) create mode 100644 Documentation/filesystems/xfs-delayed-logging-design.rst delete mode 100644 Documentation/filesystems/xfs-delayed-logging-design.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 14e298d84c8a..0fd90f854b33 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -117,4 +117,5 @@ Documentation for filesystem implementations. udf virtiofs vfat + xfs-delayed-logging-design zonefs diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs-delayed-logging-design.rst new file mode 100644 index 000000000000..464405d2801e --- /dev/null +++ b/Documentation/filesystems/xfs-delayed-logging-design.rst @@ -0,0 +1,804 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +XFS Delayed Logging Design +========================== + +Introduction to Re-logging in XFS +================================= + +XFS logging is a combination of logical and physical logging. Some objects, +such as inodes and dquots, are logged in logical format where the details +logged are made up of the changes to in-core structures rather than on-disk +structures. Other objects - typically buffers - have their physical changes +logged. The reason for these differences is to reduce the amount of log space +required for objects that are frequently logged. Some parts of inodes are more +frequently logged than others, and inodes are typically more frequently logged +than any other object (except maybe the superblock buffer) so keeping the +amount of metadata logged low is of prime importance. + +The reason that this is such a concern is that XFS allows multiple separate +modifications to a single object to be carried in the log at any given time. +This allows the log to avoid needing to flush each change to disk before +recording a new change to the object. XFS does this via a method called +"re-logging". Conceptually, this is quite simple - all it requires is that any +new change to the object is recorded with a *new copy* of all the existing +changes in the new transaction that is written to the log. + +That is, if we have a sequence of changes A through to F, and the object was +written to disk after change D, we would see in the log the following series +of transactions, their contents and the log sequence number (LSN) of the +transaction:: + + Transaction Contents LSN + A A X + B A+B X+n + C A+B+C X+n+m + D A+B+C+D X+n+m+o + + E E Y (> X+n+m+o) + F E+F Y+p + +In other words, each time an object is relogged, the new transaction contains +the aggregation of all the previous changes currently held only in the log. + +This relogging technique also allows objects to be moved forward in the log so +that an object being relogged does not prevent the tail of the log from ever +moving forward. This can be seen in the table above by the changing +(increasing) LSN of each subsequent transaction - the LSN is effectively a +direct encoding of the location in the log of the transaction. + +This relogging is also used to implement long-running, multiple-commit +transactions. These transaction are known as rolling transactions, and require +a special log reservation known as a permanent transaction reservation. A +typical example of a rolling transaction is the removal of extents from an +inode which can only be done at a rate of two extents per transaction because +of reservation size limitations. Hence a rolling extent removal transaction +keeps relogging the inode and btree buffers as they get modified in each +removal operation. This keeps them moving forward in the log as the operation +progresses, ensuring that current operation never gets blocked by itself if the +log wraps around. + +Hence it can be seen that the relogging operation is fundamental to the correct +working of the XFS journalling subsystem. From the above description, most +people should be able to see why the XFS metadata operations writes so much to +the log - repeated operations to the same objects write the same changes to +the log over and over again. Worse is the fact that objects tend to get +dirtier as they get relogged, so each subsequent transaction is writing more +metadata into the log. + +Another feature of the XFS transaction subsystem is that most transactions are +asynchronous. That is, they don't commit to disk until either a log buffer is +filled (a log buffer can hold multiple transactions) or a synchronous operation +forces the log buffers holding the transactions to disk. This means that XFS is +doing aggregation of transactions in memory - batching them, if you like - to +minimise the impact of the log IO on transaction throughput. + +The limitation on asynchronous transaction throughput is the number and size of +log buffers made available by the log manager. By default there are 8 log +buffers available and the size of each is 32kB - the size can be increased up +to 256kB by use of a mount option. + +Effectively, this gives us the maximum bound of outstanding metadata changes +that can be made to the filesystem at any point in time - if all the log +buffers are full and under IO, then no more transactions can be committed until +the current batch completes. It is now common for a single current CPU core to +be to able to issue enough transactions to keep the log buffers full and under +IO permanently. Hence the XFS journalling subsystem can be considered to be IO +bound. + +Delayed Logging: Concepts +========================= + +The key thing to note about the asynchronous logging combined with the +relogging technique XFS uses is that we can be relogging changed objects +multiple times before they are committed to disk in the log buffers. If we +return to the previous relogging example, it is entirely possible that +transactions A through D are committed to disk in the same log buffer. + +That is, a single log buffer may contain multiple copies of the same object, +but only one of those copies needs to be there - the last one "D", as it +contains all the changes from the previous changes. In other words, we have one +necessary copy in the log buffer, and three stale copies that are simply +wasting space. When we are doing repeated operations on the same set of +objects, these "stale objects" can be over 90% of the space used in the log +buffers. It is clear that reducing the number of stale objects written to the +log would greatly reduce the amount of metadata we write to the log, and this +is the fundamental goal of delayed logging. + +From a conceptual point of view, XFS is already doing relogging in memory (where +memory == log buffer), only it is doing it extremely inefficiently. It is using +logical to physical formatting to do the relogging because there is no +infrastructure to keep track of logical changes in memory prior to physically +formatting the changes in a transaction to the log buffer. Hence we cannot avoid +accumulating stale objects in the log buffers. + +Delayed logging is the name we've given to keeping and tracking transactional +changes to objects in memory outside the log buffer infrastructure. Because of +the relogging concept fundamental to the XFS journalling subsystem, this is +actually relatively easy to do - all the changes to logged items are already +tracked in the current infrastructure. The big problem is how to accumulate +them and get them to the log in a consistent, recoverable manner. +Describing the problems and how they have been solved is the focus of this +document. + +One of the key changes that delayed logging makes to the operation of the +journalling subsystem is that it disassociates the amount of outstanding +metadata changes from the size and number of log buffers available. In other +words, instead of there only being a maximum of 2MB of transaction changes not +written to the log at any point in time, there may be a much greater amount +being accumulated in memory. Hence the potential for loss of metadata on a +crash is much greater than for the existing logging mechanism. + +It should be noted that this does not change the guarantee that log recovery +will result in a consistent filesystem. What it does mean is that as far as the +recovered filesystem is concerned, there may be many thousands of transactions +that simply did not occur as a result of the crash. This makes it even more +important that applications that care about their data use fsync() where they +need to ensure application level data integrity is maintained. + +It should be noted that delayed logging is not an innovative new concept that +warrants rigorous proofs to determine whether it is correct or not. The method +of accumulating changes in memory for some period before writing them to the +log is used effectively in many filesystems including ext3 and ext4. Hence +no time is spent in this document trying to convince the reader that the +concept is sound. Instead it is simply considered a "solved problem" and as +such implementing it in XFS is purely an exercise in software engineering. + +The fundamental requirements for delayed logging in XFS are simple: + + 1. Reduce the amount of metadata written to the log by at least + an order of magnitude. + 2. Supply sufficient statistics to validate Requirement #1. + 3. Supply sufficient new tracing infrastructure to be able to debug + problems with the new code. + 4. No on-disk format change (metadata or log format). + 5. Enable and disable with a mount option. + 6. No performance regressions for synchronous transaction workloads. + +Delayed Logging: Design +======================= + +Storing Changes +--------------- + +The problem with accumulating changes at a logical level (i.e. just using the +existing log item dirty region tracking) is that when it comes to writing the +changes to the log buffers, we need to ensure that the object we are formatting +is not changing while we do this. This requires locking the object to prevent +concurrent modification. Hence flushing the logical changes to the log would +require us to lock every object, format them, and then unlock them again. + +This introduces lots of scope for deadlocks with transactions that are already +running. For example, a transaction has object A locked and modified, but needs +the delayed logging tracking lock to commit the transaction. However, the +flushing thread has the delayed logging tracking lock already held, and is +trying to get the lock on object A to flush it to the log buffer. This appears +to be an unsolvable deadlock condition, and it was solving this problem that +was the barrier to implementing delayed logging for so long. + +The solution is relatively simple - it just took a long time to recognise it. +Put simply, the current logging code formats the changes to each item into an +vector array that points to the changed regions in the item. The log write code +simply copies the memory these vectors point to into the log buffer during +transaction commit while the item is locked in the transaction. Instead of +using the log buffer as the destination of the formatting code, we can use an +allocated memory buffer big enough to fit the formatted vector. + +If we then copy the vector into the memory buffer and rewrite the vector to +point to the memory buffer rather than the object itself, we now have a copy of +the changes in a format that is compatible with the log buffer writing code. +that does not require us to lock the item to access. This formatting and +rewriting can all be done while the object is locked during transaction commit, +resulting in a vector that is transactionally consistent and can be accessed +without needing to lock the owning item. + +Hence we avoid the need to lock items when we need to flush outstanding +asynchronous transactions to the log. The differences between the existing +formatting method and the delayed logging formatting can be seen in the +diagram below. + +Current format log vector:: + + Object +---------------------------------------------+ + Vector 1 +----+ + Vector 2 +----+ + Vector 3 +----------+ + +After formatting:: + + Log Buffer +-V1-+-V2-+----V3----+ + +Delayed logging vector:: + + Object +---------------------------------------------+ + Vector 1 +----+ + Vector 2 +----+ + Vector 3 +----------+ + +After formatting:: + + Memory Buffer +-V1-+-V2-+----V3----+ + Vector 1 +----+ + Vector 2 +----+ + Vector 3 +----------+ + +The memory buffer and associated vector need to be passed as a single object, +but still need to be associated with the parent object so if the object is +relogged we can replace the current memory buffer with a new memory buffer that +contains the latest changes. + +The reason for keeping the vector around after we've formatted the memory +buffer is to support splitting vectors across log buffer boundaries correctly. +If we don't keep the vector around, we do not know where the region boundaries +are in the item, so we'd need a new encapsulation method for regions in the log +buffer writing (i.e. double encapsulation). This would be an on-disk format +change and as such is not desirable. It also means we'd have to write the log +region headers in the formatting stage, which is problematic as there is per +region state that needs to be placed into the headers during the log write. + +Hence we need to keep the vector, but by attaching the memory buffer to it and +rewriting the vector addresses to point at the memory buffer we end up with a +self-describing object that can be passed to the log buffer write code to be +handled in exactly the same manner as the existing log vectors are handled. +Hence we avoid needing a new on-disk format to handle items that have been +relogged in memory. + + +Tracking Changes +---------------- + +Now that we can record transactional changes in memory in a form that allows +them to be used without limitations, we need to be able to track and accumulate +them so that they can be written to the log at some later point in time. The +log item is the natural place to store this vector and buffer, and also makes sense +to be the object that is used to track committed objects as it will always +exist once the object has been included in a transaction. + +The log item is already used to track the log items that have been written to +the log but not yet written to disk. Such log items are considered "active" +and as such are stored in the Active Item List (AIL) which is a LSN-ordered +double linked list. Items are inserted into this list during log buffer IO +completion, after which they are unpinned and can be written to disk. An object +that is in the AIL can be relogged, which causes the object to be pinned again +and then moved forward in the AIL when the log buffer IO completes for that +transaction. + +Essentially, this shows that an item that is in the AIL can still be modified +and relogged, so any tracking must be separate to the AIL infrastructure. As +such, we cannot reuse the AIL list pointers for tracking committed items, nor +can we store state in any field that is protected by the AIL lock. Hence the +committed item tracking needs it's own locks, lists and state fields in the log +item. + +Similar to the AIL, tracking of committed items is done through a new list +called the Committed Item List (CIL). The list tracks log items that have been +committed and have formatted memory buffers attached to them. It tracks objects +in transaction commit order, so when an object is relogged it is removed from +it's place in the list and re-inserted at the tail. This is entirely arbitrary +and done to make it easy for debugging - the last items in the list are the +ones that are most recently modified. Ordering of the CIL is not necessary for +transactional integrity (as discussed in the next section) so the ordering is +done for convenience/sanity of the developers. + + +Delayed Logging: Checkpoints +---------------------------- + +When we have a log synchronisation event, commonly known as a "log force", +all the items in the CIL must be written into the log via the log buffers. +We need to write these items in the order that they exist in the CIL, and they +need to be written as an atomic transaction. The need for all the objects to be +written as an atomic transaction comes from the requirements of relogging and +log replay - all the changes in all the objects in a given transaction must +either be completely replayed during log recovery, or not replayed at all. If +a transaction is not replayed because it is not complete in the log, then +no later transactions should be replayed, either. + +To fulfill this requirement, we need to write the entire CIL in a single log +transaction. Fortunately, the XFS log code has no fixed limit on the size of a +transaction, nor does the log replay code. The only fundamental limit is that +the transaction cannot be larger than just under half the size of the log. The +reason for this limit is that to find the head and tail of the log, there must +be at least one complete transaction in the log at any given time. If a +transaction is larger than half the log, then there is the possibility that a +crash during the write of a such a transaction could partially overwrite the +only complete previous transaction in the log. This will result in a recovery +failure and an inconsistent filesystem and hence we must enforce the maximum +size of a checkpoint to be slightly less than a half the log. + +Apart from this size requirement, a checkpoint transaction looks no different +to any other transaction - it contains a transaction header, a series of +formatted log items and a commit record at the tail. From a recovery +perspective, the checkpoint transaction is also no different - just a lot +bigger with a lot more items in it. The worst case effect of this is that we +might need to tune the recovery transaction object hash size. + +Because the checkpoint is just another transaction and all the changes to log +items are stored as log vectors, we can use the existing log buffer writing +code to write the changes into the log. To do this efficiently, we need to +minimise the time we hold the CIL locked while writing the checkpoint +transaction. The current log write code enables us to do this easily with the +way it separates the writing of the transaction contents (the log vectors) from +the transaction commit record, but tracking this requires us to have a +per-checkpoint context that travels through the log write process through to +checkpoint completion. + +Hence a checkpoint has a context that tracks the state of the current +checkpoint from initiation to checkpoint completion. A new context is initiated +at the same time a checkpoint transaction is started. That is, when we remove +all the current items from the CIL during a checkpoint operation, we move all +those changes into the current checkpoint context. We then initialise a new +context and attach that to the CIL for aggregation of new transactions. + +This allows us to unlock the CIL immediately after transfer of all the +committed items and effectively allow new transactions to be issued while we +are formatting the checkpoint into the log. It also allows concurrent +checkpoints to be written into the log buffers in the case of log force heavy +workloads, just like the existing transaction commit code does. This, however, +requires that we strictly order the commit records in the log so that +checkpoint sequence order is maintained during log replay. + +To ensure that we can be writing an item into a checkpoint transaction at +the same time another transaction modifies the item and inserts the log item +into the new CIL, then checkpoint transaction commit code cannot use log items +to store the list of log vectors that need to be written into the transaction. +Hence log vectors need to be able to be chained together to allow them to be +detached from the log items. That is, when the CIL is flushed the memory +buffer and log vector attached to each log item needs to be attached to the +checkpoint context so that the log item can be released. In diagrammatic form, +the CIL would look like this before the flush:: + + CIL Head + | + V + Log Item <-> log vector 1 -> memory buffer + | -> vector array + V + Log Item <-> log vector 2 -> memory buffer + | -> vector array + V + ...... + | + V + Log Item <-> log vector N-1 -> memory buffer + | -> vector array + V + Log Item <-> log vector N -> memory buffer + -> vector array + +And after the flush the CIL head is empty, and the checkpoint context log +vector list would look like:: + + Checkpoint Context + | + V + log vector 1 -> memory buffer + | -> vector array + | -> Log Item + V + log vector 2 -> memory buffer + | -> vector array + | -> Log Item + V + ...... + | + V + log vector N-1 -> memory buffer + | -> vector array + | -> Log Item + V + log vector N -> memory buffer + -> vector array + -> Log Item + +Once this transfer is done, the CIL can be unlocked and new transactions can +start, while the checkpoint flush code works over the log vector chain to +commit the checkpoint. + +Once the checkpoint is written into the log buffers, the checkpoint context is +attached to the log buffer that the commit record was written to along with a +completion callback. Log IO completion will call that callback, which can then +run transaction committed processing for the log items (i.e. insert into AIL +and unpin) in the log vector chain and then free the log vector chain and +checkpoint context. + +Discussion Point: I am uncertain as to whether the log item is the most +efficient way to track vectors, even though it seems like the natural way to do +it. The fact that we walk the log items (in the CIL) just to chain the log +vectors and break the link between the log item and the log vector means that +we take a cache line hit for the log item list modification, then another for +the log vector chaining. If we track by the log vectors, then we only need to +break the link between the log item and the log vector, which means we should +dirty only the log item cachelines. Normally I wouldn't be concerned about one +vs two dirty cachelines except for the fact I've seen upwards of 80,000 log +vectors in one checkpoint transaction. I'd guess this is a "measure and +compare" situation that can be done after a working and reviewed implementation +is in the dev tree.... + +Delayed Logging: Checkpoint Sequencing +-------------------------------------- + +One of the key aspects of the XFS transaction subsystem is that it tags +committed transactions with the log sequence number of the transaction commit. +This allows transactions to be issued asynchronously even though there may be +future operations that cannot be completed until that transaction is fully +committed to the log. In the rare case that a dependent operation occurs (e.g. +re-using a freed metadata extent for a data extent), a special, optimised log +force can be issued to force the dependent transaction to disk immediately. + +To do this, transactions need to record the LSN of the commit record of the +transaction. This LSN comes directly from the log buffer the transaction is +written into. While this works just fine for the existing transaction +mechanism, it does not work for delayed logging because transactions are not +written directly into the log buffers. Hence some other method of sequencing +transactions is required. + +As discussed in the checkpoint section, delayed logging uses per-checkpoint +contexts, and as such it is simple to assign a sequence number to each +checkpoint. Because the switching of checkpoint contexts must be done +atomically, it is simple to ensure that each new context has a monotonically +increasing sequence number assigned to it without the need for an external +atomic counter - we can just take the current context sequence number and add +one to it for the new context. + +Then, instead of assigning a log buffer LSN to the transaction commit LSN +during the commit, we can assign the current checkpoint sequence. This allows +operations that track transactions that have not yet completed know what +checkpoint sequence needs to be committed before they can continue. As a +result, the code that forces the log to a specific LSN now needs to ensure that +the log forces to a specific checkpoint. + +To ensure that we can do this, we need to track all the checkpoint contexts +that are currently committing to the log. When we flush a checkpoint, the +context gets added to a "committing" list which can be searched. When a +checkpoint commit completes, it is removed from the committing list. Because +the checkpoint context records the LSN of the commit record for the checkpoint, +we can also wait on the log buffer that contains the commit record, thereby +using the existing log force mechanisms to execute synchronous forces. + +It should be noted that the synchronous forces may need to be extended with +mitigation algorithms similar to the current log buffer code to allow +aggregation of multiple synchronous transactions if there are already +synchronous transactions being flushed. Investigation of the performance of the +current design is needed before making any decisions here. + +The main concern with log forces is to ensure that all the previous checkpoints +are also committed to disk before the one we need to wait for. Therefore we +need to check that all the prior contexts in the committing list are also +complete before waiting on the one we need to complete. We do this +synchronisation in the log force code so that we don't need to wait anywhere +else for such serialisation - it only matters when we do a log force. + +The only remaining complexity is that a log force now also has to handle the +case where the forcing sequence number is the same as the current context. That +is, we need to flush the CIL and potentially wait for it to complete. This is a +simple addition to the existing log forcing code to check the sequence numbers +and push if required. Indeed, placing the current sequence checkpoint flush in +the log force code enables the current mechanism for issuing synchronous +transactions to remain untouched (i.e. commit an asynchronous transaction, then +force the log at the LSN of that transaction) and so the higher level code +behaves the same regardless of whether delayed logging is being used or not. + +Delayed Logging: Checkpoint Log Space Accounting +------------------------------------------------ + +The big issue for a checkpoint transaction is the log space reservation for the +transaction. We don't know how big a checkpoint transaction is going to be +ahead of time, nor how many log buffers it will take to write out, nor the +number of split log vector regions are going to be used. We can track the +amount of log space required as we add items to the commit item list, but we +still need to reserve the space in the log for the checkpoint. + +A typical transaction reserves enough space in the log for the worst case space +usage of the transaction. The reservation accounts for log record headers, +transaction and region headers, headers for split regions, buffer tail padding, +etc. as well as the actual space for all the changed metadata in the +transaction. While some of this is fixed overhead, much of it is dependent on +the size of the transaction and the number of regions being logged (the number +of log vectors in the transaction). + +An example of the differences would be logging directory changes versus logging +inode changes. If you modify lots of inode cores (e.g. ``chmod -R g+w *``), then +there are lots of transactions that only contain an inode core and an inode log +format structure. That is, two vectors totaling roughly 150 bytes. If we modify +10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each +vector is 12 bytes, so the total to be logged is approximately 1.75MB. In +comparison, if we are logging full directory buffers, they are typically 4KB +each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a +buffer format structure for each buffer - roughly 800 vectors or 1.51MB total +space. From this, it should be obvious that a static log space reservation is +not particularly flexible and is difficult to select the "optimal value" for +all workloads. + +Further, if we are going to use a static reservation, which bit of the entire +reservation does it cover? We account for space used by the transaction +reservation by tracking the space currently used by the object in the CIL and +then calculating the increase or decrease in space used as the object is +relogged. This allows for a checkpoint reservation to only have to account for +log buffer metadata used such as log header records. + +However, even using a static reservation for just the log metadata is +problematic. Typically log record headers use at least 16KB of log space per +1MB of log space consumed (512 bytes per 32k) and the reservation needs to be +large enough to handle arbitrary sized checkpoint transactions. This +reservation needs to be made before the checkpoint is started, and we need to +be able to reserve the space without sleeping. For a 8MB checkpoint, we need a +reservation of around 150KB, which is a non-trivial amount of space. + +A static reservation needs to manipulate the log grant counters - we can take a +permanent reservation on the space, but we still need to make sure we refresh +the write reservation (the actual space available to the transaction) after +every checkpoint transaction completion. Unfortunately, if this space is not +available when required, then the regrant code will sleep waiting for it. + +The problem with this is that it can lead to deadlocks as we may need to commit +checkpoints to be able to free up log space (refer back to the description of +rolling transactions for an example of this). Hence we *must* always have +space available in the log if we are to use static reservations, and that is +very difficult and complex to arrange. It is possible to do, but there is a +simpler way. + +The simpler way of doing this is tracking the entire log space used by the +items in the CIL and using this to dynamically calculate the amount of log +space required by the log metadata. If this log metadata space changes as a +result of a transaction commit inserting a new memory buffer into the CIL, then +the difference in space required is removed from the transaction that causes +the change. Transactions at this level will *always* have enough space +available in their reservation for this as they have already reserved the +maximal amount of log metadata space they require, and such a delta reservation +will always be less than or equal to the maximal amount in the reservation. + +Hence we can grow the checkpoint transaction reservation dynamically as items +are added to the CIL and avoid the need for reserving and regranting log space +up front. This avoids deadlocks and removes a blocking point from the +checkpoint flush code. + +As mentioned early, transactions can't grow to more than half the size of the +log. Hence as part of the reservation growing, we need to also check the size +of the reservation against the maximum allowed transaction size. If we reach +the maximum threshold, we need to push the CIL to the log. This is effectively +a "background flush" and is done on demand. This is identical to +a CIL push triggered by a log force, only that there is no waiting for the +checkpoint commit to complete. This background push is checked and executed by +transaction commit code. + +If the transaction subsystem goes idle while we still have items in the CIL, +they will be flushed by the periodic log force issued by the xfssyncd. This log +force will push the CIL to disk, and if the transaction subsystem stays idle, +allow the idle log to be covered (effectively marked clean) in exactly the same +manner that is done for the existing logging method. A discussion point is +whether this log force needs to be done more frequently than the current rate +which is once every 30s. + + +Delayed Logging: Log Item Pinning +--------------------------------- + +Currently log items are pinned during transaction commit while the items are +still locked. This happens just after the items are formatted, though it could +be done any time before the items are unlocked. The result of this mechanism is +that items get pinned once for every transaction that is committed to the log +buffers. Hence items that are relogged in the log buffers will have a pin count +for every outstanding transaction they were dirtied in. When each of these +transactions is completed, they will unpin the item once. As a result, the item +only becomes unpinned when all the transactions complete and there are no +pending transactions. Thus the pinning and unpinning of a log item is symmetric +as there is a 1:1 relationship with transaction commit and log item completion. + +For delayed logging, however, we have an asymmetric transaction commit to +completion relationship. Every time an object is relogged in the CIL it goes +through the commit process without a corresponding completion being registered. +That is, we now have a many-to-one relationship between transaction commit and +log item completion. The result of this is that pinning and unpinning of the +log items becomes unbalanced if we retain the "pin on transaction commit, unpin +on transaction completion" model. + +To keep pin/unpin symmetry, the algorithm needs to change to a "pin on +insertion into the CIL, unpin on checkpoint completion". In other words, the +pinning and unpinning becomes symmetric around a checkpoint context. We have to +pin the object the first time it is inserted into the CIL - if it is already in +the CIL during a transaction commit, then we do not pin it again. Because there +can be multiple outstanding checkpoint contexts, we can still see elevated pin +counts, but as each checkpoint completes the pin count will retain the correct +value according to it's context. + +Just to make matters more slightly more complex, this checkpoint level context +for the pin count means that the pinning of an item must take place under the +CIL commit/flush lock. If we pin the object outside this lock, we cannot +guarantee which context the pin count is associated with. This is because of +the fact pinning the item is dependent on whether the item is present in the +current CIL or not. If we don't pin the CIL first before we check and pin the +object, we have a race with CIL being flushed between the check and the pin +(or not pinning, as the case may be). Hence we must hold the CIL flush/commit +lock to guarantee that we pin the items correctly. + +Delayed Logging: Concurrent Scalability +--------------------------------------- + +A fundamental requirement for the CIL is that accesses through transaction +commits must scale to many concurrent commits. The current transaction commit +code does not break down even when there are transactions coming from 2048 +processors at once. The current transaction code does not go any faster than if +there was only one CPU using it, but it does not slow down either. + +As a result, the delayed logging transaction commit code needs to be designed +for concurrency from the ground up. It is obvious that there are serialisation +points in the design - the three important ones are: + + 1. Locking out new transaction commits while flushing the CIL + 2. Adding items to the CIL and updating item space accounting + 3. Checkpoint commit ordering + +Looking at the transaction commit and CIL flushing interactions, it is clear +that we have a many-to-one interaction here. That is, the only restriction on +the number of concurrent transactions that can be trying to commit at once is +the amount of space available in the log for their reservations. The practical +limit here is in the order of several hundred concurrent transactions for a +128MB log, which means that it is generally one per CPU in a machine. + +The amount of time a transaction commit needs to hold out a flush is a +relatively long period of time - the pinning of log items needs to be done +while we are holding out a CIL flush, so at the moment that means it is held +across the formatting of the objects into memory buffers (i.e. while memcpy()s +are in progress). Ultimately a two pass algorithm where the formatting is done +separately to the pinning of objects could be used to reduce the hold time of +the transaction commit side. + +Because of the number of potential transaction commit side holders, the lock +really needs to be a sleeping lock - if the CIL flush takes the lock, we do not +want every other CPU in the machine spinning on the CIL lock. Given that +flushing the CIL could involve walking a list of tens of thousands of log +items, it will get held for a significant time and so spin contention is a +significant concern. Preventing lots of CPUs spinning doing nothing is the +main reason for choosing a sleeping lock even though nothing in either the +transaction commit or CIL flush side sleeps with the lock held. + +It should also be noted that CIL flushing is also a relatively rare operation +compared to transaction commit for asynchronous transaction workloads - only +time will tell if using a read-write semaphore for exclusion will limit +transaction commit concurrency due to cache line bouncing of the lock on the +read side. + +The second serialisation point is on the transaction commit side where items +are inserted into the CIL. Because transactions can enter this code +concurrently, the CIL needs to be protected separately from the above +commit/flush exclusion. It also needs to be an exclusive lock but it is only +held for a very short time and so a spin lock is appropriate here. It is +possible that this lock will become a contention point, but given the short +hold time once per transaction I think that contention is unlikely. + +The final serialisation point is the checkpoint commit record ordering code +that is run as part of the checkpoint commit and log force sequencing. The code +path that triggers a CIL flush (i.e. whatever triggers the log force) will enter +an ordering loop after writing all the log vectors into the log buffers but +before writing the commit record. This loop walks the list of committing +checkpoints and needs to block waiting for checkpoints to complete their commit +record write. As a result it needs a lock and a wait variable. Log force +sequencing also requires the same lock, list walk, and blocking mechanism to +ensure completion of checkpoints. + +These two sequencing operations can use the mechanism even though the +events they are waiting for are different. The checkpoint commit record +sequencing needs to wait until checkpoint contexts contain a commit LSN +(obtained through completion of a commit record write) while log force +sequencing needs to wait until previous checkpoint contexts are removed from +the committing list (i.e. they've completed). A simple wait variable and +broadcast wakeups (thundering herds) has been used to implement these two +serialisation queues. They use the same lock as the CIL, too. If we see too +much contention on the CIL lock, or too many context switches as a result of +the broadcast wakeups these operations can be put under a new spinlock and +given separate wait lists to reduce lock contention and the number of processes +woken by the wrong event. + + +Lifecycle Changes +----------------- + +The existing log item life cycle is as follows:: + + 1. Transaction allocate + 2. Transaction reserve + 3. Lock item + 4. Join item to transaction + If not already attached, + Allocate log item + Attach log item to owner item + Attach log item to transaction + 5. Modify item + Record modifications in log item + 6. Transaction commit + Pin item in memory + Format item into log buffer + Write commit LSN into transaction + Unlock item + Attach transaction to log buffer + + + + + 7. Transaction completion + Mark log item committed + Insert log item into AIL + Write commit LSN into log item + Unpin log item + 8. AIL traversal + Lock item + Mark log item clean + Flush item to disk + + + + 9. Log item removed from AIL + Moves log tail + Item unlocked + +Essentially, steps 1-6 operate independently from step 7, which is also +independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9 +at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur +at the same time. If the log item is in the AIL or between steps 6 and 7 +and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9 +are entered and completed is the object considered clean. + +With delayed logging, there are new steps inserted into the life cycle:: + + 1. Transaction allocate + 2. Transaction reserve + 3. Lock item + 4. Join item to transaction + If not already attached, + Allocate log item + Attach log item to owner item + Attach log item to transaction + 5. Modify item + Record modifications in log item + 6. Transaction commit + Pin item in memory if not pinned in CIL + Format item into log vector + buffer + Attach log vector and buffer to log item + Insert log item into CIL + Write CIL context sequence into transaction + Unlock item + + + + 7. CIL push + lock CIL flush + Chain log vectors and buffers together + Remove items from CIL + unlock CIL flush + write log vectors into log + sequence commit records + attach checkpoint context to log buffer + + + + + 8. Checkpoint completion + Mark log item committed + Insert item into AIL + Write commit LSN into log item + Unpin log item + 9. AIL traversal + Lock item + Mark log item clean + Flush item to disk + + 10. Log item removed from AIL + Moves log tail + Item unlocked + +From this, it can be seen that the only life cycle differences between the two +logging methods are in the middle of the life cycle - they still have the same +beginning and end and execution constraints. The only differences are in the +committing of the log items to the log itself and the completion processing. +Hence delayed logging should not introduce any constraints on log item +behaviour, allocation or freeing that don't already exist. + +As a result of this zero-impact "insertion" of delayed logging infrastructure +and the design of the internal structures to avoid on disk format changes, we +can basically switch between delayed logging and the existing mechanism with a +mount option. Fundamentally, there is no reason why the log manager would not +be able to swap methods automatically and transparently depending on load +characteristics, but this should not be necessary if delayed logging works as +designed. diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt deleted file mode 100644 index 9a6dd289b17b..000000000000 --- a/Documentation/filesystems/xfs-delayed-logging-design.txt +++ /dev/null @@ -1,793 +0,0 @@ -XFS Delayed Logging Design --------------------------- - -Introduction to Re-logging in XFS ---------------------------------- - -XFS logging is a combination of logical and physical logging. Some objects, -such as inodes and dquots, are logged in logical format where the details -logged are made up of the changes to in-core structures rather than on-disk -structures. Other objects - typically buffers - have their physical changes -logged. The reason for these differences is to reduce the amount of log space -required for objects that are frequently logged. Some parts of inodes are more -frequently logged than others, and inodes are typically more frequently logged -than any other object (except maybe the superblock buffer) so keeping the -amount of metadata logged low is of prime importance. - -The reason that this is such a concern is that XFS allows multiple separate -modifications to a single object to be carried in the log at any given time. -This allows the log to avoid needing to flush each change to disk before -recording a new change to the object. XFS does this via a method called -"re-logging". Conceptually, this is quite simple - all it requires is that any -new change to the object is recorded with a *new copy* of all the existing -changes in the new transaction that is written to the log. - -That is, if we have a sequence of changes A through to F, and the object was -written to disk after change D, we would see in the log the following series -of transactions, their contents and the log sequence number (LSN) of the -transaction: - - Transaction Contents LSN - A A X - B A+B X+n - C A+B+C X+n+m - D A+B+C+D X+n+m+o - - E E Y (> X+n+m+o) - F E+F Y+p - -In other words, each time an object is relogged, the new transaction contains -the aggregation of all the previous changes currently held only in the log. - -This relogging technique also allows objects to be moved forward in the log so -that an object being relogged does not prevent the tail of the log from ever -moving forward. This can be seen in the table above by the changing -(increasing) LSN of each subsequent transaction - the LSN is effectively a -direct encoding of the location in the log of the transaction. - -This relogging is also used to implement long-running, multiple-commit -transactions. These transaction are known as rolling transactions, and require -a special log reservation known as a permanent transaction reservation. A -typical example of a rolling transaction is the removal of extents from an -inode which can only be done at a rate of two extents per transaction because -of reservation size limitations. Hence a rolling extent removal transaction -keeps relogging the inode and btree buffers as they get modified in each -removal operation. This keeps them moving forward in the log as the operation -progresses, ensuring that current operation never gets blocked by itself if the -log wraps around. - -Hence it can be seen that the relogging operation is fundamental to the correct -working of the XFS journalling subsystem. From the above description, most -people should be able to see why the XFS metadata operations writes so much to -the log - repeated operations to the same objects write the same changes to -the log over and over again. Worse is the fact that objects tend to get -dirtier as they get relogged, so each subsequent transaction is writing more -metadata into the log. - -Another feature of the XFS transaction subsystem is that most transactions are -asynchronous. That is, they don't commit to disk until either a log buffer is -filled (a log buffer can hold multiple transactions) or a synchronous operation -forces the log buffers holding the transactions to disk. This means that XFS is -doing aggregation of transactions in memory - batching them, if you like - to -minimise the impact of the log IO on transaction throughput. - -The limitation on asynchronous transaction throughput is the number and size of -log buffers made available by the log manager. By default there are 8 log -buffers available and the size of each is 32kB - the size can be increased up -to 256kB by use of a mount option. - -Effectively, this gives us the maximum bound of outstanding metadata changes -that can be made to the filesystem at any point in time - if all the log -buffers are full and under IO, then no more transactions can be committed until -the current batch completes. It is now common for a single current CPU core to -be to able to issue enough transactions to keep the log buffers full and under -IO permanently. Hence the XFS journalling subsystem can be considered to be IO -bound. - -Delayed Logging: Concepts -------------------------- - -The key thing to note about the asynchronous logging combined with the -relogging technique XFS uses is that we can be relogging changed objects -multiple times before they are committed to disk in the log buffers. If we -return to the previous relogging example, it is entirely possible that -transactions A through D are committed to disk in the same log buffer. - -That is, a single log buffer may contain multiple copies of the same object, -but only one of those copies needs to be there - the last one "D", as it -contains all the changes from the previous changes. In other words, we have one -necessary copy in the log buffer, and three stale copies that are simply -wasting space. When we are doing repeated operations on the same set of -objects, these "stale objects" can be over 90% of the space used in the log -buffers. It is clear that reducing the number of stale objects written to the -log would greatly reduce the amount of metadata we write to the log, and this -is the fundamental goal of delayed logging. - -From a conceptual point of view, XFS is already doing relogging in memory (where -memory == log buffer), only it is doing it extremely inefficiently. It is using -logical to physical formatting to do the relogging because there is no -infrastructure to keep track of logical changes in memory prior to physically -formatting the changes in a transaction to the log buffer. Hence we cannot avoid -accumulating stale objects in the log buffers. - -Delayed logging is the name we've given to keeping and tracking transactional -changes to objects in memory outside the log buffer infrastructure. Because of -the relogging concept fundamental to the XFS journalling subsystem, this is -actually relatively easy to do - all the changes to logged items are already -tracked in the current infrastructure. The big problem is how to accumulate -them and get them to the log in a consistent, recoverable manner. -Describing the problems and how they have been solved is the focus of this -document. - -One of the key changes that delayed logging makes to the operation of the -journalling subsystem is that it disassociates the amount of outstanding -metadata changes from the size and number of log buffers available. In other -words, instead of there only being a maximum of 2MB of transaction changes not -written to the log at any point in time, there may be a much greater amount -being accumulated in memory. Hence the potential for loss of metadata on a -crash is much greater than for the existing logging mechanism. - -It should be noted that this does not change the guarantee that log recovery -will result in a consistent filesystem. What it does mean is that as far as the -recovered filesystem is concerned, there may be many thousands of transactions -that simply did not occur as a result of the crash. This makes it even more -important that applications that care about their data use fsync() where they -need to ensure application level data integrity is maintained. - -It should be noted that delayed logging is not an innovative new concept that -warrants rigorous proofs to determine whether it is correct or not. The method -of accumulating changes in memory for some period before writing them to the -log is used effectively in many filesystems including ext3 and ext4. Hence -no time is spent in this document trying to convince the reader that the -concept is sound. Instead it is simply considered a "solved problem" and as -such implementing it in XFS is purely an exercise in software engineering. - -The fundamental requirements for delayed logging in XFS are simple: - - 1. Reduce the amount of metadata written to the log by at least - an order of magnitude. - 2. Supply sufficient statistics to validate Requirement #1. - 3. Supply sufficient new tracing infrastructure to be able to debug - problems with the new code. - 4. No on-disk format change (metadata or log format). - 5. Enable and disable with a mount option. - 6. No performance regressions for synchronous transaction workloads. - -Delayed Logging: Design ------------------------ - -Storing Changes - -The problem with accumulating changes at a logical level (i.e. just using the -existing log item dirty region tracking) is that when it comes to writing the -changes to the log buffers, we need to ensure that the object we are formatting -is not changing while we do this. This requires locking the object to prevent -concurrent modification. Hence flushing the logical changes to the log would -require us to lock every object, format them, and then unlock them again. - -This introduces lots of scope for deadlocks with transactions that are already -running. For example, a transaction has object A locked and modified, but needs -the delayed logging tracking lock to commit the transaction. However, the -flushing thread has the delayed logging tracking lock already held, and is -trying to get the lock on object A to flush it to the log buffer. This appears -to be an unsolvable deadlock condition, and it was solving this problem that -was the barrier to implementing delayed logging for so long. - -The solution is relatively simple - it just took a long time to recognise it. -Put simply, the current logging code formats the changes to each item into an -vector array that points to the changed regions in the item. The log write code -simply copies the memory these vectors point to into the log buffer during -transaction commit while the item is locked in the transaction. Instead of -using the log buffer as the destination of the formatting code, we can use an -allocated memory buffer big enough to fit the formatted vector. - -If we then copy the vector into the memory buffer and rewrite the vector to -point to the memory buffer rather than the object itself, we now have a copy of -the changes in a format that is compatible with the log buffer writing code. -that does not require us to lock the item to access. This formatting and -rewriting can all be done while the object is locked during transaction commit, -resulting in a vector that is transactionally consistent and can be accessed -without needing to lock the owning item. - -Hence we avoid the need to lock items when we need to flush outstanding -asynchronous transactions to the log. The differences between the existing -formatting method and the delayed logging formatting can be seen in the -diagram below. - -Current format log vector: - -Object +---------------------------------------------+ -Vector 1 +----+ -Vector 2 +----+ -Vector 3 +----------+ - -After formatting: - -Log Buffer +-V1-+-V2-+----V3----+ - -Delayed logging vector: - -Object +---------------------------------------------+ -Vector 1 +----+ -Vector 2 +----+ -Vector 3 +----------+ - -After formatting: - -Memory Buffer +-V1-+-V2-+----V3----+ -Vector 1 +----+ -Vector 2 +----+ -Vector 3 +----------+ - -The memory buffer and associated vector need to be passed as a single object, -but still need to be associated with the parent object so if the object is -relogged we can replace the current memory buffer with a new memory buffer that -contains the latest changes. - -The reason for keeping the vector around after we've formatted the memory -buffer is to support splitting vectors across log buffer boundaries correctly. -If we don't keep the vector around, we do not know where the region boundaries -are in the item, so we'd need a new encapsulation method for regions in the log -buffer writing (i.e. double encapsulation). This would be an on-disk format -change and as such is not desirable. It also means we'd have to write the log -region headers in the formatting stage, which is problematic as there is per -region state that needs to be placed into the headers during the log write. - -Hence we need to keep the vector, but by attaching the memory buffer to it and -rewriting the vector addresses to point at the memory buffer we end up with a -self-describing object that can be passed to the log buffer write code to be -handled in exactly the same manner as the existing log vectors are handled. -Hence we avoid needing a new on-disk format to handle items that have been -relogged in memory. - - -Tracking Changes - -Now that we can record transactional changes in memory in a form that allows -them to be used without limitations, we need to be able to track and accumulate -them so that they can be written to the log at some later point in time. The -log item is the natural place to store this vector and buffer, and also makes sense -to be the object that is used to track committed objects as it will always -exist once the object has been included in a transaction. - -The log item is already used to track the log items that have been written to -the log but not yet written to disk. Such log items are considered "active" -and as such are stored in the Active Item List (AIL) which is a LSN-ordered -double linked list. Items are inserted into this list during log buffer IO -completion, after which they are unpinned and can be written to disk. An object -that is in the AIL can be relogged, which causes the object to be pinned again -and then moved forward in the AIL when the log buffer IO completes for that -transaction. - -Essentially, this shows that an item that is in the AIL can still be modified -and relogged, so any tracking must be separate to the AIL infrastructure. As -such, we cannot reuse the AIL list pointers for tracking committed items, nor -can we store state in any field that is protected by the AIL lock. Hence the -committed item tracking needs it's own locks, lists and state fields in the log -item. - -Similar to the AIL, tracking of committed items is done through a new list -called the Committed Item List (CIL). The list tracks log items that have been -committed and have formatted memory buffers attached to them. It tracks objects -in transaction commit order, so when an object is relogged it is removed from -it's place in the list and re-inserted at the tail. This is entirely arbitrary -and done to make it easy for debugging - the last items in the list are the -ones that are most recently modified. Ordering of the CIL is not necessary for -transactional integrity (as discussed in the next section) so the ordering is -done for convenience/sanity of the developers. - - -Delayed Logging: Checkpoints - -When we have a log synchronisation event, commonly known as a "log force", -all the items in the CIL must be written into the log via the log buffers. -We need to write these items in the order that they exist in the CIL, and they -need to be written as an atomic transaction. The need for all the objects to be -written as an atomic transaction comes from the requirements of relogging and -log replay - all the changes in all the objects in a given transaction must -either be completely replayed during log recovery, or not replayed at all. If -a transaction is not replayed because it is not complete in the log, then -no later transactions should be replayed, either. - -To fulfill this requirement, we need to write the entire CIL in a single log -transaction. Fortunately, the XFS log code has no fixed limit on the size of a -transaction, nor does the log replay code. The only fundamental limit is that -the transaction cannot be larger than just under half the size of the log. The -reason for this limit is that to find the head and tail of the log, there must -be at least one complete transaction in the log at any given time. If a -transaction is larger than half the log, then there is the possibility that a -crash during the write of a such a transaction could partially overwrite the -only complete previous transaction in the log. This will result in a recovery -failure and an inconsistent filesystem and hence we must enforce the maximum -size of a checkpoint to be slightly less than a half the log. - -Apart from this size requirement, a checkpoint transaction looks no different -to any other transaction - it contains a transaction header, a series of -formatted log items and a commit record at the tail. From a recovery -perspective, the checkpoint transaction is also no different - just a lot -bigger with a lot more items in it. The worst case effect of this is that we -might need to tune the recovery transaction object hash size. - -Because the checkpoint is just another transaction and all the changes to log -items are stored as log vectors, we can use the existing log buffer writing -code to write the changes into the log. To do this efficiently, we need to -minimise the time we hold the CIL locked while writing the checkpoint -transaction. The current log write code enables us to do this easily with the -way it separates the writing of the transaction contents (the log vectors) from -the transaction commit record, but tracking this requires us to have a -per-checkpoint context that travels through the log write process through to -checkpoint completion. - -Hence a checkpoint has a context that tracks the state of the current -checkpoint from initiation to checkpoint completion. A new context is initiated -at the same time a checkpoint transaction is started. That is, when we remove -all the current items from the CIL during a checkpoint operation, we move all -those changes into the current checkpoint context. We then initialise a new -context and attach that to the CIL for aggregation of new transactions. - -This allows us to unlock the CIL immediately after transfer of all the -committed items and effectively allow new transactions to be issued while we -are formatting the checkpoint into the log. It also allows concurrent -checkpoints to be written into the log buffers in the case of log force heavy -workloads, just like the existing transaction commit code does. This, however, -requires that we strictly order the commit records in the log so that -checkpoint sequence order is maintained during log replay. - -To ensure that we can be writing an item into a checkpoint transaction at -the same time another transaction modifies the item and inserts the log item -into the new CIL, then checkpoint transaction commit code cannot use log items -to store the list of log vectors that need to be written into the transaction. -Hence log vectors need to be able to be chained together to allow them to be -detached from the log items. That is, when the CIL is flushed the memory -buffer and log vector attached to each log item needs to be attached to the -checkpoint context so that the log item can be released. In diagrammatic form, -the CIL would look like this before the flush: - - CIL Head - | - V - Log Item <-> log vector 1 -> memory buffer - | -> vector array - V - Log Item <-> log vector 2 -> memory buffer - | -> vector array - V - ...... - | - V - Log Item <-> log vector N-1 -> memory buffer - | -> vector array - V - Log Item <-> log vector N -> memory buffer - -> vector array - -And after the flush the CIL head is empty, and the checkpoint context log -vector list would look like: - - Checkpoint Context - | - V - log vector 1 -> memory buffer - | -> vector array - | -> Log Item - V - log vector 2 -> memory buffer - | -> vector array - | -> Log Item - V - ...... - | - V - log vector N-1 -> memory buffer - | -> vector array - | -> Log Item - V - log vector N -> memory buffer - -> vector array - -> Log Item - -Once this transfer is done, the CIL can be unlocked and new transactions can -start, while the checkpoint flush code works over the log vector chain to -commit the checkpoint. - -Once the checkpoint is written into the log buffers, the checkpoint context is -attached to the log buffer that the commit record was written to along with a -completion callback. Log IO completion will call that callback, which can then -run transaction committed processing for the log items (i.e. insert into AIL -and unpin) in the log vector chain and then free the log vector chain and -checkpoint context. - -Discussion Point: I am uncertain as to whether the log item is the most -efficient way to track vectors, even though it seems like the natural way to do -it. The fact that we walk the log items (in the CIL) just to chain the log -vectors and break the link between the log item and the log vector means that -we take a cache line hit for the log item list modification, then another for -the log vector chaining. If we track by the log vectors, then we only need to -break the link between the log item and the log vector, which means we should -dirty only the log item cachelines. Normally I wouldn't be concerned about one -vs two dirty cachelines except for the fact I've seen upwards of 80,000 log -vectors in one checkpoint transaction. I'd guess this is a "measure and -compare" situation that can be done after a working and reviewed implementation -is in the dev tree.... - -Delayed Logging: Checkpoint Sequencing - -One of the key aspects of the XFS transaction subsystem is that it tags -committed transactions with the log sequence number of the transaction commit. -This allows transactions to be issued asynchronously even though there may be -future operations that cannot be completed until that transaction is fully -committed to the log. In the rare case that a dependent operation occurs (e.g. -re-using a freed metadata extent for a data extent), a special, optimised log -force can be issued to force the dependent transaction to disk immediately. - -To do this, transactions need to record the LSN of the commit record of the -transaction. This LSN comes directly from the log buffer the transaction is -written into. While this works just fine for the existing transaction -mechanism, it does not work for delayed logging because transactions are not -written directly into the log buffers. Hence some other method of sequencing -transactions is required. - -As discussed in the checkpoint section, delayed logging uses per-checkpoint -contexts, and as such it is simple to assign a sequence number to each -checkpoint. Because the switching of checkpoint contexts must be done -atomically, it is simple to ensure that each new context has a monotonically -increasing sequence number assigned to it without the need for an external -atomic counter - we can just take the current context sequence number and add -one to it for the new context. - -Then, instead of assigning a log buffer LSN to the transaction commit LSN -during the commit, we can assign the current checkpoint sequence. This allows -operations that track transactions that have not yet completed know what -checkpoint sequence needs to be committed before they can continue. As a -result, the code that forces the log to a specific LSN now needs to ensure that -the log forces to a specific checkpoint. - -To ensure that we can do this, we need to track all the checkpoint contexts -that are currently committing to the log. When we flush a checkpoint, the -context gets added to a "committing" list which can be searched. When a -checkpoint commit completes, it is removed from the committing list. Because -the checkpoint context records the LSN of the commit record for the checkpoint, -we can also wait on the log buffer that contains the commit record, thereby -using the existing log force mechanisms to execute synchronous forces. - -It should be noted that the synchronous forces may need to be extended with -mitigation algorithms similar to the current log buffer code to allow -aggregation of multiple synchronous transactions if there are already -synchronous transactions being flushed. Investigation of the performance of the -current design is needed before making any decisions here. - -The main concern with log forces is to ensure that all the previous checkpoints -are also committed to disk before the one we need to wait for. Therefore we -need to check that all the prior contexts in the committing list are also -complete before waiting on the one we need to complete. We do this -synchronisation in the log force code so that we don't need to wait anywhere -else for such serialisation - it only matters when we do a log force. - -The only remaining complexity is that a log force now also has to handle the -case where the forcing sequence number is the same as the current context. That -is, we need to flush the CIL and potentially wait for it to complete. This is a -simple addition to the existing log forcing code to check the sequence numbers -and push if required. Indeed, placing the current sequence checkpoint flush in -the log force code enables the current mechanism for issuing synchronous -transactions to remain untouched (i.e. commit an asynchronous transaction, then -force the log at the LSN of that transaction) and so the higher level code -behaves the same regardless of whether delayed logging is being used or not. - -Delayed Logging: Checkpoint Log Space Accounting - -The big issue for a checkpoint transaction is the log space reservation for the -transaction. We don't know how big a checkpoint transaction is going to be -ahead of time, nor how many log buffers it will take to write out, nor the -number of split log vector regions are going to be used. We can track the -amount of log space required as we add items to the commit item list, but we -still need to reserve the space in the log for the checkpoint. - -A typical transaction reserves enough space in the log for the worst case space -usage of the transaction. The reservation accounts for log record headers, -transaction and region headers, headers for split regions, buffer tail padding, -etc. as well as the actual space for all the changed metadata in the -transaction. While some of this is fixed overhead, much of it is dependent on -the size of the transaction and the number of regions being logged (the number -of log vectors in the transaction). - -An example of the differences would be logging directory changes versus logging -inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then -there are lots of transactions that only contain an inode core and an inode log -format structure. That is, two vectors totaling roughly 150 bytes. If we modify -10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each -vector is 12 bytes, so the total to be logged is approximately 1.75MB. In -comparison, if we are logging full directory buffers, they are typically 4KB -each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a -buffer format structure for each buffer - roughly 800 vectors or 1.51MB total -space. From this, it should be obvious that a static log space reservation is -not particularly flexible and is difficult to select the "optimal value" for -all workloads. - -Further, if we are going to use a static reservation, which bit of the entire -reservation does it cover? We account for space used by the transaction -reservation by tracking the space currently used by the object in the CIL and -then calculating the increase or decrease in space used as the object is -relogged. This allows for a checkpoint reservation to only have to account for -log buffer metadata used such as log header records. - -However, even using a static reservation for just the log metadata is -problematic. Typically log record headers use at least 16KB of log space per -1MB of log space consumed (512 bytes per 32k) and the reservation needs to be -large enough to handle arbitrary sized checkpoint transactions. This -reservation needs to be made before the checkpoint is started, and we need to -be able to reserve the space without sleeping. For a 8MB checkpoint, we need a -reservation of around 150KB, which is a non-trivial amount of space. - -A static reservation needs to manipulate the log grant counters - we can take a -permanent reservation on the space, but we still need to make sure we refresh -the write reservation (the actual space available to the transaction) after -every checkpoint transaction completion. Unfortunately, if this space is not -available when required, then the regrant code will sleep waiting for it. - -The problem with this is that it can lead to deadlocks as we may need to commit -checkpoints to be able to free up log space (refer back to the description of -rolling transactions for an example of this). Hence we *must* always have -space available in the log if we are to use static reservations, and that is -very difficult and complex to arrange. It is possible to do, but there is a -simpler way. - -The simpler way of doing this is tracking the entire log space used by the -items in the CIL and using this to dynamically calculate the amount of log -space required by the log metadata. If this log metadata space changes as a -result of a transaction commit inserting a new memory buffer into the CIL, then -the difference in space required is removed from the transaction that causes -the change. Transactions at this level will *always* have enough space -available in their reservation for this as they have already reserved the -maximal amount of log metadata space they require, and such a delta reservation -will always be less than or equal to the maximal amount in the reservation. - -Hence we can grow the checkpoint transaction reservation dynamically as items -are added to the CIL and avoid the need for reserving and regranting log space -up front. This avoids deadlocks and removes a blocking point from the -checkpoint flush code. - -As mentioned early, transactions can't grow to more than half the size of the -log. Hence as part of the reservation growing, we need to also check the size -of the reservation against the maximum allowed transaction size. If we reach -the maximum threshold, we need to push the CIL to the log. This is effectively -a "background flush" and is done on demand. This is identical to -a CIL push triggered by a log force, only that there is no waiting for the -checkpoint commit to complete. This background push is checked and executed by -transaction commit code. - -If the transaction subsystem goes idle while we still have items in the CIL, -they will be flushed by the periodic log force issued by the xfssyncd. This log -force will push the CIL to disk, and if the transaction subsystem stays idle, -allow the idle log to be covered (effectively marked clean) in exactly the same -manner that is done for the existing logging method. A discussion point is -whether this log force needs to be done more frequently than the current rate -which is once every 30s. - - -Delayed Logging: Log Item Pinning - -Currently log items are pinned during transaction commit while the items are -still locked. This happens just after the items are formatted, though it could -be done any time before the items are unlocked. The result of this mechanism is -that items get pinned once for every transaction that is committed to the log -buffers. Hence items that are relogged in the log buffers will have a pin count -for every outstanding transaction they were dirtied in. When each of these -transactions is completed, they will unpin the item once. As a result, the item -only becomes unpinned when all the transactions complete and there are no -pending transactions. Thus the pinning and unpinning of a log item is symmetric -as there is a 1:1 relationship with transaction commit and log item completion. - -For delayed logging, however, we have an asymmetric transaction commit to -completion relationship. Every time an object is relogged in the CIL it goes -through the commit process without a corresponding completion being registered. -That is, we now have a many-to-one relationship between transaction commit and -log item completion. The result of this is that pinning and unpinning of the -log items becomes unbalanced if we retain the "pin on transaction commit, unpin -on transaction completion" model. - -To keep pin/unpin symmetry, the algorithm needs to change to a "pin on -insertion into the CIL, unpin on checkpoint completion". In other words, the -pinning and unpinning becomes symmetric around a checkpoint context. We have to -pin the object the first time it is inserted into the CIL - if it is already in -the CIL during a transaction commit, then we do not pin it again. Because there -can be multiple outstanding checkpoint contexts, we can still see elevated pin -counts, but as each checkpoint completes the pin count will retain the correct -value according to it's context. - -Just to make matters more slightly more complex, this checkpoint level context -for the pin count means that the pinning of an item must take place under the -CIL commit/flush lock. If we pin the object outside this lock, we cannot -guarantee which context the pin count is associated with. This is because of -the fact pinning the item is dependent on whether the item is present in the -current CIL or not. If we don't pin the CIL first before we check and pin the -object, we have a race with CIL being flushed between the check and the pin -(or not pinning, as the case may be). Hence we must hold the CIL flush/commit -lock to guarantee that we pin the items correctly. - -Delayed Logging: Concurrent Scalability - -A fundamental requirement for the CIL is that accesses through transaction -commits must scale to many concurrent commits. The current transaction commit -code does not break down even when there are transactions coming from 2048 -processors at once. The current transaction code does not go any faster than if -there was only one CPU using it, but it does not slow down either. - -As a result, the delayed logging transaction commit code needs to be designed -for concurrency from the ground up. It is obvious that there are serialisation -points in the design - the three important ones are: - - 1. Locking out new transaction commits while flushing the CIL - 2. Adding items to the CIL and updating item space accounting - 3. Checkpoint commit ordering - -Looking at the transaction commit and CIL flushing interactions, it is clear -that we have a many-to-one interaction here. That is, the only restriction on -the number of concurrent transactions that can be trying to commit at once is -the amount of space available in the log for their reservations. The practical -limit here is in the order of several hundred concurrent transactions for a -128MB log, which means that it is generally one per CPU in a machine. - -The amount of time a transaction commit needs to hold out a flush is a -relatively long period of time - the pinning of log items needs to be done -while we are holding out a CIL flush, so at the moment that means it is held -across the formatting of the objects into memory buffers (i.e. while memcpy()s -are in progress). Ultimately a two pass algorithm where the formatting is done -separately to the pinning of objects could be used to reduce the hold time of -the transaction commit side. - -Because of the number of potential transaction commit side holders, the lock -really needs to be a sleeping lock - if the CIL flush takes the lock, we do not -want every other CPU in the machine spinning on the CIL lock. Given that -flushing the CIL could involve walking a list of tens of thousands of log -items, it will get held for a significant time and so spin contention is a -significant concern. Preventing lots of CPUs spinning doing nothing is the -main reason for choosing a sleeping lock even though nothing in either the -transaction commit or CIL flush side sleeps with the lock held. - -It should also be noted that CIL flushing is also a relatively rare operation -compared to transaction commit for asynchronous transaction workloads - only -time will tell if using a read-write semaphore for exclusion will limit -transaction commit concurrency due to cache line bouncing of the lock on the -read side. - -The second serialisation point is on the transaction commit side where items -are inserted into the CIL. Because transactions can enter this code -concurrently, the CIL needs to be protected separately from the above -commit/flush exclusion. It also needs to be an exclusive lock but it is only -held for a very short time and so a spin lock is appropriate here. It is -possible that this lock will become a contention point, but given the short -hold time once per transaction I think that contention is unlikely. - -The final serialisation point is the checkpoint commit record ordering code -that is run as part of the checkpoint commit and log force sequencing. The code -path that triggers a CIL flush (i.e. whatever triggers the log force) will enter -an ordering loop after writing all the log vectors into the log buffers but -before writing the commit record. This loop walks the list of committing -checkpoints and needs to block waiting for checkpoints to complete their commit -record write. As a result it needs a lock and a wait variable. Log force -sequencing also requires the same lock, list walk, and blocking mechanism to -ensure completion of checkpoints. - -These two sequencing operations can use the mechanism even though the -events they are waiting for are different. The checkpoint commit record -sequencing needs to wait until checkpoint contexts contain a commit LSN -(obtained through completion of a commit record write) while log force -sequencing needs to wait until previous checkpoint contexts are removed from -the committing list (i.e. they've completed). A simple wait variable and -broadcast wakeups (thundering herds) has been used to implement these two -serialisation queues. They use the same lock as the CIL, too. If we see too -much contention on the CIL lock, or too many context switches as a result of -the broadcast wakeups these operations can be put under a new spinlock and -given separate wait lists to reduce lock contention and the number of processes -woken by the wrong event. - - -Lifecycle Changes - -The existing log item life cycle is as follows: - - 1. Transaction allocate - 2. Transaction reserve - 3. Lock item - 4. Join item to transaction - If not already attached, - Allocate log item - Attach log item to owner item - Attach log item to transaction - 5. Modify item - Record modifications in log item - 6. Transaction commit - Pin item in memory - Format item into log buffer - Write commit LSN into transaction - Unlock item - Attach transaction to log buffer - - - - - 7. Transaction completion - Mark log item committed - Insert log item into AIL - Write commit LSN into log item - Unpin log item - 8. AIL traversal - Lock item - Mark log item clean - Flush item to disk - - - - 9. Log item removed from AIL - Moves log tail - Item unlocked - -Essentially, steps 1-6 operate independently from step 7, which is also -independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9 -at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur -at the same time. If the log item is in the AIL or between steps 6 and 7 -and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9 -are entered and completed is the object considered clean. - -With delayed logging, there are new steps inserted into the life cycle: - - 1. Transaction allocate - 2. Transaction reserve - 3. Lock item - 4. Join item to transaction - If not already attached, - Allocate log item - Attach log item to owner item - Attach log item to transaction - 5. Modify item - Record modifications in log item - 6. Transaction commit - Pin item in memory if not pinned in CIL - Format item into log vector + buffer - Attach log vector and buffer to log item - Insert log item into CIL - Write CIL context sequence into transaction - Unlock item - - - - 7. CIL push - lock CIL flush - Chain log vectors and buffers together - Remove items from CIL - unlock CIL flush - write log vectors into log - sequence commit records - attach checkpoint context to log buffer - - - - - 8. Checkpoint completion - Mark log item committed - Insert item into AIL - Write commit LSN into log item - Unpin log item - 9. AIL traversal - Lock item - Mark log item clean - Flush item to disk - - 10. Log item removed from AIL - Moves log tail - Item unlocked - -From this, it can be seen that the only life cycle differences between the two -logging methods are in the middle of the life cycle - they still have the same -beginning and end and execution constraints. The only differences are in the -committing of the log items to the log itself and the completion processing. -Hence delayed logging should not introduce any constraints on log item -behaviour, allocation or freeing that don't already exist. - -As a result of this zero-impact "insertion" of delayed logging infrastructure -and the design of the internal structures to avoid on disk format changes, we -can basically switch between delayed logging and the existing mechanism with a -mount option. Fundamentally, there is no reason why the log manager would not -be able to swap methods automatically and transparently depending on load -characteristics, but this should not be necessary if delayed logging works as -designed. diff --git a/MAINTAINERS b/MAINTAINERS index bbf85128ca89..d8f11f8134df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18533,7 +18533,7 @@ W: http://xfs.org/ T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst -F: Documentation/filesystems/xfs-delayed-logging-design.txt +F: Documentation/filesystems/xfs-delayed-logging-design.rst F: Documentation/filesystems/xfs-self-describing-metadata.txt F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h -- cgit v1.2.3 From fc2f6fe745a0fa287f68cb8fba04040aaf73fa28 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:20 +0200 Subject: docs: filesystems: convert xfs-self-describing-metadata.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/7c26b200e12cfc07b9bd379612452d845a8d1474.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/index.rst | 1 + .../filesystems/xfs-self-describing-metadata.rst | 352 +++++++++++++++++++++ .../filesystems/xfs-self-describing-metadata.txt | 350 -------------------- MAINTAINERS | 2 +- 4 files changed, 354 insertions(+), 351 deletions(-) create mode 100644 Documentation/filesystems/xfs-self-describing-metadata.rst delete mode 100644 Documentation/filesystems/xfs-self-describing-metadata.txt diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 0fd90f854b33..e242cb75f9f2 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -118,4 +118,5 @@ Documentation for filesystem implementations. virtiofs vfat xfs-delayed-logging-design + xfs-self-describing-metadata zonefs diff --git a/Documentation/filesystems/xfs-self-describing-metadata.rst b/Documentation/filesystems/xfs-self-describing-metadata.rst new file mode 100644 index 000000000000..51cdafe01ab1 --- /dev/null +++ b/Documentation/filesystems/xfs-self-describing-metadata.rst @@ -0,0 +1,352 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +XFS Self Describing Metadata +============================ + +Introduction +============ + +The largest scalability problem facing XFS is not one of algorithmic +scalability, but of verification of the filesystem structure. Scalabilty of the +structures and indexes on disk and the algorithms for iterating them are +adequate for supporting PB scale filesystems with billions of inodes, however it +is this very scalability that causes the verification problem. + +Almost all metadata on XFS is dynamically allocated. The only fixed location +metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all +other metadata structures need to be discovered by walking the filesystem +structure in different ways. While this is already done by userspace tools for +validating and repairing the structure, there are limits to what they can +verify, and this in turn limits the supportable size of an XFS filesystem. + +For example, it is entirely possible to manually use xfs_db and a bit of +scripting to analyse the structure of a 100TB filesystem when trying to +determine the root cause of a corruption problem, but it is still mainly a +manual task of verifying that things like single bit errors or misplaced writes +weren't the ultimate cause of a corruption event. It may take a few hours to a +few days to perform such forensic analysis, so for at this scale root cause +analysis is entirely possible. + +However, if we scale the filesystem up to 1PB, we now have 10x as much metadata +to analyse and so that analysis blows out towards weeks/months of forensic work. +Most of the analysis work is slow and tedious, so as the amount of analysis goes +up, the more likely that the cause will be lost in the noise. Hence the primary +concern for supporting PB scale filesystems is minimising the time and effort +required for basic forensic analysis of the filesystem structure. + + +Self Describing Metadata +======================== + +One of the problems with the current metadata format is that apart from the +magic number in the metadata block, we have no other way of identifying what it +is supposed to be. We can't even identify if it is the right place. Put simply, +you can't look at a single metadata block in isolation and say "yes, it is +supposed to be there and the contents are valid". + +Hence most of the time spent on forensic analysis is spent doing basic +verification of metadata values, looking for values that are in range (and hence +not detected by automated verification checks) but are not correct. Finding and +understanding how things like cross linked block lists (e.g. sibling +pointers in a btree end up with loops in them) are the key to understanding what +went wrong, but it is impossible to tell what order the blocks were linked into +each other or written to disk after the fact. + +Hence we need to record more information into the metadata to allow us to +quickly determine if the metadata is intact and can be ignored for the purpose +of analysis. We can't protect against every possible type of error, but we can +ensure that common types of errors are easily detectable. Hence the concept of +self describing metadata. + +The first, fundamental requirement of self describing metadata is that the +metadata object contains some form of unique identifier in a well known +location. This allows us to identify the expected contents of the block and +hence parse and verify the metadata object. IF we can't independently identify +the type of metadata in the object, then the metadata doesn't describe itself +very well at all! + +Luckily, almost all XFS metadata has magic numbers embedded already - only the +AGFL, remote symlinks and remote attribute blocks do not contain identifying +magic numbers. Hence we can change the on-disk format of all these objects to +add more identifying information and detect this simply by changing the magic +numbers in the metadata objects. That is, if it has the current magic number, +the metadata isn't self identifying. If it contains a new magic number, it is +self identifying and we can do much more expansive automated verification of the +metadata object at runtime, during forensic analysis or repair. + +As a primary concern, self describing metadata needs some form of overall +integrity checking. We cannot trust the metadata if we cannot verify that it has +not been changed as a result of external influences. Hence we need some form of +integrity check, and this is done by adding CRC32c validation to the metadata +block. If we can verify the block contains the metadata it was intended to +contain, a large amount of the manual verification work can be skipped. + +CRC32c was selected as metadata cannot be more than 64k in length in XFS and +hence a 32 bit CRC is more than sufficient to detect multi-bit errors in +metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is +fast. So while CRC32c is not the strongest of possible integrity checks that +could be used, it is more than sufficient for our needs and has relatively +little overhead. Adding support for larger integrity fields and/or algorithms +does really provide any extra value over CRC32c, but it does add a lot of +complexity and so there is no provision for changing the integrity checking +mechanism. + +Self describing metadata needs to contain enough information so that the +metadata block can be verified as being in the correct place without needing to +look at any other metadata. This means it needs to contain location information. +Just adding a block number to the metadata is not sufficient to protect against +mis-directed writes - a write might be misdirected to the wrong LUN and so be +written to the "correct block" of the wrong filesystem. Hence location +information must contain a filesystem identifier as well as a block number. + +Another key information point in forensic analysis is knowing who the metadata +block belongs to. We already know the type, the location, that it is valid +and/or corrupted, and how long ago that it was last modified. Knowing the owner +of the block is important as it allows us to find other related metadata to +determine the scope of the corruption. For example, if we have a extent btree +object, we don't know what inode it belongs to and hence have to walk the entire +filesystem to find the owner of the block. Worse, the corruption could mean that +no owner can be found (i.e. it's an orphan block), and so without an owner field +in the metadata we have no idea of the scope of the corruption. If we have an +owner field in the metadata object, we can immediately do top down validation to +determine the scope of the problem. + +Different types of metadata have different owner identifiers. For example, +directory, attribute and extent tree blocks are all owned by an inode, while +freespace btree blocks are owned by an allocation group. Hence the size and +contents of the owner field are determined by the type of metadata object we are +looking at. The owner information can also identify misplaced writes (e.g. +freespace btree block written to the wrong AG). + +Self describing metadata also needs to contain some indication of when it was +written to the filesystem. One of the key information points when doing forensic +analysis is how recently the block was modified. Correlation of set of corrupted +metadata blocks based on modification times is important as it can indicate +whether the corruptions are related, whether there's been multiple corruption +events that lead to the eventual failure, and even whether there are corruptions +present that the run-time verification is not detecting. + +For example, we can determine whether a metadata object is supposed to be free +space or still allocated if it is still referenced by its owner by looking at +when the free space btree block that contains the block was last written +compared to when the metadata object itself was last written. If the free space +block is more recent than the object and the object's owner, then there is a +very good chance that the block should have been removed from the owner. + +To provide this "written timestamp", each metadata block gets the Log Sequence +Number (LSN) of the most recent transaction it was modified on written into it. +This number will always increase over the life of the filesystem, and the only +thing that resets it is running xfs_repair on the filesystem. Further, by use of +the LSN we can tell if the corrupted metadata all belonged to the same log +checkpoint and hence have some idea of how much modification occurred between +the first and last instance of corrupt metadata on disk and, further, how much +modification occurred between the corruption being written and when it was +detected. + +Runtime Validation +================== + +Validation of self-describing metadata takes place at runtime in two places: + + - immediately after a successful read from disk + - immediately prior to write IO submission + +The verification is completely stateless - it is done independently of the +modification process, and seeks only to check that the metadata is what it says +it is and that the metadata fields are within bounds and internally consistent. +As such, we cannot catch all types of corruption that can occur within a block +as there may be certain limitations that operational state enforces of the +metadata, or there may be corruption of interblock relationships (e.g. corrupted +sibling pointer lists). Hence we still need stateful checking in the main code +body, but in general most of the per-field validation is handled by the +verifiers. + +For read verification, the caller needs to specify the expected type of metadata +that it should see, and the IO completion process verifies that the metadata +object matches what was expected. If the verification process fails, then it +marks the object being read as EFSCORRUPTED. The caller needs to catch this +error (same as for IO errors), and if it needs to take special action due to a +verification error it can do so by catching the EFSCORRUPTED error value. If we +need more discrimination of error type at higher levels, we can define new +error numbers for different errors as necessary. + +The first step in read verification is checking the magic number and determining +whether CRC validating is necessary. If it is, the CRC32c is calculated and +compared against the value stored in the object itself. Once this is validated, +further checks are made against the location information, followed by extensive +object specific metadata validation. If any of these checks fail, then the +buffer is considered corrupt and the EFSCORRUPTED error is set appropriately. + +Write verification is the opposite of the read verification - first the object +is extensively verified and if it is OK we then update the LSN from the last +modification made to the object, After this, we calculate the CRC and insert it +into the object. Once this is done the write IO is allowed to continue. If any +error occurs during this process, the buffer is again marked with a EFSCORRUPTED +error for the higher layers to catch. + +Structures +========== + +A typical on-disk structure needs to contain the following information:: + + struct xfs_ondisk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC, not logged */ + uuid_t uuid; /* filesystem identifier */ + __be64 owner; /* parent object */ + __be64 blkno; /* location on disk */ + __be64 lsn; /* last modification in log, not logged */ + }; + +Depending on the metadata, this information may be part of a header structure +separate to the metadata contents, or may be distributed through an existing +structure. The latter occurs with metadata that already contains some of this +information, such as the superblock and AG headers. + +Other metadata may have different formats for the information, but the same +level of information is generally provided. For example: + + - short btree blocks have a 32 bit owner (ag number) and a 32 bit block + number for location. The two of these combined provide the same + information as @owner and @blkno in eh above structure, but using 8 + bytes less space on disk. + + - directory/attribute node blocks have a 16 bit magic number, and the + header that contains the magic number has other information in it as + well. hence the additional metadata headers change the overall format + of the metadata. + +A typical buffer read verifier is structured as follows:: + + #define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc) + + static void + xfs_foo_read_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_FOO_CRC_OFF)) || + !xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + } + +The code ensures that the CRC is only checked if the filesystem has CRCs enabled +by checking the superblock of the feature bit, and then if the CRC verifies OK +(or is not needed) it verifies the actual contents of the block. + +The verifier function will take a couple of different forms, depending on +whether the magic number can be used to determine the format of the block. In +the case it can't, the code is structured as follows:: + + static bool + xfs_foo_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; + + if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } + + /* object specific verification checks here */ + + return true; + } + +If there are different magic numbers for the different formats, the verifier +will look like:: + + static bool + xfs_foo_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; + + if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; + + /* object specific verification checks here */ + + return true; + } + +Write verifiers are very similar to the read verifiers, they just do things in +the opposite order to the read verifiers. A typical write verifier:: + + static void + xfs_foo_write_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + + if (bip) { + struct xfs_ondisk_hdr *hdr = bp->b_addr; + hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF); + } + +This will verify the internal structure of the metadata before we go any +further, detecting corruptions that have occurred as the metadata has been +modified in memory. If the metadata verifies OK, and CRCs are enabled, we then +update the LSN field (when it was last modified) and calculate the CRC on the +metadata. Once this is done, we can issue the IO. + +Inodes and Dquots +================= + +Inodes and dquots are special snowflakes. They have per-object CRC and +self-identifiers, but they are packed so that there are multiple objects per +buffer. Hence we do not use per-buffer verifiers to do the work of per-object +verification and CRC calculations. The per-buffer verifiers simply perform basic +identification of the buffer - that they contain inodes or dquots, and that +there are magic numbers in all the expected spots. All further CRC and +verification checks are done when each inode is read from or written back to the +buffer. + +The structure of the verifiers and the identifiers checks is very similar to the +buffer code described above. The only difference is where they are called. For +example, inode read verification is done in xfs_iread() when the inode is first +read out of the buffer and the struct xfs_inode is instantiated. The inode is +already extensively verified during writeback in xfs_iflush_int, so the only +addition here is to add the LSN and CRC to the inode as it is copied back into +the buffer. + +XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of +the unlinked list modifications check or update CRCs, neither during unlink nor +log recovery. So, it's gone unnoticed until now. This won't matter immediately - +repair will probably complain about it - but it needs to be fixed. diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt deleted file mode 100644 index 8db0121d0980..000000000000 --- a/Documentation/filesystems/xfs-self-describing-metadata.txt +++ /dev/null @@ -1,350 +0,0 @@ -XFS Self Describing Metadata ----------------------------- - -Introduction ------------- - -The largest scalability problem facing XFS is not one of algorithmic -scalability, but of verification of the filesystem structure. Scalabilty of the -structures and indexes on disk and the algorithms for iterating them are -adequate for supporting PB scale filesystems with billions of inodes, however it -is this very scalability that causes the verification problem. - -Almost all metadata on XFS is dynamically allocated. The only fixed location -metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all -other metadata structures need to be discovered by walking the filesystem -structure in different ways. While this is already done by userspace tools for -validating and repairing the structure, there are limits to what they can -verify, and this in turn limits the supportable size of an XFS filesystem. - -For example, it is entirely possible to manually use xfs_db and a bit of -scripting to analyse the structure of a 100TB filesystem when trying to -determine the root cause of a corruption problem, but it is still mainly a -manual task of verifying that things like single bit errors or misplaced writes -weren't the ultimate cause of a corruption event. It may take a few hours to a -few days to perform such forensic analysis, so for at this scale root cause -analysis is entirely possible. - -However, if we scale the filesystem up to 1PB, we now have 10x as much metadata -to analyse and so that analysis blows out towards weeks/months of forensic work. -Most of the analysis work is slow and tedious, so as the amount of analysis goes -up, the more likely that the cause will be lost in the noise. Hence the primary -concern for supporting PB scale filesystems is minimising the time and effort -required for basic forensic analysis of the filesystem structure. - - -Self Describing Metadata ------------------------- - -One of the problems with the current metadata format is that apart from the -magic number in the metadata block, we have no other way of identifying what it -is supposed to be. We can't even identify if it is the right place. Put simply, -you can't look at a single metadata block in isolation and say "yes, it is -supposed to be there and the contents are valid". - -Hence most of the time spent on forensic analysis is spent doing basic -verification of metadata values, looking for values that are in range (and hence -not detected by automated verification checks) but are not correct. Finding and -understanding how things like cross linked block lists (e.g. sibling -pointers in a btree end up with loops in them) are the key to understanding what -went wrong, but it is impossible to tell what order the blocks were linked into -each other or written to disk after the fact. - -Hence we need to record more information into the metadata to allow us to -quickly determine if the metadata is intact and can be ignored for the purpose -of analysis. We can't protect against every possible type of error, but we can -ensure that common types of errors are easily detectable. Hence the concept of -self describing metadata. - -The first, fundamental requirement of self describing metadata is that the -metadata object contains some form of unique identifier in a well known -location. This allows us to identify the expected contents of the block and -hence parse and verify the metadata object. IF we can't independently identify -the type of metadata in the object, then the metadata doesn't describe itself -very well at all! - -Luckily, almost all XFS metadata has magic numbers embedded already - only the -AGFL, remote symlinks and remote attribute blocks do not contain identifying -magic numbers. Hence we can change the on-disk format of all these objects to -add more identifying information and detect this simply by changing the magic -numbers in the metadata objects. That is, if it has the current magic number, -the metadata isn't self identifying. If it contains a new magic number, it is -self identifying and we can do much more expansive automated verification of the -metadata object at runtime, during forensic analysis or repair. - -As a primary concern, self describing metadata needs some form of overall -integrity checking. We cannot trust the metadata if we cannot verify that it has -not been changed as a result of external influences. Hence we need some form of -integrity check, and this is done by adding CRC32c validation to the metadata -block. If we can verify the block contains the metadata it was intended to -contain, a large amount of the manual verification work can be skipped. - -CRC32c was selected as metadata cannot be more than 64k in length in XFS and -hence a 32 bit CRC is more than sufficient to detect multi-bit errors in -metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is -fast. So while CRC32c is not the strongest of possible integrity checks that -could be used, it is more than sufficient for our needs and has relatively -little overhead. Adding support for larger integrity fields and/or algorithms -does really provide any extra value over CRC32c, but it does add a lot of -complexity and so there is no provision for changing the integrity checking -mechanism. - -Self describing metadata needs to contain enough information so that the -metadata block can be verified as being in the correct place without needing to -look at any other metadata. This means it needs to contain location information. -Just adding a block number to the metadata is not sufficient to protect against -mis-directed writes - a write might be misdirected to the wrong LUN and so be -written to the "correct block" of the wrong filesystem. Hence location -information must contain a filesystem identifier as well as a block number. - -Another key information point in forensic analysis is knowing who the metadata -block belongs to. We already know the type, the location, that it is valid -and/or corrupted, and how long ago that it was last modified. Knowing the owner -of the block is important as it allows us to find other related metadata to -determine the scope of the corruption. For example, if we have a extent btree -object, we don't know what inode it belongs to and hence have to walk the entire -filesystem to find the owner of the block. Worse, the corruption could mean that -no owner can be found (i.e. it's an orphan block), and so without an owner field -in the metadata we have no idea of the scope of the corruption. If we have an -owner field in the metadata object, we can immediately do top down validation to -determine the scope of the problem. - -Different types of metadata have different owner identifiers. For example, -directory, attribute and extent tree blocks are all owned by an inode, while -freespace btree blocks are owned by an allocation group. Hence the size and -contents of the owner field are determined by the type of metadata object we are -looking at. The owner information can also identify misplaced writes (e.g. -freespace btree block written to the wrong AG). - -Self describing metadata also needs to contain some indication of when it was -written to the filesystem. One of the key information points when doing forensic -analysis is how recently the block was modified. Correlation of set of corrupted -metadata blocks based on modification times is important as it can indicate -whether the corruptions are related, whether there's been multiple corruption -events that lead to the eventual failure, and even whether there are corruptions -present that the run-time verification is not detecting. - -For example, we can determine whether a metadata object is supposed to be free -space or still allocated if it is still referenced by its owner by looking at -when the free space btree block that contains the block was last written -compared to when the metadata object itself was last written. If the free space -block is more recent than the object and the object's owner, then there is a -very good chance that the block should have been removed from the owner. - -To provide this "written timestamp", each metadata block gets the Log Sequence -Number (LSN) of the most recent transaction it was modified on written into it. -This number will always increase over the life of the filesystem, and the only -thing that resets it is running xfs_repair on the filesystem. Further, by use of -the LSN we can tell if the corrupted metadata all belonged to the same log -checkpoint and hence have some idea of how much modification occurred between -the first and last instance of corrupt metadata on disk and, further, how much -modification occurred between the corruption being written and when it was -detected. - -Runtime Validation ------------------- - -Validation of self-describing metadata takes place at runtime in two places: - - - immediately after a successful read from disk - - immediately prior to write IO submission - -The verification is completely stateless - it is done independently of the -modification process, and seeks only to check that the metadata is what it says -it is and that the metadata fields are within bounds and internally consistent. -As such, we cannot catch all types of corruption that can occur within a block -as there may be certain limitations that operational state enforces of the -metadata, or there may be corruption of interblock relationships (e.g. corrupted -sibling pointer lists). Hence we still need stateful checking in the main code -body, but in general most of the per-field validation is handled by the -verifiers. - -For read verification, the caller needs to specify the expected type of metadata -that it should see, and the IO completion process verifies that the metadata -object matches what was expected. If the verification process fails, then it -marks the object being read as EFSCORRUPTED. The caller needs to catch this -error (same as for IO errors), and if it needs to take special action due to a -verification error it can do so by catching the EFSCORRUPTED error value. If we -need more discrimination of error type at higher levels, we can define new -error numbers for different errors as necessary. - -The first step in read verification is checking the magic number and determining -whether CRC validating is necessary. If it is, the CRC32c is calculated and -compared against the value stored in the object itself. Once this is validated, -further checks are made against the location information, followed by extensive -object specific metadata validation. If any of these checks fail, then the -buffer is considered corrupt and the EFSCORRUPTED error is set appropriately. - -Write verification is the opposite of the read verification - first the object -is extensively verified and if it is OK we then update the LSN from the last -modification made to the object, After this, we calculate the CRC and insert it -into the object. Once this is done the write IO is allowed to continue. If any -error occurs during this process, the buffer is again marked with a EFSCORRUPTED -error for the higher layers to catch. - -Structures ----------- - -A typical on-disk structure needs to contain the following information: - -struct xfs_ondisk_hdr { - __be32 magic; /* magic number */ - __be32 crc; /* CRC, not logged */ - uuid_t uuid; /* filesystem identifier */ - __be64 owner; /* parent object */ - __be64 blkno; /* location on disk */ - __be64 lsn; /* last modification in log, not logged */ -}; - -Depending on the metadata, this information may be part of a header structure -separate to the metadata contents, or may be distributed through an existing -structure. The latter occurs with metadata that already contains some of this -information, such as the superblock and AG headers. - -Other metadata may have different formats for the information, but the same -level of information is generally provided. For example: - - - short btree blocks have a 32 bit owner (ag number) and a 32 bit block - number for location. The two of these combined provide the same - information as @owner and @blkno in eh above structure, but using 8 - bytes less space on disk. - - - directory/attribute node blocks have a 16 bit magic number, and the - header that contains the magic number has other information in it as - well. hence the additional metadata headers change the overall format - of the metadata. - -A typical buffer read verifier is structured as follows: - -#define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc) - -static void -xfs_foo_read_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_FOO_CRC_OFF)) || - !xfs_foo_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } -} - -The code ensures that the CRC is only checked if the filesystem has CRCs enabled -by checking the superblock of the feature bit, and then if the CRC verifies OK -(or is not needed) it verifies the actual contents of the block. - -The verifier function will take a couple of different forms, depending on -whether the magic number can be used to determine the format of the block. In -the case it can't, the code is structured as follows: - -static bool -xfs_foo_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_ondisk_hdr *hdr = bp->b_addr; - - if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) - return false; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) { - if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) - return false; - if (bp->b_bn != be64_to_cpu(hdr->blkno)) - return false; - if (hdr->owner == 0) - return false; - } - - /* object specific verification checks here */ - - return true; -} - -If there are different magic numbers for the different formats, the verifier -will look like: - -static bool -xfs_foo_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_ondisk_hdr *hdr = bp->b_addr; - - if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) { - if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) - return false; - if (bp->b_bn != be64_to_cpu(hdr->blkno)) - return false; - if (hdr->owner == 0) - return false; - } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) - return false; - - /* object specific verification checks here */ - - return true; -} - -Write verifiers are very similar to the read verifiers, they just do things in -the opposite order to the read verifiers. A typical write verifier: - -static void -xfs_foo_write_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; - - if (!xfs_foo_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - - if (bip) { - struct xfs_ondisk_hdr *hdr = bp->b_addr; - hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn); - } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF); -} - -This will verify the internal structure of the metadata before we go any -further, detecting corruptions that have occurred as the metadata has been -modified in memory. If the metadata verifies OK, and CRCs are enabled, we then -update the LSN field (when it was last modified) and calculate the CRC on the -metadata. Once this is done, we can issue the IO. - -Inodes and Dquots ------------------ - -Inodes and dquots are special snowflakes. They have per-object CRC and -self-identifiers, but they are packed so that there are multiple objects per -buffer. Hence we do not use per-buffer verifiers to do the work of per-object -verification and CRC calculations. The per-buffer verifiers simply perform basic -identification of the buffer - that they contain inodes or dquots, and that -there are magic numbers in all the expected spots. All further CRC and -verification checks are done when each inode is read from or written back to the -buffer. - -The structure of the verifiers and the identifiers checks is very similar to the -buffer code described above. The only difference is where they are called. For -example, inode read verification is done in xfs_iread() when the inode is first -read out of the buffer and the struct xfs_inode is instantiated. The inode is -already extensively verified during writeback in xfs_iflush_int, so the only -addition here is to add the LSN and CRC to the inode as it is copied back into -the buffer. - -XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of -the unlinked list modifications check or update CRCs, neither during unlink nor -log recovery. So, it's gone unnoticed until now. This won't matter immediately - -repair will probably complain about it - but it needs to be fixed. - diff --git a/MAINTAINERS b/MAINTAINERS index d8f11f8134df..ffd14379f81e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18534,7 +18534,7 @@ T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst F: Documentation/filesystems/xfs-delayed-logging-design.rst -F: Documentation/filesystems/xfs-self-describing-metadata.txt +F: Documentation/filesystems/xfs-self-describing-metadata.rst F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h -- cgit v1.2.3 From 982649915d626bba8753c04e994e5a6650523c64 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 27 Apr 2020 23:17:21 +0200 Subject: docs: filesystems: convert configfs.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Use copyright symbol; - Some whitespace fixes and new line breaks; - Mark literal blocks as such; - Add it to filesystems/index.rst. Also, as this file is alone on its own dir, and it doesn't seem too likely that other documents will follow it, let's move it to the filesystems/ root documentation dir. Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/c2424ec2ad4d735751434ff7f52144c44aa02d5a.1588021877.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/filesystems/configfs.rst | 535 ++++++++++++++++++++++++ Documentation/filesystems/configfs/configfs.txt | 508 ---------------------- Documentation/filesystems/index.rst | 1 + Documentation/iio/iio_configfs.rst | 2 +- Documentation/usb/gadget_configfs.rst | 4 +- fs/configfs/inode.c | 2 +- fs/configfs/item.c | 2 +- include/linux/configfs.h | 2 +- 8 files changed, 542 insertions(+), 514 deletions(-) create mode 100644 Documentation/filesystems/configfs.rst delete mode 100644 Documentation/filesystems/configfs/configfs.txt diff --git a/Documentation/filesystems/configfs.rst b/Documentation/filesystems/configfs.rst new file mode 100644 index 000000000000..f8941954c667 --- /dev/null +++ b/Documentation/filesystems/configfs.rst @@ -0,0 +1,535 @@ +======================================================= +Configfs - Userspace-driven Kernel Object Configuration +======================================================= + +Joel Becker + +Updated: 31 March 2005 + +Copyright (c) 2005 Oracle Corporation, + Joel Becker + + +What is configfs? +================= + +configfs is a ram-based filesystem that provides the converse of +sysfs's functionality. Where sysfs is a filesystem-based view of +kernel objects, configfs is a filesystem-based manager of kernel +objects, or config_items. + +With sysfs, an object is created in kernel (for example, when a device +is discovered) and it is registered with sysfs. Its attributes then +appear in sysfs, allowing userspace to read the attributes via +readdir(3)/read(2). It may allow some attributes to be modified via +write(2). The important point is that the object is created and +destroyed in kernel, the kernel controls the lifecycle of the sysfs +representation, and sysfs is merely a window on all this. + +A configfs config_item is created via an explicit userspace operation: +mkdir(2). It is destroyed via rmdir(2). The attributes appear at +mkdir(2) time, and can be read or modified via read(2) and write(2). +As with sysfs, readdir(3) queries the list of items and/or attributes. +symlink(2) can be used to group items together. Unlike sysfs, the +lifetime of the representation is completely driven by userspace. The +kernel modules backing the items must respond to this. + +Both sysfs and configfs can and should exist together on the same +system. One is not a replacement for the other. + +Using configfs +============== + +configfs can be compiled as a module or into the kernel. You can access +it by doing:: + + mount -t configfs none /config + +The configfs tree will be empty unless client modules are also loaded. +These are modules that register their item types with configfs as +subsystems. Once a client subsystem is loaded, it will appear as a +subdirectory (or more than one) under /config. Like sysfs, the +configfs tree is always there, whether mounted on /config or not. + +An item is created via mkdir(2). The item's attributes will also +appear at this time. readdir(3) can determine what the attributes are, +read(2) can query their default values, and write(2) can store new +values. Don't mix more than one attribute in one attribute file. + +There are two types of configfs attributes: + +* Normal attributes, which similar to sysfs attributes, are small ASCII text + files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably + only one value per file should be used, and the same caveats from sysfs apply. + Configfs expects write(2) to store the entire buffer at once. When writing to + normal configfs attributes, userspace processes should first read the entire + file, modify the portions they wish to change, and then write the entire + buffer back. + +* Binary attributes, which are somewhat similar to sysfs binary attributes, + but with a few slight changes to semantics. The PAGE_SIZE limitation does not + apply, but the whole binary item must fit in single kernel vmalloc'ed buffer. + The write(2) calls from user space are buffered, and the attributes' + write_bin_attribute method will be invoked on the final close, therefore it is + imperative for user-space to check the return code of close(2) in order to + verify that the operation finished successfully. + To avoid a malicious user OOMing the kernel, there's a per-binary attribute + maximum buffer value. + +When an item needs to be destroyed, remove it with rmdir(2). An +item cannot be destroyed if any other item has a link to it (via +symlink(2)). Links can be removed via unlink(2). + +Configuring FakeNBD: an Example +=============================== + +Imagine there's a Network Block Device (NBD) driver that allows you to +access remote block devices. Call it FakeNBD. FakeNBD uses configfs +for its configuration. Obviously, there will be a nice program that +sysadmins use to configure FakeNBD, but somehow that program has to tell +the driver about it. Here's where configfs comes in. + +When the FakeNBD driver is loaded, it registers itself with configfs. +readdir(3) sees this just fine:: + + # ls /config + fakenbd + +A fakenbd connection can be created with mkdir(2). The name is +arbitrary, but likely the tool will make some use of the name. Perhaps +it is a uuid or a disk name:: + + # mkdir /config/fakenbd/disk1 + # ls /config/fakenbd/disk1 + target device rw + +The target attribute contains the IP address of the server FakeNBD will +connect to. The device attribute is the device on the server. +Predictably, the rw attribute determines whether the connection is +read-only or read-write:: + + # echo 10.0.0.1 > /config/fakenbd/disk1/target + # echo /dev/sda1 > /config/fakenbd/disk1/device + # echo 1 > /config/fakenbd/disk1/rw + +That's it. That's all there is. Now the device is configured, via the +shell no less. + +Coding With configfs +==================== + +Every object in configfs is a config_item. A config_item reflects an +object in the subsystem. It has attributes that match values on that +object. configfs handles the filesystem representation of that object +and its attributes, allowing the subsystem to ignore all but the +basic show/store interaction. + +Items are created and destroyed inside a config_group. A group is a +collection of items that share the same attributes and operations. +Items are created by mkdir(2) and removed by rmdir(2), but configfs +handles that. The group has a set of operations to perform these tasks + +A subsystem is the top level of a client module. During initialization, +the client module registers the subsystem with configfs, the subsystem +appears as a directory at the top of the configfs filesystem. A +subsystem is also a config_group, and can do everything a config_group +can. + +struct config_item +================== + +:: + + struct config_item { + char *ci_name; + char ci_namebuf[UOBJ_NAME_LEN]; + struct kref ci_kref; + struct list_head ci_entry; + struct config_item *ci_parent; + struct config_group *ci_group; + struct config_item_type *ci_type; + struct dentry *ci_dentry; + }; + + void config_item_init(struct config_item *); + void config_item_init_type_name(struct config_item *, + const char *name, + struct config_item_type *type); + struct config_item *config_item_get(struct config_item *); + void config_item_put(struct config_item *); + +Generally, struct config_item is embedded in a container structure, a +structure that actually represents what the subsystem is doing. The +config_item portion of that structure is how the object interacts with +configfs. + +Whether statically defined in a source file or created by a parent +config_group, a config_item must have one of the _init() functions +called on it. This initializes the reference count and sets up the +appropriate fields. + +All users of a config_item should have a reference on it via +config_item_get(), and drop the reference when they are done via +config_item_put(). + +By itself, a config_item cannot do much more than appear in configfs. +Usually a subsystem wants the item to display and/or store attributes, +among other things. For that, it needs a type. + +struct config_item_type +======================= + +:: + + struct configfs_item_operations { + void (*release)(struct config_item *); + int (*allow_link)(struct config_item *src, + struct config_item *target); + void (*drop_link)(struct config_item *src, + struct config_item *target); + }; + + struct config_item_type { + struct module *ct_owner; + struct configfs_item_operations *ct_item_ops; + struct configfs_group_operations *ct_group_ops; + struct configfs_attribute **ct_attrs; + struct configfs_bin_attribute **ct_bin_attrs; + }; + +The most basic function of a config_item_type is to define what +operations can be performed on a config_item. All items that have been +allocated dynamically will need to provide the ct_item_ops->release() +method. This method is called when the config_item's reference count +reaches zero. + +struct configfs_attribute +========================= + +:: + + struct configfs_attribute { + char *ca_name; + struct module *ca_owner; + umode_t ca_mode; + ssize_t (*show)(struct config_item *, char *); + ssize_t (*store)(struct config_item *, const char *, size_t); + }; + +When a config_item wants an attribute to appear as a file in the item's +configfs directory, it must define a configfs_attribute describing it. +It then adds the attribute to the NULL-terminated array +config_item_type->ct_attrs. When the item appears in configfs, the +attribute file will appear with the configfs_attribute->ca_name +filename. configfs_attribute->ca_mode specifies the file permissions. + +If an attribute is readable and provides a ->show method, that method will +be called whenever userspace asks for a read(2) on the attribute. If an +attribute is writable and provides a ->store method, that method will be +be called whenever userspace asks for a write(2) on the attribute. + +struct configfs_bin_attribute +============================= + +:: + + struct configfs_bin_attribute { + struct configfs_attribute cb_attr; + void *cb_private; + size_t cb_max_size; + }; + +The binary attribute is used when the one needs to use binary blob to +appear as the contents of a file in the item's configfs directory. +To do so add the binary attribute to the NULL-terminated array +config_item_type->ct_bin_attrs, and the item appears in configfs, the +attribute file will appear with the configfs_bin_attribute->cb_attr.ca_name +filename. configfs_bin_attribute->cb_attr.ca_mode specifies the file +permissions. +The cb_private member is provided for use by the driver, while the +cb_max_size member specifies the maximum amount of vmalloc buffer +to be used. + +If binary attribute is readable and the config_item provides a +ct_item_ops->read_bin_attribute() method, that method will be called +whenever userspace asks for a read(2) on the attribute. The converse +will happen for write(2). The reads/writes are bufferred so only a +single read/write will occur; the attributes' need not concern itself +with it. + +struct config_group +=================== + +A config_item cannot live in a vacuum. The only way one can be created +is via mkdir(2) on a config_group. This will trigger creation of a +child item:: + + struct config_group { + struct config_item cg_item; + struct list_head cg_children; + struct configfs_subsystem *cg_subsys; + struct list_head default_groups; + struct list_head group_entry; + }; + + void config_group_init(struct config_group *group); + void config_group_init_type_name(struct config_group *group, + const char *name, + struct config_item_type *type); + + +The config_group structure contains a config_item. Properly configuring +that item means that a group can behave as an item in its own right. +However, it can do more: it can create child items or groups. This is +accomplished via the group operations specified on the group's +config_item_type:: + + struct configfs_group_operations { + struct config_item *(*make_item)(struct config_group *group, + const char *name); + struct config_group *(*make_group)(struct config_group *group, + const char *name); + int (*commit_item)(struct config_item *item); + void (*disconnect_notify)(struct config_group *group, + struct config_item *item); + void (*drop_item)(struct config_group *group, + struct config_item *item); + }; + +A group creates child items by providing the +ct_group_ops->make_item() method. If provided, this method is called from +mkdir(2) in the group's directory. The subsystem allocates a new +config_item (or more likely, its container structure), initializes it, +and returns it to configfs. Configfs will then populate the filesystem +tree to reflect the new item. + +If the subsystem wants the child to be a group itself, the subsystem +provides ct_group_ops->make_group(). Everything else behaves the same, +using the group _init() functions on the group. + +Finally, when userspace calls rmdir(2) on the item or group, +ct_group_ops->drop_item() is called. As a config_group is also a +config_item, it is not necessary for a separate drop_group() method. +The subsystem must config_item_put() the reference that was initialized +upon item allocation. If a subsystem has no work to do, it may omit +the ct_group_ops->drop_item() method, and configfs will call +config_item_put() on the item on behalf of the subsystem. + +Important: + drop_item() is void, and as such cannot fail. When rmdir(2) + is called, configfs WILL remove the item from the filesystem tree + (assuming that it has no children to keep it busy). The subsystem is + responsible for responding to this. If the subsystem has references to + the item in other threads, the memory is safe. It may take some time + for the item to actually disappear from the subsystem's usage. But it + is gone from configfs. + +When drop_item() is called, the item's linkage has already been torn +down. It no longer has a reference on its parent and has no place in +the item hierarchy. If a client needs to do some cleanup before this +teardown happens, the subsystem can implement the +ct_group_ops->disconnect_notify() method. The method is called after +configfs has removed the item from the filesystem view but before the +item is removed from its parent group. Like drop_item(), +disconnect_notify() is void and cannot fail. Client subsystems should +not drop any references here, as they still must do it in drop_item(). + +A config_group cannot be removed while it still has child items. This +is implemented in the configfs rmdir(2) code. ->drop_item() will not be +called, as the item has not been dropped. rmdir(2) will fail, as the +directory is not empty. + +struct configfs_subsystem +========================= + +A subsystem must register itself, usually at module_init time. This +tells configfs to make the subsystem appear in the file tree:: + + struct configfs_subsystem { + struct config_group su_group; + struct mutex su_mutex; + }; + + int configfs_register_subsystem(struct configfs_subsystem *subsys); + void configfs_unregister_subsystem(struct configfs_subsystem *subsys); + +A subsystem consists of a toplevel config_group and a mutex. +The group is where child config_items are created. For a subsystem, +this group is usually defined statically. Before calling +configfs_register_subsystem(), the subsystem must have initialized the +group via the usual group _init() functions, and it must also have +initialized the mutex. + +When the register call returns, the subsystem is live, and it +will be visible via configfs. At that point, mkdir(2) can be called and +the subsystem must be ready for it. + +An Example +========== + +The best example of these basic concepts is the simple_children +subsystem/group and the simple_child item in +samples/configfs/configfs_sample.c. It shows a trivial object displaying +and storing an attribute, and a simple group creating and destroying +these children. + +Hierarchy Navigation and the Subsystem Mutex +============================================ + +There is an extra bonus that configfs provides. The config_groups and +config_items are arranged in a hierarchy due to the fact that they +appear in a filesystem. A subsystem is NEVER to touch the filesystem +parts, but the subsystem might be interested in this hierarchy. For +this reason, the hierarchy is mirrored via the config_group->cg_children +and config_item->ci_parent structure members. + +A subsystem can navigate the cg_children list and the ci_parent pointer +to see the tree created by the subsystem. This can race with configfs' +management of the hierarchy, so configfs uses the subsystem mutex to +protect modifications. Whenever a subsystem wants to navigate the +hierarchy, it must do so under the protection of the subsystem +mutex. + +A subsystem will be prevented from acquiring the mutex while a newly +allocated item has not been linked into this hierarchy. Similarly, it +will not be able to acquire the mutex while a dropping item has not +yet been unlinked. This means that an item's ci_parent pointer will +never be NULL while the item is in configfs, and that an item will only +be in its parent's cg_children list for the same duration. This allows +a subsystem to trust ci_parent and cg_children while they hold the +mutex. + +Item Aggregation Via symlink(2) +=============================== + +configfs provides a simple group via the group->item parent/child +relationship. Often, however, a larger environment requires aggregation +outside of the parent/child connection. This is implemented via +symlink(2). + +A config_item may provide the ct_item_ops->allow_link() and +ct_item_ops->drop_link() methods. If the ->allow_link() method exists, +symlink(2) may be called with the config_item as the source of the link. +These links are only allowed between configfs config_items. Any +symlink(2) attempt outside the configfs filesystem will be denied. + +When symlink(2) is called, the source config_item's ->allow_link() +method is called with itself and a target item. If the source item +allows linking to target item, it returns 0. A source item may wish to +reject a link if it only wants links to a certain type of object (say, +in its own subsystem). + +When unlink(2) is called on the symbolic link, the source item is +notified via the ->drop_link() method. Like the ->drop_item() method, +this is a void function and cannot return failure. The subsystem is +responsible for responding to the change. + +A config_item cannot be removed while it links to any other item, nor +can it be removed while an item links to it. Dangling symlinks are not +allowed in configfs. + +Automatically Created Subgroups +=============================== + +A new config_group may want to have two types of child config_items. +While this could be codified by magic names in ->make_item(), it is much +more explicit to have a method whereby userspace sees this divergence. + +Rather than have a group where some items behave differently than +others, configfs provides a method whereby one or many subgroups are +automatically created inside the parent at its creation. Thus, +mkdir("parent") results in "parent", "parent/subgroup1", up through +"parent/subgroupN". Items of type 1 can now be created in +"parent/subgroup1", and items of type N can be created in +"parent/subgroupN". + +These automatic subgroups, or default groups, do not preclude other +children of the parent group. If ct_group_ops->make_group() exists, +other child groups can be created on the parent group directly. + +A configfs subsystem specifies default groups by adding them using the +configfs_add_default_group() function to the parent config_group +structure. Each added group is populated in the configfs tree at the same +time as the parent group. Similarly, they are removed at the same time +as the parent. No extra notification is provided. When a ->drop_item() +method call notifies the subsystem the parent group is going away, it +also means every default group child associated with that parent group. + +As a consequence of this, default groups cannot be removed directly via +rmdir(2). They also are not considered when rmdir(2) on the parent +group is checking for children. + +Dependent Subsystems +==================== + +Sometimes other drivers depend on particular configfs items. For +example, ocfs2 mounts depend on a heartbeat region item. If that +region item is removed with rmdir(2), the ocfs2 mount must BUG or go +readonly. Not happy. + +configfs provides two additional API calls: configfs_depend_item() and +configfs_undepend_item(). A client driver can call +configfs_depend_item() on an existing item to tell configfs that it is +depended on. configfs will then return -EBUSY from rmdir(2) for that +item. When the item is no longer depended on, the client driver calls +configfs_undepend_item() on it. + +These API cannot be called underneath any configfs callbacks, as +they will conflict. They can block and allocate. A client driver +probably shouldn't calling them of its own gumption. Rather it should +be providing an API that external subsystems call. + +How does this work? Imagine the ocfs2 mount process. When it mounts, +it asks for a heartbeat region item. This is done via a call into the +heartbeat code. Inside the heartbeat code, the region item is looked +up. Here, the heartbeat code calls configfs_depend_item(). If it +succeeds, then heartbeat knows the region is safe to give to ocfs2. +If it fails, it was being torn down anyway, and heartbeat can gracefully +pass up an error. + +Committable Items +================= + +Note: + Committable items are currently unimplemented. + +Some config_items cannot have a valid initial state. That is, no +default values can be specified for the item's attributes such that the +item can do its work. Userspace must configure one or more attributes, +after which the subsystem can start whatever entity this item +represents. + +Consider the FakeNBD device from above. Without a target address *and* +a target device, the subsystem has no idea what block device to import. +The simple example assumes that the subsystem merely waits until all the +appropriate attributes are configured, and then connects. This will, +indeed, work, but now every attribute store must check if the attributes +are initialized. Every attribute store must fire off the connection if +that condition is met. + +Far better would be an explicit action notifying the subsystem that the +config_item is ready to go. More importantly, an explicit action allows +the subsystem to provide feedback as to whether the attributes are +initialized in a way that makes sense. configfs provides this as +committable items. + +configfs still uses only normal filesystem operations. An item is +committed via rename(2). The item is moved from a directory where it +can be modified to a directory where it cannot. + +Any group that provides the ct_group_ops->commit_item() method has +committable items. When this group appears in configfs, mkdir(2) will +not work directly in the group. Instead, the group will have two +subdirectories: "live" and "pending". The "live" directory does not +support mkdir(2) or rmdir(2) either. It only allows rename(2). The +"pending" directory does allow mkdir(2) and rmdir(2). An item is +created in the "pending" directory. Its attributes can be modified at +will. Userspace commits the item by renaming it into the "live" +directory. At this point, the subsystem receives the ->commit_item() +callback. If all required attributes are filled to satisfaction, the +method returns zero and the item is moved to the "live" directory. + +As rmdir(2) does not work in the "live" directory, an item must be +shutdown, or "uncommitted". Again, this is done via rename(2), this +time from the "live" directory back to the "pending" one. The subsystem +is notified by the ct_group_ops->uncommit_object() method. diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt deleted file mode 100644 index 16e606c11f40..000000000000 --- a/Documentation/filesystems/configfs/configfs.txt +++ /dev/null @@ -1,508 +0,0 @@ - -configfs - Userspace-driven kernel object configuration. - -Joel Becker - -Updated: 31 March 2005 - -Copyright (c) 2005 Oracle Corporation, - Joel Becker - - -[What is configfs?] - -configfs is a ram-based filesystem that provides the converse of -sysfs's functionality. Where sysfs is a filesystem-based view of -kernel objects, configfs is a filesystem-based manager of kernel -objects, or config_items. - -With sysfs, an object is created in kernel (for example, when a device -is discovered) and it is registered with sysfs. Its attributes then -appear in sysfs, allowing userspace to read the attributes via -readdir(3)/read(2). It may allow some attributes to be modified via -write(2). The important point is that the object is created and -destroyed in kernel, the kernel controls the lifecycle of the sysfs -representation, and sysfs is merely a window on all this. - -A configfs config_item is created via an explicit userspace operation: -mkdir(2). It is destroyed via rmdir(2). The attributes appear at -mkdir(2) time, and can be read or modified via read(2) and write(2). -As with sysfs, readdir(3) queries the list of items and/or attributes. -symlink(2) can be used to group items together. Unlike sysfs, the -lifetime of the representation is completely driven by userspace. The -kernel modules backing the items must respond to this. - -Both sysfs and configfs can and should exist together on the same -system. One is not a replacement for the other. - -[Using configfs] - -configfs can be compiled as a module or into the kernel. You can access -it by doing - - mount -t configfs none /config - -The configfs tree will be empty unless client modules are also loaded. -These are modules that register their item types with configfs as -subsystems. Once a client subsystem is loaded, it will appear as a -subdirectory (or more than one) under /config. Like sysfs, the -configfs tree is always there, whether mounted on /config or not. - -An item is created via mkdir(2). The item's attributes will also -appear at this time. readdir(3) can determine what the attributes are, -read(2) can query their default values, and write(2) can store new -values. Don't mix more than one attribute in one attribute file. - -There are two types of configfs attributes: - -* Normal attributes, which similar to sysfs attributes, are small ASCII text -files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably -only one value per file should be used, and the same caveats from sysfs apply. -Configfs expects write(2) to store the entire buffer at once. When writing to -normal configfs attributes, userspace processes should first read the entire -file, modify the portions they wish to change, and then write the entire -buffer back. - -* Binary attributes, which are somewhat similar to sysfs binary attributes, -but with a few slight changes to semantics. The PAGE_SIZE limitation does not -apply, but the whole binary item must fit in single kernel vmalloc'ed buffer. -The write(2) calls from user space are buffered, and the attributes' -write_bin_attribute method will be invoked on the final close, therefore it is -imperative for user-space to check the return code of close(2) in order to -verify that the operation finished successfully. -To avoid a malicious user OOMing the kernel, there's a per-binary attribute -maximum buffer value. - -When an item needs to be destroyed, remove it with rmdir(2). An -item cannot be destroyed if any other item has a link to it (via -symlink(2)). Links can be removed via unlink(2). - -[Configuring FakeNBD: an Example] - -Imagine there's a Network Block Device (NBD) driver that allows you to -access remote block devices. Call it FakeNBD. FakeNBD uses configfs -for its configuration. Obviously, there will be a nice program that -sysadmins use to configure FakeNBD, but somehow that program has to tell -the driver about it. Here's where configfs comes in. - -When the FakeNBD driver is loaded, it registers itself with configfs. -readdir(3) sees this just fine: - - # ls /config - fakenbd - -A fakenbd connection can be created with mkdir(2). The name is -arbitrary, but likely the tool will make some use of the name. Perhaps -it is a uuid or a disk name: - - # mkdir /config/fakenbd/disk1 - # ls /config/fakenbd/disk1 - target device rw - -The target attribute contains the IP address of the server FakeNBD will -connect to. The device attribute is the device on the server. -Predictably, the rw attribute determines whether the connection is -read-only or read-write. - - # echo 10.0.0.1 > /config/fakenbd/disk1/target - # echo /dev/sda1 > /config/fakenbd/disk1/device - # echo 1 > /config/fakenbd/disk1/rw - -That's it. That's all there is. Now the device is configured, via the -shell no less. - -[Coding With configfs] - -Every object in configfs is a config_item. A config_item reflects an -object in the subsystem. It has attributes that match values on that -object. configfs handles the filesystem representation of that object -and its attributes, allowing the subsystem to ignore all but the -basic show/store interaction. - -Items are created and destroyed inside a config_group. A group is a -collection of items that share the same attributes and operations. -Items are created by mkdir(2) and removed by rmdir(2), but configfs -handles that. The group has a set of operations to perform these tasks - -A subsystem is the top level of a client module. During initialization, -the client module registers the subsystem with configfs, the subsystem -appears as a directory at the top of the configfs filesystem. A -subsystem is also a config_group, and can do everything a config_group -can. - -[struct config_item] - - struct config_item { - char *ci_name; - char ci_namebuf[UOBJ_NAME_LEN]; - struct kref ci_kref; - struct list_head ci_entry; - struct config_item *ci_parent; - struct config_group *ci_group; - struct config_item_type *ci_type; - struct dentry *ci_dentry; - }; - - void config_item_init(struct config_item *); - void config_item_init_type_name(struct config_item *, - const char *name, - struct config_item_type *type); - struct config_item *config_item_get(struct config_item *); - void config_item_put(struct config_item *); - -Generally, struct config_item is embedded in a container structure, a -structure that actually represents what the subsystem is doing. The -config_item portion of that structure is how the object interacts with -configfs. - -Whether statically defined in a source file or created by a parent -config_group, a config_item must have one of the _init() functions -called on it. This initializes the reference count and sets up the -appropriate fields. - -All users of a config_item should have a reference on it via -config_item_get(), and drop the reference when they are done via -config_item_put(). - -By itself, a config_item cannot do much more than appear in configfs. -Usually a subsystem wants the item to display and/or store attributes, -among other things. For that, it needs a type. - -[struct config_item_type] - - struct configfs_item_operations { - void (*release)(struct config_item *); - int (*allow_link)(struct config_item *src, - struct config_item *target); - void (*drop_link)(struct config_item *src, - struct config_item *target); - }; - - struct config_item_type { - struct module *ct_owner; - struct configfs_item_operations *ct_item_ops; - struct configfs_group_operations *ct_group_ops; - struct configfs_attribute **ct_attrs; - struct configfs_bin_attribute **ct_bin_attrs; - }; - -The most basic function of a config_item_type is to define what -operations can be performed on a config_item. All items that have been -allocated dynamically will need to provide the ct_item_ops->release() -method. This method is called when the config_item's reference count -reaches zero. - -[struct configfs_attribute] - - struct configfs_attribute { - char *ca_name; - struct module *ca_owner; - umode_t ca_mode; - ssize_t (*show)(struct config_item *, char *); - ssize_t (*store)(struct config_item *, const char *, size_t); - }; - -When a config_item wants an attribute to appear as a file in the item's -configfs directory, it must define a configfs_attribute describing it. -It then adds the attribute to the NULL-terminated array -config_item_type->ct_attrs. When the item appears in configfs, the -attribute file will appear with the configfs_attribute->ca_name -filename. configfs_attribute->ca_mode specifies the file permissions. - -If an attribute is readable and provides a ->show method, that method will -be called whenever userspace asks for a read(2) on the attribute. If an -attribute is writable and provides a ->store method, that method will be -be called whenever userspace asks for a write(2) on the attribute. - -[struct configfs_bin_attribute] - - struct configfs_bin_attribute { - struct configfs_attribute cb_attr; - void *cb_private; - size_t cb_max_size; - }; - -The binary attribute is used when the one needs to use binary blob to -appear as the contents of a file in the item's configfs directory. -To do so add the binary attribute to the NULL-terminated array -config_item_type->ct_bin_attrs, and the item appears in configfs, the -attribute file will appear with the configfs_bin_attribute->cb_attr.ca_name -filename. configfs_bin_attribute->cb_attr.ca_mode specifies the file -permissions. -The cb_private member is provided for use by the driver, while the -cb_max_size member specifies the maximum amount of vmalloc buffer -to be used. - -If binary attribute is readable and the config_item provides a -ct_item_ops->read_bin_attribute() method, that method will be called -whenever userspace asks for a read(2) on the attribute. The converse -will happen for write(2). The reads/writes are bufferred so only a -single read/write will occur; the attributes' need not concern itself -with it. - -[struct config_group] - -A config_item cannot live in a vacuum. The only way one can be created -is via mkdir(2) on a config_group. This will trigger creation of a -child item. - - struct config_group { - struct config_item cg_item; - struct list_head cg_children; - struct configfs_subsystem *cg_subsys; - struct list_head default_groups; - struct list_head group_entry; - }; - - void config_group_init(struct config_group *group); - void config_group_init_type_name(struct config_group *group, - const char *name, - struct config_item_type *type); - - -The config_group structure contains a config_item. Properly configuring -that item means that a group can behave as an item in its own right. -However, it can do more: it can create child items or groups. This is -accomplished via the group operations specified on the group's -config_item_type. - - struct configfs_group_operations { - struct config_item *(*make_item)(struct config_group *group, - const char *name); - struct config_group *(*make_group)(struct config_group *group, - const char *name); - int (*commit_item)(struct config_item *item); - void (*disconnect_notify)(struct config_group *group, - struct config_item *item); - void (*drop_item)(struct config_group *group, - struct config_item *item); - }; - -A group creates child items by providing the -ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new -config_item (or more likely, its container structure), initializes it, -and returns it to configfs. Configfs will then populate the filesystem -tree to reflect the new item. - -If the subsystem wants the child to be a group itself, the subsystem -provides ct_group_ops->make_group(). Everything else behaves the same, -using the group _init() functions on the group. - -Finally, when userspace calls rmdir(2) on the item or group, -ct_group_ops->drop_item() is called. As a config_group is also a -config_item, it is not necessary for a separate drop_group() method. -The subsystem must config_item_put() the reference that was initialized -upon item allocation. If a subsystem has no work to do, it may omit -the ct_group_ops->drop_item() method, and configfs will call -config_item_put() on the item on behalf of the subsystem. - -IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2) -is called, configfs WILL remove the item from the filesystem tree -(assuming that it has no children to keep it busy). The subsystem is -responsible for responding to this. If the subsystem has references to -the item in other threads, the memory is safe. It may take some time -for the item to actually disappear from the subsystem's usage. But it -is gone from configfs. - -When drop_item() is called, the item's linkage has already been torn -down. It no longer has a reference on its parent and has no place in -the item hierarchy. If a client needs to do some cleanup before this -teardown happens, the subsystem can implement the -ct_group_ops->disconnect_notify() method. The method is called after -configfs has removed the item from the filesystem view but before the -item is removed from its parent group. Like drop_item(), -disconnect_notify() is void and cannot fail. Client subsystems should -not drop any references here, as they still must do it in drop_item(). - -A config_group cannot be removed while it still has child items. This -is implemented in the configfs rmdir(2) code. ->drop_item() will not be -called, as the item has not been dropped. rmdir(2) will fail, as the -directory is not empty. - -[struct configfs_subsystem] - -A subsystem must register itself, usually at module_init time. This -tells configfs to make the subsystem appear in the file tree. - - struct configfs_subsystem { - struct config_group su_group; - struct mutex su_mutex; - }; - - int configfs_register_subsystem(struct configfs_subsystem *subsys); - void configfs_unregister_subsystem(struct configfs_subsystem *subsys); - - A subsystem consists of a toplevel config_group and a mutex. -The group is where child config_items are created. For a subsystem, -this group is usually defined statically. Before calling -configfs_register_subsystem(), the subsystem must have initialized the -group via the usual group _init() functions, and it must also have -initialized the mutex. - When the register call returns, the subsystem is live, and it -will be visible via configfs. At that point, mkdir(2) can be called and -the subsystem must be ready for it. - -[An Example] - -The best example of these basic concepts is the simple_children -subsystem/group and the simple_child item in -samples/configfs/configfs_sample.c. It shows a trivial object displaying -and storing an attribute, and a simple group creating and destroying -these children. - -[Hierarchy Navigation and the Subsystem Mutex] - -There is an extra bonus that configfs provides. The config_groups and -config_items are arranged in a hierarchy due to the fact that they -appear in a filesystem. A subsystem is NEVER to touch the filesystem -parts, but the subsystem might be interested in this hierarchy. For -this reason, the hierarchy is mirrored via the config_group->cg_children -and config_item->ci_parent structure members. - -A subsystem can navigate the cg_children list and the ci_parent pointer -to see the tree created by the subsystem. This can race with configfs' -management of the hierarchy, so configfs uses the subsystem mutex to -protect modifications. Whenever a subsystem wants to navigate the -hierarchy, it must do so under the protection of the subsystem -mutex. - -A subsystem will be prevented from acquiring the mutex while a newly -allocated item has not been linked into this hierarchy. Similarly, it -will not be able to acquire the mutex while a dropping item has not -yet been unlinked. This means that an item's ci_parent pointer will -never be NULL while the item is in configfs, and that an item will only -be in its parent's cg_children list for the same duration. This allows -a subsystem to trust ci_parent and cg_children while they hold the -mutex. - -[Item Aggregation Via symlink(2)] - -configfs provides a simple group via the group->item parent/child -relationship. Often, however, a larger environment requires aggregation -outside of the parent/child connection. This is implemented via -symlink(2). - -A config_item may provide the ct_item_ops->allow_link() and -ct_item_ops->drop_link() methods. If the ->allow_link() method exists, -symlink(2) may be called with the config_item as the source of the link. -These links are only allowed between configfs config_items. Any -symlink(2) attempt outside the configfs filesystem will be denied. - -When symlink(2) is called, the source config_item's ->allow_link() -method is called with itself and a target item. If the source item -allows linking to target item, it returns 0. A source item may wish to -reject a link if it only wants links to a certain type of object (say, -in its own subsystem). - -When unlink(2) is called on the symbolic link, the source item is -notified via the ->drop_link() method. Like the ->drop_item() method, -this is a void function and cannot return failure. The subsystem is -responsible for responding to the change. - -A config_item cannot be removed while it links to any other item, nor -can it be removed while an item links to it. Dangling symlinks are not -allowed in configfs. - -[Automatically Created Subgroups] - -A new config_group may want to have two types of child config_items. -While this could be codified by magic names in ->make_item(), it is much -more explicit to have a method whereby userspace sees this divergence. - -Rather than have a group where some items behave differently than -others, configfs provides a method whereby one or many subgroups are -automatically created inside the parent at its creation. Thus, -mkdir("parent") results in "parent", "parent/subgroup1", up through -"parent/subgroupN". Items of type 1 can now be created in -"parent/subgroup1", and items of type N can be created in -"parent/subgroupN". - -These automatic subgroups, or default groups, do not preclude other -children of the parent group. If ct_group_ops->make_group() exists, -other child groups can be created on the parent group directly. - -A configfs subsystem specifies default groups by adding them using the -configfs_add_default_group() function to the parent config_group -structure. Each added group is populated in the configfs tree at the same -time as the parent group. Similarly, they are removed at the same time -as the parent. No extra notification is provided. When a ->drop_item() -method call notifies the subsystem the parent group is going away, it -also means every default group child associated with that parent group. - -As a consequence of this, default groups cannot be removed directly via -rmdir(2). They also are not considered when rmdir(2) on the parent -group is checking for children. - -[Dependent Subsystems] - -Sometimes other drivers depend on particular configfs items. For -example, ocfs2 mounts depend on a heartbeat region item. If that -region item is removed with rmdir(2), the ocfs2 mount must BUG or go -readonly. Not happy. - -configfs provides two additional API calls: configfs_depend_item() and -configfs_undepend_item(). A client driver can call -configfs_depend_item() on an existing item to tell configfs that it is -depended on. configfs will then return -EBUSY from rmdir(2) for that -item. When the item is no longer depended on, the client driver calls -configfs_undepend_item() on it. - -These API cannot be called underneath any configfs callbacks, as -they will conflict. They can block and allocate. A client driver -probably shouldn't calling them of its own gumption. Rather it should -be providing an API that external subsystems call. - -How does this work? Imagine the ocfs2 mount process. When it mounts, -it asks for a heartbeat region item. This is done via a call into the -heartbeat code. Inside the heartbeat code, the region item is looked -up. Here, the heartbeat code calls configfs_depend_item(). If it -succeeds, then heartbeat knows the region is safe to give to ocfs2. -If it fails, it was being torn down anyway, and heartbeat can gracefully -pass up an error. - -[Committable Items] - -NOTE: Committable items are currently unimplemented. - -Some config_items cannot have a valid initial state. That is, no -default values can be specified for the item's attributes such that the -item can do its work. Userspace must configure one or more attributes, -after which the subsystem can start whatever entity this item -represents. - -Consider the FakeNBD device from above. Without a target address *and* -a target device, the subsystem has no idea what block device to import. -The simple example assumes that the subsystem merely waits until all the -appropriate attributes are configured, and then connects. This will, -indeed, work, but now every attribute store must check if the attributes -are initialized. Every attribute store must fire off the connection if -that condition is met. - -Far better would be an explicit action notifying the subsystem that the -config_item is ready to go. More importantly, an explicit action allows -the subsystem to provide feedback as to whether the attributes are -initialized in a way that makes sense. configfs provides this as -committable items. - -configfs still uses only normal filesystem operations. An item is -committed via rename(2). The item is moved from a directory where it -can be modified to a directory where it cannot. - -Any group that provides the ct_group_ops->commit_item() method has -committable items. When this group appears in configfs, mkdir(2) will -not work directly in the group. Instead, the group will have two -subdirectories: "live" and "pending". The "live" directory does not -support mkdir(2) or rmdir(2) either. It only allows rename(2). The -"pending" directory does allow mkdir(2) and rmdir(2). An item is -created in the "pending" directory. Its attributes can be modified at -will. Userspace commits the item by renaming it into the "live" -directory. At this point, the subsystem receives the ->commit_item() -callback. If all required attributes are filled to satisfaction, the -method returns zero and the item is moved to the "live" directory. - -As rmdir(2) does not work in the "live" directory, an item must be -shutdown, or "uncommitted". Again, this is done via rename(2), this -time from the "live" directory back to the "pending" one. The subsystem -is notified by the ct_group_ops->uncommit_object() method. - - diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e242cb75f9f2..17795341e0a3 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -76,6 +76,7 @@ Documentation for filesystem implementations. cifs/cifsroot ceph coda + configfs cramfs debugfs dlmfs diff --git a/Documentation/iio/iio_configfs.rst b/Documentation/iio/iio_configfs.rst index ecbfdb3afef7..6e38cbbd2981 100644 --- a/Documentation/iio/iio_configfs.rst +++ b/Documentation/iio/iio_configfs.rst @@ -9,7 +9,7 @@ Configfs is a filesystem-based manager of kernel objects. IIO uses some objects that could be easily configured using configfs (e.g.: devices, triggers). -See Documentation/filesystems/configfs/configfs.txt for more information +See Documentation/filesystems/configfs.rst for more information about how configfs works. 2. Usage diff --git a/Documentation/usb/gadget_configfs.rst b/Documentation/usb/gadget_configfs.rst index 54fb08baae22..158e48dab586 100644 --- a/Documentation/usb/gadget_configfs.rst +++ b/Documentation/usb/gadget_configfs.rst @@ -24,7 +24,7 @@ Linux provides a number of functions for gadgets to use. Creating a gadget means deciding what configurations there will be and which functions each configuration will provide. -Configfs (please see `Documentation/filesystems/configfs/*`) lends itself nicely +Configfs (please see `Documentation/filesystems/configfs.rst`) lends itself nicely for the purpose of telling the kernel about the above mentioned decision. This document is about how to do it. @@ -354,7 +354,7 @@ the directories in general can be named at will. A group can have a number of its default sub-groups created automatically. For more information on configfs please see -`Documentation/filesystems/configfs/*`. +`Documentation/filesystems/configfs.rst`. The concepts described above translate to USB gadgets like this: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fd0b5dd68f9e..8bd6a883c94c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs/configfs.txt for more + * Please see Documentation/filesystems/configfs.rst for more * information. */ diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 6e0f1fcb8a5b..704a4356f137 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs/configfs.txt for + * Please see the file Documentation/filesystems/configfs.rst for * critical information about using the config_item interface. */ diff --git a/include/linux/configfs.h b/include/linux/configfs.h index fa9490a8874c..2e8c69b43c64 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -13,7 +13,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please read Documentation/filesystems/configfs/configfs.txt before using + * Please read Documentation/filesystems/configfs.rst before using * the configfs interface, ESPECIALLY the parts about reference counts and * item destructors. */ -- cgit v1.2.3 From 3eaa3bfa380bf07f778bc2ff57441976803ccaed Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Tue, 5 May 2020 14:18:28 +0800 Subject: kobject: documentation: Fix erroneous function example in kobject doc. Update the definitions of some functions listed in the kobject document, since they have been changed. Signed-off-by: Qi Zheng Link: https://lore.kernel.org/r/20200505061828.42952-1-arch0.zheng@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/core-api/kobject.rst | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/Documentation/core-api/kobject.rst b/Documentation/core-api/kobject.rst index 1f62d4d7d966..c3ac2963283b 100644 --- a/Documentation/core-api/kobject.rst +++ b/Documentation/core-api/kobject.rst @@ -80,11 +80,11 @@ what is the pointer to the containing structure? You must avoid tricks (such as assuming that the kobject is at the beginning of the structure) and, instead, use the container_of() macro, found in ````:: - container_of(pointer, type, member) + container_of(ptr, type, member) where: - * ``pointer`` is the pointer to the embedded kobject, + * ``ptr`` is the pointer to the embedded kobject, * ``type`` is the type of the containing structure, and * ``member`` is the name of the structure field to which ``pointer`` points. @@ -140,7 +140,7 @@ the name of the kobject, call kobject_rename():: int kobject_rename(struct kobject *kobj, const char *new_name); -kobject_rename does not perform any locking or have a solid notion of +kobject_rename() does not perform any locking or have a solid notion of what names are valid so the caller must provide their own sanity checking and serialization. @@ -222,17 +222,17 @@ ksets, show and store functions, and other details. This is the one exception where a single kobject should be created. To create such an entry, use the function:: - struct kobject *kobject_create_and_add(char *name, struct kobject *parent); + struct kobject *kobject_create_and_add(const char *name, struct kobject *parent); This function will create a kobject and place it in sysfs in the location underneath the specified parent kobject. To create simple attributes associated with this kobject, use:: - int sysfs_create_file(struct kobject *kobj, struct attribute *attr); + int sysfs_create_file(struct kobject *kobj, const struct attribute *attr); or:: - int sysfs_create_group(struct kobject *kobj, struct attribute_group *grp); + int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); Both types of attributes used here, with a kobject that has been created with the kobject_create_and_add(), can be of type kobj_attribute, so no @@ -300,8 +300,10 @@ kobj_type:: void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; struct attribute **default_attrs; + const struct attribute_group **default_groups; const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj); + void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; This structure is used to describe a particular type of kobject (or, more @@ -352,12 +354,12 @@ created and never declared statically or on the stack. To create a new kset use:: struct kset *kset_create_and_add(const char *name, - struct kset_uevent_ops *u, - struct kobject *parent); + const struct kset_uevent_ops *uevent_ops, + struct kobject *parent_kobj); When you are finished with the kset, call:: - void kset_unregister(struct kset *kset); + void kset_unregister(struct kset *k); to destroy it. This removes the kset from sysfs and decrements its reference count. When the reference count goes to zero, the kset will be released. @@ -371,9 +373,9 @@ If a kset wishes to control the uevent operations of the kobjects associated with it, it can use the struct kset_uevent_ops to handle it:: struct kset_uevent_ops { - int (*filter)(struct kset *kset, struct kobject *kobj); - const char *(*name)(struct kset *kset, struct kobject *kobj); - int (*uevent)(struct kset *kset, struct kobject *kobj, + int (* const filter)(struct kset *kset, struct kobject *kobj); + const char *(* const name)(struct kset *kset, struct kobject *kobj); + int (* const uevent)(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env); }; -- cgit v1.2.3 From 094d6dc56245d0c796af9ed3e680effbe66ba0b5 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 2 May 2020 16:30:58 +0200 Subject: watchdog: update email address in conversion doc The old one is defunct. However, I think it makes sense that I am still the primary contact person for updates here. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20200502143103.19473-1-wsa@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/watchdog/convert_drivers_to_kernel_api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/watchdog/convert_drivers_to_kernel_api.rst b/Documentation/watchdog/convert_drivers_to_kernel_api.rst index 51b999b5551a..a1c3f038ce0e 100644 --- a/Documentation/watchdog/convert_drivers_to_kernel_api.rst +++ b/Documentation/watchdog/convert_drivers_to_kernel_api.rst @@ -2,7 +2,7 @@ Converting old watchdog drivers to the watchdog framework ========================================================= -by Wolfram Sang +by Wolfram Sang Before the watchdog framework came into the kernel, every driver had to implement the API on its own. Now, as the framework factored out the common -- cgit v1.2.3 From 35c59990058351a1c9efab9e49dd19c2e5579cfb Mon Sep 17 00:00:00 2001 From: Joshua Abraham Date: Fri, 1 May 2020 18:36:24 -0400 Subject: docs: kvm: Fix KVM_KVMCLOCK_CTRL API doc The KVM_KVMCLOCK_CTRL ioctl signals to supported KVM guests that the hypervisor has paused it. Update the documentation to reflect that the guest is notified by this API. Signed-off-by: Joshua Abraham Link: https://lore.kernel.org/r/20200501223624.GA25826@josh-ZenBook Signed-off-by: Jonathan Corbet --- Documentation/virt/kvm/api.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index efbbe570aa9b..d2c1cbce1018 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2572,13 +2572,15 @@ list in 4.68. :Parameters: None :Returns: 0 on success, -1 on error -This signals to the host kernel that the specified guest is being paused by -userspace. The host will set a flag in the pvclock structure that is checked -from the soft lockup watchdog. The flag is part of the pvclock structure that -is shared between guest and host, specifically the second bit of the flags +This ioctl sets a flag accessible to the guest indicating that the specified +vCPU has been paused by the host userspace. + +The host will set a flag in the pvclock structure that is checked from the +soft lockup watchdog. The flag is part of the pvclock structure that is +shared between guest and host, specifically the second bit of the flags field of the pvclock_vcpu_time_info structure. It will be set exclusively by the host and read/cleared exclusively by the guest. The guest operation of -checking and clearing the flag must an atomic operation so +checking and clearing the flag must be an atomic operation so load-link/store-conditional, or equivalent must be used. There are two cases where the guest will clear the flag: when the soft lockup watchdog timer resets itself or when a soft lockup is detected. This ioctl can be called any time -- cgit v1.2.3 From 16a398d17649d781773fa8e342a4ce28cd65af97 Mon Sep 17 00:00:00 2001 From: Vitor Massaru Iha Date: Thu, 30 Apr 2020 19:58:28 -0300 Subject: doc: misc-device: add uacce to toctree(index) This fixes: Documentation/misc-devices/uacce.rst: WARNING: document isn't included in any toctree Signed-off-by: Vitor Massaru Iha Link: https://lore.kernel.org/r/20200430225828.114033-1-vitor@massaru.org Signed-off-by: Jonathan Corbet --- Documentation/misc-devices/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/misc-devices/index.rst b/Documentation/misc-devices/index.rst index c1dcd2628911..1ecc05fbe6f4 100644 --- a/Documentation/misc-devices/index.rst +++ b/Documentation/misc-devices/index.rst @@ -21,4 +21,5 @@ fit into other categories. lis3lv02d max6875 mic/index + uacce xilinx_sdfec -- cgit v1.2.3 From b67aa4ef68ed4f2933d06eb65a5721c97cce9934 Mon Sep 17 00:00:00 2001 From: Federico Vaga Date: Fri, 1 May 2020 00:20:37 +0200 Subject: doc:it_IT: align Italian translation Translation for the following patches: commit c4f4af4094d6 ("docs: Add documentation for Symbol Namespaces") commit 36bc683dde0a ("kernel-doc: rename the kernel-doc directive 'functions' to 'identifiers'") commit a035d552a93b ("Makefile: Globally enable fall-through warning") commit b9918bdcac1f ("Documentation/process: Add fallthrough pseudo-keyword") commit 58ad30cf91f0 ("docs: fix reference to core-api/namespaces.rst") commit fb0e0ffe7fc8 ("Documentation: bring process docs up to date") commit 7af51678b6d3 ("docs: deprecated.rst: Add BUG()-family") commit 7929b9836ed0 ("docs: Remove :c:func: from process/deprecated.rst") commit 76136e028d3b ("docs: deprecated.rst: Clean up fall-through details") commit d8401f504b49 ("docs: deprecated.rst: Add %p to the list") commit b1735296cef9 ("docs: locking: Drop :c:func: throughout") commit 6adb7755996f ("docs: locking: Add 'need' to hardirq section") Signed-off-by: Federico Vaga Link: https://lore.kernel.org/r/20200430222037.4480-1-federico.vaga@vaga.pv.it Signed-off-by: Jonathan Corbet --- .../translations/it_IT/doc-guide/kernel-doc.rst | 25 +-- .../translations/it_IT/kernel-hacking/hacking.rst | 18 +++ .../translations/it_IT/kernel-hacking/locking.rst | 172 ++++++++++----------- .../translations/it_IT/process/2.Process.rst | 95 ++++++------ .../translations/it_IT/process/coding-style.rst | 6 +- .../translations/it_IT/process/deprecated.rst | 130 ++++++++++++++-- 6 files changed, 287 insertions(+), 159 deletions(-) diff --git a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst index a4ecd8f27631..524ad86cadbb 100644 --- a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst +++ b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst @@ -515,6 +515,22 @@ internal: *[source-pattern ...]* .. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c :internal: +identifiers: *[ function/type ...]* + Include la documentazione per ogni *function* e *type* in *source*. + Se non vengono esplicitamente specificate le funzioni da includere, allora + verranno incluse tutte quelle disponibili in *source*. + + Esempi:: + + .. kernel-doc:: lib/bitmap.c + :identifiers: bitmap_parselist bitmap_parselist_user + + .. kernel-doc:: lib/idr.c + :identifiers: + +functions: *[ function ...]* + Questo è uno pseudonimo, deprecato, per la direttiva 'identifiers'. + doc: *title* Include la documentazione del paragrafo ``DOC:`` identificato dal titolo (*title*) all'interno del file sorgente (*source*). Gli spazi in *title* sono @@ -528,15 +544,6 @@ doc: *title* .. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c :doc: High Definition Audio over HDMI and Display Port -functions: *function* *[...]* - Dal file sorgente (*source*) include la documentazione per le funzioni - elencate (*function*). - - Esempio:: - - .. kernel-doc:: lib/bitmap.c - :functions: bitmap_parselist bitmap_parselist_user - Senza alcuna opzione, la direttiva kernel-doc include tutti i commenti di documentazione presenti nel file sorgente (*source*). diff --git a/Documentation/translations/it_IT/kernel-hacking/hacking.rst b/Documentation/translations/it_IT/kernel-hacking/hacking.rst index 24c592852bf1..6aab27a8d323 100644 --- a/Documentation/translations/it_IT/kernel-hacking/hacking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/hacking.rst @@ -627,6 +627,24 @@ Alcuni manutentori e sviluppatori potrebbero comunque richiedere :c:func:`EXPORT_SYMBOL_GPL()` quando si aggiungono nuove funzionalità o interfacce. +:c:func:`EXPORT_SYMBOL_NS()` +---------------------------- + +Definita in ``include/linux/export.h`` + +Questa è una variate di `EXPORT_SYMBOL()` che permette di specificare uno +spazio dei nomi. Lo spazio dei nomi è documentato in +:doc:`../core-api/symbol-namespaces` + +:c:func:`EXPORT_SYMBOL_NS_GPL()` +-------------------------------- + +Definita in ``include/linux/export.h`` + +Questa è una variate di `EXPORT_SYMBOL_GPL()` che permette di specificare uno +spazio dei nomi. Lo spazio dei nomi è documentato in +:doc:`../core-api/symbol-namespaces` + Procedure e convenzioni ======================= diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst index b9a6be4b8499..4615df5723fb 100644 --- a/Documentation/translations/it_IT/kernel-hacking/locking.rst +++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst @@ -159,17 +159,17 @@ Sincronizzazione in contesto utente Se avete una struttura dati che verrà utilizzata solo dal contesto utente, allora, per proteggerla, potete utilizzare un semplice mutex (``include/linux/mutex.h``). Questo è il caso più semplice: inizializzate il -mutex; invocate :c:func:`mutex_lock_interruptible()` per trattenerlo e -:c:func:`mutex_unlock()` per rilasciarlo. C'è anche :c:func:`mutex_lock()` +mutex; invocate mutex_lock_interruptible() per trattenerlo e +mutex_unlock() per rilasciarlo. C'è anche mutex_lock() ma questa dovrebbe essere evitata perché non ritorna in caso di segnali. Per esempio: ``net/netfilter/nf_sockopt.c`` permette la registrazione -di nuove chiamate per :c:func:`setsockopt()` e :c:func:`getsockopt()` -usando la funzione :c:func:`nf_register_sockopt()`. La registrazione e +di nuove chiamate per setsockopt() e getsockopt() +usando la funzione nf_register_sockopt(). La registrazione e la rimozione vengono eseguite solamente quando il modulo viene caricato o scaricato (e durante l'avvio del sistema, qui non abbiamo concorrenza), e la lista delle funzioni registrate viene consultata solamente quando -:c:func:`setsockopt()` o :c:func:`getsockopt()` sono sconosciute al sistema. +setsockopt() o getsockopt() sono sconosciute al sistema. In questo caso ``nf_sockopt_mutex`` è perfetto allo scopo, in particolar modo visto che setsockopt e getsockopt potrebbero dormire. @@ -179,19 +179,19 @@ Sincronizzazione fra il contesto utente e i softirq Se un softirq condivide dati col contesto utente, avete due problemi. Primo, il contesto utente corrente potrebbe essere interroto da un softirq, e secondo, la sezione critica potrebbe essere eseguita da un altro -processore. Questo è quando :c:func:`spin_lock_bh()` +processore. Questo è quando spin_lock_bh() (``include/linux/spinlock.h``) viene utilizzato. Questo disabilita i softirq -sul processore e trattiene il *lock*. Invece, :c:func:`spin_unlock_bh()` fa +sul processore e trattiene il *lock*. Invece, spin_unlock_bh() fa l'opposto. (Il suffisso '_bh' è un residuo storico che fa riferimento al "Bottom Halves", il vecchio nome delle interruzioni software. In un mondo perfetto questa funzione si chiamerebbe 'spin_lock_softirq()'). -Da notare che in questo caso potete utilizzare anche :c:func:`spin_lock_irq()` -o :c:func:`spin_lock_irqsave()`, queste fermano anche le interruzioni hardware: +Da notare che in questo caso potete utilizzare anche spin_lock_irq() +o spin_lock_irqsave(), queste fermano anche le interruzioni hardware: vedere :ref:`Contesto di interruzione hardware `. Questo funziona alla perfezione anche sui sistemi monoprocessore: gli spinlock -svaniscono e questa macro diventa semplicemente :c:func:`local_bh_disable()` +svaniscono e questa macro diventa semplicemente local_bh_disable() (``include/linux/interrupt.h``), la quale impedisce ai softirq d'essere eseguiti. @@ -224,8 +224,8 @@ Differenti tasklet/timer ~~~~~~~~~~~~~~~~~~~~~~~~ Se un altro tasklet/timer vuole condividere dati col vostro tasklet o timer, -allora avrete bisogno entrambe di :c:func:`spin_lock()` e -:c:func:`spin_unlock()`. Qui :c:func:`spin_lock_bh()` è inutile, siete già +allora avrete bisogno entrambe di spin_lock() e +spin_unlock(). Qui spin_lock_bh() è inutile, siete già in un tasklet ed avete la garanzia che nessun altro verrà eseguito sullo stesso processore. @@ -243,13 +243,13 @@ processore (vedere :ref:`Dati per processore `). Se siete arrivati fino a questo punto nell'uso dei softirq, probabilmente tenete alla scalabilità delle prestazioni abbastanza da giustificarne la complessità aggiuntiva. -Dovete utilizzare :c:func:`spin_lock()` e :c:func:`spin_unlock()` per +Dovete utilizzare spin_lock() e spin_unlock() per proteggere i dati condivisi. Diversi Softirqs ~~~~~~~~~~~~~~~~ -Dovete utilizzare :c:func:`spin_lock()` e :c:func:`spin_unlock()` per +Dovete utilizzare spin_lock() e spin_unlock() per proteggere i dati condivisi, che siano timer, tasklet, diversi softirq o lo stesso o altri softirq: uno qualsiasi di essi potrebbe essere in esecuzione su un diverso processore. @@ -270,40 +270,40 @@ Se un gestore di interruzioni hardware condivide dati con un softirq, allora avrete due preoccupazioni. Primo, il softirq può essere interrotto da un'interruzione hardware, e secondo, la sezione critica potrebbe essere eseguita da un'interruzione hardware su un processore diverso. Questo è il caso -dove :c:func:`spin_lock_irq()` viene utilizzato. Disabilita le interruzioni -sul processore che l'esegue, poi trattiene il lock. :c:func:`spin_unlock_irq()` +dove spin_lock_irq() viene utilizzato. Disabilita le interruzioni +sul processore che l'esegue, poi trattiene il lock. spin_unlock_irq() fa l'opposto. -Il gestore d'interruzione hardware non usa :c:func:`spin_lock_irq()` perché -i softirq non possono essere eseguiti quando il gestore d'interruzione hardware -è in esecuzione: per questo si può usare :c:func:`spin_lock()`, che è un po' +Il gestore d'interruzione hardware non ha bisogno di usare spin_lock_irq() +perché i softirq non possono essere eseguiti quando il gestore d'interruzione +hardware è in esecuzione: per questo si può usare spin_lock(), che è un po' più veloce. L'unica eccezione è quando un altro gestore d'interruzioni -hardware utilizza lo stesso *lock*: :c:func:`spin_lock_irq()` impedirà a questo +hardware utilizza lo stesso *lock*: spin_lock_irq() impedirà a questo secondo gestore di interrompere quello in esecuzione. Questo funziona alla perfezione anche sui sistemi monoprocessore: gli spinlock -svaniscono e questa macro diventa semplicemente :c:func:`local_irq_disable()` +svaniscono e questa macro diventa semplicemente local_irq_disable() (``include/asm/smp.h``), la quale impedisce a softirq/tasklet/BH d'essere eseguiti. -:c:func:`spin_lock_irqsave()` (``include/linux/spinlock.h``) è una variante che +spin_lock_irqsave() (``include/linux/spinlock.h``) è una variante che salva lo stato delle interruzioni in una variabile, questa verrà poi passata -a :c:func:`spin_unlock_irqrestore()`. Questo significa che lo stesso codice +a spin_unlock_irqrestore(). Questo significa che lo stesso codice potrà essere utilizzato in un'interruzione hardware (dove le interruzioni sono già disabilitate) e in un softirq (dove la disabilitazione delle interruzioni è richiesta). Da notare che i softirq (e quindi tasklet e timer) sono eseguiti al ritorno -da un'interruzione hardware, quindi :c:func:`spin_lock_irq()` interrompe +da un'interruzione hardware, quindi spin_lock_irq() interrompe anche questi. Tenuto conto di questo si può dire che -:c:func:`spin_lock_irqsave()` è la funzione di sincronizzazione più generica +spin_lock_irqsave() è la funzione di sincronizzazione più generica e potente. Sincronizzazione fra due gestori d'interruzioni hardware -------------------------------------------------------- Condividere dati fra due gestori di interruzione hardware è molto raro, ma se -succede, dovreste usare :c:func:`spin_lock_irqsave()`: è una specificità +succede, dovreste usare spin_lock_irqsave(): è una specificità dell'architettura il fatto che tutte le interruzioni vengano interrotte quando si eseguono di gestori di interruzioni. @@ -317,11 +317,11 @@ Pete Zaitcev ci offre il seguente riassunto: il mutex e dormire (``copy_from_user*(`` o ``kmalloc(x,GFP_KERNEL)``). - Altrimenti (== i dati possono essere manipolati da un'interruzione) usate - :c:func:`spin_lock_irqsave()` e :c:func:`spin_unlock_irqrestore()`. + spin_lock_irqsave() e spin_unlock_irqrestore(). - Evitate di trattenere uno spinlock per più di 5 righe di codice incluse le chiamate a funzione (ad eccezione di quell per l'accesso come - :c:func:`readb()`). + readb()). Tabella dei requisiti minimi ---------------------------- @@ -334,7 +334,7 @@ processore alla volta, ma se deve condividere dati con un altro thread, allora la sincronizzazione è necessaria). Ricordatevi il suggerimento qui sopra: potete sempre usare -:c:func:`spin_lock_irqsave()`, che è un sovrainsieme di tutte le altre funzioni +spin_lock_irqsave(), che è un sovrainsieme di tutte le altre funzioni per spinlock. ============== ============= ============= ========= ========= ========= ========= ======= ======= ============== ============== @@ -378,13 +378,13 @@ protetti dal *lock* quando qualche altro thread lo sta già facendo trattenendo il *lock*. Potrete acquisire il *lock* più tardi se vi serve accedere ai dati protetti da questo *lock*. -La funzione :c:func:`spin_trylock()` non ritenta di acquisire il *lock*, +La funzione spin_trylock() non ritenta di acquisire il *lock*, se ci riesce al primo colpo ritorna un valore diverso da zero, altrimenti se fallisce ritorna 0. Questa funzione può essere utilizzata in un qualunque -contesto, ma come :c:func:`spin_lock()`: dovete disabilitare i contesti che +contesto, ma come spin_lock(): dovete disabilitare i contesti che potrebbero interrompervi e quindi trattenere lo spinlock. -La funzione :c:func:`mutex_trylock()` invece di sospendere il vostro processo +La funzione mutex_trylock() invece di sospendere il vostro processo ritorna un valore diverso da zero se è possibile trattenere il lock al primo colpo, altrimenti se fallisce ritorna 0. Nonostante non dorma, questa funzione non può essere usata in modo sicuro in contesti di interruzione hardware o @@ -506,7 +506,7 @@ della memoria che il suo contenuto sono protetti dal *lock*. Questo caso è semplice dato che copiamo i dati dall'utente e non permettiamo mai loro di accedere direttamente agli oggetti. -C'è una piccola ottimizzazione qui: nella funzione :c:func:`cache_add()` +C'è una piccola ottimizzazione qui: nella funzione cache_add() impostiamo i campi dell'oggetto prima di acquisire il *lock*. Questo è sicuro perché nessun altro potrà accedervi finché non lo inseriremo nella memoria. @@ -514,7 +514,7 @@ nella memoria. Accesso dal contesto utente --------------------------- -Ora consideriamo il caso in cui :c:func:`cache_find()` può essere invocata +Ora consideriamo il caso in cui cache_find() può essere invocata dal contesto d'interruzione: sia hardware che software. Un esempio potrebbe essere un timer che elimina oggetti dalla memoria. @@ -583,15 +583,15 @@ sono quelle rimosse, mentre quelle ``+`` sono quelle aggiunte. return ret; } -Da notare che :c:func:`spin_lock_irqsave()` disabiliterà le interruzioni +Da notare che spin_lock_irqsave() disabiliterà le interruzioni se erano attive, altrimenti non farà niente (quando siamo già in un contesto d'interruzione); dunque queste funzioni possono essere chiamante in sicurezza da qualsiasi contesto. -Sfortunatamente, :c:func:`cache_add()` invoca :c:func:`kmalloc()` con +Sfortunatamente, cache_add() invoca kmalloc() con l'opzione ``GFP_KERNEL`` che è permessa solo in contesto utente. Ho supposto -che :c:func:`cache_add()` venga chiamata dal contesto utente, altrimenti -questa opzione deve diventare un parametro di :c:func:`cache_add()`. +che cache_add() venga chiamata dal contesto utente, altrimenti +questa opzione deve diventare un parametro di cache_add(). Esporre gli oggetti al di fuori del file ---------------------------------------- @@ -610,7 +610,7 @@ Il secondo problema è il problema del ciclo di vita: se un'altra struttura mantiene un puntatore ad un oggetto, presumibilmente si aspetta che questo puntatore rimanga valido. Sfortunatamente, questo è garantito solo mentre si trattiene il *lock*, altrimenti qualcuno potrebbe chiamare -:c:func:`cache_delete()` o peggio, aggiungere un oggetto che riutilizza lo +cache_delete() o peggio, aggiungere un oggetto che riutilizza lo stesso indirizzo. Dato che c'è un solo *lock*, non potete trattenerlo a vita: altrimenti @@ -710,9 +710,9 @@ Ecco il codice:: } Abbiamo incapsulato il contatore di riferimenti nelle tipiche funzioni -di 'get' e 'put'. Ora possiamo ritornare l'oggetto da :c:func:`cache_find()` +di 'get' e 'put'. Ora possiamo ritornare l'oggetto da cache_find() col vantaggio che l'utente può dormire trattenendo l'oggetto (per esempio, -:c:func:`copy_to_user()` per copiare il nome verso lo spazio utente). +copy_to_user() per copiare il nome verso lo spazio utente). Un altro punto da notare è che ho detto che il contatore dovrebbe incrementarsi per ogni puntatore ad un oggetto: quindi il contatore di riferimenti è 1 @@ -727,8 +727,8 @@ Ci sono un certo numbero di operazioni atomiche definite in ``include/asm/atomic.h``: queste sono garantite come atomiche su qualsiasi processore del sistema, quindi non sono necessari i *lock*. In questo caso è più semplice rispetto all'uso degli spinlock, benché l'uso degli spinlock -sia più elegante per casi non banali. Le funzioni :c:func:`atomic_inc()` e -:c:func:`atomic_dec_and_test()` vengono usate al posto dei tipici operatori di +sia più elegante per casi non banali. Le funzioni atomic_inc() e +atomic_dec_and_test() vengono usate al posto dei tipici operatori di incremento e decremento, e i *lock* non sono più necessari per proteggere il contatore stesso. @@ -820,7 +820,7 @@ al nome di cambiare abbiamo tre possibilità: - Si può togliere static da ``cache_lock`` e dire agli utenti che devono trattenere il *lock* prima di modificare il nome di un oggetto. -- Si può fornire una funzione :c:func:`cache_obj_rename()` che prende il +- Si può fornire una funzione cache_obj_rename() che prende il *lock* e cambia il nome per conto del chiamante; si dirà poi agli utenti di usare questa funzione. @@ -878,11 +878,11 @@ Da notare che ho deciso che il contatore di popolarità dovesse essere protetto da ``cache_lock`` piuttosto che dal *lock* dell'oggetto; questo perché è logicamente parte dell'infrastruttura (come :c:type:`struct list_head ` nell'oggetto). In questo modo, -in :c:func:`__cache_add()`, non ho bisogno di trattenere il *lock* di ogni +in __cache_add(), non ho bisogno di trattenere il *lock* di ogni oggetto mentre si cerca il meno popolare. Ho anche deciso che il campo id è immutabile, quindi non ho bisogno di -trattenere il lock dell'oggetto quando si usa :c:func:`__cache_find()` +trattenere il lock dell'oggetto quando si usa __cache_find() per leggere questo campo; il *lock* dell'oggetto è usato solo dal chiamante che vuole leggere o scrivere il campo name. @@ -907,7 +907,7 @@ Questo è facile da diagnosticare: non è uno di quei problemi che ti tengono sveglio 5 notti a parlare da solo. Un caso un pochino più complesso; immaginate d'avere una spazio condiviso -fra un softirq ed il contesto utente. Se usate :c:func:`spin_lock()` per +fra un softirq ed il contesto utente. Se usate spin_lock() per proteggerlo, il contesto utente potrebbe essere interrotto da un softirq mentre trattiene il lock, da qui il softirq rimarrà in attesa attiva provando ad acquisire il *lock* già trattenuto nel contesto utente. @@ -1006,12 +1006,12 @@ potreste fare come segue:: spin_unlock_bh(&list_lock); Primo o poi, questo esploderà su un sistema multiprocessore perché un -temporizzatore potrebbe essere già partiro prima di :c:func:`spin_lock_bh()`, -e prenderà il *lock* solo dopo :c:func:`spin_unlock_bh()`, e cercherà +temporizzatore potrebbe essere già partiro prima di spin_lock_bh(), +e prenderà il *lock* solo dopo spin_unlock_bh(), e cercherà di eliminare il suo oggetto (che però è già stato eliminato). Questo può essere evitato controllando il valore di ritorno di -:c:func:`del_timer()`: se ritorna 1, il temporizzatore è stato già +del_timer(): se ritorna 1, il temporizzatore è stato già rimosso. Se 0, significa (in questo caso) che il temporizzatore è in esecuzione, quindi possiamo fare come segue:: @@ -1032,9 +1032,9 @@ esecuzione, quindi possiamo fare come segue:: spin_unlock_bh(&list_lock); Un altro problema è l'eliminazione dei temporizzatori che si riavviano -da soli (chiamando :c:func:`add_timer()` alla fine della loro esecuzione). +da soli (chiamando add_timer() alla fine della loro esecuzione). Dato che questo è un problema abbastanza comune con una propensione -alle corse critiche, dovreste usare :c:func:`del_timer_sync()` +alle corse critiche, dovreste usare del_timer_sync() (``include/linux/timer.h``) per gestire questo caso. Questa ritorna il numero di volte che il temporizzatore è stato interrotto prima che fosse in grado di fermarlo senza che si riavviasse. @@ -1116,7 +1116,7 @@ chiamata ``list``:: wmb(); list->next = new; -La funzione :c:func:`wmb()` è una barriera di sincronizzazione delle +La funzione wmb() è una barriera di sincronizzazione delle scritture. Questa garantisce che la prima operazione (impostare l'elemento ``next`` del nuovo elemento) venga completata e vista da tutti i processori prima che venga eseguita la seconda operazione (che sarebbe quella di mettere @@ -1127,7 +1127,7 @@ completamente il nuovo elemento; oppure che lo vedano correttamente e quindi il puntatore ``next`` deve puntare al resto della lista. Fortunatamente, c'è una funzione che fa questa operazione sulle liste -:c:type:`struct list_head `: :c:func:`list_add_rcu()` +:c:type:`struct list_head `: list_add_rcu() (``include/linux/list.h``). Rimuovere un elemento dalla lista è anche più facile: sostituiamo il puntatore @@ -1138,7 +1138,7 @@ l'elemento o lo salteranno. list->next = old->next; -La funzione :c:func:`list_del_rcu()` (``include/linux/list.h``) fa esattamente +La funzione list_del_rcu() (``include/linux/list.h``) fa esattamente questo (la versione normale corrompe il vecchio oggetto, e non vogliamo che accada). @@ -1146,9 +1146,9 @@ Anche i lettori devono stare attenti: alcuni processori potrebbero leggere attraverso il puntatore ``next`` il contenuto dell'elemento successivo troppo presto, ma non accorgersi che il contenuto caricato è sbagliato quando il puntatore ``next`` viene modificato alla loro spalle. Ancora una volta -c'è una funzione che viene in vostro aiuto :c:func:`list_for_each_entry_rcu()` +c'è una funzione che viene in vostro aiuto list_for_each_entry_rcu() (``include/linux/list.h``). Ovviamente, gli scrittori possono usare -:c:func:`list_for_each_entry()` dato che non ci possono essere due scrittori +list_for_each_entry() dato che non ci possono essere due scrittori in contemporanea. Il nostro ultimo dilemma è il seguente: quando possiamo realmente distruggere @@ -1156,15 +1156,15 @@ l'elemento rimosso? Ricordate, un lettore potrebbe aver avuto accesso a questo elemento proprio ora: se eliminiamo questo elemento ed il puntatore ``next`` cambia, il lettore salterà direttamente nella spazzatura e scoppierà. Dobbiamo aspettare finché tutti i lettori che stanno attraversando la lista abbiano -finito. Utilizziamo :c:func:`call_rcu()` per registrare una funzione di +finito. Utilizziamo call_rcu() per registrare una funzione di richiamo che distrugga l'oggetto quando tutti i lettori correnti hanno terminato. In alternative, potrebbe essere usata la funzione -:c:func:`synchronize_rcu()` che blocca l'esecuzione finché tutti i lettori +synchronize_rcu() che blocca l'esecuzione finché tutti i lettori non terminano di ispezionare la lista. Ma come fa l'RCU a sapere quando i lettori sono finiti? Il meccanismo è il seguente: innanzi tutto i lettori accedono alla lista solo fra la coppia -:c:func:`rcu_read_lock()`/:c:func:`rcu_read_unlock()` che disabilita la +rcu_read_lock()/rcu_read_unlock() che disabilita la prelazione così che i lettori non vengano sospesi mentre stanno leggendo la lista. @@ -1253,12 +1253,12 @@ codice RCU è un po' più ottimizzato di così, ma questa è l'idea di fondo. } Da notare che i lettori modificano il campo popularity nella funzione -:c:func:`__cache_find()`, e ora non trattiene alcun *lock*. Una soluzione +__cache_find(), e ora non trattiene alcun *lock*. Una soluzione potrebbe essere quella di rendere la variabile ``atomic_t``, ma per l'uso che ne abbiamo fatto qui, non ci interessano queste corse critiche perché un risultato approssimativo è comunque accettabile, quindi non l'ho cambiato. -Il risultato è che la funzione :c:func:`cache_find()` non ha bisogno di alcuna +Il risultato è che la funzione cache_find() non ha bisogno di alcuna sincronizzazione con le altre funzioni, quindi è veloce su un sistema multi-processore tanto quanto lo sarebbe su un sistema mono-processore. @@ -1271,9 +1271,9 @@ riferimenti. Ora, dato che il '*lock* di lettura' di un RCU non fa altro che disabilitare la prelazione, un chiamante che ha sempre la prelazione disabilitata fra le -chiamate :c:func:`cache_find()` e :c:func:`object_put()` non necessita +chiamate cache_find() e object_put() non necessita di incrementare e decrementare il contatore di riferimenti. Potremmo -esporre la funzione :c:func:`__cache_find()` dichiarandola non-static, +esporre la funzione __cache_find() dichiarandola non-static, e quel chiamante potrebbe usare direttamente questa funzione. Il beneficio qui sta nel fatto che il contatore di riferimenti no @@ -1293,10 +1293,10 @@ singolo contatore. Facile e pulito. Se questo dovesse essere troppo lento (solitamente non lo è, ma se avete dimostrato che lo è devvero), potreste usare un contatore per ogni processore e quindi non sarebbe più necessaria la mutua esclusione. Vedere -:c:func:`DEFINE_PER_CPU()`, :c:func:`get_cpu_var()` e :c:func:`put_cpu_var()` +DEFINE_PER_CPU(), get_cpu_var() e put_cpu_var() (``include/linux/percpu.h``). -Il tipo di dato ``local_t``, la funzione :c:func:`cpu_local_inc()` e tutte +Il tipo di dato ``local_t``, la funzione cpu_local_inc() e tutte le altre funzioni associate, sono di particolare utilità per semplici contatori per-processore; su alcune architetture sono anche più efficienti (``include/asm/local.h``). @@ -1324,11 +1324,11 @@ da un'interruzione software. Il gestore d'interruzione non utilizza alcun enable_irq(irq); spin_unlock(&lock); -La funzione :c:func:`disable_irq()` impedisce al gestore d'interruzioni +La funzione disable_irq() impedisce al gestore d'interruzioni d'essere eseguito (e aspetta che finisca nel caso fosse in esecuzione su un altro processore). Lo spinlock, invece, previene accessi simultanei. Naturalmente, questo è più lento della semplice chiamata -:c:func:`spin_lock_irq()`, quindi ha senso solo se questo genere di accesso +spin_lock_irq(), quindi ha senso solo se questo genere di accesso è estremamente raro. .. _`it_sleeping-things`: @@ -1336,7 +1336,7 @@ Naturalmente, questo è più lento della semplice chiamata Quali funzioni possono essere chiamate in modo sicuro dalle interruzioni? ========================================================================= -Molte funzioni del kernel dormono (in sostanza, chiamano ``schedule()``) +Molte funzioni del kernel dormono (in sostanza, chiamano schedule()) direttamente od indirettamente: non potete chiamarle se trattenere uno spinlock o avete la prelazione disabilitata, mai. Questo significa che dovete necessariamente essere nel contesto utente: chiamarle da un @@ -1354,23 +1354,23 @@ dormire. - Accessi allo spazio utente: - - :c:func:`copy_from_user()` + - copy_from_user() - - :c:func:`copy_to_user()` + - copy_to_user() - - :c:func:`get_user()` + - get_user() - - :c:func:`put_user()` + - put_user() -- :c:func:`kmalloc(GFP_KERNEL) ` +- kmalloc(GFP_KERNEL) ` -- :c:func:`mutex_lock_interruptible()` and - :c:func:`mutex_lock()` +- mutex_lock_interruptible() and + mutex_lock() - C'è anche :c:func:`mutex_trylock()` che però non dorme. + C'è anche mutex_trylock() che però non dorme. Comunque, non deve essere usata in un contesto d'interruzione dato che la sua implementazione non è sicura in quel contesto. - Anche :c:func:`mutex_unlock()` non dorme mai. Non può comunque essere + Anche mutex_unlock() non dorme mai. Non può comunque essere usata in un contesto d'interruzione perché un mutex deve essere rilasciato dallo stesso processo che l'ha acquisito. @@ -1380,11 +1380,11 @@ Alcune funzioni che non dormono Alcune funzioni possono essere chiamate tranquillamente da qualsiasi contesto, o trattenendo un qualsiasi *lock*. -- :c:func:`printk()` +- printk() -- :c:func:`kfree()` +- kfree() -- :c:func:`add_timer()` e :c:func:`del_timer()` +- add_timer() e del_timer() Riferimento per l'API dei Mutex =============================== @@ -1444,14 +1444,14 @@ prelazione bh Bottom Half: per ragioni storiche, le funzioni che contengono '_bh' nel loro nome ora si riferiscono a qualsiasi interruzione software; per esempio, - :c:func:`spin_lock_bh()` blocca qualsiasi interuzione software sul processore + spin_lock_bh() blocca qualsiasi interuzione software sul processore corrente. I *Bottom Halves* sono deprecati, e probabilmente verranno sostituiti dai tasklet. In un dato momento potrà esserci solo un *bottom half* in esecuzione. contesto d'interruzione Non è il contesto utente: qui si processano le interruzioni hardware e - software. La macro :c:func:`in_interrupt()` ritorna vero. + software. La macro in_interrupt() ritorna vero. contesto utente Il kernel che esegue qualcosa per conto di un particolare processo (per @@ -1461,12 +1461,12 @@ contesto utente che hardware. interruzione hardware - Richiesta di interruzione hardware. :c:func:`in_irq()` ritorna vero in un + Richiesta di interruzione hardware. in_irq() ritorna vero in un gestore d'interruzioni hardware. interruzione software / softirq - Gestore di interruzioni software: :c:func:`in_irq()` ritorna falso; - :c:func:`in_softirq()` ritorna vero. I tasklet e le softirq sono entrambi + Gestore di interruzioni software: in_irq() ritorna falso; + in_softirq() ritorna vero. I tasklet e le softirq sono entrambi considerati 'interruzioni software'. In soldoni, un softirq è uno delle 32 interruzioni software che possono diff --git a/Documentation/translations/it_IT/process/2.Process.rst b/Documentation/translations/it_IT/process/2.Process.rst index 9af4d01617c4..30dc172f06b0 100644 --- a/Documentation/translations/it_IT/process/2.Process.rst +++ b/Documentation/translations/it_IT/process/2.Process.rst @@ -23,18 +23,18 @@ ogni due o tre mesi viene effettuata un rilascio importante del kernel. I rilasci più recenti sono stati: ====== ================= - 4.11 Aprile 30, 2017 - 4.12 Luglio 2, 2017 - 4.13 Settembre 3, 2017 - 4.14 Novembre 12, 2017 - 4.15 Gennaio 28, 2018 - 4.16 Aprile 1, 2018 + 5.0 3 marzo, 2019 + 5.1 5 maggio, 2019 + 5.2 7 luglio, 2019 + 5.3 15 settembre, 2019 + 5.4 24 novembre, 2019 + 5.5 6 gennaio, 2020 ====== ================= -Ciascun rilascio 4.x è un importante rilascio del kernel con nuove +Ciascun rilascio 5.x è un importante rilascio del kernel con nuove funzionalità, modifiche interne dell'API, e molto altro. Un tipico -rilascio 4.x contiene quasi 13,000 gruppi di modifiche con ulteriori -modifiche a parecchie migliaia di linee di codice. La 4.x. è pertanto la +rilascio contiene quasi 13,000 gruppi di modifiche con ulteriori +modifiche a parecchie migliaia di linee di codice. La 5.x. è pertanto la linea di confine nello sviluppo del kernel Linux; il kernel utilizza un sistema di sviluppo continuo che integra costantemente nuove importanti modifiche. @@ -55,8 +55,8 @@ verrà descritto dettagliatamente più avanti). La finestra di inclusione resta attiva approssimativamente per due settimane. Al termine di questo periodo, Linus Torvald dichiarerà che la finestra è chiusa e rilascerà il primo degli "rc" del kernel. -Per il kernel che è destinato ad essere 2.6.40, per esempio, il rilascio -che emerge al termine della finestra d'inclusione si chiamerà 2.6.40-rc1. +Per il kernel che è destinato ad essere 5.6, per esempio, il rilascio +che emerge al termine della finestra d'inclusione si chiamerà 5.6-rc1. Questo rilascio indica che il momento di aggiungere nuovi componenti è passato, e che è iniziato il periodo di stabilizzazione del prossimo kernel. @@ -76,22 +76,23 @@ Mentre le correzioni si aprono la loro strada all'interno del ramo principale, il ritmo delle modifiche rallenta col tempo. Linus rilascia un nuovo kernel -rc circa una volta alla settimana; e ne usciranno circa 6 o 9 prima che il kernel venga considerato sufficientemente stabile e che il rilascio -finale 2.6.x venga fatto. A quel punto tutto il processo ricomincerà. +finale venga fatto. A quel punto tutto il processo ricomincerà. -Esempio: ecco com'è andato il ciclo di sviluppo della versione 4.16 +Esempio: ecco com'è andato il ciclo di sviluppo della versione 5.4 (tutte le date si collocano nel 2018) ============== ======================================= - Gennaio 28 4.15 rilascio stabile - Febbraio 11 4.16-rc1, finestra di inclusione chiusa - Febbraio 18 4.16-rc2 - Febbraio 25 4.16-rc3 - Marzo 4 4.16-rc4 - Marzo 11 4.16-rc5 - Marzo 18 4.16-rc6 - Marzo 25 4.16-rc7 - Aprile 1 4.17 rilascio stabile + 15 settembre 5.3 rilascio stabile + 30 settembre 5.4-rc1, finestra di inclusione chiusa + 6 ottobre 5.4-rc2 + 13 ottobre 5.4-rc3 + 20 ottobre 5.4-rc4 + 27 ottobre 5.4-rc5 + 3 novembre 5.4-rc6 + 10 novembre 5.4-rc7 + 17 novembre 5.4-rc8 + 24 novembre 5.4 rilascio stabile ============== ======================================= In che modo gli sviluppatori decidono quando chiudere il ciclo di sviluppo e @@ -108,43 +109,44 @@ tipo di perfezione difficilmente viene raggiunta; esistono troppe variabili in un progetto di questa portata. Arriva un punto dove ritardare il rilascio finale peggiora la situazione; la quantità di modifiche in attesa della prossima finestra di inclusione crescerà enormemente, creando ancor più -regressioni al giro successivo. Quindi molti kernel 4.x escono con una +regressioni al giro successivo. Quindi molti kernel 5.x escono con una manciata di regressioni delle quali, si spera, nessuna è grave. Una volta che un rilascio stabile è fatto, il suo costante mantenimento è affidato al "squadra stabilità", attualmente composta da Greg Kroah-Hartman. Questa squadra rilascia occasionalmente degli aggiornamenti relativi al -rilascio stabile usando la numerazione 4.x.y. Per essere presa in +rilascio stabile usando la numerazione 5.x.y. Per essere presa in considerazione per un rilascio d'aggiornamento, una modifica deve: (1) correggere un baco importante (2) essere già inserita nel ramo principale per il prossimo sviluppo del kernel. Solitamente, passato il loro rilascio iniziale, i kernel ricevono aggiornamenti per più di un ciclo di sviluppo. -Quindi, per esempio, la storia del kernel 4.13 appare così: +Quindi, per esempio, la storia del kernel 5.2 appare così (anno 2019): ============== =============================== - Settembre 3 4.13 rilascio stabile - Settembre 13 4.13.1 - Settembre 20 4.13.2 - Settembre 27 4.13.3 - Ottobre 5 4.13.4 - Ottobre 12 4.13.5 + 15 settembre 5.2 rilascio stabile FIXME settembre è sbagliato + 14 luglio 5.2.1 + 21 luglio 5.2.2 + 26 luglio 5.2.3 + 28 luglio 5.2.4 + 31 luglio 5.2.5 ... ... - Novembre 24 4.13.16 + 11 ottobre 5.2.21 ============== =============================== -La 4.13.16 fu l'aggiornamento finale per la versione 4.13. +La 5.2.21 fu l'aggiornamento finale per la versione 5.2. Alcuni kernel sono destinati ad essere kernel a "lungo termine"; questi riceveranno assistenza per un lungo periodo di tempo. Al momento in cui scriviamo, i manutentori dei kernel stabili a lungo termine sono: - ====== ====================== ========================================== - 3.16 Ben Hutchings (kernel stabile molto più a lungo termine) - 4.1 Sasha Levin - 4.4 Greg Kroah-Hartman (kernel stabile molto più a lungo termine) - 4.9 Greg Kroah-Hartman - 4.14 Greg Kroah-Hartman - ====== ====================== ========================================== + ====== ================================ ========================================== + 3.16 Ben Hutchings (kernel stabile molto più a lungo termine) + 4.4 Greg Kroah-Hartman e Sasha Levin (kernel stabile molto più a lungo termine) + 4.9 Greg Kroah-Hartman e Sasha Levin + 4.14 Greg Kroah-Hartman e Sasha Levin + 4.19 Greg Kroah-Hartman e Sasha Levin + 5.4i Greg Kroah-Hartman e Sasha Levin + ====== ================================ ========================================== Questa selezione di kernel di lungo periodo sono puramente dovuti ai loro @@ -229,12 +231,13 @@ Come le modifiche finiscono nel Kernel -------------------------------------- Esiste una sola persona che può inserire le patch nel repositorio principale -del kernel: Linus Torvalds. Ma, di tutte le 9500 patch che entrarono nella -versione 2.6.38 del kernel, solo 112 (circa l'1,3%) furono scelte direttamente -da Linus in persona. Il progetto del kernel è cresciuto fino a raggiungere -una dimensione tale per cui un singolo sviluppatore non può controllare e -selezionare indipendentemente ogni modifica senza essere supportato. -La via scelta dagli sviluppatori per indirizzare tale crescita è stata quella +del kernel: Linus Torvalds. Ma, per esempio, di tutte le 9500 patch +che entrarono nella versione 2.6.38 del kernel, solo 112 (circa +l'1,3%) furono scelte direttamente da Linus in persona. Il progetto +del kernel è cresciuto fino a raggiungere una dimensione tale per cui +un singolo sviluppatore non può controllare e selezionare +indipendentemente ogni modifica senza essere supportato. La via +scelta dagli sviluppatori per indirizzare tale crescita è stata quella di utilizzare un sistema di "sottotenenti" basato sulla fiducia. Il codice base del kernel è spezzato in una serie si sottosistemi: rete, diff --git a/Documentation/translations/it_IT/process/coding-style.rst b/Documentation/translations/it_IT/process/coding-style.rst index 8725f2b9e960..6f4f85832dee 100644 --- a/Documentation/translations/it_IT/process/coding-style.rst +++ b/Documentation/translations/it_IT/process/coding-style.rst @@ -313,7 +313,7 @@ che conta gli utenti attivi, dovreste chiamarla ``count_active_users()`` o qualcosa di simile, **non** dovreste chiamarla ``cntusr()``. Codificare il tipo di funzione nel suo nome (quella cosa chiamata notazione -ungherese) fa male al cervello - il compilatore conosce comunque il tipo e +ungherese) è stupido - il compilatore conosce comunque il tipo e può verificarli, e inoltre confonde i programmatori. Non c'è da sorprendersi che MicroSoft faccia programmi bacati. @@ -825,8 +825,8 @@ linguaggio assembler. Agli sviluppatori del kernel piace essere visti come dotti. Tenete un occhio di riguardo per l'ortografia e farete una belle figura. In inglese, evitate -l'uso di parole mozzate come ``dont``: usate ``do not`` oppure ``don't``. -Scrivete messaggi concisi, chiari, e inequivocabili. +l'uso incorretto di abbreviazioni come ``dont``: usate ``do not`` oppure +``don't``. Scrivete messaggi concisi, chiari, e inequivocabili. I messaggi del kernel non devono terminare con un punto fermo. diff --git a/Documentation/translations/it_IT/process/deprecated.rst b/Documentation/translations/it_IT/process/deprecated.rst index 776f26732a94..e108eaf82cf6 100644 --- a/Documentation/translations/it_IT/process/deprecated.rst +++ b/Documentation/translations/it_IT/process/deprecated.rst @@ -34,6 +34,33 @@ interfaccia come 'vecchia', questa non è una soluzione completa. L'interfaccia deve essere rimossa dal kernel, o aggiunta a questo documento per scoraggiarne l'uso. +BUG() e BUG_ON() +---------------- +Al loro posto usate WARN() e WARN_ON() per gestire le +condizioni "impossibili" e gestitele come se fosse possibile farlo. +Nonostante le funzioni della famiglia BUG() siano state progettate +per asserire "situazioni impossibili" e interrompere in sicurezza un +thread del kernel, queste si sono rivelate essere troppo rischiose +(per esempio, in quale ordine rilasciare i *lock*? Ci sono stati che +sono stati ripristinati?). Molto spesso l'uso di BUG() +destabilizza il sistema o lo corrompe del tutto, il che rende +impossibile un'attività di debug o anche solo leggere un rapporto +circa l'errore. Linus ha un'opinione molto critica al riguardo: +`email 1 +`_, +`email 2 +`_ + +Tenete presente che la famiglia di funzioni WARN() dovrebbe essere +usato solo per situazioni che si suppone siano "impossibili". Se +volete avvisare gli utenti riguardo a qualcosa di possibile anche se +indesiderato, usare le funzioni della famiglia pr_warn(). Chi +amministra il sistema potrebbe aver attivato l'opzione sysctl +*panic_on_warn* per essere sicuri che il sistema smetta di funzionare +in caso si verifichino delle condizioni "inaspettate". (per esempio, +date un'occhiata al questo `commit +`_) + Calcoli codificati negli argomenti di un allocatore ---------------------------------------------------- Il calcolo dinamico delle dimensioni (specialmente le moltiplicazioni) non @@ -68,52 +95,81 @@ Invece, usate la seguente funzione:: header = kzalloc(struct_size(header, item, count), GFP_KERNEL); -Per maggiori dettagli fate riferimento a :c:func:`array_size`, -:c:func:`array3_size`, e :c:func:`struct_size`, così come la famiglia di -funzioni :c:func:`check_add_overflow` e :c:func:`check_mul_overflow`. +Per maggiori dettagli fate riferimento a array_size(), +array3_size(), e struct_size(), così come la famiglia di +funzioni check_add_overflow() e check_mul_overflow(). simple_strtol(), simple_strtoll(), simple_strtoul(), simple_strtoull() ---------------------------------------------------------------------- -Le funzioni :c:func:`simple_strtol`, :c:func:`simple_strtoll`, -:c:func:`simple_strtoul`, e :c:func:`simple_strtoull` ignorano volutamente +Le funzioni simple_strtol(), simple_strtoll(), +simple_strtoul(), e simple_strtoull() ignorano volutamente i possibili overflow, e questo può portare il chiamante a generare risultati -inaspettati. Le rispettive funzioni :c:func:`kstrtol`, :c:func:`kstrtoll`, -:c:func:`kstrtoul`, e :c:func:`kstrtoull` sono da considerarsi le corrette +inaspettati. Le rispettive funzioni kstrtol(), kstrtoll(), +kstrtoul(), e kstrtoull() sono da considerarsi le corrette sostitute; tuttavia va notato che queste richiedono che la stringa sia terminata con il carattere NUL o quello di nuova riga. strcpy() -------- -La funzione :c:func:`strcpy` non fa controlli agli estremi del buffer +La funzione strcpy() non fa controlli agli estremi del buffer di destinazione. Questo può portare ad un overflow oltre i limiti del buffer e generare svariati tipi di malfunzionamenti. Nonostante l'opzione `CONFIG_FORTIFY_SOURCE=y` e svariate opzioni del compilatore aiutano a ridurne il rischio, non c'è alcuna buona ragione per continuare ad usare -questa funzione. La versione sicura da usare è :c:func:`strscpy`. +questa funzione. La versione sicura da usare è strscpy(). strncpy() su stringe terminate con NUL -------------------------------------- -L'utilizzo di :c:func:`strncpy` non fornisce alcuna garanzia sul fatto che +L'utilizzo di strncpy() non fornisce alcuna garanzia sul fatto che il buffer di destinazione verrà terminato con il carattere NUL. Questo potrebbe portare a diversi overflow di lettura o altri malfunzionamenti causati, appunto, dalla mancanza del terminatore. Questa estende la terminazione nel buffer di destinazione quando la stringa d'origine è più corta; questo potrebbe portare ad una penalizzazione delle prestazioni per chi usa solo stringe terminate. La versione sicura da usare è -:c:func:`strscpy`. (chi usa :c:func:`strscpy` e necessita di estendere la -terminazione con NUL deve aggiungere una chiamata a :c:func:`memset`) +strscpy(). (chi usa strscpy() e necessita di estendere la +terminazione con NUL deve aggiungere una chiamata a memset()) -Se il chiamate no usa stringhe terminate con NUL, allore :c:func:`strncpy()` +Se il chiamate no usa stringhe terminate con NUL, allore strncpy()() può continuare ad essere usata, ma i buffer di destinazione devono essere marchiati con l'attributo `__nonstring `_ per evitare avvisi durante la compilazione. strlcpy() --------- -La funzione :c:func:`strlcpy`, per prima cosa, legge interamente il buffer di +La funzione strlcpy(), per prima cosa, legge interamente il buffer di origine, magari leggendo più di quanto verrà effettivamente copiato. Questo è inefficiente e può portare a overflow di lettura quando la stringa non è -terminata con NUL. La versione sicura da usare è :c:func:`strscpy`. +terminata con NUL. La versione sicura da usare è strscpy(). + +Segnaposto %p nella stringa di formato +-------------------------------------- + +Tradizionalmente, l'uso del segnaposto "%p" nella stringa di formato +esponne un indirizzo di memoria in dmesg, proc, sysfs, eccetera. Per +evitare che questi indirizzi vengano sfruttati da malintenzionati, +tutto gli usi di "%p" nel kernel rappresentano l'hash dell'indirizzo, +rendendolo di fatto inutilizzabile. Nuovi usi di "%p" non dovrebbero +essere aggiunti al kernel. Per una rappresentazione testuale di un +indirizzo usate "%pS", l'output è migliore perché mostrerà il nome del +simbolo. Per tutto il resto, semplicemente non usate "%p". + +Parafrasando la `guida +`_ +di Linus: + +- Se il valore hash di "%p" è inutile, chiediti se il puntatore stesso + è importante. Forse dovrebbe essere rimosso del tutto? +- Se credi davvero che il vero valore del puntatore sia importante, + perché alcuni stati del sistema o i livelli di privilegi di un + utente sono considerati "special"? Se pensi di poterlo giustificare + (in un commento e nel messaggio del commit) abbastanza bene da + affrontare il giudizio di Linus, allora forse potrai usare "%px", + assicurandosi anche di averne il permesso. + +Infine, sappi che un cambio in favore di "%p" con hash `non verrà +accettato +`_. Vettori a dimensione variabile (VLA) ------------------------------------ @@ -127,3 +183,47 @@ Questo può portare a dei malfunzionamenti, potrebbe sovrascrivere dati importanti alla fine dello stack (quando il kernel è compilato senza `CONFIG_THREAD_INFO_IN_TASK=y`), o sovrascrivere un pezzo di memoria adiacente allo stack (quando il kernel è compilato senza `CONFIG_VMAP_STACK=y`). + +Salto implicito nell'istruzione switch-case +------------------------------------------- + +Il linguaggio C permette ai casi di un'istruzione `switch` di saltare al +prossimo caso quando l'istruzione "break" viene omessa alla fine del caso +corrente. Tuttavia questo rende il codice ambiguo perché non è sempre ovvio se +l'istruzione "break" viene omessa intenzionalmente o è un baco. Per esempio, +osservando il seguente pezzo di codice non è chiaro se lo stato +`STATE_ONE` è stato progettato apposta per eseguire anche `STATE_TWO`:: + + switch (value) { + case STATE_ONE: + do_something(); + case STATE_TWO: + do_other(); + break; + default: + WARN("unknown state"); + } + +Dato che c'è stata una lunga lista di problemi `dovuti alla mancanza dell'istruzione +"break" `_, oggigiorno non +permettiamo più che vi sia un "salto implicito" (*fall-through*). Per +identificare un salto implicito intenzionale abbiamo adottato la pseudo +parola chiave 'fallthrough' che viene espansa nell'estensione di gcc +`__attribute__((fallthrough))` `Statement Attributes +`_. +(Quando la sintassi C17/C18 `[[fallthrough]]` sarà più comunemente +supportata dai compilatori C, analizzatori statici, e dagli IDE, +allora potremo usare quella sintassi per la pseudo parola chiave) + +Quando la sintassi [[fallthrough]] sarà più comunemente supportata dai +compilatori, analizzatori statici, e ambienti di sviluppo IDE, +allora potremo usarla anche noi. + +Ne consegue che tutti i blocchi switch/case devono finire in uno dei seguenti +modi: + +* ``break;`` +* `fallthrough;`` +* ``continue;`` +* ``goto