diff options
author | Ingo Molnar <mingo@kernel.org> | 2016-01-06 11:41:48 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-01-06 11:41:48 +0100 |
commit | 3104fb3dd45bb47ff1382d1c079c251710ddcae3 (patch) | |
tree | 0c562ad5b3a960b3299edd5abdbe12b9f0a50a6b | |
parent | ee9a7d2cb0cf1a1498478bc923d911f3d9c910ac (diff) | |
parent | 984cf355aeaa8f2eda3861b50d0e8d3e3f77e83b (diff) | |
download | linux-3104fb3dd45bb47ff1382d1c079c251710ddcae3.tar.bz2 |
Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Pull RCU changes from Paul E. McKenney:
- Adding transitivity uniformly to rcu_node structure ->lock
acquisitions. (This is implemented by the first two commits
on top of v4.4-rc2 due to the pervasive nature of this change.)
- Documentation updates, including RCU requirements.
- Expedited grace-period changes.
- Miscellaneous fixes.
- Linked-list fixes, courtesy of KTSAN.
- Torture-test updates.
- Late-breaking fix to sysrq-generated crash.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
34 files changed, 7552 insertions, 287 deletions
diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png Binary files differnew file mode 100644 index 000000000000..7496a55e4e7b --- /dev/null +++ b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg new file mode 100644 index 000000000000..4b4014fda770 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg @@ -0,0 +1,374 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="447.99197" + height="428.19299" + id="svg2" + version="1.1" + inkscape:version="0.48.3.1 r9886" + sodipodi:docname="GPpartitionReaders1.svg"> + <defs + id="defs4"> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0" + refX="0" + id="Arrow2Lend" + style="overflow:visible"> + <path + id="path3792" + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + transform="matrix(-1.1,0,0,-1.1,-1.1,0)" + inkscape:connector-curvature="0" /> + </marker> + <marker + inkscape:stockid="Arrow2Lstart" + orient="auto" + refY="0" + refX="0" + id="Arrow2Lstart" + style="overflow:visible"> + <path + id="path3789" + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + transform="matrix(1.1,0,0,1.1,1.1,0)" + inkscape:connector-curvature="0" /> + </marker> + </defs> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="1.6184291" + inkscape:cx="223.99599" + inkscape:cy="214.0965" + inkscape:document-units="px" + inkscape:current-layer="layer1" + showgrid="false" + inkscape:window-width="979" + inkscape:window-height="836" + inkscape:window-x="571" + inkscape:window-y="335" + inkscape:window-maximized="0" + fit-margin-top="5" + fit-margin-left="5" + fit-margin-right="5" + fit-margin-bottom="5" /> + <metadata + id="metadata7"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-28.441125,-185.60612)"> + <flowRoot + xml:space="preserve" + id="flowRoot2985" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion + id="flowRegion2987"><rect + id="rect2989" + width="82.85714" + height="11.428572" + x="240" + y="492.36218" /></flowRegion><flowPara + id="flowPara2991"></flowPara></flowRoot> <g + id="g4433" + transform="translate(2,0)"> + <text + sodipodi:linespacing="125%" + id="text2993" + y="-261.66608" + x="412.12299" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + xml:space="preserve" + transform="matrix(0,1,-1,0,0,0)"><tspan + y="-261.66608" + x="412.12299" + id="tspan2995" + sodipodi:role="line">synchronize_rcu()</tspan></text> + <g + id="g4417" + transform="matrix(0,1,-1,0,730.90257,222.4928)"> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" + d="m 97.580736,477.4048 183.140664,0" + id="path2997" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 96.752718,465.38398 0,22.62742" + id="path4397" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 281.54942,465.38397 0,22.62742" + id="path4397-5" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + </g> + </g> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.04738" + y="268.18076" + id="text4429" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431" + x="112.04738" + y="268.18076">WRITE_ONCE(a, 1);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.04738" + y="439.13766" + id="text4441" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4443" + x="112.04738" + y="439.13766">WRITE_ONCE(b, 1);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="255.60869" + y="309.29346" + id="text4445" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4447" + x="255.60869" + y="309.29346">r1 = READ_ONCE(a);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="255.14423" + y="520.61786" + id="text4449" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4451" + x="255.14423" + y="520.61786">WRITE_ONCE(c, 1);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="384.71124" + id="text4453" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4455" + x="396.10254" + y="384.71124">r2 = READ_ONCE(b);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="582.13617" + id="text4457" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4459" + x="396.10254" + y="582.13617">r3 = READ_ONCE(c);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.08231" + y="213.91006" + id="text4461" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463" + x="112.08231" + y="213.91006">thread0()</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="252.34512" + y="213.91006" + id="text4461-6" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463-0" + x="252.34512" + y="213.91006">thread1()</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.42557" + y="213.91006" + id="text4461-2" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463-2" + x="396.42557" + y="213.91006">thread2()</tspan></text> + <rect + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="rect4495" + width="436.28488" + height="416.4859" + x="34.648232" + y="191.10612" /> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + d="m 183.14066,191.10612 0,417.193 -0.70711,0" + id="path4497" + inkscape:connector-curvature="0" /> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + d="m 325.13867,191.10612 0,417.193 -0.70711,0" + id="path4497-5" + inkscape:connector-curvature="0" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="111.75929" + y="251.53981" + id="text4429-8" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9" + x="111.75929" + y="251.53981">rcu_read_lock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="367.91556" + id="text4429-8-9" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4" + x="396.10254" + y="367.91556">rcu_read_lock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="597.40289" + id="text4429-8-9-3" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4-4" + x="396.10254" + y="597.40289">rcu_read_unlock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="111.75929" + y="453.15311" + id="text4429-8-9-3-1" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4-4-6" + x="111.75929" + y="453.15311">rcu_read_unlock();</tspan></text> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 33.941125,227.87568 436.284885,0 0,0.7071" + id="path4608" + inkscape:connector-curvature="0" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="394.94427" + y="345.66351" + id="text4648" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650" + x="394.94427" + y="345.66351">QS</tspan></text> + <path + sodipodi:type="arc" + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(36.441125,199.60612)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.11968" + y="475.77856" + id="text4648-4" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-4" + x="112.11968" + y="475.77856">QS</tspan></text> + <path + sodipodi:type="arc" + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-7" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(-246.38346,329.72117)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <path + sodipodi:type="arc" + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-7-7" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(-103.65246,202.90878)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="254.85066" + y="348.96619" + id="text4648-4-3" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-4-5" + x="254.85066" + y="348.96619">QS</tspan></text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg new file mode 100644 index 000000000000..ebcbeee391ed --- /dev/null +++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg @@ -0,0 +1,237 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5d --> + +<!-- CreationDate: Tue Mar 4 18:34:25 2014 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="1089.1382" + height="668.21368" + viewBox="-2121 -36 14554.634 8876.4061" + id="svg2" + version="1.1" + inkscape:version="0.48.3.1 r9886" + sodipodi:docname="RCUApplicability.svg"> + <metadata + id="metadata40"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title /> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs38" /> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="849" + inkscape:window-height="639" + id="namedview36" + showgrid="false" + inkscape:zoom="0.51326165" + inkscape:cx="544.56912" + inkscape:cy="334.10686" + inkscape:window-x="149" + inkscape:window-y="448" + inkscape:window-maximized="0" + inkscape:current-layer="g4" + fit-margin-top="5" + fit-margin-left="5" + fit-margin-right="5" + fit-margin-bottom="5" /> + <g + style="fill:none;stroke-width:0.025in" + id="g4" + transform="translate(-2043.6828,14.791398)"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="14400" + height="8775" + rx="0" + style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" + id="rect6" /> + <!-- Line: box --> + <rect + x="1350" + y="0" + width="11700" + height="6075" + rx="0" + style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" + id="rect8" /> + <!-- Line: box --> + <rect + x="2700" + y="0" + width="9000" + height="4275" + rx="0" + style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" + id="rect10" /> + <!-- Line: box --> + <rect + x="4050" + y="0" + width="6300" + height="2475" + rx="0" + style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" + id="rect12" /> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="900" + font-style="normal" + font-weight="normal" + font-size="324" + id="text14" + sodipodi:linespacing="125%" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + id="tspan3017">Read-Mostly, Stale &</tspan></text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="1350" + font-style="normal" + font-weight="normal" + font-size="324" + id="text16" + sodipodi:linespacing="125%" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + id="tspan3019">Inconsistent Data OK</tspan></text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="1800" + font-style="normal" + font-weight="normal" + font-size="324" + id="text18" + sodipodi:linespacing="125%" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + id="tspan3021">(RCU Works Great!!!)</tspan></text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="3825" + font-style="normal" + font-weight="normal" + font-size="324" + id="text20" + sodipodi:linespacing="125%" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + id="tspan3023">(RCU Works Well)</tspan></text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="3375" + font-style="normal" + font-weight="normal" + font-size="324" + id="text22" + sodipodi:linespacing="125%" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="5175" + font-style="normal" + font-weight="normal" + font-size="324" + id="text24" + sodipodi:linespacing="125%" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + id="tspan3027">Read-Write, Need Consistent Data</tspan></text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="6975" + font-style="normal" + font-weight="normal" + font-size="324" + id="text26" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="5625" + font-style="normal" + font-weight="normal" + font-size="324" + id="text28" + sodipodi:linespacing="125%" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + id="tspan3029">(RCU Might Be OK...)</tspan></text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="7875" + font-style="normal" + font-weight="normal" + font-size="324" + id="text30" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="8325" + font-style="normal" + font-weight="normal" + font-size="324" + id="text32" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text> + <!-- Text --> + <text + xml:space="preserve" + x="7200" + y="7425" + font-style="normal" + font-weight="normal" + font-size="324" + id="text34" + style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" + sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg new file mode 100644 index 000000000000..48cd1623d4d4 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg @@ -0,0 +1,639 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Created with Inkscape (http://www.inkscape.org/) --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="735.25" + height="516.21875" + id="svg2" + version="1.1" + inkscape:version="0.48.3.1 r9886" + sodipodi:docname="ReadersPartitionGP1.svg"> + <defs + id="defs4"> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0" + refX="0" + id="Arrow2Lend" + style="overflow:visible"> + <path + id="path3792" + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + transform="matrix(-1.1,0,0,-1.1,-1.1,0)" + inkscape:connector-curvature="0" /> + </marker> + <marker + inkscape:stockid="Arrow2Lstart" + orient="auto" + refY="0" + refX="0" + id="Arrow2Lstart" + style="overflow:visible"> + <path + id="path3789" + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + transform="matrix(1.1,0,0,1.1,1.1,0)" + inkscape:connector-curvature="0" /> + </marker> + <marker + inkscape:stockid="Arrow2Lstart" + orient="auto" + refY="0" + refX="0" + id="Arrow2Lstart-4" + style="overflow:visible"> + <path + id="path3789-9" + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + transform="matrix(1.1,0,0,1.1,1.1,0)" + inkscape:connector-curvature="0" /> + </marker> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0" + refX="0" + id="Arrow2Lend-4" + style="overflow:visible"> + <path + id="path3792-4" + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + transform="matrix(-1.1,0,0,-1.1,-1.1,0)" + inkscape:connector-curvature="0" /> + </marker> + </defs> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="1.3670394" + inkscape:cx="367.26465" + inkscape:cy="258.46182" + inkscape:document-units="px" + inkscape:current-layer="g4433-6" + showgrid="false" + inkscape:window-width="1351" + inkscape:window-height="836" + inkscape:window-x="438" + inkscape:window-y="335" + inkscape:window-maximized="0" + fit-margin-top="5" + fit-margin-left="5" + fit-margin-right="5" + fit-margin-bottom="5" /> + <metadata + id="metadata7"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title /> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Layer 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-29.15625,-185.59375)"> + <flowRoot + xml:space="preserve" + id="flowRoot2985" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion + id="flowRegion2987"><rect + id="rect2989" + width="82.85714" + height="11.428572" + x="240" + y="492.36218" /></flowRegion><flowPara + id="flowPara2991" /></flowRoot> <g + id="g4433" + transform="translate(2,-12)"> + <text + sodipodi:linespacing="125%" + id="text2993" + y="-261.66608" + x="436.12299" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + xml:space="preserve" + transform="matrix(0,1,-1,0,0,0)"><tspan + y="-261.66608" + x="436.12299" + id="tspan2995" + sodipodi:role="line">synchronize_rcu()</tspan></text> + <g + id="g4417" + transform="matrix(0,1,-1,0,730.90257,222.4928)"> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" + d="M 97.580736,477.4048 327.57913,476.09759" + id="path2997" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 96.752718,465.38398 0,22.62742" + id="path4397" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 328.40703,465.38397 0,22.62742" + id="path4397-5" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + </g> + </g> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.04738" + y="268.18076" + id="text4429" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431" + x="112.04738" + y="268.18076">WRITE_ONCE(a, 1);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.04738" + y="487.13766" + id="text4441" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4443" + x="112.04738" + y="487.13766">WRITE_ONCE(b, 1);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="255.60869" + y="297.29346" + id="text4445" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4447" + x="255.60869" + y="297.29346">r1 = READ_ONCE(a);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="255.14423" + y="554.61786" + id="text4449" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4451" + x="255.14423" + y="554.61786">WRITE_ONCE(c, 1);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="370.71124" + id="text4453" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4455" + x="396.10254" + y="370.71124">WRITE_ONCE(d, 1);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="572.13617" + id="text4457" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4459" + x="396.10254" + y="572.13617">r2 = READ_ONCE(c);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.08231" + y="213.91006" + id="text4461" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463" + x="112.08231" + y="213.91006">thread0()</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="252.34512" + y="213.91006" + id="text4461-6" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463-0" + x="252.34512" + y="213.91006">thread1()</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.42557" + y="213.91006" + id="text4461-2" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463-2" + x="396.42557" + y="213.91006">thread2()</tspan></text> + <rect + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="rect4495" + width="724.25244" + height="505.21201" + x="34.648232" + y="191.10612" /> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + d="m 183.14066,191.10612 0,504.24243" + id="path4497" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + d="m 325.13867,191.10612 0,504.24243" + id="path4497-5" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="111.75929" + y="251.53981" + id="text4429-8" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9" + x="111.75929" + y="251.53981">rcu_read_lock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="353.91556" + id="text4429-8-9" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4" + x="396.10254" + y="353.91556">rcu_read_lock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="396.10254" + y="587.40289" + id="text4429-8-9-3" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4-4" + x="396.10254" + y="587.40289">rcu_read_unlock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="111.75929" + y="501.15311" + id="text4429-8-9-3-1" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4-4-6" + x="111.75929" + y="501.15311">rcu_read_unlock();</tspan></text> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 33.941125,227.87568 724.941765,0" + id="path4608" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="394.94427" + y="331.66351" + id="text4648" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650" + x="394.94427" + y="331.66351">QS</tspan></text> + <path + sodipodi:type="arc" + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(36.441125,185.60612)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="112.11968" + y="523.77856" + id="text4648-4" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-4" + x="112.11968" + y="523.77856">QS</tspan></text> + <path + sodipodi:type="arc" + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-7" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(-246.38346,377.72117)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <path + sodipodi:type="arc" + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-7-7" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(-103.65246,190.90878)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="254.85066" + y="336.96619" + id="text4648-4-3" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-4-5" + x="254.85066" + y="336.96619">QS</tspan></text> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + d="m 470.93311,190.39903 0,504.24243" + id="path4497-5-6" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + d="m 616.22755,190.38323 0,504.24243" + id="path4497-5-2" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <g + id="g4433-6" + transform="translate(288.0964,78.32827)"> + <text + sodipodi:linespacing="125%" + id="text2993-7" + y="-261.66608" + x="440.12299" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + xml:space="preserve" + transform="matrix(0,1,-1,0,0,0)"><tspan + y="-261.66608" + x="440.12299" + id="tspan2995-1" + sodipodi:role="line">synchronize_rcu()</tspan></text> + <g + id="g4417-1" + transform="matrix(0,1,-1,0,730.90257,222.4928)"> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)" + d="M 97.580736,477.4048 328.5624,477.07246" + id="path2997-2" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 96.752718,465.38398 0,22.62742" + id="path4397-3" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + d="m 329.39039,465.38397 0,22.62742" + id="path4397-5-4" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> + </g> + </g> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="541.70508" + y="387.6217" + id="text4445-0" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4447-5" + x="541.70508" + y="387.6217">r3 = READ_ONCE(d);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="541.2406" + y="646.94611" + id="text4449-6" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4451-6" + x="541.2406" + y="646.94611">WRITE_ONCE(e, 1);</tspan></text> + <path + sodipodi:type="arc" + style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-7-7-5" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(182.44393,281.23704)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="540.94702" + y="427.29443" + id="text4648-4-3-1" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-4-5-7" + x="540.94702" + y="427.29443">QS</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="686.27747" + y="461.83929" + id="text4453-7" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4455-1" + x="686.27747" + y="461.83929">r4 = READ_ONCE(b);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="686.27747" + y="669.26422" + id="text4457-9" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4459-2" + x="686.27747" + y="669.26422">r5 = READ_ONCE(e);</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="686.27747" + y="445.04358" + id="text4429-8-9-33" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4-2" + x="686.27747" + y="445.04358">rcu_read_lock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="686.27747" + y="684.53094" + id="text4429-8-9-3-8" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4431-9-4-4-5" + x="686.27747" + y="684.53094">rcu_read_unlock();</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="685.11914" + y="422.79153" + id="text4648-9" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-7" + x="685.11914" + y="422.79153">QS</tspan></text> + <path + sodipodi:type="arc" + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-8" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(326.61602,276.73415)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="397.85934" + y="609.59003" + id="text4648-5" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-77" + x="397.85934" + y="609.59003">QS</tspan></text> + <path + sodipodi:type="arc" + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-80" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(39.356201,463.53264)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="256.75986" + y="586.99133" + id="text4648-5-2" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4650-77-7" + x="256.75986" + y="586.99133">QS</tspan></text> + <path + sodipodi:type="arc" + style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + id="path4652-80-5" + sodipodi:cx="358.85669" + sodipodi:cy="142.87541" + sodipodi:rx="10.960155" + sodipodi:ry="10.253048" + d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0" + transform="translate(-101.74328,440.93395)" + sodipodi:start="4.7135481" + sodipodi:end="10.994651" + sodipodi:open="true" /> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="546.22791" + y="213.91006" + id="text4461-2-5" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463-2-6" + x="546.22791" + y="213.91006">thread3()</tspan></text> + <text + xml:space="preserve" + style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol" + x="684.00067" + y="213.91006" + id="text4461-2-1" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan4463-2-0" + x="684.00067" + y="213.91006">thread4()</tspan></text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html new file mode 100644 index 000000000000..a725f9900ec8 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -0,0 +1,2897 @@ +<!-- DO NOT HAND EDIT. --> +<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' --> +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"> + <html> + <head><title>A Tour Through RCU's Requirements [LWN.net]</title> + <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> + +<h1>A Tour Through RCU's Requirements</h1> + +<p>Copyright IBM Corporation, 2015</p> +<p>Author: Paul E. McKenney</p> +<p><i>The initial version of this document appeared in the +<a href="https://lwn.net/">LWN</a> articles +<a href="https://lwn.net/Articles/652156/">here</a>, +<a href="https://lwn.net/Articles/652677/">here</a>, and +<a href="https://lwn.net/Articles/653326/">here</a>.</i></p> + +<h2>Introduction</h2> + +<p> +Read-copy update (RCU) is a synchronization mechanism that is often +used as a replacement for reader-writer locking. +RCU is unusual in that updaters do not block readers, +which means that RCU's read-side primitives can be exceedingly fast +and scalable. +In addition, updaters can make useful forward progress concurrently +with readers. +However, all this concurrency between RCU readers and updaters does raise +the question of exactly what RCU readers are doing, which in turn +raises the question of exactly what RCU's requirements are. + +<p> +This document therefore summarizes RCU's requirements, and can be thought +of as an informal, high-level specification for RCU. +It is important to understand that RCU's specification is primarily +empirical in nature; +in fact, I learned about many of these requirements the hard way. +This situation might cause some consternation, however, not only +has this learning process been a lot of fun, but it has also been +a great privilege to work with so many people willing to apply +technologies in interesting new ways. + +<p> +All that aside, here are the categories of currently known RCU requirements: +</p> + +<ol> +<li> <a href="#Fundamental Requirements"> + Fundamental Requirements</a> +<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> +<li> <a href="#Parallelism Facts of Life"> + Parallelism Facts of Life</a> +<li> <a href="#Quality-of-Implementation Requirements"> + Quality-of-Implementation Requirements</a> +<li> <a href="#Linux Kernel Complications"> + Linux Kernel Complications</a> +<li> <a href="#Software-Engineering Requirements"> + Software-Engineering Requirements</a> +<li> <a href="#Other RCU Flavors"> + Other RCU Flavors</a> +<li> <a href="#Possible Future Changes"> + Possible Future Changes</a> +</ol> + +<p> +This is followed by a <a href="#Summary">summary</a>, +which is in turn followed by the inevitable +<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. + +<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> + +<p> +RCU's fundamental requirements are the closest thing RCU has to hard +mathematical requirements. +These are: + +<ol> +<li> <a href="#Grace-Period Guarantee"> + Grace-Period Guarantee</a> +<li> <a href="#Publish-Subscribe Guarantee"> + Publish-Subscribe Guarantee</a> +<li> <a href="#Memory-Barrier Guarantees"> + Memory-Barrier Guarantees</a> +<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> + RCU Primitives Guaranteed to Execute Unconditionally</a> +<li> <a href="#Guaranteed Read-to-Write Upgrade"> + Guaranteed Read-to-Write Upgrade</a> +</ol> + +<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> + +<p> +RCU's grace-period guarantee is unusual in being premeditated: +Jack Slingwine and I had this guarantee firmly in mind when we started +work on RCU (then called “rclock”) in the early 1990s. +That said, the past two decades of experience with RCU have produced +a much more detailed understanding of this guarantee. + +<p> +RCU's grace-period guarantee allows updaters to wait for the completion +of all pre-existing RCU read-side critical sections. +An RCU read-side critical section +begins with the marker <tt>rcu_read_lock()</tt> and ends with +the marker <tt>rcu_read_unlock()</tt>. +These markers may be nested, and RCU treats a nested set as one +big RCU read-side critical section. +Production-quality implementations of <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> are extremely lightweight, and in +fact have exactly zero overhead in Linux kernels built for production +use with <tt>CONFIG_PREEMPT=n</tt>. + +<p> +This guarantee allows ordering to be enforced with extremely low +overhead to readers, for example: + +<blockquote> +<pre> + 1 int x, y; + 2 + 3 void thread0(void) + 4 { + 5 rcu_read_lock(); + 6 r1 = READ_ONCE(x); + 7 r2 = READ_ONCE(y); + 8 rcu_read_unlock(); + 9 } +10 +11 void thread1(void) +12 { +13 WRITE_ONCE(x, 1); +14 synchronize_rcu(); +15 WRITE_ONCE(y, 1); +16 } +</pre> +</blockquote> + +<p> +Because the <tt>synchronize_rcu()</tt> on line 14 waits for +all pre-existing readers, any instance of <tt>thread0()</tt> that +loads a value of zero from <tt>x</tt> must complete before +<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must +also load a value of zero from <tt>y</tt>. +Similarly, any instance of <tt>thread0()</tt> that loads a value of +one from <tt>y</tt> must have started after the +<tt>synchronize_rcu()</tt> started, and must therefore also load +a value of one from <tt>x</tt>. +Therefore, the outcome: +<blockquote> +<pre> +(r1 == 0 && r2 == 1) +</pre> +</blockquote> +cannot happen. + +<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a> +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +<tt>synchronize_rcu()</tt>!!! +Just who are you trying to fool??? +<br><a href="#qq1answer">Answer</a> + +<p> +This scenario resembles one of the first uses of RCU in +<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, +which managed a distributed lock manager's transition into +a state suitable for handling recovery from node failure, +more or less as follows: + +<blockquote> +<pre> + 1 #define STATE_NORMAL 0 + 2 #define STATE_WANT_RECOVERY 1 + 3 #define STATE_RECOVERING 2 + 4 #define STATE_WANT_NORMAL 3 + 5 + 6 int state = STATE_NORMAL; + 7 + 8 void do_something_dlm(void) + 9 { +10 int state_snap; +11 +12 rcu_read_lock(); +13 state_snap = READ_ONCE(state); +14 if (state_snap == STATE_NORMAL) +15 do_something(); +16 else +17 do_something_carefully(); +18 rcu_read_unlock(); +19 } +20 +21 void start_recovery(void) +22 { +23 WRITE_ONCE(state, STATE_WANT_RECOVERY); +24 synchronize_rcu(); +25 WRITE_ONCE(state, STATE_RECOVERING); +26 recovery(); +27 WRITE_ONCE(state, STATE_WANT_NORMAL); +28 synchronize_rcu(); +29 WRITE_ONCE(state, STATE_NORMAL); +30 } +</pre> +</blockquote> + +<p> +The RCU read-side critical section in <tt>do_something_dlm()</tt> +works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> +to guarantee that <tt>do_something()</tt> never runs concurrently +with <tt>recovery()</tt>, but with little or no synchronization +overhead in <tt>do_something_dlm()</tt>. + +<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a> +Why is the <tt>synchronize_rcu()</tt> on line 28 needed? +<br><a href="#qq2answer">Answer</a> + +<p> +In order to avoid fatal problems such as deadlocks, +an RCU read-side critical section must not contain calls to +<tt>synchronize_rcu()</tt>. +Similarly, an RCU read-side critical section must not +contain anything that waits, directly or indirectly, on completion of +an invocation of <tt>synchronize_rcu()</tt>. + +<p> +Although RCU's grace-period guarantee is useful in and of itself, with +<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, +it would be good to be able to use RCU to coordinate read-side +access to linked data structures. +For this, the grace-period guarantee is not sufficient, as can +be seen in function <tt>add_gp_buggy()</tt> below. +We will look at the reader's code later, but in the meantime, just think of +the reader as locklessly picking up the <tt>gp</tt> pointer, +and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the +<tt>->a</tt> and <tt>->b</tt> fields. + +<blockquote> +<pre> + 1 bool add_gp_buggy(int a, int b) + 2 { + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); + 4 if (!p) + 5 return -ENOMEM; + 6 spin_lock(&gp_lock); + 7 if (rcu_access_pointer(gp)) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +11 p->a = a; +12 p->b = a; +13 gp = p; /* ORDERING BUG */ +14 spin_unlock(&gp_lock); +15 return true; +16 } +</pre> +</blockquote> + +<p> +The problem is that both the compiler and weakly ordered CPUs are within +their rights to reorder this code as follows: + +<blockquote> +<pre> + 1 bool add_gp_buggy_optimized(int a, int b) + 2 { + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); + 4 if (!p) + 5 return -ENOMEM; + 6 spin_lock(&gp_lock); + 7 if (rcu_access_pointer(gp)) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +<b>11 gp = p; /* ORDERING BUG */ +12 p->a = a; +13 p->b = a;</b> +14 spin_unlock(&gp_lock); +15 return true; +16 } +</pre> +</blockquote> + +<p> +If an RCU reader fetches <tt>gp</tt> just after +<tt>add_gp_buggy_optimized</tt> executes line 11, +it will see garbage in the <tt>->a</tt> and <tt>->b</tt> +fields. +And this is but one of many ways in which compiler and hardware optimizations +could cause trouble. +Therefore, we clearly need some way to prevent the compiler and the CPU from +reordering in this manner, which brings us to the publish-subscribe +guarantee discussed in the next section. + +<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> + +<p> +RCU's publish-subscribe guarantee allows data to be inserted +into a linked data structure without disrupting RCU readers. +The updater uses <tt>rcu_assign_pointer()</tt> to insert the +new data, and readers use <tt>rcu_dereference()</tt> to +access data, whether new or old. +The following shows an example of insertion: + +<blockquote> +<pre> + 1 bool add_gp(int a, int b) + 2 { + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); + 4 if (!p) + 5 return -ENOMEM; + 6 spin_lock(&gp_lock); + 7 if (rcu_access_pointer(gp)) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +11 p->a = a; +12 p->b = a; +13 rcu_assign_pointer(gp, p); +14 spin_unlock(&gp_lock); +15 return true; +16 } +</pre> +</blockquote> + +<p> +The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually +equivalent to a simple assignment statement, but also guarantees +that its assignment will +happen after the two assignments in lines 11 and 12, +similar to the C11 <tt>memory_order_release</tt> store operation. +It also prevents any number of “interesting” compiler +optimizations, for example, the use of <tt>gp</tt> as a scratch +location immediately preceding the assignment. + +<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a> +But <tt>rcu_assign_pointer()</tt> does nothing to prevent the +two assignments to <tt>p->a</tt> and <tt>p->b</tt> +from being reordered. +Can't that also cause problems? +<br><a href="#qq3answer">Answer</a> + +<p> +It is tempting to assume that the reader need not do anything special +to control its accesses to the RCU-protected data, +as shown in <tt>do_something_gp_buggy()</tt> below: + +<blockquote> +<pre> + 1 bool do_something_gp_buggy(void) + 2 { + 3 rcu_read_lock(); + 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ + 5 if (p) { + 6 do_something(p->a, p->b); + 7 rcu_read_unlock(); + 8 return true; + 9 } +10 rcu_read_unlock(); +11 return false; +12 } +</pre> +</blockquote> + +<p> +However, this temptation must be resisted because there are a +surprisingly large number of ways that the compiler +(to say nothing of +<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) +can trip this code up. +For but one example, if the compiler were short of registers, it +might choose to refetch from <tt>gp</tt> rather than keeping +a separate copy in <tt>p</tt> as follows: + +<blockquote> +<pre> + 1 bool do_something_gp_buggy_optimized(void) + 2 { + 3 rcu_read_lock(); + 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ +<b> 5 do_something(gp->a, gp->b);</b> + 6 rcu_read_unlock(); + 7 return true; + 8 } + 9 rcu_read_unlock(); +10 return false; +11 } +</pre> +</blockquote> + +<p> +If this function ran concurrently with a series of updates that +replaced the current structure with a new one, +the fetches of <tt>gp->a</tt> +and <tt>gp->b</tt> might well come from two different structures, +which could cause serious confusion. +To prevent this (and much else besides), <tt>do_something_gp()</tt> uses +<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: + +<blockquote> +<pre> + 1 bool do_something_gp(void) + 2 { + 3 rcu_read_lock(); + 4 p = rcu_dereference(gp); + 5 if (p) { + 6 do_something(p->a, p->b); + 7 rcu_read_unlock(); + 8 return true; + 9 } +10 rcu_read_unlock(); +11 return false; +12 } +</pre> +</blockquote> + +<p> +The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) +memory barriers in the Linux kernel. +Should a +<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> +ever appear, then <tt>rcu_dereference()</tt> could be implemented +as a <tt>memory_order_consume</tt> load. +Regardless of the exact implementation, a pointer fetched by +<tt>rcu_dereference()</tt> may not be used outside of the +outermost RCU read-side critical section containing that +<tt>rcu_dereference()</tt>, unless protection of +the corresponding data element has been passed from RCU to some +other synchronization mechanism, most commonly locking or +<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. + +<p> +In short, updaters use <tt>rcu_assign_pointer()</tt> and readers +use <tt>rcu_dereference()</tt>, and these two RCU API elements +work together to ensure that readers have a consistent view of +newly added data elements. + +<p> +Of course, it is also necessary to remove elements from RCU-protected +data structures, for example, using the following process: + +<ol> +<li> Remove the data element from the enclosing structure. +<li> Wait for all pre-existing RCU read-side critical sections + to complete (because only pre-existing readers can possibly have + a reference to the newly removed data element). +<li> At this point, only the updater has a reference to the + newly removed data element, so it can safely reclaim + the data element, for example, by passing it to <tt>kfree()</tt>. +</ol> + +This process is implemented by <tt>remove_gp_synchronous()</tt>: + +<blockquote> +<pre> + 1 bool remove_gp_synchronous(void) + 2 { + 3 struct foo *p; + 4 + 5 spin_lock(&gp_lock); + 6 p = rcu_access_pointer(gp); + 7 if (!p) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +11 rcu_assign_pointer(gp, NULL); +12 spin_unlock(&gp_lock); +13 synchronize_rcu(); +14 kfree(p); +15 return true; +16 } +</pre> +</blockquote> + +<p> +This function is straightforward, with line 13 waiting for a grace +period before line 14 frees the old data element. +This waiting ensures that readers will reach line 7 of +<tt>do_something_gp()</tt> before the data element referenced by +<tt>p</tt> is freed. +The <tt>rcu_access_pointer()</tt> on line 6 is similar to +<tt>rcu_dereference()</tt>, except that: + +<ol> +<li> The value returned by <tt>rcu_access_pointer()</tt> + cannot be dereferenced. + If you want to access the value pointed to as well as + the pointer itself, use <tt>rcu_dereference()</tt> + instead of <tt>rcu_access_pointer()</tt>. +<li> The call to <tt>rcu_access_pointer()</tt> need not be + protected. + In contrast, <tt>rcu_dereference()</tt> must either be + within an RCU read-side critical section or in a code + segment where the pointer cannot change, for example, in + code protected by the corresponding update-side lock. +</ol> + +<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a> +Without the <tt>rcu_dereference()</tt> or the +<tt>rcu_access_pointer()</tt>, what destructive optimizations +might the compiler make use of? +<br><a href="#qq4answer">Answer</a> + +<p> +In short, RCU's publish-subscribe guarantee is provided by the combination +of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +<p> +This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling <tt>rcu_dereference()</tt> for subscription, nor did it +have anything resembling the <tt>smp_read_barrier_depends()</tt> +that was later subsumed into <tt>rcu_dereference()</tt>. +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good <i>two</i> hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting <tt>rcu_dereference()</tt>! + +<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3> + +<p> +The previous section's simple linked-data-structure scenario clearly +demonstrates the need for RCU's stringent memory-ordering guarantees on +systems with more than one CPU: + +<ol> +<li> Each CPU that has an RCU read-side critical section that + begins before <tt>synchronize_rcu()</tt> starts is + guaranteed to execute a full memory barrier between the time + that the RCU read-side critical section ends and the time that + <tt>synchronize_rcu()</tt> returns. + Without this guarantee, a pre-existing RCU read-side critical section + might hold a reference to the newly removed <tt>struct foo</tt> + after the <tt>kfree()</tt> on line 14 of + <tt>remove_gp_synchronous()</tt>. +<li> Each CPU that has an RCU read-side critical section that ends + after <tt>synchronize_rcu()</tt> returns is guaranteed + to execute a full memory barrier between the time that + <tt>synchronize_rcu()</tt> begins and the time that the RCU + read-side critical section begins. + Without this guarantee, a later RCU read-side critical section + running after the <tt>kfree()</tt> on line 14 of + <tt>remove_gp_synchronous()</tt> might + later run <tt>do_something_gp()</tt> and find the + newly deleted <tt>struct foo</tt>. +<li> If the task invoking <tt>synchronize_rcu()</tt> remains + on a given CPU, then that CPU is guaranteed to execute a full + memory barrier sometime during the execution of + <tt>synchronize_rcu()</tt>. + This guarantee ensures that the <tt>kfree()</tt> on + line 14 of <tt>remove_gp_synchronous()</tt> really does + execute after the removal on line 11. +<li> If the task invoking <tt>synchronize_rcu()</tt> migrates + among a group of CPUs during that invocation, then each of the + CPUs in that group is guaranteed to execute a full memory barrier + sometime during the execution of <tt>synchronize_rcu()</tt>. + This guarantee also ensures that the <tt>kfree()</tt> on + line 14 of <tt>remove_gp_synchronous()</tt> really does + execute after the removal on + line 11, but also in the case where the thread executing the + <tt>synchronize_rcu()</tt> migrates in the meantime. +</ol> + +<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a> +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of <tt>synchronize_rcu()</tt>? +<br><a href="#qq5answer">Answer</a> + +<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a> +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers <i> really</i> required? +<br><a href="#qq6answer">Answer</a> + +<p> +Note that these memory-barrier requirements do not replace the fundamental +RCU requirement that a grace period wait for all pre-existing readers. +On the contrary, the memory barriers called out in this section must operate in +such a way as to <i>enforce</i> this fundamental requirement. +Of course, different implementations enforce this requirement in different +ways, but enforce it they must. + +<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> + +<p> +The common-case RCU primitives are unconditional. +They are invoked, they do their job, and they return, with no possibility +of error, and no need to retry. +This is a key RCU design philosophy. + +<p> +However, this philosophy is pragmatic rather than pigheaded. +If someone comes up with a good justification for a particular conditional +RCU primitive, it might well be implemented and added. +After all, this guarantee was reverse-engineered, not premeditated. +The unconditional nature of the RCU primitives was initially an +accident of implementation, and later experience with synchronization +primitives with conditional primitives caused me to elevate this +accident to a guarantee. +Therefore, the justification for adding a conditional primitive to +RCU would need to be based on detailed and compelling use cases. + +<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> + +<p> +As far as RCU is concerned, it is always possible to carry out an +update within an RCU read-side critical section. +For example, that RCU read-side critical section might search for +a given data element, and then might acquire the update-side +spinlock in order to update that element, all while remaining +in that RCU read-side critical section. +Of course, it is necessary to exit the RCU read-side critical section +before invoking <tt>synchronize_rcu()</tt>, however, this +inconvenience can be avoided through use of the +<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members +described later in this document. + +<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a> +But how does the upgrade-to-write operation exclude other readers? +<br><a href="#qq7answer">Answer</a> + +<p> +This guarantee allows lookup code to be shared between read-side +and update-side code, and was premeditated, appearing in the earliest +DYNIX/ptx RCU documentation. + +<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> + +<p> +RCU provides extremely lightweight readers, and its read-side guarantees, +though quite useful, are correspondingly lightweight. +It is therefore all too easy to assume that RCU is guaranteeing more +than it really is. +Of course, the list of things that RCU does not guarantee is infinitely +long, however, the following sections list a few non-guarantees that +have caused confusion. +Except where otherwise noted, these non-guarantees were premeditated. + +<ol> +<li> <a href="#Readers Impose Minimal Ordering"> + Readers Impose Minimal Ordering</a> +<li> <a href="#Readers Do Not Exclude Updaters"> + Readers Do Not Exclude Updaters</a> +<li> <a href="#Updaters Only Wait For Old Readers"> + Updaters Only Wait For Old Readers</a> +<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> + Grace Periods Don't Partition Read-Side Critical Sections</a> +<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> + Read-Side Critical Sections Don't Partition Grace Periods</a> +<li> <a href="#Disabling Preemption Does Not Block Grace Periods"> + Disabling Preemption Does Not Block Grace Periods</a> +</ol> + +<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> + +<p> +Reader-side markers such as <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees +except through their interaction with the grace-period APIs such as +<tt>synchronize_rcu()</tt>. +To see this, consider the following pair of threads: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(x, 1); + 5 rcu_read_unlock(); + 6 rcu_read_lock(); + 7 WRITE_ONCE(y, 1); + 8 rcu_read_unlock(); + 9 } +10 +11 void thread1(void) +12 { +13 rcu_read_lock(); +14 r1 = READ_ONCE(y); +15 rcu_read_unlock(); +16 rcu_read_lock(); +17 r2 = READ_ONCE(x); +18 rcu_read_unlock(); +19 } +</pre> +</blockquote> + +<p> +After <tt>thread0()</tt> and <tt>thread1()</tt> execute +concurrently, it is quite possible to have + +<blockquote> +<pre> +(r1 == 1 && r2 == 0) +</pre> +</blockquote> + +(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), +which would not be possible if <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> had much in the way of ordering +properties. +But they do not, so the CPU is within its rights +to do significant reordering. +This is by design: Any significant ordering constraints would slow down +these fast-path APIs. + +<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a> +Can't the compiler also reorder this code? +<br><a href="#qq8answer">Answer</a> + +<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> + +<p> +Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> +exclude updates. +All they do is to prevent grace periods from ending. +The following example illustrates this: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 r1 = READ_ONCE(y); + 5 if (r1) { + 6 do_something_with_nonzero_x(); + 7 r2 = READ_ONCE(x); + 8 WARN_ON(!r2); /* BUG!!! */ + 9 } +10 rcu_read_unlock(); +11 } +12 +13 void thread1(void) +14 { +15 spin_lock(&my_lock); +16 WRITE_ONCE(x, 1); +17 WRITE_ONCE(y, 1); +18 spin_unlock(&my_lock); +19 } +</pre> +</blockquote> + +<p> +If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> +excluded the <tt>thread1()</tt> function's update, +the <tt>WARN_ON()</tt> could never fire. +But the fact is that <tt>rcu_read_lock()</tt> does not exclude +much of anything aside from subsequent grace periods, of which +<tt>thread1()</tt> has none, so the +<tt>WARN_ON()</tt> can and does fire. + +<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> + +<p> +It might be tempting to assume that after <tt>synchronize_rcu()</tt> +completes, there are no readers executing. +This temptation must be avoided because +new readers can start immediately after <tt>synchronize_rcu()</tt> +starts, and <tt>synchronize_rcu()</tt> is under no +obligation to wait for these new readers. + +<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a> +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? +<br><a href="#qq9answer">Answer</a> + +<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> +Grace Periods Don't Partition Read-Side Critical Sections</a></h3> + +<p> +It is tempting to assume that if any part of one RCU read-side critical +section precedes a given grace period, and if any part of another RCU +read-side critical section follows that same grace period, then all of +the first RCU read-side critical section must precede all of the second. +However, this just isn't the case: A single grace period does not +partition the set of RCU read-side critical sections. +An example of this situation can be illustrated as follows, where +<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(a, 1); + 5 WRITE_ONCE(b, 1); + 6 rcu_read_unlock(); + 7 } + 8 + 9 void thread1(void) +10 { +11 r1 = READ_ONCE(a); +12 synchronize_rcu(); +13 WRITE_ONCE(c, 1); +14 } +15 +16 void thread2(void) +17 { +18 rcu_read_lock(); +19 r2 = READ_ONCE(b); +20 r3 = READ_ONCE(c); +21 rcu_read_unlock(); +22 } +</pre> +</blockquote> + +<p> +It turns out that the outcome: + +<blockquote> +<pre> +(r1 == 1 && r2 == 0 && r3 == 1) +</pre> +</blockquote> + +is entirely possible. +The following figure show how this can happen, with each circled +<tt>QS</tt> indicating the point at which RCU recorded a +<i>quiescent state</i> for each thread, that is, a state in which +RCU knows that the thread cannot be in the midst of an RCU read-side +critical section that started before the current grace period: + +<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> + +<p> +If it is necessary to partition RCU read-side critical sections in this +manner, it is necessary to use two grace periods, where the first +grace period is known to end before the second grace period starts: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(a, 1); + 5 WRITE_ONCE(b, 1); + 6 rcu_read_unlock(); + 7 } + 8 + 9 void thread1(void) +10 { +11 r1 = READ_ONCE(a); +12 synchronize_rcu(); +13 WRITE_ONCE(c, 1); +14 } +15 +16 void thread2(void) +17 { +18 r2 = READ_ONCE(c); +19 synchronize_rcu(); +20 WRITE_ONCE(d, 1); +21 } +22 +23 void thread3(void) +24 { +25 rcu_read_lock(); +26 r3 = READ_ONCE(b); +27 r4 = READ_ONCE(d); +28 rcu_read_unlock(); +29 } +</pre> +</blockquote> + +<p> +Here, if <tt>(r1 == 1)</tt>, then +<tt>thread0()</tt>'s write to <tt>b</tt> must happen +before the end of <tt>thread1()</tt>'s grace period. +If in addition <tt>(r4 == 1)</tt>, then +<tt>thread3()</tt>'s read from <tt>b</tt> must happen +after the beginning of <tt>thread2()</tt>'s grace period. +If it is also the case that <tt>(r2 == 1)</tt>, then the +end of <tt>thread1()</tt>'s grace period must precede the +beginning of <tt>thread2()</tt>'s grace period. +This mean that the two RCU read-side critical sections cannot overlap, +guaranteeing that <tt>(r3 == 1)</tt>. +As a result, the outcome: + +<blockquote> +<pre> +(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) +</pre> +</blockquote> + +cannot happen. + +<p> +This non-requirement was also non-premeditated, but became apparent +when studying RCU's interaction with memory ordering. + +<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> +Read-Side Critical Sections Don't Partition Grace Periods</a></h3> + +<p> +It is also tempting to assume that if an RCU read-side critical section +happens between a pair of grace periods, then those grace periods cannot +overlap. +However, this temptation leads nowhere good, as can be illustrated by +the following, with all variables initially zero: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(a, 1); + 5 WRITE_ONCE(b, 1); + 6 rcu_read_unlock(); + 7 } + 8 + 9 void thread1(void) +10 { +11 r1 = READ_ONCE(a); +12 synchronize_rcu(); +13 WRITE_ONCE(c, 1); +14 } +15 +16 void thread2(void) +17 { +18 rcu_read_lock(); +19 WRITE_ONCE(d, 1); +20 r2 = READ_ONCE(c); +21 rcu_read_unlock(); +22 } +23 +24 void thread3(void) +25 { +26 r3 = READ_ONCE(d); +27 synchronize_rcu(); +28 WRITE_ONCE(e, 1); +29 } +30 +31 void thread4(void) +32 { +33 rcu_read_lock(); +34 r4 = READ_ONCE(b); +35 r5 = READ_ONCE(e); +36 rcu_read_unlock(); +37 } +</pre> +</blockquote> + +<p> +In this case, the outcome: + +<blockquote> +<pre> +(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) +</pre> +</blockquote> + +is entirely possible, as illustrated below: + +<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> + +<p> +Again, an RCU read-side critical section can overlap almost all of a +given grace period, just so long as it does not overlap the entire +grace period. +As a result, an RCU read-side critical section cannot partition a pair +of RCU grace periods. + +<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a> +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? +<br><a href="#qq10answer">Answer</a> + +<h3><a name="Disabling Preemption Does Not Block Grace Periods"> +Disabling Preemption Does Not Block Grace Periods</a></h3> + +<p> +There was a time when disabling preemption on any given CPU would block +subsequent grace periods. +However, this was an accident of implementation and is not a requirement. +And in the current Linux-kernel implementation, disabling preemption +on a given CPU in fact does not block grace periods, as Oleg Nesterov +<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. + +<p> +If you need a preempt-disable region to block grace periods, you need to add +<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example +as follows: + +<blockquote> +<pre> + 1 preempt_disable(); + 2 rcu_read_lock(); + 3 do_something(); + 4 rcu_read_unlock(); + 5 preempt_enable(); + 6 + 7 /* Spinlocks implicitly disable preemption. */ + 8 spin_lock(&mylock); + 9 rcu_read_lock(); +10 do_something(); +11 rcu_read_unlock(); +12 spin_unlock(&mylock); +</pre> +</blockquote> + +<p> +In theory, you could enter the RCU read-side critical section first, +but it is more efficient to keep the entire RCU read-side critical +section contained in the preempt-disable region as shown above. +Of course, RCU read-side critical sections that extend outside of +preempt-disable regions will work correctly, but such critical sections +can be preempted, which forces <tt>rcu_read_unlock()</tt> to do +more work. +And no, this is <i>not</i> an invitation to enclose all of your RCU +read-side critical sections within preempt-disable regions, because +doing so would degrade real-time response. + +<p> +This non-requirement appeared with preemptible RCU. +If you need a grace period that waits on non-preemptible code regions, use +<a href="#Sched Flavor">RCU-sched</a>. + +<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> + +<p> +These parallelism facts of life are by no means specific to RCU, but +the RCU implementation must abide by them. +They therefore bear repeating: + +<ol> +<li> Any CPU or task may be delayed at any time, + and any attempts to avoid these delays by disabling + preemption, interrupts, or whatever are completely futile. + This is most obvious in preemptible user-level + environments and in virtualized environments (where + a given guest OS's VCPUs can be preempted at any time by + the underlying hypervisor), but can also happen in bare-metal + environments due to ECC errors, NMIs, and other hardware + events. + Although a delay of more than about 20 seconds can result + in splats, the RCU implementation is obligated to use + algorithms that can tolerate extremely long delays, but where + “extremely long” is not long enough to allow + wrap-around when incrementing a 64-bit counter. +<li> Both the compiler and the CPU can reorder memory accesses. + Where it matters, RCU must use compiler directives and + memory-barrier instructions to preserve ordering. +<li> Conflicting writes to memory locations in any given cache line + will result in expensive cache misses. + Greater numbers of concurrent writes and more-frequent + concurrent writes will result in more dramatic slowdowns. + RCU is therefore obligated to use algorithms that have + sufficient locality to avoid significant performance and + scalability problems. +<li> As a rough rule of thumb, only one CPU's worth of processing + may be carried out under the protection of any given exclusive + lock. + RCU must therefore use scalable locking designs. +<li> Counters are finite, especially on 32-bit systems. + RCU's use of counters must therefore tolerate counter wrap, + or be designed such that counter wrap would take way more + time than a single system is likely to run. + An uptime of ten years is quite possible, a runtime + of a century much less so. + As an example of the latter, RCU's dyntick-idle nesting counter + allows 54 bits for interrupt nesting level (this counter + is 64 bits even on a 32-bit system). + Overflowing this counter requires 2<sup>54</sup> + half-interrupts on a given CPU without that CPU ever going idle. + If a half-interrupt happened every microsecond, it would take + 570 years of runtime to overflow this counter, which is currently + believed to be an acceptably long time. +<li> Linux systems can have thousands of CPUs running a single + Linux kernel in a single shared-memory environment. + RCU must therefore pay close attention to high-end scalability. +</ol> + +<p> +This last parallelism fact of life means that RCU must pay special +attention to the preceding facts of life. +The idea that Linux might scale to systems with thousands of CPUs would +have been met with some skepticism in the 1990s, but these requirements +would have otherwise have been unsurprising, even in the early 1990s. + +<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> + +<p> +These sections list quality-of-implementation requirements. +Although an RCU implementation that ignores these requirements could +still be used, it would likely be subject to limitations that would +make it inappropriate for industrial-strength production use. +Classes of quality-of-implementation requirements are as follows: + +<ol> +<li> <a href="#Specialization">Specialization</a> +<li> <a href="#Performance and Scalability">Performance and Scalability</a> +<li> <a href="#Composability">Composability</a> +<li> <a href="#Corner Cases">Corner Cases</a> +</ol> + +<p> +These classes is covered in the following sections. + +<h3><a name="Specialization">Specialization</a></h3> + +<p> +RCU is and always has been intended primarily for read-mostly situations, as +illustrated by the following figure. +This means that RCU's read-side primitives are optimized, often at the +expense of its update-side primitives. + +<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> + +<p> +This focus on read-mostly situations means that RCU must interoperate +with other synchronization primitives. +For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> +examples discussed earlier use RCU to protect readers and locking to +coordinate updaters. +However, the need extends much farther, requiring that a variety of +synchronization primitives be legal within RCU read-side critical sections, +including spinlocks, sequence locks, atomic operations, reference +counters, and memory barriers. + +<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a> +What about sleeping locks? +<br><a href="#qq11answer">Answer</a> + +<p> +It often comes as a surprise that many algorithms do not require a +consistent view of data, but many can function in that mode, +with network routing being the poster child. +Internet routing algorithms take significant time to propagate +updates, so that by the time an update arrives at a given system, +that system has been sending network traffic the wrong way for +a considerable length of time. +Having a few threads continue to send traffic the wrong way for a +few more milliseconds is clearly not a problem: In the worst case, +TCP retransmissions will eventually get the data where it needs to go. +In general, when tracking the state of the universe outside of the +computer, some level of inconsistency must be tolerated due to +speed-of-light delays if nothing else. + +<p> +Furthermore, uncertainty about external state is inherent in many cases. +For example, a pair of veternarians might use heartbeat to determine +whether or not a given cat was alive. +But how long should they wait after the last heartbeat to decide that +the cat is in fact dead? +Waiting less than 400 milliseconds makes no sense because this would +mean that a relaxed cat would be considered to cycle between death +and life more than 100 times per minute. +Moreover, just as with human beings, a cat's heart might stop for +some period of time, so the exact wait period is a judgment call. +One of our pair of veternarians might wait 30 seconds before pronouncing +the cat dead, while the other might insist on waiting a full minute. +The two veternarians would then disagree on the state of the cat during +the final 30 seconds of the minute following the last heartbeat, as +fancifully illustrated below: + +<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> + +<p> +Interestingly enough, this same situation applies to hardware. +When push comes to shove, how do we tell whether or not some +external server has failed? +We send messages to it periodically, and declare it failed if we +don't receive a response within a given period of time. +Policy decisions can usually tolerate short +periods of inconsistency. +The policy was decided some time ago, and is only now being put into +effect, so a few milliseconds of delay is normally inconsequential. + +<p> +However, there are algorithms that absolutely must see consistent data. +For example, the translation between a user-level SystemV semaphore +ID to the corresponding in-kernel data structure is protected by RCU, +but it is absolutely forbidden to update a semaphore that has just been +removed. +In the Linux kernel, this need for consistency is accommodated by acquiring +spinlocks located in the in-kernel data structure from within +the RCU read-side critical section, and this is indicated by the +green box in the figure above. +Many other techniques may be used, and are in fact used within the +Linux kernel. + +<p> +In short, RCU is not required to maintain consistency, and other +mechanisms may be used in concert with RCU when consistency is required. +RCU's specialization allows it to do its job extremely well, and its +ability to interoperate with other synchronization mechanisms allows +the right mix of synchronization tools to be used for a given job. + +<h3><a name="Performance and Scalability">Performance and Scalability</a></h3> + +<p> +Energy efficiency is a critical component of performance today, +and Linux-kernel RCU implementations must therefore avoid unnecessarily +awakening idle CPUs. +I cannot claim that this requirement was premeditated. +In fact, I learned of it during a telephone conversation in which I +was given “frank and open” feedback on the importance +of energy efficiency in battery-powered systems and on specific +energy-efficiency shortcomings of the Linux-kernel RCU implementation. +In my experience, the battery-powered embedded community will consider +any unnecessary wakeups to be extremely unfriendly acts. +So much so that mere Linux-kernel-mailing-list posts are +insufficient to vent their ire. + +<p> +Memory consumption is not particularly important for in most +situations, and has become decreasingly +so as memory sizes have expanded and memory +costs have plummeted. +However, as I learned from Matt Mackall's +<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> +efforts, memory footprint is critically important on single-CPU systems with +non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus +<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> +was born. +Josh Triplett has since taken over the small-memory banner with his +<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> +project, which resulted in +<a href="#Sleepable RCU">SRCU</a> +becoming optional for those kernels not needing it. + +<p> +The remaining performance requirements are, for the most part, +unsurprising. +For example, in keeping with RCU's read-side specialization, +<tt>rcu_dereference()</tt> should have negligible overhead (for +example, suppression of a few minor compiler optimizations). +Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> should have exactly zero overhead. + +<p> +In preemptible environments, in the case where the RCU read-side +critical section was not preempted (as will be the case for the +highest-priority real-time process), <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> should have minimal overhead. +In particular, they should not contain atomic read-modify-write +operations, memory-barrier instructions, preemption disabling, +interrupt disabling, or backwards branches. +However, in the case where the RCU read-side critical section was preempted, +<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. +This is why it is better to nest an RCU read-side critical section +within a preempt-disable region than vice versa, at least in cases +where that critical section is short enough to avoid unduly degrading +real-time latencies. + +<p> +The <tt>synchronize_rcu()</tt> grace-period-wait primitive is +optimized for throughput. +It may therefore incur several milliseconds of latency in addition to +the duration of the longest RCU read-side critical section. +On the other hand, multiple concurrent invocations of +<tt>synchronize_rcu()</tt> are required to use batching optimizations +so that they can be satisfied by a single underlying grace-period-wait +operation. +For example, in the Linux kernel, it is not unusual for a single +grace-period-wait operation to serve more than +<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> +of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation +overhead down to nearly zero. +However, the grace-period optimization is also required to avoid +measurable degradation of real-time scheduling and interrupt latencies. + +<p> +In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> +latencies are unacceptable. +In these cases, <tt>synchronize_rcu_expedited()</tt> may be used +instead, reducing the grace-period latency down to a few tens of +microseconds on small systems, at least in cases where the RCU read-side +critical sections are short. +There are currently no special latency requirements for +<tt>synchronize_rcu_expedited()</tt> on large systems, but, +consistent with the empirical nature of the RCU specification, +that is subject to change. +However, there most definitely are scalability requirements: +A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 +CPUs should at least make reasonable forward progress. +In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> +is permitted to impose modest degradation of real-time latency +on non-idle online CPUs. +That said, it will likely be necessary to take further steps to reduce this +degradation, hopefully to roughly that of a scheduling-clock interrupt. + +<p> +There are a number of situations where even +<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period +latency is unacceptable. +In these situations, the asynchronous <tt>call_rcu()</tt> can be +used in place of <tt>synchronize_rcu()</tt> as follows: + +<blockquote> +<pre> + 1 struct foo { + 2 int a; + 3 int b; + 4 struct rcu_head rh; + 5 }; + 6 + 7 static void remove_gp_cb(struct rcu_head *rhp) + 8 { + 9 struct foo *p = container_of(rhp, struct foo, rh); +10 +11 kfree(p); +12 } +13 +14 bool remove_gp_asynchronous(void) +15 { +16 struct foo *p; +17 +18 spin_lock(&gp_lock); +19 p = rcu_dereference(gp); +20 if (!p) { +21 spin_unlock(&gp_lock); +22 return false; +23 } +24 rcu_assign_pointer(gp, NULL); +25 call_rcu(&p->rh, remove_gp_cb); +26 spin_unlock(&gp_lock); +27 return true; +28 } +</pre> +</blockquote> + +<p> +A definition of <tt>struct foo</tt> is finally needed, and appears +on lines 1-5. +The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> +on line 25, and will be invoked after the end of a subsequent +grace period. +This gets the same effect as <tt>remove_gp_synchronous()</tt>, +but without forcing the updater to wait for a grace period to elapse. +The <tt>call_rcu()</tt> function may be used in a number of +situations where neither <tt>synchronize_rcu()</tt> nor +<tt>synchronize_rcu_expedited()</tt> would be legal, +including within preempt-disable code, <tt>local_bh_disable()</tt> code, +interrupt-disable code, and interrupt handlers. +However, even <tt>call_rcu()</tt> is illegal within NMI handlers. +The callback function (<tt>remove_gp_cb()</tt> in this case) will be +executed within softirq (software interrupt) environment within the +Linux kernel, +either within a real softirq handler or under the protection +of <tt>local_bh_disable()</tt>. +In both the Linux kernel and in userspace, it is bad practice to +write an RCU callback function that takes too long. +Long-running operations should be relegated to separate threads or +(in the Linux kernel) workqueues. + +<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a> +Why does line 19 use <tt>rcu_access_pointer()</tt>? +After all, <tt>call_rcu()</tt> on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that <tt>rcu_dereference()</tt> is required? +<br><a href="#qq12answer">Answer</a> + +<p> +However, all that <tt>remove_gp_cb()</tt> is doing is +invoking <tt>kfree()</tt> on the data element. +This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, +which allows “fire and forget” operation as shown below: + +<blockquote> +<pre> + 1 struct foo { + 2 int a; + 3 int b; + 4 struct rcu_head rh; + 5 }; + 6 + 7 bool remove_gp_faf(void) + 8 { + 9 struct foo *p; +10 +11 spin_lock(&gp_lock); +12 p = rcu_dereference(gp); +13 if (!p) { +14 spin_unlock(&gp_lock); +15 return false; +16 } +17 rcu_assign_pointer(gp, NULL); +18 kfree_rcu(p, rh); +19 spin_unlock(&gp_lock); +20 return true; +21 } +</pre> +</blockquote> + +<p> +Note that <tt>remove_gp_faf()</tt> simply invokes +<tt>kfree_rcu()</tt> and proceeds, without any need to pay any +further attention to the subsequent grace period and <tt>kfree()</tt>. +It is permissible to invoke <tt>kfree_rcu()</tt> from the same +environments as for <tt>call_rcu()</tt>. +Interestingly enough, DYNIX/ptx had the equivalents of +<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not +<tt>synchronize_rcu()</tt>. +This was due to the fact that RCU was not heavily used within DYNIX/ptx, +so the very few places that needed something like +<tt>synchronize_rcu()</tt> simply open-coded it. + +<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a> +Earlier it was claimed that <tt>call_rcu()</tt> and +<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? +<br><a href="#qq13answer">Answer</a> + +<p> +But what if the updater must wait for the completion of code to be +executed after the end of the grace period, but has other tasks +that can be carried out in the meantime? +The polling-style <tt>get_state_synchronize_rcu()</tt> and +<tt>cond_synchronize_rcu()</tt> functions may be used for this +purpose, as shown below: + +<blockquote> +<pre> + 1 bool remove_gp_poll(void) + 2 { + 3 struct foo *p; + 4 unsigned long s; + 5 + 6 spin_lock(&gp_lock); + 7 p = rcu_access_pointer(gp); + 8 if (!p) { + 9 spin_unlock(&gp_lock); +10 return false; +11 } +12 rcu_assign_pointer(gp, NULL); +13 spin_unlock(&gp_lock); +14 s = get_state_synchronize_rcu(); +15 do_something_while_waiting(); +16 cond_synchronize_rcu(s); +17 kfree(p); +18 return true; +19 } +</pre> +</blockquote> + +<p> +On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a +“cookie” from RCU, +then line 15 carries out other tasks, +and finally, line 16 returns immediately if a grace period has +elapsed in the meantime, but otherwise waits as required. +The need for <tt>get_state_synchronize_rcu</tt> and +<tt>cond_synchronize_rcu()</tt> has appeared quite recently, +so it is too early to tell whether they will stand the test of time. + +<p> +RCU thus provides a range of tools to allow updaters to strike the +required tradeoff between latency, flexibility and CPU overhead. + +<h3><a name="Composability">Composability</a></h3> + +<p> +Composability has received much attention in recent years, perhaps in part +due to the collision of multicore hardware with object-oriented techniques +designed in single-threaded environments for single-threaded use. +And in theory, RCU read-side critical sections may be composed, and in +fact may be nested arbitrarily deeply. +In practice, as with all real-world implementations of composable +constructs, there are limitations. + +<p> +Implementations of RCU for which <tt>rcu_read_lock()</tt> +and <tt>rcu_read_unlock()</tt> generate no code, such as +Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be +nested arbitrarily deeply. +After all, there is no overhead. +Except that if all these instances of <tt>rcu_read_lock()</tt> +and <tt>rcu_read_unlock()</tt> are visible to the compiler, +compilation will eventually fail due to exhausting memory, +mass storage, or user patience, whichever comes first. +If the nesting is not visible to the compiler, as is the case with +mutually recursive functions each in its own translation unit, +stack overflow will result. +If the nesting takes the form of loops, either the control variable +will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. +Nevertheless, this class of RCU implementations is one +of the most composable constructs in existence. + +<p> +RCU implementations that explicitly track nesting depth +are limited by the nesting-depth counter. +For example, the Linux kernel's preemptible RCU limits nesting to +<tt>INT_MAX</tt>. +This should suffice for almost all practical purposes. +That said, a consecutive pair of RCU read-side critical sections +between which there is an operation that waits for a grace period +cannot be enclosed in another RCU read-side critical section. +This is because it is not legal to wait for a grace period within +an RCU read-side critical section: To do so would result either +in deadlock or +in RCU implicitly splitting the enclosing RCU read-side critical +section, neither of which is conducive to a long-lived and prosperous +kernel. + +<p> +It is worth noting that RCU is not alone in limiting composability. +For example, many transactional-memory implementations prohibit +composing a pair of transactions separated by an irrevocable +operation (for example, a network receive operation). +For another example, lock-based critical sections can be composed +surprisingly freely, but only if deadlock is avoided. + +<p> +In short, although RCU read-side critical sections are highly composable, +care is required in some situations, just as is the case for any other +composable synchronization mechanism. + +<h3><a name="Corner Cases">Corner Cases</a></h3> + +<p> +A given RCU workload might have an endless and intense stream of +RCU read-side critical sections, perhaps even so intense that there +was never a point in time during which there was not at least one +RCU read-side critical section in flight. +RCU cannot allow this situation to block grace periods: As long as +all the RCU read-side critical sections are finite, grace periods +must also be finite. + +<p> +That said, preemptible RCU implementations could potentially result +in RCU read-side critical sections being preempted for long durations, +which has the effect of creating a long-duration RCU read-side +critical section. +This situation can arise only in heavily loaded systems, but systems using +real-time priorities are of course more vulnerable. +Therefore, RCU priority boosting is provided to help deal with this +case. +That said, the exact requirements on RCU priority boosting will likely +evolve as more experience accumulates. + +<p> +Other workloads might have very high update rates. +Although one can argue that such workloads should instead use +something other than RCU, the fact remains that RCU must +handle such workloads gracefully. +This requirement is another factor driving batching of grace periods, +but it is also the driving force behind the checks for large numbers +of queued RCU callbacks in the <tt>call_rcu()</tt> code path. +Finally, high update rates should not delay RCU read-side critical +sections, although some read-side delays can occur when using +<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use +of <tt>try_stop_cpus()</tt>. +(In the future, <tt>synchronize_rcu_expedited()</tt> will be +converted to use lighter-weight inter-processor interrupts (IPIs), +but this will still disturb readers, though to a much smaller degree.) + +<p> +Although all three of these corner cases were understood in the early +1990s, a simple user-level test consisting of <tt>close(open(path))</tt> +in a tight loop +in the early 2000s suddenly provided a much deeper appreciation of the +high-update-rate corner case. +This test also motivated addition of some RCU code to react to high update +rates, for example, if a given CPU finds itself with more than 10,000 +RCU callbacks queued, it will cause RCU to take evasive action by +more aggressively starting grace periods and more aggressively forcing +completion of grace-period processing. +This evasive action causes the grace period to complete more quickly, +but at the cost of restricting RCU's batching optimizations, thus +increasing the CPU overhead incurred by that grace period. + +<h2><a name="Software-Engineering Requirements"> +Software-Engineering Requirements</a></h2> + +<p> +Between Murphy's Law and “To err is human”, it is necessary to +guard against mishaps and misuse: + +<ol> +<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> + everywhere that it is needed, so kernels built with + <tt>CONFIG_PROVE_RCU=y</tt> will spat if + <tt>rcu_dereference()</tt> is used outside of an + RCU read-side critical section. + Update-side code can use <tt>rcu_dereference_protected()</tt>, + which takes a + <a href="https://lwn.net/Articles/371986/">lockdep expression</a> + to indicate what is providing the protection. + If the indicated protection is not provided, a lockdep splat + is emitted. + + <p> + Code shared between readers and updaters can use + <tt>rcu_dereference_check()</tt>, which also takes a + lockdep expression, and emits a lockdep splat if neither + <tt>rcu_read_lock()</tt> nor the indicated protection + is in place. + In addition, <tt>rcu_dereference_raw()</tt> is used in those + (hopefully rare) cases where the required protection cannot + be easily described. + Finally, <tt>rcu_read_lock_held()</tt> is provided to + allow a function to verify that it has been invoked within + an RCU read-side critical section. + I was made aware of this set of requirements shortly after Thomas + Gleixner audited a number of RCU uses. +<li> A given function might wish to check for RCU-related preconditions + upon entry, before using any other RCU API. + The <tt>rcu_lockdep_assert()</tt> does this job, + asserting the expression in kernels having lockdep enabled + and doing nothing otherwise. +<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> + and <tt>rcu_dereference()</tt>, perhaps (incorrectly) + substituting a simple assignment. + To catch this sort of error, a given RCU-protected pointer may be + tagged with <tt>__rcu</tt>, after which running sparse + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain + about simple-assignment accesses to that pointer. + Arnd Bergmann made me aware of this requirement, and also + supplied the needed + <a href="https://lwn.net/Articles/376011/">patch series</a>. +<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> + will splat if a data element is passed to <tt>call_rcu()</tt> + twice in a row, without a grace period in between. + (This error is similar to a double free.) + The corresponding <tt>rcu_head</tt> structures that are + dynamically allocated are automatically tracked, but + <tt>rcu_head</tt> structures allocated on the stack + must be initialized with <tt>init_rcu_head_on_stack()</tt> + and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. + Similarly, statically allocated non-stack <tt>rcu_head</tt> + structures must be initialized with <tt>init_rcu_head()</tt> + and cleaned up with <tt>destroy_rcu_head()</tt>. + Mathieu Desnoyers made me aware of this requirement, and also + supplied the needed + <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. +<li> An infinite loop in an RCU read-side critical section will + eventually trigger an RCU CPU stall warning splat, with + the duration of “eventually” being controlled by the + <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or, + alternatively, by the + <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs + parameter. + However, RCU is not obligated to produce this splat + unless there is a grace period waiting on that particular + RCU read-side critical section. + <p> + Some extreme workloads might intentionally delay + RCU grace periods, and systems running those workloads can + be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt> + to suppress the splats. + This kernel parameter may also be set via <tt>sysfs</tt>. + Furthermore, RCU CPU stall warnings are counter-productive + during sysrq dumps and during panics. + RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and + <tt>rcu_sysrq_end()</tt> API members to be called before + and after long sysrq dumps. + RCU also supplies the <tt>rcu_panic()</tt> notifier that is + automatically invoked at the beginning of a panic to suppress + further RCU CPU stall warnings. + + <p> + This requirement made itself known in the early 1990s, pretty + much the first time that it was necessary to debug a CPU stall. + That said, the initial implementation in DYNIX/ptx was quite + generic in comparison with that of Linux. +<li> Although it would be very good to detect pointers leaking out + of RCU read-side critical sections, there is currently no + good way of doing this. + One complication is the need to distinguish between pointers + leaking and pointers that have been handed off from RCU to + some other synchronization mechanism, for example, reference + counting. +<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related + information is provided via both debugfs and event tracing. +<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and + <tt>rcu_dereference()</tt> to create typical linked + data structures can be surprisingly error-prone. + Therefore, RCU-protected + <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> + and, more recently, RCU-protected + <a href="https://lwn.net/Articles/612100/">hash tables</a> + are available. + Many other special-purpose RCU-protected data structures are + available in the Linux kernel and the userspace RCU library. +<li> Some linked structures are created at compile time, but still + require <tt>__rcu</tt> checking. + The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this + purpose. +<li> It is not necessary to use <tt>rcu_assign_pointer()</tt> + when creating linked structures that are to be published via + a single external pointer. + The <tt>RCU_INIT_POINTER()</tt> macro is provided for + this task and also for assigning <tt>NULL</tt> pointers + at runtime. +</ol> + +<p> +This not a hard-and-fast list: RCU's diagnostic capabilities will +continue to be guided by the number and type of usage bugs found +in real-world RCU usage. + +<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> + +<p> +The Linux kernel provides an interesting environment for all kinds of +software, including RCU. +Some of the relevant points of interest are as follows: + +<ol> +<li> <a href="#Configuration">Configuration</a>. +<li> <a href="#Firmware Interface">Firmware Interface</a>. +<li> <a href="#Early Boot">Early Boot</a>. +<li> <a href="#Interrupts and NMIs"> + Interrupts and non-maskable interrupts (NMIs)</a>. +<li> <a href="#Loadable Modules">Loadable Modules</a>. +<li> <a href="#Hotplug CPU">Hotplug CPU</a>. +<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. +<li> <a href="#Tracing and RCU">Tracing and RCU</a>. +<li> <a href="#Energy Efficiency">Energy Efficiency</a>. +<li> <a href="#Memory Efficiency">Memory Efficiency</a>. +<li> <a href="#Performance, Scalability, Response Time, and Reliability"> + Performance, Scalability, Response Time, and Reliability</a>. +</ol> + +<p> +This list is probably incomplete, but it does give a feel for the +most notable Linux-kernel complications. +Each of the following sections covers one of the above topics. + +<h3><a name="Configuration">Configuration</a></h3> + +<p> +RCU's goal is automatic configuration, so that almost nobody +needs to worry about RCU's <tt>Kconfig</tt> options. +And for almost all users, RCU does in fact work well +“out of the box.” + +<p> +However, there are specialized use cases that are handled by +kernel boot parameters and <tt>Kconfig</tt> options. +Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users +about new <tt>Kconfig</tt> options, which requires almost all of them +be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. + +<p> +This all should be quite obvious, but the fact remains that +Linus Torvalds recently had to +<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> +me of this requirement. + +<h3><a name="Firmware Interface">Firmware Interface</a></h3> + +<p> +In many cases, kernel obtains information about the system from the +firmware, and sometimes things are lost in translation. +Or the translation is accurate, but the original message is bogus. + +<p> +For example, some systems' firmware overreports the number of CPUs, +sometimes by a large factor. +If RCU naively believed the firmware, as it used to do, +it would create too many per-CPU kthreads. +Although the resulting system will still run correctly, the extra +kthreads needlessly consume memory and can cause confusion +when they show up in <tt>ps</tt> listings. + +<p> +RCU must therefore wait for a given CPU to actually come online before +it can allow itself to believe that the CPU actually exists. +The resulting “ghost CPUs” (which are never going to +come online) cause a number of +<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. + +<h3><a name="Early Boot">Early Boot</a></h3> + +<p> +The Linux kernel's boot sequence is an interesting process, +and RCU is used early, even before <tt>rcu_init()</tt> +is invoked. +In fact, a number of RCU's primitives can be used as soon as the +initial task's <tt>task_struct</tt> is available and the +boot CPU's per-CPU variables are set up. +The read-side primitives (<tt>rcu_read_lock()</tt>, +<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, +and <tt>rcu_access_pointer()</tt>) will operate normally very early on, +as will <tt>rcu_assign_pointer()</tt>. + +<p> +Although <tt>call_rcu()</tt> may be invoked at any +time during boot, callbacks are not guaranteed to be invoked until after +the scheduler is fully up and running. +This delay in callback invocation is due to the fact that RCU does not +invoke callbacks until it is fully initialized, and this full initialization +cannot occur until after the scheduler has initialized itself to the +point where RCU can spawn and run its kthreads. +In theory, it would be possible to invoke callbacks earlier, +however, this is not a panacea because there would be severe restrictions +on what operations those callbacks could invoke. + +<p> +Perhaps surprisingly, <tt>synchronize_rcu()</tt>, +<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> +(<a href="#Bottom-Half Flavor">discussed below</a>), +and +<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> +will all operate normally +during very early boot, the reason being that there is only one CPU +and preemption is disabled. +This means that the call <tt>synchronize_rcu()</tt> (or friends) +itself is a quiescent +state and thus a grace period, so the early-boot implementation can +be a no-op. + +<p> +Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> +continue to operate normally through the remainder of boot, courtesy +of the fact that preemption is disabled across their RCU read-side +critical sections and also courtesy of the fact that there is still +only one CPU. +However, once the scheduler starts initializing, preemption is enabled. +There is still only a single CPU, but the fact that preemption is enabled +means that the no-op implementation of <tt>synchronize_rcu()</tt> no +longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. +Therefore, as soon as the scheduler starts initializing, the early-boot +fastpath is disabled. +This means that <tt>synchronize_rcu()</tt> switches to its runtime +mode of operation where it posts callbacks, which in turn means that +any call to <tt>synchronize_rcu()</tt> will block until the corresponding +callback is invoked. +Unfortunately, the callback cannot be invoked until RCU's runtime +grace-period machinery is up and running, which cannot happen until +the scheduler has initialized itself sufficiently to allow RCU's +kthreads to be spawned. +Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler +initialization can result in deadlock. + +<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a> +So what happens with <tt>synchronize_rcu()</tt> during +scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> +kernels? +<br><a href="#qq14answer">Answer</a> + +<p> +I learned of these boot-time requirements as a result of a series of +system hangs. + +<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> + +<p> +The Linux kernel has interrupts, and RCU read-side critical sections are +legal within interrupt handlers and within interrupt-disabled regions +of code, as are invocations of <tt>call_rcu()</tt>. + +<p> +Some Linux-kernel architectures can enter an interrupt handler from +non-idle process context, and then just never leave it, instead stealthily +transitioning back to process context. +This trick is sometimes used to invoke system calls from inside the kernel. +These “half-interrupts” mean that RCU has to be very careful +about how it counts interrupt nesting levels. +I learned of this requirement the hard way during a rewrite +of RCU's dyntick-idle code. + +<p> +The Linux kernel has non-maskable interrupts (NMIs), and +RCU read-side critical sections are legal within NMI handlers. +Thankfully, RCU update-side primitives, including +<tt>call_rcu()</tt>, are prohibited within NMI handlers. + +<p> +The name notwithstanding, some Linux-kernel architectures +can have nested NMIs, which RCU must handle correctly. +Andy Lutomirski +<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> +with this requirement; +he also kindly surprised me with +<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> +that meets this requirement. + +<h3><a name="Loadable Modules">Loadable Modules</a></h3> + +<p> +The Linux kernel has loadable modules, and these modules can +also be unloaded. +After a given module has been unloaded, any attempt to call +one of its functions results in a segmentation fault. +The module-unload functions must therefore cancel any +delayed calls to loadable-module functions, for example, +any outstanding <tt>mod_timer()</tt> must be dealt with +via <tt>del_timer_sync()</tt> or similar. + +<p> +Unfortunately, there is no way to cancel an RCU callback; +once you invoke <tt>call_rcu()</tt>, the callback function is +going to eventually be invoked, unless the system goes down first. +Because it is normally considered socially irresponsible to crash the system +in response to a module unload request, we need some other way +to deal with in-flight RCU callbacks. + +<p> +RCU therefore provides +<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, +which waits until all in-flight RCU callbacks have been invoked. +If a module uses <tt>call_rcu()</tt>, its exit function should therefore +prevent any future invocation of <tt>call_rcu()</tt>, then invoke +<tt>rcu_barrier()</tt>. +In theory, the underlying module-unload code could invoke +<tt>rcu_barrier()</tt> unconditionally, but in practice this would +incur unacceptable latencies. + +<p> +Nikita Danilov noted this requirement for an analogous filesystem-unmount +situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. +The need for <tt>rcu_barrier()</tt> for module unloading became +apparent later. + +<h3><a name="Hotplug CPU">Hotplug CPU</a></h3> + +<p> +The Linux kernel supports CPU hotplug, which means that CPUs +can come and go. +It is of course illegal to use any RCU API member from an offline CPU. +This requirement was present from day one in DYNIX/ptx, but +on the other hand, the Linux kernel's CPU-hotplug implementation +is “interesting.” + +<p> +The Linux-kernel CPU-hotplug implementation has notifiers that +are used to allow the various kernel subsystems (including RCU) +to respond appropriately to a given CPU-hotplug operation. +Most RCU operations may be invoked from CPU-hotplug notifiers, +including even normal synchronous grace-period operations +such as <tt>synchronize_rcu()</tt>. +However, expedited grace-period operations such as +<tt>synchronize_rcu_expedited()</tt> are not supported, +due to the fact that current implementations block CPU-hotplug +operations, which could result in deadlock. + +<p> +In addition, all-callback-wait operations such as +<tt>rcu_barrier()</tt> are also not supported, due to the +fact that there are phases of CPU-hotplug operations where +the outgoing CPU's callbacks will not be invoked until after +the CPU-hotplug operation ends, which could also result in deadlock. + +<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> + +<p> +RCU depends on the scheduler, and the scheduler uses RCU to +protect some of its data structures. +This means the scheduler is forbidden from acquiring +the runqueue locks and the priority-inheritance locks +in the middle of an outermost RCU read-side critical section unless either +(1) it releases them before exiting that same +RCU read-side critical section, or +(2) interrupts are disabled across +that entire RCU read-side critical section. +This same prohibition also applies (recursively!) to any lock that is acquired +while holding any lock to which this prohibition applies. +Adhering to this rule prevents preemptible RCU from invoking +<tt>rcu_read_unlock_special()</tt> while either runqueue or +priority-inheritance locks are held, thus avoiding deadlock. + +<p> +Prior to v4.4, it was only necessary to disable preemption across +RCU read-side critical sections that acquired scheduler locks. +In v4.4, expedited grace periods started using IPIs, and these +IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath. +Therefore, this expedited-grace-period change required disabling of +interrupts, not just preemption. + +<p> +For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> +implementation must be written carefully to avoid similar deadlocks. +In particular, <tt>rcu_read_unlock()</tt> must tolerate an +interrupt where the interrupt handler invokes both +<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. +This possibility requires <tt>rcu_read_unlock()</tt> to use +negative nesting levels to avoid destructive recursion via +interrupt handler's use of RCU. + +<p> +This pair of mutual scheduler-RCU requirements came as a +<a href="https://lwn.net/Articles/453002/">complete surprise</a>. + +<p> +As noted above, RCU makes use of kthreads, and it is necessary to +avoid excessive CPU-time accumulation by these kthreads. +This requirement was no surprise, but RCU's violation of it +when running context-switch-heavy workloads when built with +<tt>CONFIG_NO_HZ_FULL=y</tt> +<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. +RCU has made good progress towards meeting this requirement, even +for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, +but there is room for further improvement. + +<h3><a name="Tracing and RCU">Tracing and RCU</a></h3> + +<p> +It is possible to use tracing on RCU code, but tracing itself +uses RCU. +For this reason, <tt>rcu_dereference_raw_notrace()</tt> +is provided for use by tracing, which avoids the destructive +recursion that could otherwise ensue. +This API is also used by virtualization in some architectures, +where RCU readers execute in environments in which tracing +cannot be used. +The tracing folks both located the requirement and provided the +needed fix, so this surprise requirement was relatively painless. + +<h3><a name="Energy Efficiency">Energy Efficiency</a></h3> + +<p> +Interrupting idle CPUs is considered socially unacceptable, +especially by people with battery-powered embedded systems. +RCU therefore conserves energy by detecting which CPUs are +idle, including tracking CPUs that have been interrupted from idle. +This is a large part of the energy-efficiency requirement, +so I learned of this via an irate phone call. + +<p> +Because RCU avoids interrupting idle CPUs, it is illegal to +execute an RCU read-side critical section on an idle CPU. +(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat +if you try it.) +The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> +event tracing is provided to work around this restriction. +In addition, <tt>rcu_is_watching()</tt> may be used to +test whether or not it is currently legal to run RCU read-side +critical sections on this CPU. +I learned of the need for diagnostics on the one hand +and <tt>RCU_NONIDLE()</tt> on the other while inspecting +idle-loop code. +Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, +which is used quite heavily in the idle loop. + +<p> +It is similarly socially unacceptable to interrupt an +<tt>nohz_full</tt> CPU running in userspace. +RCU must therefore track <tt>nohz_full</tt> userspace +execution. +And in +<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> +kernels, RCU must separately track idle CPUs on the one hand and +CPUs that are either idle or executing in userspace on the other. +In both cases, RCU must be able to sample state at two points in +time, and be able to determine whether or not some other CPU spent +any time idle and/or executing in userspace. + +<p> +These energy-efficiency requirements have proven quite difficult to +understand and to meet, for example, there have been more than five +clean-sheet rewrites of RCU's energy-efficiency code, the last of +which was finally able to demonstrate +<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. +As noted earlier, +I learned of many of these requirements via angry phone calls: +Flaming me on the Linux-kernel mailing list was apparently not +sufficient to fully vent their ire at RCU's energy-efficiency bugs! + +<h3><a name="Memory Efficiency">Memory Efficiency</a></h3> + +<p> +Although small-memory non-realtime systems can simply use Tiny RCU, +code size is only one aspect of memory efficiency. +Another aspect is the size of the <tt>rcu_head</tt> structure +used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>. +Although this structure contains nothing more than a pair of pointers, +it does appear in many RCU-protected data structures, including +some that are size critical. +The <tt>page</tt> structure is a case in point, as evidenced by +the many occurrences of the <tt>union</tt> keyword within that structure. + +<p> +This need for memory efficiency is one reason that RCU uses hand-crafted +singly linked lists to track the <tt>rcu_head</tt> structures that +are waiting for a grace period to elapse. +It is also the reason why <tt>rcu_head</tt> structures do not contain +debug information, such as fields tracking the file and line of the +<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them. +Although this information might appear in debug-only kernel builds at some +point, in the meantime, the <tt>->func</tt> field will often provide +the needed debug information. + +<p> +However, in some cases, the need for memory efficiency leads to even +more extreme measures. +Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field +shares storage with a great many other structures that are used at +various points in the corresponding page's lifetime. +In order to correctly resolve certain +<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>, +the Linux kernel's memory-management subsystem needs a particular bit +to remain zero during all phases of grace-period processing, +and that bit happens to map to the bottom bit of the +<tt>rcu_head</tt> structure's <tt>->next</tt> field. +RCU makes this guarantee as long as <tt>call_rcu()</tt> +is used to post the callback, as opposed to <tt>kfree_rcu()</tt> +or some future “lazy” +variant of <tt>call_rcu()</tt> that might one day be created for +energy-efficiency purposes. + +<h3><a name="Performance, Scalability, Response Time, and Reliability"> +Performance, Scalability, Response Time, and Reliability</a></h3> + +<p> +Expanding on the +<a href="#Performance and Scalability">earlier discussion</a>, +RCU is used heavily by hot code paths in performance-critical +portions of the Linux kernel's networking, security, virtualization, +and scheduling code paths. +RCU must therefore use efficient implementations, especially in its +read-side primitives. +To that end, it would be good if preemptible RCU's implementation +of <tt>rcu_read_lock()</tt> could be inlined, however, doing +this requires resolving <tt>#include</tt> issues with the +<tt>task_struct</tt> structure. + +<p> +The Linux kernel supports hardware configurations with up to +4096 CPUs, which means that RCU must be extremely scalable. +Algorithms that involve frequent acquisitions of global locks or +frequent atomic operations on global variables simply cannot be +tolerated within the RCU implementation. +RCU therefore makes heavy use of a combining tree based on the +<tt>rcu_node</tt> structure. +RCU is required to tolerate all CPUs continuously invoking any +combination of RCU's runtime primitives with minimal per-operation +overhead. +In fact, in many cases, increasing load must <i>decrease</i> the +per-operation overhead, witness the batching optimizations for +<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, +<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. +As a general rule, RCU must cheerfully accept whatever the +rest of the Linux kernel decides to throw at it. + +<p> +The Linux kernel is used for real-time workloads, especially +in conjunction with the +<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. +The real-time-latency response requirements are such that the +traditional approach of disabling preemption across RCU +read-side critical sections is inappropriate. +Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore +use an RCU implementation that allows RCU read-side critical +sections to be preempted. +This requirement made its presence known after users made it +clear that an earlier +<a href="https://lwn.net/Articles/107930/">real-time patch</a> +did not meet their needs, in conjunction with some +<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> +encountered by a very early version of the -rt patchset. + +<p> +In addition, RCU must make do with a sub-100-microsecond real-time latency +budget. +In fact, on smaller systems with the -rt patchset, the Linux kernel +provides sub-20-microsecond real-time latencies for the whole kernel, +including RCU. +RCU's scalability and latency must therefore be sufficient for +these sorts of configurations. +To my surprise, the sub-100-microsecond real-time latency budget +<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> +applies to even the largest systems [PDF]</a>, +up to and including systems with 4096 CPUs. +This real-time requirement motivated the grace-period kthread, which +also simplified handling of a number of race conditions. + +<p> +Finally, RCU's status as a synchronization primitive means that +any RCU failure can result in arbitrary memory corruption that can be +extremely difficult to debug. +This means that RCU must be extremely reliable, which in +practice also means that RCU must have an aggressive stress-test +suite. +This stress-test suite is called <tt>rcutorture</tt>. + +<p> +Although the need for <tt>rcutorture</tt> was no surprise, +the current immense popularity of the Linux kernel is posing +interesting—and perhaps unprecedented—validation +challenges. +To see this, keep in mind that there are well over one billion +instances of the Linux kernel running today, given Android +smartphones, Linux-powered televisions, and servers. +This number can be expected to increase sharply with the advent of +the celebrated Internet of Things. + +<p> +Suppose that RCU contains a race condition that manifests on average +once per million years of runtime. +This bug will be occurring about three times per <i>day</i> across +the installed base. +RCU could simply hide behind hardware error rates, given that no one +should really expect their smartphone to last for a million years. +However, anyone taking too much comfort from this thought should +consider the fact that in most jurisdictions, a successful multi-year +test of a given mechanism, which might include a Linux kernel, +suffices for a number of types of safety-critical certifications. +In fact, rumor has it that the Linux kernel is already being used +in production for safety-critical applications. +I don't know about you, but I would feel quite bad if a bug in RCU +killed someone. +Which might explain my recent focus on validation and verification. + +<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> + +<p> +One of the more surprising things about RCU is that there are now +no fewer than five <i>flavors</i>, or API families. +In addition, the primary flavor that has been the sole focus up to +this point has two different implementations, non-preemptible and +preemptible. +The other four flavors are listed below, with requirements for each +described in a separate section. + +<ol> +<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> +<li> <a href="#Sched Flavor">Sched Flavor</a> +<li> <a href="#Sleepable RCU">Sleepable RCU</a> +<li> <a href="#Tasks RCU">Tasks RCU</a> +</ol> + +<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> + +<p> +The softirq-disable (AKA “bottom-half”, +hence the “_bh” abbreviations) +flavor of RCU, or <i>RCU-bh</i>, was developed by +Dipankar Sarma to provide a flavor of RCU that could withstand the +network-based denial-of-service attacks researched by Robert +Olsson. +These attacks placed so much networking load on the system +that some of the CPUs never exited softirq execution, +which in turn prevented those CPUs from ever executing a context switch, +which, in the RCU implementation of that time, prevented grace periods +from ever ending. +The result was an out-of-memory condition and a system hang. + +<p> +The solution was the creation of RCU-bh, which does +<tt>local_bh_disable()</tt> +across its read-side critical sections, and which uses the transition +from one type of softirq processing to another as a quiescent state +in addition to context switch, idle, user mode, and offline. +This means that RCU-bh grace periods can complete even when some of +the CPUs execute in softirq indefinitely, thus allowing algorithms +based on RCU-bh to withstand network-based denial-of-service attacks. + +<p> +Because +<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> +disable and re-enable softirq handlers, any attempt to start a softirq +handlers during the +RCU-bh read-side critical section will be deferred. +In this case, <tt>rcu_read_unlock_bh()</tt> +will invoke softirq processing, which can take considerable time. +One can of course argue that this softirq overhead should be associated +with the code following the RCU-bh read-side critical section rather +than <tt>rcu_read_unlock_bh()</tt>, but the fact +is that most profiling tools cannot be expected to make this sort +of fine distinction. +For example, suppose that a three-millisecond-long RCU-bh read-side +critical section executes during a time of heavy networking load. +There will very likely be an attempt to invoke at least one softirq +handler during that three milliseconds, but any such invocation will +be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. +This can of course make it appear at first glance as if +<tt>rcu_read_unlock_bh()</tt> was executing very slowly. + +<p> +The +<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> +includes +<tt>rcu_read_lock_bh()</tt>, +<tt>rcu_read_unlock_bh()</tt>, +<tt>rcu_dereference_bh()</tt>, +<tt>rcu_dereference_bh_check()</tt>, +<tt>synchronize_rcu_bh()</tt>, +<tt>synchronize_rcu_bh_expedited()</tt>, +<tt>call_rcu_bh()</tt>, +<tt>rcu_barrier_bh()</tt>, and +<tt>rcu_read_lock_bh_held()</tt>. + +<h3><a name="Sched Flavor">Sched Flavor</a></h3> + +<p> +Before preemptible RCU, waiting for an RCU grace period had the +side effect of also waiting for all pre-existing interrupt +and NMI handlers. +However, there are legitimate preemptible-RCU implementations that +do not have this property, given that any point in the code outside +of an RCU read-side critical section can be a quiescent state. +Therefore, <i>RCU-sched</i> was created, which follows “classic” +RCU in that an RCU-sched grace period waits for for pre-existing +interrupt and NMI handlers. +In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched +APIs have identical implementations, while kernels built with +<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. + +<p> +Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, +<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> +disable and re-enable preemption, respectively. +This means that if there was a preemption attempt during the +RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> +will enter the scheduler, with all the latency and overhead entailed. +Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look +as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. +However, the highest-priority task won't be preempted, so that task +will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. + +<p> +The +<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> +includes +<tt>rcu_read_lock_sched()</tt>, +<tt>rcu_read_unlock_sched()</tt>, +<tt>rcu_read_lock_sched_notrace()</tt>, +<tt>rcu_read_unlock_sched_notrace()</tt>, +<tt>rcu_dereference_sched()</tt>, +<tt>rcu_dereference_sched_check()</tt>, +<tt>synchronize_sched()</tt>, +<tt>synchronize_rcu_sched_expedited()</tt>, +<tt>call_rcu_sched()</tt>, +<tt>rcu_barrier_sched()</tt>, and +<tt>rcu_read_lock_sched_held()</tt>. +However, anything that disables preemption also marks an RCU-sched +read-side critical section, including +<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, +<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, +and so on. + +<h3><a name="Sleepable RCU">Sleepable RCU</a></h3> + +<p> +For well over a decade, someone saying “I need to block within +an RCU read-side critical section” was a reliable indication +that this someone did not understand RCU. +After all, if you are always blocking in an RCU read-side critical +section, you can probably afford to use a higher-overhead synchronization +mechanism. +However, that changed with the advent of the Linux kernel's notifiers, +whose RCU read-side critical +sections almost never sleep, but sometimes need to. +This resulted in the introduction of +<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, +or <i>SRCU</i>. + +<p> +SRCU allows different domains to be defined, with each such domain +defined by an instance of an <tt>srcu_struct</tt> structure. +A pointer to this structure must be passed in to each SRCU function, +for example, <tt>synchronize_srcu(&ss)</tt>, where +<tt>ss</tt> is the <tt>srcu_struct</tt> structure. +The key benefit of these domains is that a slow SRCU reader in one +domain does not delay an SRCU grace period in some other domain. +That said, one consequence of these domains is that read-side code +must pass a “cookie” from <tt>srcu_read_lock()</tt> +to <tt>srcu_read_unlock()</tt>, for example, as follows: + +<blockquote> +<pre> + 1 int idx; + 2 + 3 idx = srcu_read_lock(&ss); + 4 do_something(); + 5 srcu_read_unlock(&ss, idx); +</pre> +</blockquote> + +<p> +As noted above, it is legal to block within SRCU read-side critical sections, +however, with great power comes great responsibility. +If you block forever in one of a given domain's SRCU read-side critical +sections, then that domain's grace periods will also be blocked forever. +Of course, one good way to block forever is to deadlock, which can +happen if any operation in a given domain's SRCU read-side critical +section can block waiting, either directly or indirectly, for that domain's +grace period to elapse. +For example, this results in a self-deadlock: + +<blockquote> +<pre> + 1 int idx; + 2 + 3 idx = srcu_read_lock(&ss); + 4 do_something(); + 5 synchronize_srcu(&ss); + 6 srcu_read_unlock(&ss, idx); +</pre> +</blockquote> + +<p> +However, if line 5 acquired a mutex that was held across +a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, +deadlock would still be possible. +Furthermore, if line 5 acquired a mutex that was held across +a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, +and if an <tt>ss1</tt>-domain SRCU read-side critical section +acquired another mutex that was held across as <tt>ss</tt>-domain +<tt>synchronize_srcu()</tt>, +deadlock would again be possible. +Such a deadlock cycle could extend across an arbitrarily large number +of different SRCU domains. +Again, with great power comes great responsibility. + +<p> +Unlike the other RCU flavors, SRCU read-side critical sections can +run on idle and even offline CPUs. +This ability requires that <tt>srcu_read_lock()</tt> and +<tt>srcu_read_unlock()</tt> contain memory barriers, which means +that SRCU readers will run a bit slower than would RCU readers. +It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> +API, which, in combination with <tt>srcu_read_unlock()</tt>, +guarantees a full memory barrier. + +<p> +The +<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> +includes +<tt>srcu_read_lock()</tt>, +<tt>srcu_read_unlock()</tt>, +<tt>srcu_dereference()</tt>, +<tt>srcu_dereference_check()</tt>, +<tt>synchronize_srcu()</tt>, +<tt>synchronize_srcu_expedited()</tt>, +<tt>call_srcu()</tt>, +<tt>srcu_barrier()</tt>, and +<tt>srcu_read_lock_held()</tt>. +It also includes +<tt>DEFINE_SRCU()</tt>, +<tt>DEFINE_STATIC_SRCU()</tt>, and +<tt>init_srcu_struct()</tt> +APIs for defining and initializing <tt>srcu_struct</tt> structures. + +<h3><a name="Tasks RCU">Tasks RCU</a></h3> + +<p> +Some forms of tracing use “tramopolines” to handle the +binary rewriting required to install different types of probes. +It would be good to be able to free old trampolines, which sounds +like a job for some form of RCU. +However, because it is necessary to be able to install a trace +anywhere in the code, it is not possible to use read-side markers +such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. +In addition, it does not work to have these markers in the trampoline +itself, because there would need to be instructions following +<tt>rcu_read_unlock()</tt>. +Although <tt>synchronize_rcu()</tt> would guarantee that execution +reached the <tt>rcu_read_unlock()</tt>, it would not be able to +guarantee that execution had completely left the trampoline. + +<p> +The solution, in the form of +<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, +is to have implicit +read-side critical sections that are delimited by voluntary context +switches, that is, calls to <tt>schedule()</tt>, +<tt>cond_resched_rcu_qs()</tt>, and +<tt>synchronize_rcu_tasks()</tt>. +In addition, transitions to and from userspace execution also delimit +tasks-RCU read-side critical sections. + +<p> +The tasks-RCU API is quite compact, consisting only of +<tt>call_rcu_tasks()</tt>, +<tt>synchronize_rcu_tasks()</tt>, and +<tt>rcu_barrier_tasks()</tt>. + +<h2><a name="Possible Future Changes">Possible Future Changes</a></h2> + +<p> +One of the tricks that RCU uses to attain update-side scalability is +to increase grace-period latency with increasing numbers of CPUs. +If this becomes a serious problem, it will be necessary to rework the +grace-period state machine so as to avoid the need for the additional +latency. + +<p> +Expedited grace periods scan the CPUs, so their latency and overhead +increases with increasing numbers of CPUs. +If this becomes a serious problem on large systems, it will be necessary +to do some redesign to avoid this scalability problem. + +<p> +RCU disables CPU hotplug in a few places, perhaps most notably in the +expedited grace-period and <tt>rcu_barrier()</tt> operations. +If there is a strong reason to use expedited grace periods in CPU-hotplug +notifiers, it will be necessary to avoid disabling CPU hotplug. +This would introduce some complexity, so there had better be a <i>very</i> +good reason. + +<p> +The tradeoff between grace-period latency on the one hand and interruptions +of other CPUs on the other hand may need to be re-examined. +The desire is of course for zero grace-period latency as well as zero +interprocessor interrupts undertaken during an expedited grace period +operation. +While this ideal is unlikely to be achievable, it is quite possible that +further improvements can be made. + +<p> +The multiprocessor implementations of RCU use a combining tree that +groups CPUs so as to reduce lock contention and increase cache locality. +However, this combining tree does not spread its memory across NUMA +nodes nor does it align the CPU groups with hardware features such +as sockets or cores. +Such spreading and alignment is currently believed to be unnecessary +because the hotpath read-side primitives do not access the combining +tree, nor does <tt>call_rcu()</tt> in the common case. +If you believe that your architecture needs such spreading and alignment, +then your architecture should also benefit from the +<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set +to the number of CPUs in a socket, NUMA node, or whatever. +If the number of CPUs is too large, use a fraction of the number of +CPUs. +If the number of CPUs is a large prime number, well, that certainly +is an “interesting” architectural choice! +More flexible arrangements might be considered, but only if +<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only +if the inadequacy has been demonstrated by a carefully run and +realistic system-level workload. + +<p> +Please note that arrangements that require RCU to remap CPU numbers will +require extremely good demonstration of need and full exploration of +alternatives. + +<p> +There is an embarrassingly large number of flavors of RCU, and this +number has been increasing over time. +Perhaps it will be possible to combine some at some future date. + +<p> +RCU's various kthreads are reasonably recent additions. +It is quite likely that adjustments will be required to more gracefully +handle extreme loads. +It might also be necessary to be able to relate CPU utilization by +RCU's kthreads and softirq handlers to the code that instigated this +CPU utilization. +For example, RCU callback overhead might be charged back to the +originating <tt>call_rcu()</tt> instance, though probably not +in production kernels. + +<h2><a name="Summary">Summary</a></h2> + +<p> +This document has presented more than two decade's worth of RCU +requirements. +Given that the requirements keep changing, this will not be the last +word on this subject, but at least it serves to get an important +subset of the requirements set forth. + +<h2><a name="Acknowledgments">Acknowledgments</a></h2> + +I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, +Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and +Andy Lutomirski for their help in rendering +this article human readable, and to Michelle Rankin for her support +of this effort. +Other contributions are acknowledged in the Linux kernel's git archive. +The cartoon is copyright (c) 2013 by Melissa Broussard, +and is provided +under the terms of the Creative Commons Attribution-Share Alike 3.0 +United States license. + +<h3><a name="Answers to Quick Quizzes"> +Answers to Quick Quizzes</a></h3> + +<a name="qq1answer"></a> +<p><b>Quick Quiz 1</b>: +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +<tt>synchronize_rcu()</tt>!!! +Just who are you trying to fool??? + + +</p><p><b>Answer</b>: +First, if updaters do not wish to be blocked by readers, they can use +<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will +be discussed later. +Second, even when using <tt>synchronize_rcu()</tt>, the other +update-side code does run concurrently with readers, whether pre-existing +or not. + + +</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a> + +<a name="qq2answer"></a> +<p><b>Quick Quiz 2</b>: +Why is the <tt>synchronize_rcu()</tt> on line 28 needed? + + +</p><p><b>Answer</b>: +Without that extra grace period, memory reordering could result in +<tt>do_something_dlm()</tt> executing <tt>do_something()</tt> +concurrently with the last bits of <tt>recovery()</tt>. + + +</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a> + +<a name="qq3answer"></a> +<p><b>Quick Quiz 3</b>: +But <tt>rcu_assign_pointer()</tt> does nothing to prevent the +two assignments to <tt>p->a</tt> and <tt>p->b</tt> +from being reordered. +Can't that also cause problems? + + +</p><p><b>Answer</b>: +No, it cannot. +The readers cannot see either of these two fields until +the assignment to <tt>gp</tt>, by which time both fields are +fully initialized. +So reordering the assignments +to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly +cause any problems. + + +</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a> + +<a name="qq4answer"></a> +<p><b>Quick Quiz 4</b>: +Without the <tt>rcu_dereference()</tt> or the +<tt>rcu_access_pointer()</tt>, what destructive optimizations +might the compiler make use of? + + +</p><p><b>Answer</b>: +Let's start with what happens to <tt>do_something_gp()</tt> +if it fails to use <tt>rcu_dereference()</tt>. +It could reuse a value formerly fetched from this same pointer. +It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time +manner, resulting in <i>load tearing</i>, in turn resulting a bytewise +mash-up of two distince pointer values. +It might even use value-speculation optimizations, where it makes a wrong +guess, but by the time it gets around to checking the value, an update +has changed the pointer to match the wrong guess. +Too bad about any dereferences that returned pre-initialization garbage +in the meantime! + +<p> +For <tt>remove_gp_synchronous()</tt>, as long as all modifications +to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, +the above optimizations are harmless. +However, +with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, +<tt>sparse</tt> will complain if you +define <tt>gp</tt> with <tt>__rcu</tt> and then +access it without using +either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. + + +</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a> + +<a name="qq5answer"></a> +<p><b>Quick Quiz 5</b>: +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of <tt>synchronize_rcu()</tt>? + + +</p><p><b>Answer</b>: +If RCU cannot tell whether or not a given +RCU read-side critical section starts before a +given instance of <tt>synchronize_rcu()</tt>, +then it must assume that the RCU read-side critical section +started first. +In other words, a given instance of <tt>synchronize_rcu()</tt> +can avoid waiting on a given RCU read-side critical section only +if it can prove that <tt>synchronize_rcu()</tt> started first. + + +</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a> + +<a name="qq6answer"></a> +<p><b>Quick Quiz 6</b>: +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers <i> really</i> required? + + +</p><p><b>Answer</b>: +Yes, they really are required. +To see why the first guarantee is required, consider the following +sequence of events: + +<ol> +<li> CPU 1: <tt>rcu_read_lock()</tt> +<li> CPU 1: <tt>q = rcu_dereference(gp); + /* Very likely to return p. */</tt> +<li> CPU 0: <tt>list_del_rcu(p);</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> starts. +<li> CPU 1: <tt>do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */</tt> +<li> CPU 1: <tt>rcu_read_unlock()</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> returns. +<li> CPU 0: <tt>kfree(p);</tt> +</ol> + +<p> +Therefore, there absolutely must be a full memory barrier between the +end of the RCU read-side critical section and the end of the +grace period. + +<p> +The sequence of events demonstrating the necessity of the second rule +is roughly similar: + +<ol> +<li> CPU 0: <tt>list_del_rcu(p);</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> starts. +<li> CPU 1: <tt>rcu_read_lock()</tt> +<li> CPU 1: <tt>q = rcu_dereference(gp); + /* Might return p if no memory barrier. */</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> returns. +<li> CPU 0: <tt>kfree(p);</tt> +<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> +<li> CPU 1: <tt>rcu_read_unlock()</tt> +</ol> + +<p> +And similarly, without a memory barrier between the beginning of the +grace period and the beginning of the RCU read-side critical section, +CPU 1 might end up accessing the freelist. + +<p> +The “as if” rule of course applies, so that any implementation +that acts as if the appropriate memory barriers were in place is a +correct implementation. +That said, it is much easier to fool yourself into believing that you have +adhered to the as-if rule than it is to actually adhere to it! + + +</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a> + +<a name="qq7answer"></a> +<p><b>Quick Quiz 7</b>: +But how does the upgrade-to-write operation exclude other readers? + + +</p><p><b>Answer</b>: +It doesn't, just like normal RCU updates, which also do not exclude +RCU readers. + + +</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a> + +<a name="qq8answer"></a> +<p><b>Quick Quiz 8</b>: +Can't the compiler also reorder this code? + + +</p><p><b>Answer</b>: +No, the volatile casts in <tt>READ_ONCE()</tt> and +<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in +this particular case. + + +</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a> + +<a name="qq9answer"></a> +<p><b>Quick Quiz 9</b>: +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? + + +</p><p><b>Answer</b>: +No. +Even if <tt>synchronize_rcu()</tt> were to wait until +all readers had completed, a new reader might start immediately after +<tt>synchronize_rcu()</tt> completed. +Therefore, the code following +<tt>synchronize_rcu()</tt> cannot rely on there being no readers +in any case. + + +</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a> + +<a name="qq10answer"></a> +<p><b>Quick Quiz 10</b>: +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? + + +</p><p><b>Answer</b>: +In theory, an infinite number. +In practice, an unknown number that is sensitive to both implementation +details and timing considerations. +Therefore, even in practice, RCU users must abide by the theoretical rather +than the practical answer. + + +</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a> + +<a name="qq11answer"></a> +<p><b>Quick Quiz 11</b>: +What about sleeping locks? + + +</p><p><b>Answer</b>: +These are forbidden within Linux-kernel RCU read-side critical sections +because it is not legal to place a quiescent state (in this case, +voluntary context switch) within an RCU read-side critical section. +However, sleeping locks may be used within userspace RCU read-side critical +sections, and also within Linux-kernel sleepable RCU +<a href="#Sleepable RCU">(SRCU)</a> +read-side critical sections. +In addition, the -rt patchset turns spinlocks into a sleeping locks so +that the corresponding critical sections can be preempted, which +also means that these sleeplockified spinlocks (but not other sleeping locks!) +may be acquire within -rt-Linux-kernel RCU read-side critical sections. + +<p> +Note that it <i>is</i> legal for a normal RCU read-side critical section +to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), +but only as long as it does not loop indefinitely attempting to +conditionally acquire that sleeping locks. +The key point is that things like <tt>mutex_trylock()</tt> +either return with the mutex held, or return an error indication if +the mutex was not immediately available. +Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. + + +</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a> + +<a name="qq12answer"></a> +<p><b>Quick Quiz 12</b>: +Why does line 19 use <tt>rcu_access_pointer()</tt>? +After all, <tt>call_rcu()</tt> on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that <tt>rcu_dereference()</tt> is required? + + +</p><p><b>Answer</b>: +Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes +any changes, including any insertions that <tt>rcu_dereference()</tt> +would protect against. +Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> +is released on line 25, which in turn means that +<tt>rcu_access_pointer()</tt> suffices. + + +</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a> + +<a name="qq13answer"></a> +<p><b>Quick Quiz 13</b>: +Earlier it was claimed that <tt>call_rcu()</tt> and +<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? + + +</p><p><b>Answer</b>: +We could define things this way, but keep in mind that this sort of +definition would say that updates in garbage-collected languages +cannot complete until the next time the garbage collector runs, +which does not seem at all reasonable. +The key point is that in most cases, an updater using either +<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the +next update as soon as it has invoked <tt>call_rcu()</tt> or +<tt>kfree_rcu()</tt>, without having to wait for a subsequent +grace period. + + +</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a> + +<a name="qq14answer"></a> +<p><b>Quick Quiz 14</b>: +So what happens with <tt>synchronize_rcu()</tt> during +scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> +kernels? + + +</p><p><b>Answer</b>: +In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> +maps directly to <tt>synchronize_sched()</tt>. +Therefore, <tt>synchronize_rcu()</tt> works normally throughout +boot in <tt>CONFIG_PREEMPT=n</tt> kernels. +However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, +so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> +during scheduler initialization. + + +</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a> + + +</body></html> diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx new file mode 100644 index 000000000000..3a97ba490c42 --- /dev/null +++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx @@ -0,0 +1,2741 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"> + <html> + <head><title>A Tour Through RCU's Requirements [LWN.net]</title> + <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> + +<h1>A Tour Through RCU's Requirements</h1> + +<p>Copyright IBM Corporation, 2015</p> +<p>Author: Paul E. McKenney</p> +<p><i>The initial version of this document appeared in the +<a href="https://lwn.net/">LWN</a> articles +<a href="https://lwn.net/Articles/652156/">here</a>, +<a href="https://lwn.net/Articles/652677/">here</a>, and +<a href="https://lwn.net/Articles/653326/">here</a>.</i></p> + +<h2>Introduction</h2> + +<p> +Read-copy update (RCU) is a synchronization mechanism that is often +used as a replacement for reader-writer locking. +RCU is unusual in that updaters do not block readers, +which means that RCU's read-side primitives can be exceedingly fast +and scalable. +In addition, updaters can make useful forward progress concurrently +with readers. +However, all this concurrency between RCU readers and updaters does raise +the question of exactly what RCU readers are doing, which in turn +raises the question of exactly what RCU's requirements are. + +<p> +This document therefore summarizes RCU's requirements, and can be thought +of as an informal, high-level specification for RCU. +It is important to understand that RCU's specification is primarily +empirical in nature; +in fact, I learned about many of these requirements the hard way. +This situation might cause some consternation, however, not only +has this learning process been a lot of fun, but it has also been +a great privilege to work with so many people willing to apply +technologies in interesting new ways. + +<p> +All that aside, here are the categories of currently known RCU requirements: +</p> + +<ol> +<li> <a href="#Fundamental Requirements"> + Fundamental Requirements</a> +<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> +<li> <a href="#Parallelism Facts of Life"> + Parallelism Facts of Life</a> +<li> <a href="#Quality-of-Implementation Requirements"> + Quality-of-Implementation Requirements</a> +<li> <a href="#Linux Kernel Complications"> + Linux Kernel Complications</a> +<li> <a href="#Software-Engineering Requirements"> + Software-Engineering Requirements</a> +<li> <a href="#Other RCU Flavors"> + Other RCU Flavors</a> +<li> <a href="#Possible Future Changes"> + Possible Future Changes</a> +</ol> + +<p> +This is followed by a <a href="#Summary">summary</a>, +which is in turn followed by the inevitable +<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. + +<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> + +<p> +RCU's fundamental requirements are the closest thing RCU has to hard +mathematical requirements. +These are: + +<ol> +<li> <a href="#Grace-Period Guarantee"> + Grace-Period Guarantee</a> +<li> <a href="#Publish-Subscribe Guarantee"> + Publish-Subscribe Guarantee</a> +<li> <a href="#Memory-Barrier Guarantees"> + Memory-Barrier Guarantees</a> +<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> + RCU Primitives Guaranteed to Execute Unconditionally</a> +<li> <a href="#Guaranteed Read-to-Write Upgrade"> + Guaranteed Read-to-Write Upgrade</a> +</ol> + +<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> + +<p> +RCU's grace-period guarantee is unusual in being premeditated: +Jack Slingwine and I had this guarantee firmly in mind when we started +work on RCU (then called “rclock”) in the early 1990s. +That said, the past two decades of experience with RCU have produced +a much more detailed understanding of this guarantee. + +<p> +RCU's grace-period guarantee allows updaters to wait for the completion +of all pre-existing RCU read-side critical sections. +An RCU read-side critical section +begins with the marker <tt>rcu_read_lock()</tt> and ends with +the marker <tt>rcu_read_unlock()</tt>. +These markers may be nested, and RCU treats a nested set as one +big RCU read-side critical section. +Production-quality implementations of <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> are extremely lightweight, and in +fact have exactly zero overhead in Linux kernels built for production +use with <tt>CONFIG_PREEMPT=n</tt>. + +<p> +This guarantee allows ordering to be enforced with extremely low +overhead to readers, for example: + +<blockquote> +<pre> + 1 int x, y; + 2 + 3 void thread0(void) + 4 { + 5 rcu_read_lock(); + 6 r1 = READ_ONCE(x); + 7 r2 = READ_ONCE(y); + 8 rcu_read_unlock(); + 9 } +10 +11 void thread1(void) +12 { +13 WRITE_ONCE(x, 1); +14 synchronize_rcu(); +15 WRITE_ONCE(y, 1); +16 } +</pre> +</blockquote> + +<p> +Because the <tt>synchronize_rcu()</tt> on line 14 waits for +all pre-existing readers, any instance of <tt>thread0()</tt> that +loads a value of zero from <tt>x</tt> must complete before +<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must +also load a value of zero from <tt>y</tt>. +Similarly, any instance of <tt>thread0()</tt> that loads a value of +one from <tt>y</tt> must have started after the +<tt>synchronize_rcu()</tt> started, and must therefore also load +a value of one from <tt>x</tt>. +Therefore, the outcome: +<blockquote> +<pre> +(r1 == 0 && r2 == 1) +</pre> +</blockquote> +cannot happen. + +<p>@@QQ@@ +Wait a minute! +You said that updaters can make useful forward progress concurrently +with readers, but pre-existing readers will block +<tt>synchronize_rcu()</tt>!!! +Just who are you trying to fool??? +<p>@@QQA@@ +First, if updaters do not wish to be blocked by readers, they can use +<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will +be discussed later. +Second, even when using <tt>synchronize_rcu()</tt>, the other +update-side code does run concurrently with readers, whether pre-existing +or not. +<p>@@QQE@@ + +<p> +This scenario resembles one of the first uses of RCU in +<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, +which managed a distributed lock manager's transition into +a state suitable for handling recovery from node failure, +more or less as follows: + +<blockquote> +<pre> + 1 #define STATE_NORMAL 0 + 2 #define STATE_WANT_RECOVERY 1 + 3 #define STATE_RECOVERING 2 + 4 #define STATE_WANT_NORMAL 3 + 5 + 6 int state = STATE_NORMAL; + 7 + 8 void do_something_dlm(void) + 9 { +10 int state_snap; +11 +12 rcu_read_lock(); +13 state_snap = READ_ONCE(state); +14 if (state_snap == STATE_NORMAL) +15 do_something(); +16 else +17 do_something_carefully(); +18 rcu_read_unlock(); +19 } +20 +21 void start_recovery(void) +22 { +23 WRITE_ONCE(state, STATE_WANT_RECOVERY); +24 synchronize_rcu(); +25 WRITE_ONCE(state, STATE_RECOVERING); +26 recovery(); +27 WRITE_ONCE(state, STATE_WANT_NORMAL); +28 synchronize_rcu(); +29 WRITE_ONCE(state, STATE_NORMAL); +30 } +</pre> +</blockquote> + +<p> +The RCU read-side critical section in <tt>do_something_dlm()</tt> +works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> +to guarantee that <tt>do_something()</tt> never runs concurrently +with <tt>recovery()</tt>, but with little or no synchronization +overhead in <tt>do_something_dlm()</tt>. + +<p>@@QQ@@ +Why is the <tt>synchronize_rcu()</tt> on line 28 needed? +<p>@@QQA@@ +Without that extra grace period, memory reordering could result in +<tt>do_something_dlm()</tt> executing <tt>do_something()</tt> +concurrently with the last bits of <tt>recovery()</tt>. +<p>@@QQE@@ + +<p> +In order to avoid fatal problems such as deadlocks, +an RCU read-side critical section must not contain calls to +<tt>synchronize_rcu()</tt>. +Similarly, an RCU read-side critical section must not +contain anything that waits, directly or indirectly, on completion of +an invocation of <tt>synchronize_rcu()</tt>. + +<p> +Although RCU's grace-period guarantee is useful in and of itself, with +<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, +it would be good to be able to use RCU to coordinate read-side +access to linked data structures. +For this, the grace-period guarantee is not sufficient, as can +be seen in function <tt>add_gp_buggy()</tt> below. +We will look at the reader's code later, but in the meantime, just think of +the reader as locklessly picking up the <tt>gp</tt> pointer, +and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the +<tt>->a</tt> and <tt>->b</tt> fields. + +<blockquote> +<pre> + 1 bool add_gp_buggy(int a, int b) + 2 { + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); + 4 if (!p) + 5 return -ENOMEM; + 6 spin_lock(&gp_lock); + 7 if (rcu_access_pointer(gp)) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +11 p->a = a; +12 p->b = a; +13 gp = p; /* ORDERING BUG */ +14 spin_unlock(&gp_lock); +15 return true; +16 } +</pre> +</blockquote> + +<p> +The problem is that both the compiler and weakly ordered CPUs are within +their rights to reorder this code as follows: + +<blockquote> +<pre> + 1 bool add_gp_buggy_optimized(int a, int b) + 2 { + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); + 4 if (!p) + 5 return -ENOMEM; + 6 spin_lock(&gp_lock); + 7 if (rcu_access_pointer(gp)) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +<b>11 gp = p; /* ORDERING BUG */ +12 p->a = a; +13 p->b = a;</b> +14 spin_unlock(&gp_lock); +15 return true; +16 } +</pre> +</blockquote> + +<p> +If an RCU reader fetches <tt>gp</tt> just after +<tt>add_gp_buggy_optimized</tt> executes line 11, +it will see garbage in the <tt>->a</tt> and <tt>->b</tt> +fields. +And this is but one of many ways in which compiler and hardware optimizations +could cause trouble. +Therefore, we clearly need some way to prevent the compiler and the CPU from +reordering in this manner, which brings us to the publish-subscribe +guarantee discussed in the next section. + +<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> + +<p> +RCU's publish-subscribe guarantee allows data to be inserted +into a linked data structure without disrupting RCU readers. +The updater uses <tt>rcu_assign_pointer()</tt> to insert the +new data, and readers use <tt>rcu_dereference()</tt> to +access data, whether new or old. +The following shows an example of insertion: + +<blockquote> +<pre> + 1 bool add_gp(int a, int b) + 2 { + 3 p = kmalloc(sizeof(*p), GFP_KERNEL); + 4 if (!p) + 5 return -ENOMEM; + 6 spin_lock(&gp_lock); + 7 if (rcu_access_pointer(gp)) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +11 p->a = a; +12 p->b = a; +13 rcu_assign_pointer(gp, p); +14 spin_unlock(&gp_lock); +15 return true; +16 } +</pre> +</blockquote> + +<p> +The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually +equivalent to a simple assignment statement, but also guarantees +that its assignment will +happen after the two assignments in lines 11 and 12, +similar to the C11 <tt>memory_order_release</tt> store operation. +It also prevents any number of “interesting” compiler +optimizations, for example, the use of <tt>gp</tt> as a scratch +location immediately preceding the assignment. + +<p>@@QQ@@ +But <tt>rcu_assign_pointer()</tt> does nothing to prevent the +two assignments to <tt>p->a</tt> and <tt>p->b</tt> +from being reordered. +Can't that also cause problems? +<p>@@QQA@@ +No, it cannot. +The readers cannot see either of these two fields until +the assignment to <tt>gp</tt>, by which time both fields are +fully initialized. +So reordering the assignments +to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly +cause any problems. +<p>@@QQE@@ + +<p> +It is tempting to assume that the reader need not do anything special +to control its accesses to the RCU-protected data, +as shown in <tt>do_something_gp_buggy()</tt> below: + +<blockquote> +<pre> + 1 bool do_something_gp_buggy(void) + 2 { + 3 rcu_read_lock(); + 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ + 5 if (p) { + 6 do_something(p->a, p->b); + 7 rcu_read_unlock(); + 8 return true; + 9 } +10 rcu_read_unlock(); +11 return false; +12 } +</pre> +</blockquote> + +<p> +However, this temptation must be resisted because there are a +surprisingly large number of ways that the compiler +(to say nothing of +<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) +can trip this code up. +For but one example, if the compiler were short of registers, it +might choose to refetch from <tt>gp</tt> rather than keeping +a separate copy in <tt>p</tt> as follows: + +<blockquote> +<pre> + 1 bool do_something_gp_buggy_optimized(void) + 2 { + 3 rcu_read_lock(); + 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ +<b> 5 do_something(gp->a, gp->b);</b> + 6 rcu_read_unlock(); + 7 return true; + 8 } + 9 rcu_read_unlock(); +10 return false; +11 } +</pre> +</blockquote> + +<p> +If this function ran concurrently with a series of updates that +replaced the current structure with a new one, +the fetches of <tt>gp->a</tt> +and <tt>gp->b</tt> might well come from two different structures, +which could cause serious confusion. +To prevent this (and much else besides), <tt>do_something_gp()</tt> uses +<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: + +<blockquote> +<pre> + 1 bool do_something_gp(void) + 2 { + 3 rcu_read_lock(); + 4 p = rcu_dereference(gp); + 5 if (p) { + 6 do_something(p->a, p->b); + 7 rcu_read_unlock(); + 8 return true; + 9 } +10 rcu_read_unlock(); +11 return false; +12 } +</pre> +</blockquote> + +<p> +The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) +memory barriers in the Linux kernel. +Should a +<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> +ever appear, then <tt>rcu_dereference()</tt> could be implemented +as a <tt>memory_order_consume</tt> load. +Regardless of the exact implementation, a pointer fetched by +<tt>rcu_dereference()</tt> may not be used outside of the +outermost RCU read-side critical section containing that +<tt>rcu_dereference()</tt>, unless protection of +the corresponding data element has been passed from RCU to some +other synchronization mechanism, most commonly locking or +<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. + +<p> +In short, updaters use <tt>rcu_assign_pointer()</tt> and readers +use <tt>rcu_dereference()</tt>, and these two RCU API elements +work together to ensure that readers have a consistent view of +newly added data elements. + +<p> +Of course, it is also necessary to remove elements from RCU-protected +data structures, for example, using the following process: + +<ol> +<li> Remove the data element from the enclosing structure. +<li> Wait for all pre-existing RCU read-side critical sections + to complete (because only pre-existing readers can possibly have + a reference to the newly removed data element). +<li> At this point, only the updater has a reference to the + newly removed data element, so it can safely reclaim + the data element, for example, by passing it to <tt>kfree()</tt>. +</ol> + +This process is implemented by <tt>remove_gp_synchronous()</tt>: + +<blockquote> +<pre> + 1 bool remove_gp_synchronous(void) + 2 { + 3 struct foo *p; + 4 + 5 spin_lock(&gp_lock); + 6 p = rcu_access_pointer(gp); + 7 if (!p) { + 8 spin_unlock(&gp_lock); + 9 return false; +10 } +11 rcu_assign_pointer(gp, NULL); +12 spin_unlock(&gp_lock); +13 synchronize_rcu(); +14 kfree(p); +15 return true; +16 } +</pre> +</blockquote> + +<p> +This function is straightforward, with line 13 waiting for a grace +period before line 14 frees the old data element. +This waiting ensures that readers will reach line 7 of +<tt>do_something_gp()</tt> before the data element referenced by +<tt>p</tt> is freed. +The <tt>rcu_access_pointer()</tt> on line 6 is similar to +<tt>rcu_dereference()</tt>, except that: + +<ol> +<li> The value returned by <tt>rcu_access_pointer()</tt> + cannot be dereferenced. + If you want to access the value pointed to as well as + the pointer itself, use <tt>rcu_dereference()</tt> + instead of <tt>rcu_access_pointer()</tt>. +<li> The call to <tt>rcu_access_pointer()</tt> need not be + protected. + In contrast, <tt>rcu_dereference()</tt> must either be + within an RCU read-side critical section or in a code + segment where the pointer cannot change, for example, in + code protected by the corresponding update-side lock. +</ol> + +<p>@@QQ@@ +Without the <tt>rcu_dereference()</tt> or the +<tt>rcu_access_pointer()</tt>, what destructive optimizations +might the compiler make use of? +<p>@@QQA@@ +Let's start with what happens to <tt>do_something_gp()</tt> +if it fails to use <tt>rcu_dereference()</tt>. +It could reuse a value formerly fetched from this same pointer. +It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time +manner, resulting in <i>load tearing</i>, in turn resulting a bytewise +mash-up of two distince pointer values. +It might even use value-speculation optimizations, where it makes a wrong +guess, but by the time it gets around to checking the value, an update +has changed the pointer to match the wrong guess. +Too bad about any dereferences that returned pre-initialization garbage +in the meantime! + +<p> +For <tt>remove_gp_synchronous()</tt>, as long as all modifications +to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, +the above optimizations are harmless. +However, +with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, +<tt>sparse</tt> will complain if you +define <tt>gp</tt> with <tt>__rcu</tt> and then +access it without using +either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. +<p>@@QQE@@ + +<p> +In short, RCU's publish-subscribe guarantee is provided by the combination +of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. +This guarantee allows data elements to be safely added to RCU-protected +linked data structures without disrupting RCU readers. +This guarantee can be used in combination with the grace-period +guarantee to also allow data elements to be removed from RCU-protected +linked data structures, again without disrupting RCU readers. + +<p> +This guarantee was only partially premeditated. +DYNIX/ptx used an explicit memory barrier for publication, but had nothing +resembling <tt>rcu_dereference()</tt> for subscription, nor did it +have anything resembling the <tt>smp_read_barrier_depends()</tt> +that was later subsumed into <tt>rcu_dereference()</tt>. +The need for these operations made itself known quite suddenly at a +late-1990s meeting with the DEC Alpha architects, back in the days when +DEC was still a free-standing company. +It took the Alpha architects a good hour to convince me that any sort +of barrier would ever be needed, and it then took me a good <i>two</i> hours +to convince them that their documentation did not make this point clear. +More recent work with the C and C++ standards committees have provided +much education on tricks and traps from the compiler. +In short, compilers were much less tricky in the early 1990s, but in +2015, don't even think about omitting <tt>rcu_dereference()</tt>! + +<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3> + +<p> +The previous section's simple linked-data-structure scenario clearly +demonstrates the need for RCU's stringent memory-ordering guarantees on +systems with more than one CPU: + +<ol> +<li> Each CPU that has an RCU read-side critical section that + begins before <tt>synchronize_rcu()</tt> starts is + guaranteed to execute a full memory barrier between the time + that the RCU read-side critical section ends and the time that + <tt>synchronize_rcu()</tt> returns. + Without this guarantee, a pre-existing RCU read-side critical section + might hold a reference to the newly removed <tt>struct foo</tt> + after the <tt>kfree()</tt> on line 14 of + <tt>remove_gp_synchronous()</tt>. +<li> Each CPU that has an RCU read-side critical section that ends + after <tt>synchronize_rcu()</tt> returns is guaranteed + to execute a full memory barrier between the time that + <tt>synchronize_rcu()</tt> begins and the time that the RCU + read-side critical section begins. + Without this guarantee, a later RCU read-side critical section + running after the <tt>kfree()</tt> on line 14 of + <tt>remove_gp_synchronous()</tt> might + later run <tt>do_something_gp()</tt> and find the + newly deleted <tt>struct foo</tt>. +<li> If the task invoking <tt>synchronize_rcu()</tt> remains + on a given CPU, then that CPU is guaranteed to execute a full + memory barrier sometime during the execution of + <tt>synchronize_rcu()</tt>. + This guarantee ensures that the <tt>kfree()</tt> on + line 14 of <tt>remove_gp_synchronous()</tt> really does + execute after the removal on line 11. +<li> If the task invoking <tt>synchronize_rcu()</tt> migrates + among a group of CPUs during that invocation, then each of the + CPUs in that group is guaranteed to execute a full memory barrier + sometime during the execution of <tt>synchronize_rcu()</tt>. + This guarantee also ensures that the <tt>kfree()</tt> on + line 14 of <tt>remove_gp_synchronous()</tt> really does + execute after the removal on + line 11, but also in the case where the thread executing the + <tt>synchronize_rcu()</tt> migrates in the meantime. +</ol> + +<p>@@QQ@@ +Given that multiple CPUs can start RCU read-side critical sections +at any time without any ordering whatsoever, how can RCU possibly tell whether +or not a given RCU read-side critical section starts before a +given instance of <tt>synchronize_rcu()</tt>? +<p>@@QQA@@ +If RCU cannot tell whether or not a given +RCU read-side critical section starts before a +given instance of <tt>synchronize_rcu()</tt>, +then it must assume that the RCU read-side critical section +started first. +In other words, a given instance of <tt>synchronize_rcu()</tt> +can avoid waiting on a given RCU read-side critical section only +if it can prove that <tt>synchronize_rcu()</tt> started first. +<p>@@QQE@@ + +<p>@@QQ@@ +The first and second guarantees require unbelievably strict ordering! +Are all these memory barriers <i> really</i> required? +<p>@@QQA@@ +Yes, they really are required. +To see why the first guarantee is required, consider the following +sequence of events: + +<ol> +<li> CPU 1: <tt>rcu_read_lock()</tt> +<li> CPU 1: <tt>q = rcu_dereference(gp); + /* Very likely to return p. */</tt> +<li> CPU 0: <tt>list_del_rcu(p);</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> starts. +<li> CPU 1: <tt>do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */</tt> +<li> CPU 1: <tt>rcu_read_unlock()</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> returns. +<li> CPU 0: <tt>kfree(p);</tt> +</ol> + +<p> +Therefore, there absolutely must be a full memory barrier between the +end of the RCU read-side critical section and the end of the +grace period. + +<p> +The sequence of events demonstrating the necessity of the second rule +is roughly similar: + +<ol> +<li> CPU 0: <tt>list_del_rcu(p);</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> starts. +<li> CPU 1: <tt>rcu_read_lock()</tt> +<li> CPU 1: <tt>q = rcu_dereference(gp); + /* Might return p if no memory barrier. */</tt> +<li> CPU 0: <tt>synchronize_rcu()</tt> returns. +<li> CPU 0: <tt>kfree(p);</tt> +<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> +<li> CPU 1: <tt>rcu_read_unlock()</tt> +</ol> + +<p> +And similarly, without a memory barrier between the beginning of the +grace period and the beginning of the RCU read-side critical section, +CPU 1 might end up accessing the freelist. + +<p> +The “as if” rule of course applies, so that any implementation +that acts as if the appropriate memory barriers were in place is a +correct implementation. +That said, it is much easier to fool yourself into believing that you have +adhered to the as-if rule than it is to actually adhere to it! +<p>@@QQE@@ + +<p> +Note that these memory-barrier requirements do not replace the fundamental +RCU requirement that a grace period wait for all pre-existing readers. +On the contrary, the memory barriers called out in this section must operate in +such a way as to <i>enforce</i> this fundamental requirement. +Of course, different implementations enforce this requirement in different +ways, but enforce it they must. + +<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> + +<p> +The common-case RCU primitives are unconditional. +They are invoked, they do their job, and they return, with no possibility +of error, and no need to retry. +This is a key RCU design philosophy. + +<p> +However, this philosophy is pragmatic rather than pigheaded. +If someone comes up with a good justification for a particular conditional +RCU primitive, it might well be implemented and added. +After all, this guarantee was reverse-engineered, not premeditated. +The unconditional nature of the RCU primitives was initially an +accident of implementation, and later experience with synchronization +primitives with conditional primitives caused me to elevate this +accident to a guarantee. +Therefore, the justification for adding a conditional primitive to +RCU would need to be based on detailed and compelling use cases. + +<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> + +<p> +As far as RCU is concerned, it is always possible to carry out an +update within an RCU read-side critical section. +For example, that RCU read-side critical section might search for +a given data element, and then might acquire the update-side +spinlock in order to update that element, all while remaining +in that RCU read-side critical section. +Of course, it is necessary to exit the RCU read-side critical section +before invoking <tt>synchronize_rcu()</tt>, however, this +inconvenience can be avoided through use of the +<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members +described later in this document. + +<p>@@QQ@@ +But how does the upgrade-to-write operation exclude other readers? +<p>@@QQA@@ +It doesn't, just like normal RCU updates, which also do not exclude +RCU readers. +<p>@@QQE@@ + +<p> +This guarantee allows lookup code to be shared between read-side +and update-side code, and was premeditated, appearing in the earliest +DYNIX/ptx RCU documentation. + +<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> + +<p> +RCU provides extremely lightweight readers, and its read-side guarantees, +though quite useful, are correspondingly lightweight. +It is therefore all too easy to assume that RCU is guaranteeing more +than it really is. +Of course, the list of things that RCU does not guarantee is infinitely +long, however, the following sections list a few non-guarantees that +have caused confusion. +Except where otherwise noted, these non-guarantees were premeditated. + +<ol> +<li> <a href="#Readers Impose Minimal Ordering"> + Readers Impose Minimal Ordering</a> +<li> <a href="#Readers Do Not Exclude Updaters"> + Readers Do Not Exclude Updaters</a> +<li> <a href="#Updaters Only Wait For Old Readers"> + Updaters Only Wait For Old Readers</a> +<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> + Grace Periods Don't Partition Read-Side Critical Sections</a> +<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> + Read-Side Critical Sections Don't Partition Grace Periods</a> +<li> <a href="#Disabling Preemption Does Not Block Grace Periods"> + Disabling Preemption Does Not Block Grace Periods</a> +</ol> + +<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> + +<p> +Reader-side markers such as <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees +except through their interaction with the grace-period APIs such as +<tt>synchronize_rcu()</tt>. +To see this, consider the following pair of threads: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(x, 1); + 5 rcu_read_unlock(); + 6 rcu_read_lock(); + 7 WRITE_ONCE(y, 1); + 8 rcu_read_unlock(); + 9 } +10 +11 void thread1(void) +12 { +13 rcu_read_lock(); +14 r1 = READ_ONCE(y); +15 rcu_read_unlock(); +16 rcu_read_lock(); +17 r2 = READ_ONCE(x); +18 rcu_read_unlock(); +19 } +</pre> +</blockquote> + +<p> +After <tt>thread0()</tt> and <tt>thread1()</tt> execute +concurrently, it is quite possible to have + +<blockquote> +<pre> +(r1 == 1 && r2 == 0) +</pre> +</blockquote> + +(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), +which would not be possible if <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> had much in the way of ordering +properties. +But they do not, so the CPU is within its rights +to do significant reordering. +This is by design: Any significant ordering constraints would slow down +these fast-path APIs. + +<p>@@QQ@@ +Can't the compiler also reorder this code? +<p>@@QQA@@ +No, the volatile casts in <tt>READ_ONCE()</tt> and +<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in +this particular case. +<p>@@QQE@@ + +<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> + +<p> +Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> +exclude updates. +All they do is to prevent grace periods from ending. +The following example illustrates this: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 r1 = READ_ONCE(y); + 5 if (r1) { + 6 do_something_with_nonzero_x(); + 7 r2 = READ_ONCE(x); + 8 WARN_ON(!r2); /* BUG!!! */ + 9 } +10 rcu_read_unlock(); +11 } +12 +13 void thread1(void) +14 { +15 spin_lock(&my_lock); +16 WRITE_ONCE(x, 1); +17 WRITE_ONCE(y, 1); +18 spin_unlock(&my_lock); +19 } +</pre> +</blockquote> + +<p> +If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> +excluded the <tt>thread1()</tt> function's update, +the <tt>WARN_ON()</tt> could never fire. +But the fact is that <tt>rcu_read_lock()</tt> does not exclude +much of anything aside from subsequent grace periods, of which +<tt>thread1()</tt> has none, so the +<tt>WARN_ON()</tt> can and does fire. + +<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> + +<p> +It might be tempting to assume that after <tt>synchronize_rcu()</tt> +completes, there are no readers executing. +This temptation must be avoided because +new readers can start immediately after <tt>synchronize_rcu()</tt> +starts, and <tt>synchronize_rcu()</tt> is under no +obligation to wait for these new readers. + +<p>@@QQ@@ +Suppose that synchronize_rcu() did wait until all readers had completed. +Would the updater be able to rely on this? +<p>@@QQA@@ +No. +Even if <tt>synchronize_rcu()</tt> were to wait until +all readers had completed, a new reader might start immediately after +<tt>synchronize_rcu()</tt> completed. +Therefore, the code following +<tt>synchronize_rcu()</tt> cannot rely on there being no readers +in any case. +<p>@@QQE@@ + +<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> +Grace Periods Don't Partition Read-Side Critical Sections</a></h3> + +<p> +It is tempting to assume that if any part of one RCU read-side critical +section precedes a given grace period, and if any part of another RCU +read-side critical section follows that same grace period, then all of +the first RCU read-side critical section must precede all of the second. +However, this just isn't the case: A single grace period does not +partition the set of RCU read-side critical sections. +An example of this situation can be illustrated as follows, where +<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(a, 1); + 5 WRITE_ONCE(b, 1); + 6 rcu_read_unlock(); + 7 } + 8 + 9 void thread1(void) +10 { +11 r1 = READ_ONCE(a); +12 synchronize_rcu(); +13 WRITE_ONCE(c, 1); +14 } +15 +16 void thread2(void) +17 { +18 rcu_read_lock(); +19 r2 = READ_ONCE(b); +20 r3 = READ_ONCE(c); +21 rcu_read_unlock(); +22 } +</pre> +</blockquote> + +<p> +It turns out that the outcome: + +<blockquote> +<pre> +(r1 == 1 && r2 == 0 && r3 == 1) +</pre> +</blockquote> + +is entirely possible. +The following figure show how this can happen, with each circled +<tt>QS</tt> indicating the point at which RCU recorded a +<i>quiescent state</i> for each thread, that is, a state in which +RCU knows that the thread cannot be in the midst of an RCU read-side +critical section that started before the current grace period: + +<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> + +<p> +If it is necessary to partition RCU read-side critical sections in this +manner, it is necessary to use two grace periods, where the first +grace period is known to end before the second grace period starts: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(a, 1); + 5 WRITE_ONCE(b, 1); + 6 rcu_read_unlock(); + 7 } + 8 + 9 void thread1(void) +10 { +11 r1 = READ_ONCE(a); +12 synchronize_rcu(); +13 WRITE_ONCE(c, 1); +14 } +15 +16 void thread2(void) +17 { +18 r2 = READ_ONCE(c); +19 synchronize_rcu(); +20 WRITE_ONCE(d, 1); +21 } +22 +23 void thread3(void) +24 { +25 rcu_read_lock(); +26 r3 = READ_ONCE(b); +27 r4 = READ_ONCE(d); +28 rcu_read_unlock(); +29 } +</pre> +</blockquote> + +<p> +Here, if <tt>(r1 == 1)</tt>, then +<tt>thread0()</tt>'s write to <tt>b</tt> must happen +before the end of <tt>thread1()</tt>'s grace period. +If in addition <tt>(r4 == 1)</tt>, then +<tt>thread3()</tt>'s read from <tt>b</tt> must happen +after the beginning of <tt>thread2()</tt>'s grace period. +If it is also the case that <tt>(r2 == 1)</tt>, then the +end of <tt>thread1()</tt>'s grace period must precede the +beginning of <tt>thread2()</tt>'s grace period. +This mean that the two RCU read-side critical sections cannot overlap, +guaranteeing that <tt>(r3 == 1)</tt>. +As a result, the outcome: + +<blockquote> +<pre> +(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) +</pre> +</blockquote> + +cannot happen. + +<p> +This non-requirement was also non-premeditated, but became apparent +when studying RCU's interaction with memory ordering. + +<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> +Read-Side Critical Sections Don't Partition Grace Periods</a></h3> + +<p> +It is also tempting to assume that if an RCU read-side critical section +happens between a pair of grace periods, then those grace periods cannot +overlap. +However, this temptation leads nowhere good, as can be illustrated by +the following, with all variables initially zero: + +<blockquote> +<pre> + 1 void thread0(void) + 2 { + 3 rcu_read_lock(); + 4 WRITE_ONCE(a, 1); + 5 WRITE_ONCE(b, 1); + 6 rcu_read_unlock(); + 7 } + 8 + 9 void thread1(void) +10 { +11 r1 = READ_ONCE(a); +12 synchronize_rcu(); +13 WRITE_ONCE(c, 1); +14 } +15 +16 void thread2(void) +17 { +18 rcu_read_lock(); +19 WRITE_ONCE(d, 1); +20 r2 = READ_ONCE(c); +21 rcu_read_unlock(); +22 } +23 +24 void thread3(void) +25 { +26 r3 = READ_ONCE(d); +27 synchronize_rcu(); +28 WRITE_ONCE(e, 1); +29 } +30 +31 void thread4(void) +32 { +33 rcu_read_lock(); +34 r4 = READ_ONCE(b); +35 r5 = READ_ONCE(e); +36 rcu_read_unlock(); +37 } +</pre> +</blockquote> + +<p> +In this case, the outcome: + +<blockquote> +<pre> +(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) +</pre> +</blockquote> + +is entirely possible, as illustrated below: + +<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> + +<p> +Again, an RCU read-side critical section can overlap almost all of a +given grace period, just so long as it does not overlap the entire +grace period. +As a result, an RCU read-side critical section cannot partition a pair +of RCU grace periods. + +<p>@@QQ@@ +How long a sequence of grace periods, each separated by an RCU read-side +critical section, would be required to partition the RCU read-side +critical sections at the beginning and end of the chain? +<p>@@QQA@@ +In theory, an infinite number. +In practice, an unknown number that is sensitive to both implementation +details and timing considerations. +Therefore, even in practice, RCU users must abide by the theoretical rather +than the practical answer. +<p>@@QQE@@ + +<h3><a name="Disabling Preemption Does Not Block Grace Periods"> +Disabling Preemption Does Not Block Grace Periods</a></h3> + +<p> +There was a time when disabling preemption on any given CPU would block +subsequent grace periods. +However, this was an accident of implementation and is not a requirement. +And in the current Linux-kernel implementation, disabling preemption +on a given CPU in fact does not block grace periods, as Oleg Nesterov +<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. + +<p> +If you need a preempt-disable region to block grace periods, you need to add +<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example +as follows: + +<blockquote> +<pre> + 1 preempt_disable(); + 2 rcu_read_lock(); + 3 do_something(); + 4 rcu_read_unlock(); + 5 preempt_enable(); + 6 + 7 /* Spinlocks implicitly disable preemption. */ + 8 spin_lock(&mylock); + 9 rcu_read_lock(); +10 do_something(); +11 rcu_read_unlock(); +12 spin_unlock(&mylock); +</pre> +</blockquote> + +<p> +In theory, you could enter the RCU read-side critical section first, +but it is more efficient to keep the entire RCU read-side critical +section contained in the preempt-disable region as shown above. +Of course, RCU read-side critical sections that extend outside of +preempt-disable regions will work correctly, but such critical sections +can be preempted, which forces <tt>rcu_read_unlock()</tt> to do +more work. +And no, this is <i>not</i> an invitation to enclose all of your RCU +read-side critical sections within preempt-disable regions, because +doing so would degrade real-time response. + +<p> +This non-requirement appeared with preemptible RCU. +If you need a grace period that waits on non-preemptible code regions, use +<a href="#Sched Flavor">RCU-sched</a>. + +<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> + +<p> +These parallelism facts of life are by no means specific to RCU, but +the RCU implementation must abide by them. +They therefore bear repeating: + +<ol> +<li> Any CPU or task may be delayed at any time, + and any attempts to avoid these delays by disabling + preemption, interrupts, or whatever are completely futile. + This is most obvious in preemptible user-level + environments and in virtualized environments (where + a given guest OS's VCPUs can be preempted at any time by + the underlying hypervisor), but can also happen in bare-metal + environments due to ECC errors, NMIs, and other hardware + events. + Although a delay of more than about 20 seconds can result + in splats, the RCU implementation is obligated to use + algorithms that can tolerate extremely long delays, but where + “extremely long” is not long enough to allow + wrap-around when incrementing a 64-bit counter. +<li> Both the compiler and the CPU can reorder memory accesses. + Where it matters, RCU must use compiler directives and + memory-barrier instructions to preserve ordering. +<li> Conflicting writes to memory locations in any given cache line + will result in expensive cache misses. + Greater numbers of concurrent writes and more-frequent + concurrent writes will result in more dramatic slowdowns. + RCU is therefore obligated to use algorithms that have + sufficient locality to avoid significant performance and + scalability problems. +<li> As a rough rule of thumb, only one CPU's worth of processing + may be carried out under the protection of any given exclusive + lock. + RCU must therefore use scalable locking designs. +<li> Counters are finite, especially on 32-bit systems. + RCU's use of counters must therefore tolerate counter wrap, + or be designed such that counter wrap would take way more + time than a single system is likely to run. + An uptime of ten years is quite possible, a runtime + of a century much less so. + As an example of the latter, RCU's dyntick-idle nesting counter + allows 54 bits for interrupt nesting level (this counter + is 64 bits even on a 32-bit system). + Overflowing this counter requires 2<sup>54</sup> + half-interrupts on a given CPU without that CPU ever going idle. + If a half-interrupt happened every microsecond, it would take + 570 years of runtime to overflow this counter, which is currently + believed to be an acceptably long time. +<li> Linux systems can have thousands of CPUs running a single + Linux kernel in a single shared-memory environment. + RCU must therefore pay close attention to high-end scalability. +</ol> + +<p> +This last parallelism fact of life means that RCU must pay special +attention to the preceding facts of life. +The idea that Linux might scale to systems with thousands of CPUs would +have been met with some skepticism in the 1990s, but these requirements +would have otherwise have been unsurprising, even in the early 1990s. + +<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> + +<p> +These sections list quality-of-implementation requirements. +Although an RCU implementation that ignores these requirements could +still be used, it would likely be subject to limitations that would +make it inappropriate for industrial-strength production use. +Classes of quality-of-implementation requirements are as follows: + +<ol> +<li> <a href="#Specialization">Specialization</a> +<li> <a href="#Performance and Scalability">Performance and Scalability</a> +<li> <a href="#Composability">Composability</a> +<li> <a href="#Corner Cases">Corner Cases</a> +</ol> + +<p> +These classes is covered in the following sections. + +<h3><a name="Specialization">Specialization</a></h3> + +<p> +RCU is and always has been intended primarily for read-mostly situations, as +illustrated by the following figure. +This means that RCU's read-side primitives are optimized, often at the +expense of its update-side primitives. + +<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> + +<p> +This focus on read-mostly situations means that RCU must interoperate +with other synchronization primitives. +For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> +examples discussed earlier use RCU to protect readers and locking to +coordinate updaters. +However, the need extends much farther, requiring that a variety of +synchronization primitives be legal within RCU read-side critical sections, +including spinlocks, sequence locks, atomic operations, reference +counters, and memory barriers. + +<p>@@QQ@@ +What about sleeping locks? +<p>@@QQA@@ +These are forbidden within Linux-kernel RCU read-side critical sections +because it is not legal to place a quiescent state (in this case, +voluntary context switch) within an RCU read-side critical section. +However, sleeping locks may be used within userspace RCU read-side critical +sections, and also within Linux-kernel sleepable RCU +<a href="#Sleepable RCU">(SRCU)</a> +read-side critical sections. +In addition, the -rt patchset turns spinlocks into a sleeping locks so +that the corresponding critical sections can be preempted, which +also means that these sleeplockified spinlocks (but not other sleeping locks!) +may be acquire within -rt-Linux-kernel RCU read-side critical sections. + +<p> +Note that it <i>is</i> legal for a normal RCU read-side critical section +to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), +but only as long as it does not loop indefinitely attempting to +conditionally acquire that sleeping locks. +The key point is that things like <tt>mutex_trylock()</tt> +either return with the mutex held, or return an error indication if +the mutex was not immediately available. +Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. +<p>@@QQE@@ + +<p> +It often comes as a surprise that many algorithms do not require a +consistent view of data, but many can function in that mode, +with network routing being the poster child. +Internet routing algorithms take significant time to propagate +updates, so that by the time an update arrives at a given system, +that system has been sending network traffic the wrong way for +a considerable length of time. +Having a few threads continue to send traffic the wrong way for a +few more milliseconds is clearly not a problem: In the worst case, +TCP retransmissions will eventually get the data where it needs to go. +In general, when tracking the state of the universe outside of the +computer, some level of inconsistency must be tolerated due to +speed-of-light delays if nothing else. + +<p> +Furthermore, uncertainty about external state is inherent in many cases. +For example, a pair of veternarians might use heartbeat to determine +whether or not a given cat was alive. +But how long should they wait after the last heartbeat to decide that +the cat is in fact dead? +Waiting less than 400 milliseconds makes no sense because this would +mean that a relaxed cat would be considered to cycle between death +and life more than 100 times per minute. +Moreover, just as with human beings, a cat's heart might stop for +some period of time, so the exact wait period is a judgment call. +One of our pair of veternarians might wait 30 seconds before pronouncing +the cat dead, while the other might insist on waiting a full minute. +The two veternarians would then disagree on the state of the cat during +the final 30 seconds of the minute following the last heartbeat, as +fancifully illustrated below: + +<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> + +<p> +Interestingly enough, this same situation applies to hardware. +When push comes to shove, how do we tell whether or not some +external server has failed? +We send messages to it periodically, and declare it failed if we +don't receive a response within a given period of time. +Policy decisions can usually tolerate short +periods of inconsistency. +The policy was decided some time ago, and is only now being put into +effect, so a few milliseconds of delay is normally inconsequential. + +<p> +However, there are algorithms that absolutely must see consistent data. +For example, the translation between a user-level SystemV semaphore +ID to the corresponding in-kernel data structure is protected by RCU, +but it is absolutely forbidden to update a semaphore that has just been +removed. +In the Linux kernel, this need for consistency is accommodated by acquiring +spinlocks located in the in-kernel data structure from within +the RCU read-side critical section, and this is indicated by the +green box in the figure above. +Many other techniques may be used, and are in fact used within the +Linux kernel. + +<p> +In short, RCU is not required to maintain consistency, and other +mechanisms may be used in concert with RCU when consistency is required. +RCU's specialization allows it to do its job extremely well, and its +ability to interoperate with other synchronization mechanisms allows +the right mix of synchronization tools to be used for a given job. + +<h3><a name="Performance and Scalability">Performance and Scalability</a></h3> + +<p> +Energy efficiency is a critical component of performance today, +and Linux-kernel RCU implementations must therefore avoid unnecessarily +awakening idle CPUs. +I cannot claim that this requirement was premeditated. +In fact, I learned of it during a telephone conversation in which I +was given “frank and open” feedback on the importance +of energy efficiency in battery-powered systems and on specific +energy-efficiency shortcomings of the Linux-kernel RCU implementation. +In my experience, the battery-powered embedded community will consider +any unnecessary wakeups to be extremely unfriendly acts. +So much so that mere Linux-kernel-mailing-list posts are +insufficient to vent their ire. + +<p> +Memory consumption is not particularly important for in most +situations, and has become decreasingly +so as memory sizes have expanded and memory +costs have plummeted. +However, as I learned from Matt Mackall's +<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> +efforts, memory footprint is critically important on single-CPU systems with +non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus +<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> +was born. +Josh Triplett has since taken over the small-memory banner with his +<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> +project, which resulted in +<a href="#Sleepable RCU">SRCU</a> +becoming optional for those kernels not needing it. + +<p> +The remaining performance requirements are, for the most part, +unsurprising. +For example, in keeping with RCU's read-side specialization, +<tt>rcu_dereference()</tt> should have negligible overhead (for +example, suppression of a few minor compiler optimizations). +Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> should have exactly zero overhead. + +<p> +In preemptible environments, in the case where the RCU read-side +critical section was not preempted (as will be the case for the +highest-priority real-time process), <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> should have minimal overhead. +In particular, they should not contain atomic read-modify-write +operations, memory-barrier instructions, preemption disabling, +interrupt disabling, or backwards branches. +However, in the case where the RCU read-side critical section was preempted, +<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. +This is why it is better to nest an RCU read-side critical section +within a preempt-disable region than vice versa, at least in cases +where that critical section is short enough to avoid unduly degrading +real-time latencies. + +<p> +The <tt>synchronize_rcu()</tt> grace-period-wait primitive is +optimized for throughput. +It may therefore incur several milliseconds of latency in addition to +the duration of the longest RCU read-side critical section. +On the other hand, multiple concurrent invocations of +<tt>synchronize_rcu()</tt> are required to use batching optimizations +so that they can be satisfied by a single underlying grace-period-wait +operation. +For example, in the Linux kernel, it is not unusual for a single +grace-period-wait operation to serve more than +<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> +of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation +overhead down to nearly zero. +However, the grace-period optimization is also required to avoid +measurable degradation of real-time scheduling and interrupt latencies. + +<p> +In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> +latencies are unacceptable. +In these cases, <tt>synchronize_rcu_expedited()</tt> may be used +instead, reducing the grace-period latency down to a few tens of +microseconds on small systems, at least in cases where the RCU read-side +critical sections are short. +There are currently no special latency requirements for +<tt>synchronize_rcu_expedited()</tt> on large systems, but, +consistent with the empirical nature of the RCU specification, +that is subject to change. +However, there most definitely are scalability requirements: +A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 +CPUs should at least make reasonable forward progress. +In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> +is permitted to impose modest degradation of real-time latency +on non-idle online CPUs. +That said, it will likely be necessary to take further steps to reduce this +degradation, hopefully to roughly that of a scheduling-clock interrupt. + +<p> +There are a number of situations where even +<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period +latency is unacceptable. +In these situations, the asynchronous <tt>call_rcu()</tt> can be +used in place of <tt>synchronize_rcu()</tt> as follows: + +<blockquote> +<pre> + 1 struct foo { + 2 int a; + 3 int b; + 4 struct rcu_head rh; + 5 }; + 6 + 7 static void remove_gp_cb(struct rcu_head *rhp) + 8 { + 9 struct foo *p = container_of(rhp, struct foo, rh); +10 +11 kfree(p); +12 } +13 +14 bool remove_gp_asynchronous(void) +15 { +16 struct foo *p; +17 +18 spin_lock(&gp_lock); +19 p = rcu_dereference(gp); +20 if (!p) { +21 spin_unlock(&gp_lock); +22 return false; +23 } +24 rcu_assign_pointer(gp, NULL); +25 call_rcu(&p->rh, remove_gp_cb); +26 spin_unlock(&gp_lock); +27 return true; +28 } +</pre> +</blockquote> + +<p> +A definition of <tt>struct foo</tt> is finally needed, and appears +on lines 1-5. +The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> +on line 25, and will be invoked after the end of a subsequent +grace period. +This gets the same effect as <tt>remove_gp_synchronous()</tt>, +but without forcing the updater to wait for a grace period to elapse. +The <tt>call_rcu()</tt> function may be used in a number of +situations where neither <tt>synchronize_rcu()</tt> nor +<tt>synchronize_rcu_expedited()</tt> would be legal, +including within preempt-disable code, <tt>local_bh_disable()</tt> code, +interrupt-disable code, and interrupt handlers. +However, even <tt>call_rcu()</tt> is illegal within NMI handlers. +The callback function (<tt>remove_gp_cb()</tt> in this case) will be +executed within softirq (software interrupt) environment within the +Linux kernel, +either within a real softirq handler or under the protection +of <tt>local_bh_disable()</tt>. +In both the Linux kernel and in userspace, it is bad practice to +write an RCU callback function that takes too long. +Long-running operations should be relegated to separate threads or +(in the Linux kernel) workqueues. + +<p>@@QQ@@ +Why does line 19 use <tt>rcu_access_pointer()</tt>? +After all, <tt>call_rcu()</tt> on line 25 stores into the +structure, which would interact badly with concurrent insertions. +Doesn't this mean that <tt>rcu_dereference()</tt> is required? +<p>@@QQA@@ +Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes +any changes, including any insertions that <tt>rcu_dereference()</tt> +would protect against. +Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> +is released on line 25, which in turn means that +<tt>rcu_access_pointer()</tt> suffices. +<p>@@QQE@@ + +<p> +However, all that <tt>remove_gp_cb()</tt> is doing is +invoking <tt>kfree()</tt> on the data element. +This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, +which allows “fire and forget” operation as shown below: + +<blockquote> +<pre> + 1 struct foo { + 2 int a; + 3 int b; + 4 struct rcu_head rh; + 5 }; + 6 + 7 bool remove_gp_faf(void) + 8 { + 9 struct foo *p; +10 +11 spin_lock(&gp_lock); +12 p = rcu_dereference(gp); +13 if (!p) { +14 spin_unlock(&gp_lock); +15 return false; +16 } +17 rcu_assign_pointer(gp, NULL); +18 kfree_rcu(p, rh); +19 spin_unlock(&gp_lock); +20 return true; +21 } +</pre> +</blockquote> + +<p> +Note that <tt>remove_gp_faf()</tt> simply invokes +<tt>kfree_rcu()</tt> and proceeds, without any need to pay any +further attention to the subsequent grace period and <tt>kfree()</tt>. +It is permissible to invoke <tt>kfree_rcu()</tt> from the same +environments as for <tt>call_rcu()</tt>. +Interestingly enough, DYNIX/ptx had the equivalents of +<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not +<tt>synchronize_rcu()</tt>. +This was due to the fact that RCU was not heavily used within DYNIX/ptx, +so the very few places that needed something like +<tt>synchronize_rcu()</tt> simply open-coded it. + +<p>@@QQ@@ +Earlier it was claimed that <tt>call_rcu()</tt> and +<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked +by readers. +But how can that be correct, given that the invocation of the callback +and the freeing of the memory (respectively) must still wait for +a grace period to elapse? +<p>@@QQA@@ +We could define things this way, but keep in mind that this sort of +definition would say that updates in garbage-collected languages +cannot complete until the next time the garbage collector runs, +which does not seem at all reasonable. +The key point is that in most cases, an updater using either +<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the +next update as soon as it has invoked <tt>call_rcu()</tt> or +<tt>kfree_rcu()</tt>, without having to wait for a subsequent +grace period. +<p>@@QQE@@ + +<p> +But what if the updater must wait for the completion of code to be +executed after the end of the grace period, but has other tasks +that can be carried out in the meantime? +The polling-style <tt>get_state_synchronize_rcu()</tt> and +<tt>cond_synchronize_rcu()</tt> functions may be used for this +purpose, as shown below: + +<blockquote> +<pre> + 1 bool remove_gp_poll(void) + 2 { + 3 struct foo *p; + 4 unsigned long s; + 5 + 6 spin_lock(&gp_lock); + 7 p = rcu_access_pointer(gp); + 8 if (!p) { + 9 spin_unlock(&gp_lock); +10 return false; +11 } +12 rcu_assign_pointer(gp, NULL); +13 spin_unlock(&gp_lock); +14 s = get_state_synchronize_rcu(); +15 do_something_while_waiting(); +16 cond_synchronize_rcu(s); +17 kfree(p); +18 return true; +19 } +</pre> +</blockquote> + +<p> +On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a +“cookie” from RCU, +then line 15 carries out other tasks, +and finally, line 16 returns immediately if a grace period has +elapsed in the meantime, but otherwise waits as required. +The need for <tt>get_state_synchronize_rcu</tt> and +<tt>cond_synchronize_rcu()</tt> has appeared quite recently, +so it is too early to tell whether they will stand the test of time. + +<p> +RCU thus provides a range of tools to allow updaters to strike the +required tradeoff between latency, flexibility and CPU overhead. + +<h3><a name="Composability">Composability</a></h3> + +<p> +Composability has received much attention in recent years, perhaps in part +due to the collision of multicore hardware with object-oriented techniques +designed in single-threaded environments for single-threaded use. +And in theory, RCU read-side critical sections may be composed, and in +fact may be nested arbitrarily deeply. +In practice, as with all real-world implementations of composable +constructs, there are limitations. + +<p> +Implementations of RCU for which <tt>rcu_read_lock()</tt> +and <tt>rcu_read_unlock()</tt> generate no code, such as +Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be +nested arbitrarily deeply. +After all, there is no overhead. +Except that if all these instances of <tt>rcu_read_lock()</tt> +and <tt>rcu_read_unlock()</tt> are visible to the compiler, +compilation will eventually fail due to exhausting memory, +mass storage, or user patience, whichever comes first. +If the nesting is not visible to the compiler, as is the case with +mutually recursive functions each in its own translation unit, +stack overflow will result. +If the nesting takes the form of loops, either the control variable +will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. +Nevertheless, this class of RCU implementations is one +of the most composable constructs in existence. + +<p> +RCU implementations that explicitly track nesting depth +are limited by the nesting-depth counter. +For example, the Linux kernel's preemptible RCU limits nesting to +<tt>INT_MAX</tt>. +This should suffice for almost all practical purposes. +That said, a consecutive pair of RCU read-side critical sections +between which there is an operation that waits for a grace period +cannot be enclosed in another RCU read-side critical section. +This is because it is not legal to wait for a grace period within +an RCU read-side critical section: To do so would result either +in deadlock or +in RCU implicitly splitting the enclosing RCU read-side critical +section, neither of which is conducive to a long-lived and prosperous +kernel. + +<p> +It is worth noting that RCU is not alone in limiting composability. +For example, many transactional-memory implementations prohibit +composing a pair of transactions separated by an irrevocable +operation (for example, a network receive operation). +For another example, lock-based critical sections can be composed +surprisingly freely, but only if deadlock is avoided. + +<p> +In short, although RCU read-side critical sections are highly composable, +care is required in some situations, just as is the case for any other +composable synchronization mechanism. + +<h3><a name="Corner Cases">Corner Cases</a></h3> + +<p> +A given RCU workload might have an endless and intense stream of +RCU read-side critical sections, perhaps even so intense that there +was never a point in time during which there was not at least one +RCU read-side critical section in flight. +RCU cannot allow this situation to block grace periods: As long as +all the RCU read-side critical sections are finite, grace periods +must also be finite. + +<p> +That said, preemptible RCU implementations could potentially result +in RCU read-side critical sections being preempted for long durations, +which has the effect of creating a long-duration RCU read-side +critical section. +This situation can arise only in heavily loaded systems, but systems using +real-time priorities are of course more vulnerable. +Therefore, RCU priority boosting is provided to help deal with this +case. +That said, the exact requirements on RCU priority boosting will likely +evolve as more experience accumulates. + +<p> +Other workloads might have very high update rates. +Although one can argue that such workloads should instead use +something other than RCU, the fact remains that RCU must +handle such workloads gracefully. +This requirement is another factor driving batching of grace periods, +but it is also the driving force behind the checks for large numbers +of queued RCU callbacks in the <tt>call_rcu()</tt> code path. +Finally, high update rates should not delay RCU read-side critical +sections, although some read-side delays can occur when using +<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use +of <tt>try_stop_cpus()</tt>. +(In the future, <tt>synchronize_rcu_expedited()</tt> will be +converted to use lighter-weight inter-processor interrupts (IPIs), +but this will still disturb readers, though to a much smaller degree.) + +<p> +Although all three of these corner cases were understood in the early +1990s, a simple user-level test consisting of <tt>close(open(path))</tt> +in a tight loop +in the early 2000s suddenly provided a much deeper appreciation of the +high-update-rate corner case. +This test also motivated addition of some RCU code to react to high update +rates, for example, if a given CPU finds itself with more than 10,000 +RCU callbacks queued, it will cause RCU to take evasive action by +more aggressively starting grace periods and more aggressively forcing +completion of grace-period processing. +This evasive action causes the grace period to complete more quickly, +but at the cost of restricting RCU's batching optimizations, thus +increasing the CPU overhead incurred by that grace period. + +<h2><a name="Software-Engineering Requirements"> +Software-Engineering Requirements</a></h2> + +<p> +Between Murphy's Law and “To err is human”, it is necessary to +guard against mishaps and misuse: + +<ol> +<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> + everywhere that it is needed, so kernels built with + <tt>CONFIG_PROVE_RCU=y</tt> will spat if + <tt>rcu_dereference()</tt> is used outside of an + RCU read-side critical section. + Update-side code can use <tt>rcu_dereference_protected()</tt>, + which takes a + <a href="https://lwn.net/Articles/371986/">lockdep expression</a> + to indicate what is providing the protection. + If the indicated protection is not provided, a lockdep splat + is emitted. + + <p> + Code shared between readers and updaters can use + <tt>rcu_dereference_check()</tt>, which also takes a + lockdep expression, and emits a lockdep splat if neither + <tt>rcu_read_lock()</tt> nor the indicated protection + is in place. + In addition, <tt>rcu_dereference_raw()</tt> is used in those + (hopefully rare) cases where the required protection cannot + be easily described. + Finally, <tt>rcu_read_lock_held()</tt> is provided to + allow a function to verify that it has been invoked within + an RCU read-side critical section. + I was made aware of this set of requirements shortly after Thomas + Gleixner audited a number of RCU uses. +<li> A given function might wish to check for RCU-related preconditions + upon entry, before using any other RCU API. + The <tt>rcu_lockdep_assert()</tt> does this job, + asserting the expression in kernels having lockdep enabled + and doing nothing otherwise. +<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> + and <tt>rcu_dereference()</tt>, perhaps (incorrectly) + substituting a simple assignment. + To catch this sort of error, a given RCU-protected pointer may be + tagged with <tt>__rcu</tt>, after which running sparse + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain + about simple-assignment accesses to that pointer. + Arnd Bergmann made me aware of this requirement, and also + supplied the needed + <a href="https://lwn.net/Articles/376011/">patch series</a>. +<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> + will splat if a data element is passed to <tt>call_rcu()</tt> + twice in a row, without a grace period in between. + (This error is similar to a double free.) + The corresponding <tt>rcu_head</tt> structures that are + dynamically allocated are automatically tracked, but + <tt>rcu_head</tt> structures allocated on the stack + must be initialized with <tt>init_rcu_head_on_stack()</tt> + and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. + Similarly, statically allocated non-stack <tt>rcu_head</tt> + structures must be initialized with <tt>init_rcu_head()</tt> + and cleaned up with <tt>destroy_rcu_head()</tt>. + Mathieu Desnoyers made me aware of this requirement, and also + supplied the needed + <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. +<li> An infinite loop in an RCU read-side critical section will + eventually trigger an RCU CPU stall warning splat, with + the duration of “eventually” being controlled by the + <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or, + alternatively, by the + <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs + parameter. + However, RCU is not obligated to produce this splat + unless there is a grace period waiting on that particular + RCU read-side critical section. + <p> + Some extreme workloads might intentionally delay + RCU grace periods, and systems running those workloads can + be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt> + to suppress the splats. + This kernel parameter may also be set via <tt>sysfs</tt>. + Furthermore, RCU CPU stall warnings are counter-productive + during sysrq dumps and during panics. + RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and + <tt>rcu_sysrq_end()</tt> API members to be called before + and after long sysrq dumps. + RCU also supplies the <tt>rcu_panic()</tt> notifier that is + automatically invoked at the beginning of a panic to suppress + further RCU CPU stall warnings. + + <p> + This requirement made itself known in the early 1990s, pretty + much the first time that it was necessary to debug a CPU stall. + That said, the initial implementation in DYNIX/ptx was quite + generic in comparison with that of Linux. +<li> Although it would be very good to detect pointers leaking out + of RCU read-side critical sections, there is currently no + good way of doing this. + One complication is the need to distinguish between pointers + leaking and pointers that have been handed off from RCU to + some other synchronization mechanism, for example, reference + counting. +<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related + information is provided via both debugfs and event tracing. +<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and + <tt>rcu_dereference()</tt> to create typical linked + data structures can be surprisingly error-prone. + Therefore, RCU-protected + <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> + and, more recently, RCU-protected + <a href="https://lwn.net/Articles/612100/">hash tables</a> + are available. + Many other special-purpose RCU-protected data structures are + available in the Linux kernel and the userspace RCU library. +<li> Some linked structures are created at compile time, but still + require <tt>__rcu</tt> checking. + The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this + purpose. +<li> It is not necessary to use <tt>rcu_assign_pointer()</tt> + when creating linked structures that are to be published via + a single external pointer. + The <tt>RCU_INIT_POINTER()</tt> macro is provided for + this task and also for assigning <tt>NULL</tt> pointers + at runtime. +</ol> + +<p> +This not a hard-and-fast list: RCU's diagnostic capabilities will +continue to be guided by the number and type of usage bugs found +in real-world RCU usage. + +<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> + +<p> +The Linux kernel provides an interesting environment for all kinds of +software, including RCU. +Some of the relevant points of interest are as follows: + +<ol> +<li> <a href="#Configuration">Configuration</a>. +<li> <a href="#Firmware Interface">Firmware Interface</a>. +<li> <a href="#Early Boot">Early Boot</a>. +<li> <a href="#Interrupts and NMIs"> + Interrupts and non-maskable interrupts (NMIs)</a>. +<li> <a href="#Loadable Modules">Loadable Modules</a>. +<li> <a href="#Hotplug CPU">Hotplug CPU</a>. +<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. +<li> <a href="#Tracing and RCU">Tracing and RCU</a>. +<li> <a href="#Energy Efficiency">Energy Efficiency</a>. +<li> <a href="#Memory Efficiency">Memory Efficiency</a>. +<li> <a href="#Performance, Scalability, Response Time, and Reliability"> + Performance, Scalability, Response Time, and Reliability</a>. +</ol> + +<p> +This list is probably incomplete, but it does give a feel for the +most notable Linux-kernel complications. +Each of the following sections covers one of the above topics. + +<h3><a name="Configuration">Configuration</a></h3> + +<p> +RCU's goal is automatic configuration, so that almost nobody +needs to worry about RCU's <tt>Kconfig</tt> options. +And for almost all users, RCU does in fact work well +“out of the box.” + +<p> +However, there are specialized use cases that are handled by +kernel boot parameters and <tt>Kconfig</tt> options. +Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users +about new <tt>Kconfig</tt> options, which requires almost all of them +be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. + +<p> +This all should be quite obvious, but the fact remains that +Linus Torvalds recently had to +<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> +me of this requirement. + +<h3><a name="Firmware Interface">Firmware Interface</a></h3> + +<p> +In many cases, kernel obtains information about the system from the +firmware, and sometimes things are lost in translation. +Or the translation is accurate, but the original message is bogus. + +<p> +For example, some systems' firmware overreports the number of CPUs, +sometimes by a large factor. +If RCU naively believed the firmware, as it used to do, +it would create too many per-CPU kthreads. +Although the resulting system will still run correctly, the extra +kthreads needlessly consume memory and can cause confusion +when they show up in <tt>ps</tt> listings. + +<p> +RCU must therefore wait for a given CPU to actually come online before +it can allow itself to believe that the CPU actually exists. +The resulting “ghost CPUs” (which are never going to +come online) cause a number of +<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. + +<h3><a name="Early Boot">Early Boot</a></h3> + +<p> +The Linux kernel's boot sequence is an interesting process, +and RCU is used early, even before <tt>rcu_init()</tt> +is invoked. +In fact, a number of RCU's primitives can be used as soon as the +initial task's <tt>task_struct</tt> is available and the +boot CPU's per-CPU variables are set up. +The read-side primitives (<tt>rcu_read_lock()</tt>, +<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, +and <tt>rcu_access_pointer()</tt>) will operate normally very early on, +as will <tt>rcu_assign_pointer()</tt>. + +<p> +Although <tt>call_rcu()</tt> may be invoked at any +time during boot, callbacks are not guaranteed to be invoked until after +the scheduler is fully up and running. +This delay in callback invocation is due to the fact that RCU does not +invoke callbacks until it is fully initialized, and this full initialization +cannot occur until after the scheduler has initialized itself to the +point where RCU can spawn and run its kthreads. +In theory, it would be possible to invoke callbacks earlier, +however, this is not a panacea because there would be severe restrictions +on what operations those callbacks could invoke. + +<p> +Perhaps surprisingly, <tt>synchronize_rcu()</tt>, +<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> +(<a href="#Bottom-Half Flavor">discussed below</a>), +and +<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> +will all operate normally +during very early boot, the reason being that there is only one CPU +and preemption is disabled. +This means that the call <tt>synchronize_rcu()</tt> (or friends) +itself is a quiescent +state and thus a grace period, so the early-boot implementation can +be a no-op. + +<p> +Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> +continue to operate normally through the remainder of boot, courtesy +of the fact that preemption is disabled across their RCU read-side +critical sections and also courtesy of the fact that there is still +only one CPU. +However, once the scheduler starts initializing, preemption is enabled. +There is still only a single CPU, but the fact that preemption is enabled +means that the no-op implementation of <tt>synchronize_rcu()</tt> no +longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. +Therefore, as soon as the scheduler starts initializing, the early-boot +fastpath is disabled. +This means that <tt>synchronize_rcu()</tt> switches to its runtime +mode of operation where it posts callbacks, which in turn means that +any call to <tt>synchronize_rcu()</tt> will block until the corresponding +callback is invoked. +Unfortunately, the callback cannot be invoked until RCU's runtime +grace-period machinery is up and running, which cannot happen until +the scheduler has initialized itself sufficiently to allow RCU's +kthreads to be spawned. +Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler +initialization can result in deadlock. + +<p>@@QQ@@ +So what happens with <tt>synchronize_rcu()</tt> during +scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> +kernels? +<p>@@QQA@@ +In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> +maps directly to <tt>synchronize_sched()</tt>. +Therefore, <tt>synchronize_rcu()</tt> works normally throughout +boot in <tt>CONFIG_PREEMPT=n</tt> kernels. +However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, +so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> +during scheduler initialization. +<p>@@QQE@@ + +<p> +I learned of these boot-time requirements as a result of a series of +system hangs. + +<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> + +<p> +The Linux kernel has interrupts, and RCU read-side critical sections are +legal within interrupt handlers and within interrupt-disabled regions +of code, as are invocations of <tt>call_rcu()</tt>. + +<p> +Some Linux-kernel architectures can enter an interrupt handler from +non-idle process context, and then just never leave it, instead stealthily +transitioning back to process context. +This trick is sometimes used to invoke system calls from inside the kernel. +These “half-interrupts” mean that RCU has to be very careful +about how it counts interrupt nesting levels. +I learned of this requirement the hard way during a rewrite +of RCU's dyntick-idle code. + +<p> +The Linux kernel has non-maskable interrupts (NMIs), and +RCU read-side critical sections are legal within NMI handlers. +Thankfully, RCU update-side primitives, including +<tt>call_rcu()</tt>, are prohibited within NMI handlers. + +<p> +The name notwithstanding, some Linux-kernel architectures +can have nested NMIs, which RCU must handle correctly. +Andy Lutomirski +<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> +with this requirement; +he also kindly surprised me with +<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> +that meets this requirement. + +<h3><a name="Loadable Modules">Loadable Modules</a></h3> + +<p> +The Linux kernel has loadable modules, and these modules can +also be unloaded. +After a given module has been unloaded, any attempt to call +one of its functions results in a segmentation fault. +The module-unload functions must therefore cancel any +delayed calls to loadable-module functions, for example, +any outstanding <tt>mod_timer()</tt> must be dealt with +via <tt>del_timer_sync()</tt> or similar. + +<p> +Unfortunately, there is no way to cancel an RCU callback; +once you invoke <tt>call_rcu()</tt>, the callback function is +going to eventually be invoked, unless the system goes down first. +Because it is normally considered socially irresponsible to crash the system +in response to a module unload request, we need some other way +to deal with in-flight RCU callbacks. + +<p> +RCU therefore provides +<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, +which waits until all in-flight RCU callbacks have been invoked. +If a module uses <tt>call_rcu()</tt>, its exit function should therefore +prevent any future invocation of <tt>call_rcu()</tt>, then invoke +<tt>rcu_barrier()</tt>. +In theory, the underlying module-unload code could invoke +<tt>rcu_barrier()</tt> unconditionally, but in practice this would +incur unacceptable latencies. + +<p> +Nikita Danilov noted this requirement for an analogous filesystem-unmount +situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. +The need for <tt>rcu_barrier()</tt> for module unloading became +apparent later. + +<h3><a name="Hotplug CPU">Hotplug CPU</a></h3> + +<p> +The Linux kernel supports CPU hotplug, which means that CPUs +can come and go. +It is of course illegal to use any RCU API member from an offline CPU. +This requirement was present from day one in DYNIX/ptx, but +on the other hand, the Linux kernel's CPU-hotplug implementation +is “interesting.” + +<p> +The Linux-kernel CPU-hotplug implementation has notifiers that +are used to allow the various kernel subsystems (including RCU) +to respond appropriately to a given CPU-hotplug operation. +Most RCU operations may be invoked from CPU-hotplug notifiers, +including even normal synchronous grace-period operations +such as <tt>synchronize_rcu()</tt>. +However, expedited grace-period operations such as +<tt>synchronize_rcu_expedited()</tt> are not supported, +due to the fact that current implementations block CPU-hotplug +operations, which could result in deadlock. + +<p> +In addition, all-callback-wait operations such as +<tt>rcu_barrier()</tt> are also not supported, due to the +fact that there are phases of CPU-hotplug operations where +the outgoing CPU's callbacks will not be invoked until after +the CPU-hotplug operation ends, which could also result in deadlock. + +<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> + +<p> +RCU depends on the scheduler, and the scheduler uses RCU to +protect some of its data structures. +This means the scheduler is forbidden from acquiring +the runqueue locks and the priority-inheritance locks +in the middle of an outermost RCU read-side critical section unless either +(1) it releases them before exiting that same +RCU read-side critical section, or +(2) interrupts are disabled across +that entire RCU read-side critical section. +This same prohibition also applies (recursively!) to any lock that is acquired +while holding any lock to which this prohibition applies. +Adhering to this rule prevents preemptible RCU from invoking +<tt>rcu_read_unlock_special()</tt> while either runqueue or +priority-inheritance locks are held, thus avoiding deadlock. + +<p> +Prior to v4.4, it was only necessary to disable preemption across +RCU read-side critical sections that acquired scheduler locks. +In v4.4, expedited grace periods started using IPIs, and these +IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath. +Therefore, this expedited-grace-period change required disabling of +interrupts, not just preemption. + +<p> +For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> +implementation must be written carefully to avoid similar deadlocks. +In particular, <tt>rcu_read_unlock()</tt> must tolerate an +interrupt where the interrupt handler invokes both +<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. +This possibility requires <tt>rcu_read_unlock()</tt> to use +negative nesting levels to avoid destructive recursion via +interrupt handler's use of RCU. + +<p> +This pair of mutual scheduler-RCU requirements came as a +<a href="https://lwn.net/Articles/453002/">complete surprise</a>. + +<p> +As noted above, RCU makes use of kthreads, and it is necessary to +avoid excessive CPU-time accumulation by these kthreads. +This requirement was no surprise, but RCU's violation of it +when running context-switch-heavy workloads when built with +<tt>CONFIG_NO_HZ_FULL=y</tt> +<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. +RCU has made good progress towards meeting this requirement, even +for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, +but there is room for further improvement. + +<h3><a name="Tracing and RCU">Tracing and RCU</a></h3> + +<p> +It is possible to use tracing on RCU code, but tracing itself +uses RCU. +For this reason, <tt>rcu_dereference_raw_notrace()</tt> +is provided for use by tracing, which avoids the destructive +recursion that could otherwise ensue. +This API is also used by virtualization in some architectures, +where RCU readers execute in environments in which tracing +cannot be used. +The tracing folks both located the requirement and provided the +needed fix, so this surprise requirement was relatively painless. + +<h3><a name="Energy Efficiency">Energy Efficiency</a></h3> + +<p> +Interrupting idle CPUs is considered socially unacceptable, +especially by people with battery-powered embedded systems. +RCU therefore conserves energy by detecting which CPUs are +idle, including tracking CPUs that have been interrupted from idle. +This is a large part of the energy-efficiency requirement, +so I learned of this via an irate phone call. + +<p> +Because RCU avoids interrupting idle CPUs, it is illegal to +execute an RCU read-side critical section on an idle CPU. +(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat +if you try it.) +The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> +event tracing is provided to work around this restriction. +In addition, <tt>rcu_is_watching()</tt> may be used to +test whether or not it is currently legal to run RCU read-side +critical sections on this CPU. +I learned of the need for diagnostics on the one hand +and <tt>RCU_NONIDLE()</tt> on the other while inspecting +idle-loop code. +Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, +which is used quite heavily in the idle loop. + +<p> +It is similarly socially unacceptable to interrupt an +<tt>nohz_full</tt> CPU running in userspace. +RCU must therefore track <tt>nohz_full</tt> userspace +execution. +And in +<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> +kernels, RCU must separately track idle CPUs on the one hand and +CPUs that are either idle or executing in userspace on the other. +In both cases, RCU must be able to sample state at two points in +time, and be able to determine whether or not some other CPU spent +any time idle and/or executing in userspace. + +<p> +These energy-efficiency requirements have proven quite difficult to +understand and to meet, for example, there have been more than five +clean-sheet rewrites of RCU's energy-efficiency code, the last of +which was finally able to demonstrate +<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. +As noted earlier, +I learned of many of these requirements via angry phone calls: +Flaming me on the Linux-kernel mailing list was apparently not +sufficient to fully vent their ire at RCU's energy-efficiency bugs! + +<h3><a name="Memory Efficiency">Memory Efficiency</a></h3> + +<p> +Although small-memory non-realtime systems can simply use Tiny RCU, +code size is only one aspect of memory efficiency. +Another aspect is the size of the <tt>rcu_head</tt> structure +used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>. +Although this structure contains nothing more than a pair of pointers, +it does appear in many RCU-protected data structures, including +some that are size critical. +The <tt>page</tt> structure is a case in point, as evidenced by +the many occurrences of the <tt>union</tt> keyword within that structure. + +<p> +This need for memory efficiency is one reason that RCU uses hand-crafted +singly linked lists to track the <tt>rcu_head</tt> structures that +are waiting for a grace period to elapse. +It is also the reason why <tt>rcu_head</tt> structures do not contain +debug information, such as fields tracking the file and line of the +<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them. +Although this information might appear in debug-only kernel builds at some +point, in the meantime, the <tt>->func</tt> field will often provide +the needed debug information. + +<p> +However, in some cases, the need for memory efficiency leads to even +more extreme measures. +Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field +shares storage with a great many other structures that are used at +various points in the corresponding page's lifetime. +In order to correctly resolve certain +<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>, +the Linux kernel's memory-management subsystem needs a particular bit +to remain zero during all phases of grace-period processing, +and that bit happens to map to the bottom bit of the +<tt>rcu_head</tt> structure's <tt>->next</tt> field. +RCU makes this guarantee as long as <tt>call_rcu()</tt> +is used to post the callback, as opposed to <tt>kfree_rcu()</tt> +or some future “lazy” +variant of <tt>call_rcu()</tt> that might one day be created for +energy-efficiency purposes. + +<h3><a name="Performance, Scalability, Response Time, and Reliability"> +Performance, Scalability, Response Time, and Reliability</a></h3> + +<p> +Expanding on the +<a href="#Performance and Scalability">earlier discussion</a>, +RCU is used heavily by hot code paths in performance-critical +portions of the Linux kernel's networking, security, virtualization, +and scheduling code paths. +RCU must therefore use efficient implementations, especially in its +read-side primitives. +To that end, it would be good if preemptible RCU's implementation +of <tt>rcu_read_lock()</tt> could be inlined, however, doing +this requires resolving <tt>#include</tt> issues with the +<tt>task_struct</tt> structure. + +<p> +The Linux kernel supports hardware configurations with up to +4096 CPUs, which means that RCU must be extremely scalable. +Algorithms that involve frequent acquisitions of global locks or +frequent atomic operations on global variables simply cannot be +tolerated within the RCU implementation. +RCU therefore makes heavy use of a combining tree based on the +<tt>rcu_node</tt> structure. +RCU is required to tolerate all CPUs continuously invoking any +combination of RCU's runtime primitives with minimal per-operation +overhead. +In fact, in many cases, increasing load must <i>decrease</i> the +per-operation overhead, witness the batching optimizations for +<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, +<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. +As a general rule, RCU must cheerfully accept whatever the +rest of the Linux kernel decides to throw at it. + +<p> +The Linux kernel is used for real-time workloads, especially +in conjunction with the +<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. +The real-time-latency response requirements are such that the +traditional approach of disabling preemption across RCU +read-side critical sections is inappropriate. +Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore +use an RCU implementation that allows RCU read-side critical +sections to be preempted. +This requirement made its presence known after users made it +clear that an earlier +<a href="https://lwn.net/Articles/107930/">real-time patch</a> +did not meet their needs, in conjunction with some +<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> +encountered by a very early version of the -rt patchset. + +<p> +In addition, RCU must make do with a sub-100-microsecond real-time latency +budget. +In fact, on smaller systems with the -rt patchset, the Linux kernel +provides sub-20-microsecond real-time latencies for the whole kernel, +including RCU. +RCU's scalability and latency must therefore be sufficient for +these sorts of configurations. +To my surprise, the sub-100-microsecond real-time latency budget +<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> +applies to even the largest systems [PDF]</a>, +up to and including systems with 4096 CPUs. +This real-time requirement motivated the grace-period kthread, which +also simplified handling of a number of race conditions. + +<p> +Finally, RCU's status as a synchronization primitive means that +any RCU failure can result in arbitrary memory corruption that can be +extremely difficult to debug. +This means that RCU must be extremely reliable, which in +practice also means that RCU must have an aggressive stress-test +suite. +This stress-test suite is called <tt>rcutorture</tt>. + +<p> +Although the need for <tt>rcutorture</tt> was no surprise, +the current immense popularity of the Linux kernel is posing +interesting—and perhaps unprecedented—validation +challenges. +To see this, keep in mind that there are well over one billion +instances of the Linux kernel running today, given Android +smartphones, Linux-powered televisions, and servers. +This number can be expected to increase sharply with the advent of +the celebrated Internet of Things. + +<p> +Suppose that RCU contains a race condition that manifests on average +once per million years of runtime. +This bug will be occurring about three times per <i>day</i> across +the installed base. +RCU could simply hide behind hardware error rates, given that no one +should really expect their smartphone to last for a million years. +However, anyone taking too much comfort from this thought should +consider the fact that in most jurisdictions, a successful multi-year +test of a given mechanism, which might include a Linux kernel, +suffices for a number of types of safety-critical certifications. +In fact, rumor has it that the Linux kernel is already being used +in production for safety-critical applications. +I don't know about you, but I would feel quite bad if a bug in RCU +killed someone. +Which might explain my recent focus on validation and verification. + +<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> + +<p> +One of the more surprising things about RCU is that there are now +no fewer than five <i>flavors</i>, or API families. +In addition, the primary flavor that has been the sole focus up to +this point has two different implementations, non-preemptible and +preemptible. +The other four flavors are listed below, with requirements for each +described in a separate section. + +<ol> +<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> +<li> <a href="#Sched Flavor">Sched Flavor</a> +<li> <a href="#Sleepable RCU">Sleepable RCU</a> +<li> <a href="#Tasks RCU">Tasks RCU</a> +</ol> + +<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> + +<p> +The softirq-disable (AKA “bottom-half”, +hence the “_bh” abbreviations) +flavor of RCU, or <i>RCU-bh</i>, was developed by +Dipankar Sarma to provide a flavor of RCU that could withstand the +network-based denial-of-service attacks researched by Robert +Olsson. +These attacks placed so much networking load on the system +that some of the CPUs never exited softirq execution, +which in turn prevented those CPUs from ever executing a context switch, +which, in the RCU implementation of that time, prevented grace periods +from ever ending. +The result was an out-of-memory condition and a system hang. + +<p> +The solution was the creation of RCU-bh, which does +<tt>local_bh_disable()</tt> +across its read-side critical sections, and which uses the transition +from one type of softirq processing to another as a quiescent state +in addition to context switch, idle, user mode, and offline. +This means that RCU-bh grace periods can complete even when some of +the CPUs execute in softirq indefinitely, thus allowing algorithms +based on RCU-bh to withstand network-based denial-of-service attacks. + +<p> +Because +<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> +disable and re-enable softirq handlers, any attempt to start a softirq +handlers during the +RCU-bh read-side critical section will be deferred. +In this case, <tt>rcu_read_unlock_bh()</tt> +will invoke softirq processing, which can take considerable time. +One can of course argue that this softirq overhead should be associated +with the code following the RCU-bh read-side critical section rather +than <tt>rcu_read_unlock_bh()</tt>, but the fact +is that most profiling tools cannot be expected to make this sort +of fine distinction. +For example, suppose that a three-millisecond-long RCU-bh read-side +critical section executes during a time of heavy networking load. +There will very likely be an attempt to invoke at least one softirq +handler during that three milliseconds, but any such invocation will +be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. +This can of course make it appear at first glance as if +<tt>rcu_read_unlock_bh()</tt> was executing very slowly. + +<p> +The +<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> +includes +<tt>rcu_read_lock_bh()</tt>, +<tt>rcu_read_unlock_bh()</tt>, +<tt>rcu_dereference_bh()</tt>, +<tt>rcu_dereference_bh_check()</tt>, +<tt>synchronize_rcu_bh()</tt>, +<tt>synchronize_rcu_bh_expedited()</tt>, +<tt>call_rcu_bh()</tt>, +<tt>rcu_barrier_bh()</tt>, and +<tt>rcu_read_lock_bh_held()</tt>. + +<h3><a name="Sched Flavor">Sched Flavor</a></h3> + +<p> +Before preemptible RCU, waiting for an RCU grace period had the +side effect of also waiting for all pre-existing interrupt +and NMI handlers. +However, there are legitimate preemptible-RCU implementations that +do not have this property, given that any point in the code outside +of an RCU read-side critical section can be a quiescent state. +Therefore, <i>RCU-sched</i> was created, which follows “classic” +RCU in that an RCU-sched grace period waits for for pre-existing +interrupt and NMI handlers. +In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched +APIs have identical implementations, while kernels built with +<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. + +<p> +Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, +<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> +disable and re-enable preemption, respectively. +This means that if there was a preemption attempt during the +RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> +will enter the scheduler, with all the latency and overhead entailed. +Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look +as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. +However, the highest-priority task won't be preempted, so that task +will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. + +<p> +The +<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> +includes +<tt>rcu_read_lock_sched()</tt>, +<tt>rcu_read_unlock_sched()</tt>, +<tt>rcu_read_lock_sched_notrace()</tt>, +<tt>rcu_read_unlock_sched_notrace()</tt>, +<tt>rcu_dereference_sched()</tt>, +<tt>rcu_dereference_sched_check()</tt>, +<tt>synchronize_sched()</tt>, +<tt>synchronize_rcu_sched_expedited()</tt>, +<tt>call_rcu_sched()</tt>, +<tt>rcu_barrier_sched()</tt>, and +<tt>rcu_read_lock_sched_held()</tt>. +However, anything that disables preemption also marks an RCU-sched +read-side critical section, including +<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, +<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, +and so on. + +<h3><a name="Sleepable RCU">Sleepable RCU</a></h3> + +<p> +For well over a decade, someone saying “I need to block within +an RCU read-side critical section” was a reliable indication +that this someone did not understand RCU. +After all, if you are always blocking in an RCU read-side critical +section, you can probably afford to use a higher-overhead synchronization +mechanism. +However, that changed with the advent of the Linux kernel's notifiers, +whose RCU read-side critical +sections almost never sleep, but sometimes need to. +This resulted in the introduction of +<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, +or <i>SRCU</i>. + +<p> +SRCU allows different domains to be defined, with each such domain +defined by an instance of an <tt>srcu_struct</tt> structure. +A pointer to this structure must be passed in to each SRCU function, +for example, <tt>synchronize_srcu(&ss)</tt>, where +<tt>ss</tt> is the <tt>srcu_struct</tt> structure. +The key benefit of these domains is that a slow SRCU reader in one +domain does not delay an SRCU grace period in some other domain. +That said, one consequence of these domains is that read-side code +must pass a “cookie” from <tt>srcu_read_lock()</tt> +to <tt>srcu_read_unlock()</tt>, for example, as follows: + +<blockquote> +<pre> + 1 int idx; + 2 + 3 idx = srcu_read_lock(&ss); + 4 do_something(); + 5 srcu_read_unlock(&ss, idx); +</pre> +</blockquote> + +<p> +As noted above, it is legal to block within SRCU read-side critical sections, +however, with great power comes great responsibility. +If you block forever in one of a given domain's SRCU read-side critical +sections, then that domain's grace periods will also be blocked forever. +Of course, one good way to block forever is to deadlock, which can +happen if any operation in a given domain's SRCU read-side critical +section can block waiting, either directly or indirectly, for that domain's +grace period to elapse. +For example, this results in a self-deadlock: + +<blockquote> +<pre> + 1 int idx; + 2 + 3 idx = srcu_read_lock(&ss); + 4 do_something(); + 5 synchronize_srcu(&ss); + 6 srcu_read_unlock(&ss, idx); +</pre> +</blockquote> + +<p> +However, if line 5 acquired a mutex that was held across +a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, +deadlock would still be possible. +Furthermore, if line 5 acquired a mutex that was held across +a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, +and if an <tt>ss1</tt>-domain SRCU read-side critical section +acquired another mutex that was held across as <tt>ss</tt>-domain +<tt>synchronize_srcu()</tt>, +deadlock would again be possible. +Such a deadlock cycle could extend across an arbitrarily large number +of different SRCU domains. +Again, with great power comes great responsibility. + +<p> +Unlike the other RCU flavors, SRCU read-side critical sections can +run on idle and even offline CPUs. +This ability requires that <tt>srcu_read_lock()</tt> and +<tt>srcu_read_unlock()</tt> contain memory barriers, which means +that SRCU readers will run a bit slower than would RCU readers. +It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> +API, which, in combination with <tt>srcu_read_unlock()</tt>, +guarantees a full memory barrier. + +<p> +The +<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> +includes +<tt>srcu_read_lock()</tt>, +<tt>srcu_read_unlock()</tt>, +<tt>srcu_dereference()</tt>, +<tt>srcu_dereference_check()</tt>, +<tt>synchronize_srcu()</tt>, +<tt>synchronize_srcu_expedited()</tt>, +<tt>call_srcu()</tt>, +<tt>srcu_barrier()</tt>, and +<tt>srcu_read_lock_held()</tt>. +It also includes +<tt>DEFINE_SRCU()</tt>, +<tt>DEFINE_STATIC_SRCU()</tt>, and +<tt>init_srcu_struct()</tt> +APIs for defining and initializing <tt>srcu_struct</tt> structures. + +<h3><a name="Tasks RCU">Tasks RCU</a></h3> + +<p> +Some forms of tracing use “tramopolines” to handle the +binary rewriting required to install different types of probes. +It would be good to be able to free old trampolines, which sounds +like a job for some form of RCU. +However, because it is necessary to be able to install a trace +anywhere in the code, it is not possible to use read-side markers +such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. +In addition, it does not work to have these markers in the trampoline +itself, because there would need to be instructions following +<tt>rcu_read_unlock()</tt>. +Although <tt>synchronize_rcu()</tt> would guarantee that execution +reached the <tt>rcu_read_unlock()</tt>, it would not be able to +guarantee that execution had completely left the trampoline. + +<p> +The solution, in the form of +<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, +is to have implicit +read-side critical sections that are delimited by voluntary context +switches, that is, calls to <tt>schedule()</tt>, +<tt>cond_resched_rcu_qs()</tt>, and +<tt>synchronize_rcu_tasks()</tt>. +In addition, transitions to and from userspace execution also delimit +tasks-RCU read-side critical sections. + +<p> +The tasks-RCU API is quite compact, consisting only of +<tt>call_rcu_tasks()</tt>, +<tt>synchronize_rcu_tasks()</tt>, and +<tt>rcu_barrier_tasks()</tt>. + +<h2><a name="Possible Future Changes">Possible Future Changes</a></h2> + +<p> +One of the tricks that RCU uses to attain update-side scalability is +to increase grace-period latency with increasing numbers of CPUs. +If this becomes a serious problem, it will be necessary to rework the +grace-period state machine so as to avoid the need for the additional +latency. + +<p> +Expedited grace periods scan the CPUs, so their latency and overhead +increases with increasing numbers of CPUs. +If this becomes a serious problem on large systems, it will be necessary +to do some redesign to avoid this scalability problem. + +<p> +RCU disables CPU hotplug in a few places, perhaps most notably in the +expedited grace-period and <tt>rcu_barrier()</tt> operations. +If there is a strong reason to use expedited grace periods in CPU-hotplug +notifiers, it will be necessary to avoid disabling CPU hotplug. +This would introduce some complexity, so there had better be a <i>very</i> +good reason. + +<p> +The tradeoff between grace-period latency on the one hand and interruptions +of other CPUs on the other hand may need to be re-examined. +The desire is of course for zero grace-period latency as well as zero +interprocessor interrupts undertaken during an expedited grace period +operation. +While this ideal is unlikely to be achievable, it is quite possible that +further improvements can be made. + +<p> +The multiprocessor implementations of RCU use a combining tree that +groups CPUs so as to reduce lock contention and increase cache locality. +However, this combining tree does not spread its memory across NUMA +nodes nor does it align the CPU groups with hardware features such +as sockets or cores. +Such spreading and alignment is currently believed to be unnecessary +because the hotpath read-side primitives do not access the combining +tree, nor does <tt>call_rcu()</tt> in the common case. +If you believe that your architecture needs such spreading and alignment, +then your architecture should also benefit from the +<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set +to the number of CPUs in a socket, NUMA node, or whatever. +If the number of CPUs is too large, use a fraction of the number of +CPUs. +If the number of CPUs is a large prime number, well, that certainly +is an “interesting” architectural choice! +More flexible arrangements might be considered, but only if +<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only +if the inadequacy has been demonstrated by a carefully run and +realistic system-level workload. + +<p> +Please note that arrangements that require RCU to remap CPU numbers will +require extremely good demonstration of need and full exploration of +alternatives. + +<p> +There is an embarrassingly large number of flavors of RCU, and this +number has been increasing over time. +Perhaps it will be possible to combine some at some future date. + +<p> +RCU's various kthreads are reasonably recent additions. +It is quite likely that adjustments will be required to more gracefully +handle extreme loads. +It might also be necessary to be able to relate CPU utilization by +RCU's kthreads and softirq handlers to the code that instigated this +CPU utilization. +For example, RCU callback overhead might be charged back to the +originating <tt>call_rcu()</tt> instance, though probably not +in production kernels. + +<h2><a name="Summary">Summary</a></h2> + +<p> +This document has presented more than two decade's worth of RCU +requirements. +Given that the requirements keep changing, this will not be the last +word on this subject, but at least it serves to get an important +subset of the requirements set forth. + +<h2><a name="Acknowledgments">Acknowledgments</a></h2> + +I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, +Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and +Andy Lutomirski for their help in rendering +this article human readable, and to Michelle Rankin for her support +of this effort. +Other contributions are acknowledged in the Linux kernel's git archive. +The cartoon is copyright (c) 2013 by Melissa Broussard, +and is provided +under the terms of the Creative Commons Attribution-Share Alike 3.0 +United States license. + +<p>@@QQAL@@ + +</body></html> diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh new file mode 100755 index 000000000000..d354f069559b --- /dev/null +++ b/Documentation/RCU/Design/htmlqqz.sh @@ -0,0 +1,108 @@ +#!/bin/sh +# +# Usage: sh htmlqqz.sh file +# +# Extracts and converts quick quizzes in a proto-HTML document file.htmlx. +# Commands, all of which must be on a line by themselves: +# +# "<p>@@QQ@@": Start of a quick quiz. +# "<p>@@QQA@@": Start of a quick-quiz answer. +# "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. +# "<p>@@QQAL@@": Place to put quick-quiz answer list. +# +# Places the result in file.html. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (c) 2013 Paul E. McKenney, IBM Corporation. + +fn=$1 +if test ! -r $fn.htmlx +then + echo "Error: $fn.htmlx unreadable." + exit 1 +fi + +echo "<!-- DO NOT HAND EDIT. -->" > $fn.html +echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html +awk < $fn.htmlx >> $fn.html ' + +state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" { + print $0; + if ($0 ~ /^<p>@@QQ/) + print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr" + next; +} + +state == "" && $1 == "<p>@@QQ@@" { + qqn++; + qqlineno = NR; + haveqq = 1; + state = "qq"; + print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>" + next; +} + +state == "qq" && $1 != "<p>@@QQA@@" { + qq[qqn] = qq[qqn] $0 "\n"; + print $0 + if ($0 ~ /^<p>@@QQ/) + print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr" + next; +} + +state == "qq" && $1 == "<p>@@QQA@@" { + state = "qqa"; + print "<br><a href=\"#qq" qqn "answer\">Answer</a>" + next; +} + +state == "qqa" && $1 != "<p>@@QQE@@" { + qqa[qqn] = qqa[qqn] $0 "\n"; + if ($0 ~ /^<p>@@QQ/) + print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr" + next; +} + +state == "qqa" && $1 == "<p>@@QQE@@" { + state = ""; + next; +} + +state == "" && $1 == "<p>@@QQAL@@" { + haveqq = ""; + print "<h3><a name=\"Answers to Quick Quizzes\">" + print "Answers to Quick Quizzes</a></h3>" + print ""; + for (i = 1; i <= qqn; i++) { + print "<a name=\"qq" i "answer\"></a>" + print "<p><b>Quick Quiz " i "</b>:" + print qq[i]; + print ""; + print "</p><p><b>Answer</b>:" + print qqa[i]; + print ""; + print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>" + print ""; + } + next; +} + +END { + if (state != "") + print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" + else if (haveqq) + print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr" +}' diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 742f69d18fc8..d8186da15ca1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3296,18 +3296,35 @@ bytes respectively. Such letter suffixes can also be entirely omitted. rcutorture.verbose= [KNL] Enable additional printk() statements. + rcupdate.rcu_cpu_stall_suppress= [KNL] + Suppress RCU CPU stall warning messages. + + rcupdate.rcu_cpu_stall_timeout= [KNL] + Set timeout for RCU CPU stall warning messages. + rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for example, synchronize_rcu_expedited() instead of synchronize_rcu(). This reduces latency, but can increase CPU utilization, degrade real-time latency, and degrade energy efficiency. - - rcupdate.rcu_cpu_stall_suppress= [KNL] - Suppress RCU CPU stall warning messages. - - rcupdate.rcu_cpu_stall_timeout= [KNL] - Set timeout for RCU CPU stall warning messages. + No effect on CONFIG_TINY_RCU kernels. + + rcupdate.rcu_normal= [KNL] + Use only normal grace-period primitives, + for example, synchronize_rcu() instead of + synchronize_rcu_expedited(). This improves + real-time latency, CPU utilization, and + energy efficiency, but can expose users to + increased grace-period latency. This parameter + overrides rcupdate.rcu_expedited. No effect on + CONFIG_TINY_RCU kernels. + + rcupdate.rcu_normal_after_boot= [KNL] + Once boot has completed (that is, after + rcu_end_inkernel_boot() has been invoked), use + only normal grace-period primitives. No effect + on CONFIG_TINY_RCU kernels. rcupdate.rcu_task_stall_timeout= [KNL] Set timeout in jiffies for RCU task stall warning diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index aef9487303d0..85304ebd187c 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -194,7 +194,7 @@ There are some minimal guarantees that may be expected of a CPU: (*) On any given CPU, dependent memory accesses will be issued in order, with respect to itself. This means that for: - WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q); + Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q); the CPU will issue the following memory operations: @@ -202,9 +202,9 @@ There are some minimal guarantees that may be expected of a CPU: and always in that order. On most systems, smp_read_barrier_depends() does nothing, but it is required for DEC Alpha. The READ_ONCE() - and WRITE_ONCE() are required to prevent compiler mischief. Please - note that you should normally use something like rcu_dereference() - instead of open-coding smp_read_barrier_depends(). + is required to prevent compiler mischief. Please note that you + should normally use something like rcu_dereference() instead of + open-coding smp_read_barrier_depends(). (*) Overlapping loads and stores within a particular CPU will appear to be ordered within that CPU. This means that for: diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 5381a728d23e..e5139402e7f8 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -133,6 +133,12 @@ static void sysrq_handle_crash(int key) { char *killer = NULL; + /* we need to release the RCU read lock here, + * otherwise we get an annoying + * 'BUG: sleeping function called from invalid context' + * complaint from the kernel before the panic. + */ + rcu_read_unlock(); panic_on_oops = 1; /* force panic */ wmb(); *killer = 1; diff --git a/include/linux/list.h b/include/linux/list.h index 993395a2e55c..5356f4d661a7 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -24,7 +24,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) { - list->next = list; + WRITE_ONCE(list->next, list); list->prev = list; } @@ -42,7 +42,7 @@ static inline void __list_add(struct list_head *new, next->prev = new; new->next = next; new->prev = prev; - prev->next = new; + WRITE_ONCE(prev->next, new); } #else extern void __list_add(struct list_head *new, @@ -186,7 +186,7 @@ static inline int list_is_last(const struct list_head *list, */ static inline int list_empty(const struct list_head *head) { - return head->next == head; + return READ_ONCE(head->next) == head; } /** @@ -608,7 +608,7 @@ static inline int hlist_unhashed(const struct hlist_node *h) static inline int hlist_empty(const struct hlist_head *h) { - return !h->first; + return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) @@ -642,7 +642,7 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) n->next = first; if (first) first->pprev = &n->next; - h->first = n; + WRITE_ONCE(h->first, n); n->pprev = &h->first; } @@ -653,14 +653,14 @@ static inline void hlist_add_before(struct hlist_node *n, n->pprev = next->pprev; n->next = next; next->pprev = &n->next; - *(n->pprev) = n; + WRITE_ONCE(*(n->pprev), n); } static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; - prev->next = n; + WRITE_ONCE(prev->next, n); n->pprev = &prev->next; if (n->next) diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 8132214e8efd..ee7229a6c06a 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -70,7 +70,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, static inline int hlist_bl_empty(const struct hlist_bl_head *h) { - return !((unsigned long)h->first & ~LIST_BL_LOCKMASK); + return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_add_head(struct hlist_bl_node *n, diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 444d2b1313bd..b01fe1009084 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -57,7 +57,7 @@ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { - return is_a_nulls(h->first); + return is_a_nulls(READ_ONCE(h->first)); } static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 5ed540986019..14ec1652daf4 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -179,32 +179,31 @@ static inline void list_replace_rcu(struct list_head *old, } /** - * list_splice_init_rcu - splice an RCU-protected list into an existing list. + * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice - * @head: the place in the list to splice the first list into + * @prev: points to the last element of the existing list + * @next: points to the first element of the existing list * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... * - * @head can be RCU-read traversed concurrently with this function. + * The list pointed to by @prev and @next can be RCU-read traversed + * concurrently with this function. * * Note that this function blocks. * - * Important note: the caller must take whatever action is necessary to - * prevent any other updates to @head. In principle, it is possible - * to modify the list as soon as sync() begins execution. - * If this sort of thing becomes necessary, an alternative version - * based on call_rcu() could be created. But only if -really- - * needed -- there is no shortage of RCU API members. + * Important note: the caller must take whatever action is necessary to prevent + * any other updates to the existing list. In principle, it is possible to + * modify the list as soon as sync() begins execution. If this sort of thing + * becomes necessary, an alternative version based on call_rcu() could be + * created. But only if -really- needed -- there is no shortage of RCU API + * members. */ -static inline void list_splice_init_rcu(struct list_head *list, - struct list_head *head, - void (*sync)(void)) +static inline void __list_splice_init_rcu(struct list_head *list, + struct list_head *prev, + struct list_head *next, + void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; - struct list_head *at = head->next; - - if (list_empty(list)) - return; /* * "first" and "last" tracking list, so initialize it. RCU readers @@ -231,10 +230,40 @@ static inline void list_splice_init_rcu(struct list_head *list, * this function. */ - last->next = at; - rcu_assign_pointer(list_next_rcu(head), first); - first->prev = head; - at->prev = last; + last->next = next; + rcu_assign_pointer(list_next_rcu(prev), first); + first->prev = prev; + next->prev = last; +} + +/** + * list_splice_init_rcu - splice an RCU-protected list into an existing list, + * designed for stacks. + * @list: the RCU-protected list to splice + * @head: the place in the existing list to splice the first list into + * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... + */ +static inline void list_splice_init_rcu(struct list_head *list, + struct list_head *head, + void (*sync)(void)) +{ + if (!list_empty(list)) + __list_splice_init_rcu(list, head, head->next, sync); +} + +/** + * list_splice_tail_init_rcu - splice an RCU-protected list into an existing + * list, designed for queues. + * @list: the RCU-protected list to splice + * @head: the place in the existing list to splice the first list into + * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... + */ +static inline void list_splice_tail_init_rcu(struct list_head *list, + struct list_head *head, + void (*sync)(void)) +{ + if (!list_empty(list)) + __list_splice_init_rcu(list, head->prev, head, sync); } /** @@ -305,6 +334,42 @@ static inline void list_splice_init_rcu(struct list_head *list, pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** + * list_entry_lockless - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu(), but requires some implicit RCU + * read-side guarding. One example is running within a special + * exception-time environment where preemption is disabled and where + * lockdep cannot be invoked (in which case updaters must use RCU-sched, + * as in synchronize_sched(), call_rcu_sched(), and friends). Another + * example is when items are added to the list, but never deleted. + */ +#define list_entry_lockless(ptr, type, member) \ + container_of((typeof(ptr))lockless_dereference(ptr), type, member) + +/** + * list_for_each_entry_lockless - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu(), but requires some implicit RCU + * read-side guarding. One example is running within a special + * exception-time environment where preemption is disabled and where + * lockdep cannot be invoked (in which case updaters must use RCU-sched, + * as in synchronize_sched(), call_rcu_sched(), and friends). Another + * example is when items are added to the list, but never deleted. + */ +#define list_for_each_entry_lockless(pos, head, member) \ + for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) + +/** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a0189ba67fde..14e6f47ee16f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -48,10 +48,17 @@ #include <asm/barrier.h> +#ifndef CONFIG_TINY_RCU extern int rcu_expedited; /* for sysctl */ +extern int rcu_normal; /* also for sysctl */ +#endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ +{ + return true; +} static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ { return false; @@ -65,6 +72,7 @@ static inline void rcu_unexpedite_gp(void) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); @@ -321,7 +329,6 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); -void rcu_end_inkernel_boot(void); void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); @@ -329,6 +336,12 @@ struct notifier_block; int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); +#ifndef CONFIG_TINY_RCU +void rcu_end_inkernel_boot(void); +#else /* #ifndef CONFIG_TINY_RCU */ +static inline void rcu_end_inkernel_boot(void) { } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); @@ -379,9 +392,9 @@ static inline void rcu_init_nohz(void) */ #define RCU_NONIDLE(a) \ do { \ - rcu_irq_enter(); \ + rcu_irq_enter_irqson(); \ do { a; } while (0); \ - rcu_irq_exit(); \ + rcu_irq_exit_irqson(); \ } while (0) /* @@ -741,7 +754,7 @@ static inline void rcu_preempt_sleep_check(void) * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * - * The tracing version of rcu_dereference_raw() must not call + * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 4c1aaf9cce7b..64809aea661c 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -181,6 +181,14 @@ static inline void rcu_irq_enter(void) { } +static inline void rcu_irq_exit_irqson(void) +{ +} + +static inline void rcu_irq_enter_irqson(void) +{ +} + static inline void rcu_irq_exit(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 60d15a080d7c..ad1eda9fa4da 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -37,7 +37,7 @@ void rcu_cpu_stall_reset(void); /* * Note a virtualization-based context switch. This is simply a * wrapper around rcu_note_context_switch(), which allows TINY_RCU - * to save a few bytes. + * to save a few bytes. The caller must have disabled interrupts. */ static inline void rcu_virt_note_context_switch(int cpu) { @@ -97,6 +97,8 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +void rcu_irq_enter_irqson(void); +void rcu_irq_exit_irqson(void); void exit_rcu(void); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 696a339c592c..7834a8a8bf1e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -171,8 +171,8 @@ extern void syscall_unregfunc(void); TP_PROTO(data_proto), \ TP_ARGS(data_args), \ TP_CONDITION(cond), \ - rcu_irq_enter(), \ - rcu_irq_exit()); \ + rcu_irq_enter_irqson(), \ + rcu_irq_exit_irqson()); \ } #else #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) diff --git a/init/main.c b/init/main.c index 9e64d7097f1a..c6ebefafa496 100644 --- a/init/main.c +++ b/init/main.c @@ -943,6 +943,8 @@ static int __ref kernel_init(void *unused) flush_delayed_fput(); + rcu_end_inkernel_boot(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e83b26464061..152da4a48867 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -20,7 +20,7 @@ #include <linux/capability.h> #include <linux/compiler.h> -#include <linux/rcupdate.h> /* rcu_expedited */ +#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +#ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", rcu_expedited); + return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj, } KERNEL_ATTR_RW(rcu_expedited); +int rcu_normal; +static ssize_t rcu_normal_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); +} +static ssize_t rcu_normal_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_normal)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_normal); +#endif /* #ifndef CONFIG_TINY_RCU */ + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif +#ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, +#endif NULL }; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d89328e260df..d2988d047d66 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -162,6 +162,27 @@ static int rcu_torture_writer_state; #define RTWS_SYNC 7 #define RTWS_STUTTER 8 #define RTWS_STOPPING 9 +static const char * const rcu_torture_writer_state_names[] = { + "RTWS_FIXED_DELAY", + "RTWS_DELAY", + "RTWS_REPLACE", + "RTWS_DEF_FREE", + "RTWS_EXP_SYNC", + "RTWS_COND_GET", + "RTWS_COND_SYNC", + "RTWS_SYNC", + "RTWS_STUTTER", + "RTWS_STOPPING", +}; + +static const char *rcu_torture_writer_state_getname(void) +{ + unsigned int i = READ_ONCE(rcu_torture_writer_state); + + if (i >= ARRAY_SIZE(rcu_torture_writer_state_names)) + return "???"; + return rcu_torture_writer_state_names[i]; +} #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index a63a1ea5a41b..9b9cdd549caa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_gp_is_expedited() + __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..e41dd4131f7a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree"); /* Data structures. */ -static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); - } + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) * The caller must have disabled interrupts to prevent races with * normal callback registry. */ -static int +static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { int i; if (rcu_gp_in_progress(rsp)) - return 0; /* No, a grace period is already in progress. */ + return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) - return 1; /* Yes, a no-CBs CPU needs one. */ + return true; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) - return 0; /* No, this is a no-CBs (or offline) CPU. */ + return false; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) - return 1; /* Yes, this CPU has newly registered callbacks. */ + return true; /* Yes, CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) - return 1; /* Yes, CBs for future grace period. */ - return 0; /* No grace period needed. */ + return true; /* Yes, CBs for future grace period. */ + return false; /* No grace period needed. */ } /* @@ -740,7 +732,7 @@ void rcu_user_enter(void) * * Exit from an interrupt handler, which might possibly result in entering * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your @@ -753,11 +745,10 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; @@ -768,6 +759,17 @@ void rcu_irq_exit(void) else rcu_eqs_enter_common(oldval, true); rcu_sysidle_enter(1); +} + +/* + * Wrapper for rcu_irq_exit() where interrupts are enabled. + */ +void rcu_irq_exit_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_exit(); local_irq_restore(flags); } @@ -865,7 +867,7 @@ void rcu_user_exit(void) * * Enter an interrupt handler, which might possibly result in exiting * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt * handler that it never exits, for example when doing upcalls to @@ -881,11 +883,10 @@ void rcu_user_exit(void) */ void rcu_irq_enter(void) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; @@ -896,6 +897,17 @@ void rcu_irq_enter(void) else rcu_eqs_exit_common(oldval, true); rcu_sysidle_exit(1); +} + +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_enter(); local_irq_restore(flags); } @@ -1187,6 +1199,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + +/* * Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) @@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", + if (j - gpa > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, - rsp->gp_flags, rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : 0); + rsp->gp_flags, + gp_state_getname(rsp->gp_state), rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + if (rsp->gp_kthread) + sched_show_task(rsp->gp_kthread); + } } /* @@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) @@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) { - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); - } + if (rnp != rnp_root) + raw_spin_lock_rcu_node(rnp_root); /* * Get a new grace-period number. If there really is no grace @@ -1786,11 +1810,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && rdp->completed == READ_ONCE(rnp->completed) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - smp_mb__after_unlock_lock(); needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) @@ -1805,21 +1828,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) } /* - * Initialize a new grace period. Return 0 if no grace period required. + * Initialize a new grace period. Return false if no grace period required. */ -static int rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(struct rcu_state *rsp) { unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1829,7 +1851,7 @@ static int rcu_gp_init(struct rcu_state *rsp) * Not supposed to be able to happen. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } /* Advance to a new grace period and initialize state. */ @@ -1847,8 +1869,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ @@ -1904,8 +1925,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1923,7 +1943,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rsp->gp_activity, jiffies); } - return 1; + return true; } /* @@ -1973,8 +1993,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); @@ -1993,8 +2012,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * grace period is recorded in any of the rcu_node structures. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); @@ -2035,8 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ @@ -2284,8 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); oldmask = rnp_c->qsmask; } @@ -2332,8 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2355,8 +2369,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || @@ -2582,8 +2595,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (!rnp) break; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); /* GP memory ordering. */ + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { @@ -2611,8 +2623,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2809,8 +2820,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_for_each_leaf_node(rsp, rnp) { cond_resched_rcu_qs(); mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2881,8 +2891,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -2914,7 +2923,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* Does this CPU require a not-yet-started grace period? */ local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); if (needwake) @@ -3005,8 +3014,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (!rcu_gp_in_progress(rsp)) { struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); if (needwake) @@ -3365,7 +3373,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ s = (READ_ONCE(*sp) + 3) & ~0x1; smp_mb(); /* Above access must not bleed into critical section. */ return s; @@ -3392,6 +3399,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + smp_mb(); /* Caller's modifications seen first by other CPUs. */ return rcu_seq_snap(&rsp->expedited_sequence); } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) @@ -3426,8 +3434,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * CPUs for the current rcu_node structure up the rcu_node tree. */ rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; /* No new CPUs, nothing to do. */ @@ -3447,8 +3454,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rnp_up = rnp->parent; done = false; while (rnp_up) { - raw_spin_lock_irqsave(&rnp_up->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; @@ -3472,8 +3478,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) sync_exp_reset_tree_hotplug(rsp); rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -3531,8 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } @@ -3549,8 +3553,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); __rcu_report_exp_rnp(rsp, rnp, wake, flags); } @@ -3564,8 +3567,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -3609,7 +3611,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { - struct rcu_data *rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3623,7 +3625,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { if (sync_exp_work_done(rsp, rnp0, NULL, - &rsp->expedited_workdone0, s)) + &rdp->expedited_workdone0, s)) return NULL; return rnp0; } @@ -3637,14 +3639,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone2, s)) + &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) @@ -3654,7 +3655,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone3, s)) + &rdp->expedited_workdone3, s)) return NULL; return rnp1; } @@ -3708,8 +3709,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; @@ -3741,24 +3741,22 @@ retry_ipi: ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; - } else { - /* Failed, raced with offline. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, - flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave(&rnp->lock, - flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -3773,6 +3771,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; + int ndetected; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; @@ -3785,7 +3784,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); - if (ret > 0) + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ @@ -3795,14 +3794,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); + ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - (void)rcu_print_task_exp_stall(rnp); + ndetected = rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; if (!(rnp->expmask & mask)) continue; + ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[cpu_online(cpu)], @@ -3811,8 +3812,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } mask <<= 1; } - pr_cont(" } %lu jiffies s: %lu\n", - jiffies - jiffies_start, rsp->expedited_sequence); + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (!ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } rcu_for_each_leaf_node(rsp, rnp) { mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { @@ -3847,6 +3863,16 @@ void synchronize_sched_expedited(void) struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); @@ -4135,7 +4161,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (rnp == NULL) return; - raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ } @@ -4152,7 +4178,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); @@ -4179,7 +4205,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -4198,8 +4224,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinitnext |= mask; rnp->expmaskinitnext |= mask; if (!rdp->beenonline) @@ -4327,14 +4352,14 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rsp->gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); } rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); @@ -4385,12 +4410,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) +static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4576,8 +4603,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state); + rcu_init_one(&rcu_sched_state); if (dump_tree) rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d4dc..83360b4f4352 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -178,6 +178,8 @@ struct rcu_node { /* beginning of each expedited GP. */ unsigned long expmaskinitnext; /* Online CPUs for next expedited GP. */ + /* Any CPU that has ever been online will */ + /* have its bit set. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -384,6 +386,10 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + atomic_long_t expedited_workdone0; /* # done by others #0. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -498,10 +504,6 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ @@ -545,6 +547,18 @@ struct rcu_state { #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#ifndef RCU_TREE_NONCORE +static const char * const gp_state_names[] = { + "RCU_GP_IDLE", + "RCU_GP_WAIT_GPS", + "RCU_GP_DONE_GPS", + "RCU_GP_WAIT_FQS", + "RCU_GP_DOING_FQS", + "RCU_GP_CLEANUP", + "RCU_GP_CLEANED", +}; +#endif /* #ifndef RCU_TREE_NONCORE */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ @@ -664,3 +678,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #else /* #ifdef CONFIG_PPC */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_PPC */ + +/* + * Wrappers for the rcu_node::lock acquire. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + */ +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&(rnp)->lock, flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) +{ + bool locked = raw_spin_trylock(&rnp->lock); + + if (locked) + smp_mb__after_unlock_lock(); + return locked; +} diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..9467a8b7e756 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ /* * Check the RCU kernel configuration parameters and print informative - * messages about anything out of the ordinary. If you like #ifdef, you - * will love this function. + * messages about anything out of the ordinary. */ static void __init rcu_bootup_announce_oddness(void) { @@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -286,12 +284,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * Remove this task from the list it blocked on. The task - * now remains queued on the rcu_node corresponding to - * the CPU it first blocked on, so the first attempt to - * acquire the task's rcu_node's ->lock will succeed. - * Keep the loop and add a WARN_ON() out of sheer paranoia. + * now remains queued on the rcu_node corresponding to the + * CPU it first blocked on, so there is no longer any need + * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. */ - for (;;) { - rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - if (rnp == t->rcu_blocked_node) - break; - WARN_ON_ONCE(1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } + rnp = t->rcu_blocked_node; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + WARN_ON_ONCE(rnp != t->rcu_blocked_node); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void) struct rcu_state *rsp = rcu_state_p; unsigned long s; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + s = rcu_exp_gp_seq_snap(rsp); rnp_unlock = exp_funnel_lock(rsp, s); @@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(rcu_state_p, rcu_data_p); + rcu_init_one(rcu_state_p); } /* @@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp) READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* * Recheck under the lock: all tasks in need of boosting @@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = kthread_prio; @@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - /* * If a non-lazy callback arrived at a CPU having only lazy * callbacks, invoke RCU core for the side-effect of recalculating @@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void) if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ if (needwake) @@ -2068,8 +2057,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ef7093cc9b5c..1088e64f01ad 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update tracing for classic implementation + * Read-Copy Update tracing for hierarchical implementation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 + * Author: Paul E. McKenney * * Papers: http://www.rdrop.com/users/paulmck/RCU * @@ -33,9 +34,7 @@ #include <linux/sched.h> #include <linux/atomic.h> #include <linux/bitops.h> -#include <linux/module.h> #include <linux/completion.h> -#include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = { static int show_rcuexp(struct seq_file *m, void *v) { + int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; - + struct rcu_data *rdp; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->expedited_workdone0); + s1 += atomic_long_read(&rdp->expedited_workdone1); + s2 += atomic_long_read(&rdp->expedited_workdone2); + s3 += atomic_long_read(&rdp->expedited_workdone3); + } seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_workdone0), - atomic_long_read(&rsp->expedited_workdone1), - atomic_long_read(&rsp->expedited_workdone2), - atomic_long_read(&rsp->expedited_workdone3), + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); @@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) unsigned long gpmax; struct rcu_node *rnp = &rsp->node[0]; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); completed = READ_ONCE(rsp->completed); gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) @@ -487,16 +492,4 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutree_trace_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a40f0..76b94e19430b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate"); #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); +module_param(rcu_normal, int, 0); +static int rcu_normal_after_boot; +module_param(rcu_normal_after_boot, int, 0); +#endif /* #ifndef CONFIG_TINY_RCU */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); #ifndef CONFIG_TINY_RCU +/* + * Should expedited grace-period primitives always fall back to their + * non-expedited counterparts? Intended for use within RCU. Note + * that if the user specifies both rcu_expedited and rcu_normal, then + * rcu_normal wins. + */ +bool rcu_gp_is_normal(void) +{ + return READ_ONCE(rcu_normal); +} + static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); @@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); -#endif /* #ifndef CONFIG_TINY_RCU */ - /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void) { if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) rcu_unexpedite_gp(); + if (rcu_normal_after_boot) + WRITE_ONCE(rcu_normal, 1); } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PREEMPT_RCU /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 732e993b564b..1ef0d7aeab47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3109,7 +3109,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3128,13 +3127,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ diff --git a/lib/list_debug.c b/lib/list_debug.c index c24c2f7e296f..3859bf63561c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -37,7 +37,7 @@ void __list_add(struct list_head *new, next->prev = new; new->next = next; new->prev = prev; - prev->next = new; + WRITE_ONCE(prev->next, new); } EXPORT_SYMBOL(__list_add); diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 5236e073919d..0f80eefb0bfd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -38,8 +38,6 @@ # # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> -grace=120 - T=/tmp/kvm-test-1-run.sh.$$ trap 'rm -rf $T' 0 touch $T @@ -152,7 +150,7 @@ fi qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`" # Generate architecture-specific and interaction-specific qemu arguments -qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$builddir/console.log"`" +qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" # Generate qemu -append arguments qemu_append="`identify_qemu_append "$QEMU"`" @@ -168,7 +166,7 @@ then touch $resdir/buildonly exit 0 fi -echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log +echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & qemu_pid=$! @@ -214,7 +212,7 @@ then else break fi - if test $kruntime -ge $((seconds + grace)) + if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) then echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 kill -KILL $qemu_pid @@ -224,6 +222,5 @@ then done fi -cp $builddir/console.log $resdir parse-torture.sh $resdir/console.log $title parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index f6483609ebc2..4a431767f77a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -42,6 +42,7 @@ TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KMAKE_ARG="" +TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu resdir="" configs="" @@ -149,6 +150,11 @@ do resdir=$2 shift ;; + --shutdown-grace) + checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error' + TORTURE_SHUTDOWN_GRACE=$2 + shift + ;; --torture) checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' TORTURE_SUITE=$2 @@ -266,6 +272,7 @@ TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC +TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE if ! test -e $resdir then @@ -307,10 +314,10 @@ awk < $T/cfgcpu.pack \ } # Dump out the scripting required to run one test batch. -function dump(first, pastlast) +function dump(first, pastlast, batchnum) { - print "echo ----Start batch: `date`"; - print "echo ----Start batch: `date` >> " rd "/log"; + print "echo ----Start batch " batchnum ": `date`"; + print "echo ----Start batch " batchnum ": `date` >> " rd "/log"; jn=1 for (j = first; j < pastlast; j++) { builddir=KVM "/b" jn @@ -371,25 +378,28 @@ END { njobs = i; nc = ncpus; first = 0; + batchnum = 1; # Each pass through the following loop considers one test. for (i = 0; i < njobs; i++) { if (ncpus == 0) { # Sequential test specified, each test its own batch. - dump(i, i + 1); + dump(i, i + 1, batchnum); first = i; + batchnum++; } else if (nc < cpus[i] && i != 0) { # Out of CPUs, dump out a batch. - dump(first, i); + dump(first, i, batchnum); first = i; nc = ncpus; + batchnum++; } # Account for the CPUs needed by the current test. nc -= cpus[i]; } # Dump the last batch. if (ncpus != 0) - dump(first, i); + dump(first, i, batchnum); }' >> $T/script cat << ___EOF___ >> $T/script diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index d8f35cf116be..844787a0d7be 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -24,9 +24,6 @@ # # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> -T=/tmp/abat-chk-badness.sh.$$ -trap 'rm -f $T' 0 - file="$1" title="$2" @@ -36,9 +33,41 @@ if grep -Pq '\x00' < $file then print_warning Console output contains nul bytes, old qemu still running? fi -egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T -if test -s $T +egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags +if test -s $1.diags then print_warning Assertion failure in $file $title - cat $T + # cat $1.diags + summary="" + n_badness=`grep -c Badness $1` + if test "$n_badness" -ne 0 + then + summary="$summary Badness: $n_badness" + fi + n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'` + if test "$n_warn" -ne 0 + then + summary="$summary Warnings: $n_warn" + fi + n_bugs=`egrep -c 'BUG|Oops:' $1` + if test "$n_bugs" -ne 0 + then + summary="$summary Bugs: $n_bugs" + fi + n_calltrace=`grep -c 'Call Trace:' $1` + if test "$n_calltrace" -ne 0 + then + summary="$summary Call Traces: $n_calltrace" + fi + n_lockdep=`grep -c =========== $1` + if test "$n_badness" -ne 0 + then + summary="$summary lockdep: $n_badness" + fi + n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1` + if test "$n_stalls" -ne 0 + then + summary="$summary Stalls: $n_stalls" + fi + print_warning Summary: $summary fi diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt index 9ef33a743b73..24396ae8355b 100644 --- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt +++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt @@ -20,7 +20,6 @@ CONFIG_PROVE_RCU CONFIG_NO_HZ_FULL_SYSIDLE CONFIG_RCU_NOCB_CPU -CONFIG_RCU_USER_QS Meaningless for TINY_RCU. diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 657f3a035488..4e2b1893d40d 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -72,10 +72,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE Always used in KVM testing. -CONFIG_RCU_USER_QS - - Redundant with CONFIG_NO_HZ_FULL. - CONFIG_PREEMPT_RCU CONFIG_TREE_RCU |