summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg474
-rw-r--r--Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg499
-rw-r--r--Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg695
-rw-r--r--Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg741
-rw-r--r--Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg858
-rw-r--r--Documentation/RCU/Design/Data-Structures/Data-Structures.html1333
-rw-r--r--Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg939
-rw-r--r--Documentation/RCU/Design/Data-Structures/TreeLevel.svg828
-rw-r--r--Documentation/RCU/Design/Data-Structures/TreeMapping.svg305
-rw-r--r--Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg380
-rw-r--r--Documentation/RCU/Design/Data-Structures/blkd_task.svg843
-rw-r--r--Documentation/RCU/Design/Data-Structures/nxtlist.svg396
-rw-r--r--Documentation/RCU/Design/Requirements/2013-08-is-it-dead.pngbin100825 -> 0 bytes
-rw-r--r--Documentation/RCU/Design/Requirements/RCUApplicability.svg237
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html941
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.htmlx2741
-rwxr-xr-xDocumentation/RCU/Design/htmlqqz.sh108
-rw-r--r--Documentation/RCU/trace.txt10
-rw-r--r--Documentation/RCU/whatisRCU.txt22
-rw-r--r--Documentation/devicetree/bindings/regmap/regmap.txt59
-rw-r--r--Documentation/kernel-parameters.txt38
-rw-r--r--Documentation/locking/lockdep-design.txt4
-rw-r--r--Documentation/memory-barriers.txt117
-rw-r--r--Documentation/sysctl/kernel.txt14
-rw-r--r--Makefile2
-rw-r--r--arch/alpha/include/asm/rwsem.h18
-rw-r--r--arch/arm/boot/dts/at91sam9x5.dtsi2
-rw-r--r--arch/arm/boot/dts/sama5d2.dtsi2
-rw-r--r--arch/arm/include/asm/efi.h37
-rw-r--r--arch/arm/kernel/efi.c41
-rw-r--r--arch/arm/kernel/hw_breakpoint.c4
-rw-r--r--arch/arm/kernel/perf_callchain.c2
-rw-r--r--arch/arm/kernel/setup.c3
-rw-r--r--arch/arm64/Kconfig.platforms1
-rw-r--r--arch/arm64/include/asm/efi.h37
-rw-r--r--arch/arm64/kernel/efi.c57
-rw-r--r--arch/arm64/kernel/hw_breakpoint.c4
-rw-r--r--arch/arm64/kernel/image.h1
-rw-r--r--arch/arm64/kernel/perf_callchain.c4
-rw-r--r--arch/arm64/net/bpf_jit_comp.c1
-rw-r--r--arch/ia64/include/asm/rwsem.h22
-rw-r--r--arch/ia64/kernel/efi.c2
-rw-r--r--arch/metag/kernel/perf_callchain.c2
-rw-r--r--arch/mips/kernel/perf_event.c4
-rw-r--r--arch/powerpc/perf/callchain.c4
-rw-r--r--arch/s390/include/asm/rwsem.h18
-rw-r--r--arch/sh/include/asm/Kbuild1
-rw-r--r--arch/sh/include/asm/rwsem.h132
-rw-r--r--arch/sparc/include/asm/Kbuild1
-rw-r--r--arch/sparc/include/asm/rwsem.h124
-rw-r--r--arch/sparc/kernel/perf_event.c6
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/boot/compressed/eboot.c308
-rw-r--r--arch/x86/boot/compressed/eboot.h74
-rw-r--r--arch/x86/configs/kvm_guest.config3
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/events/Kconfig36
-rw-r--r--arch/x86/events/Makefile9
-rw-r--r--arch/x86/events/amd/uncore.c2
-rw-r--r--arch/x86/events/core.c10
-rw-r--r--arch/x86/events/intel/Makefile9
-rw-r--r--arch/x86/events/intel/bts.c105
-rw-r--r--arch/x86/events/intel/core.c162
-rw-r--r--arch/x86/events/intel/cstate.c547
-rw-r--r--arch/x86/events/intel/ds.c6
-rw-r--r--arch/x86/events/intel/lbr.c31
-rw-r--r--arch/x86/events/intel/pt.c320
-rw-r--r--arch/x86/events/intel/pt.h68
-rw-r--r--arch/x86/events/intel/rapl.c183
-rw-r--r--arch/x86/events/intel/uncore.c216
-rw-r--r--arch/x86/events/intel/uncore_snbep.c7
-rw-r--r--arch/x86/events/msr.c38
-rw-r--r--arch/x86/events/perf_event.h5
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/cpufeatures.h9
-rw-r--r--arch/x86/include/asm/efi.h52
-rw-r--r--arch/x86/include/asm/mce.h19
-rw-r--r--arch/x86/include/asm/msr-index.h35
-rw-r--r--arch/x86/include/asm/rwsem.h42
-rw-r--r--arch/x86/include/asm/uaccess.h8
-rw-r--r--arch/x86/kernel/cpu/common.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-genpool.c46
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c30
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c160
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c94
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/signal.c23
-rw-r--r--arch/x86/kernel/sysfb_efi.c15
-rw-r--r--arch/x86/kernel/uprobes.c4
-rw-r--r--arch/x86/kvm/emulate.c6
-rw-r--r--arch/x86/lib/rwsem.S16
-rw-r--r--arch/x86/mm/pageattr.c8
-rw-r--r--arch/x86/platform/efi/efi.c133
-rw-r--r--arch/x86/platform/efi/efi_64.c10
-rw-r--r--arch/x86/platform/efi/quirks.c10
-rw-r--r--arch/x86/ras/Kconfig2
-rw-r--r--arch/x86/ras/Makefile2
-rw-r--r--arch/x86/ras/mce_amd_inj.c31
-rw-r--r--arch/xtensa/include/asm/Kbuild1
-rw-r--r--arch/xtensa/include/asm/rwsem.h131
-rw-r--r--arch/xtensa/kernel/perf_event.c4
-rw-r--r--block/blk-map.c47
-rw-r--r--crypto/testmgr.c27
-rw-r--r--drivers/base/regmap/internal.h1
-rw-r--r--drivers/base/regmap/regmap-mmio.c5
-rw-r--r--drivers/base/regmap/regmap-spmi.c2
-rw-r--r--drivers/edac/mce_amd.c9
-rw-r--r--drivers/firmware/efi/Kconfig25
-rw-r--r--drivers/firmware/efi/Makefile5
-rw-r--r--drivers/firmware/efi/arm-init.c96
-rw-r--r--drivers/firmware/efi/arm-runtime.c45
-rw-r--r--drivers/firmware/efi/capsule-loader.c343
-rw-r--r--drivers/firmware/efi/capsule.c308
-rw-r--r--drivers/firmware/efi/efi.c48
-rw-r--r--drivers/firmware/efi/efibc.c113
-rw-r--r--drivers/firmware/efi/efivars.c5
-rw-r--r--drivers/firmware/efi/fake_mem.c43
-rw-r--r--drivers/firmware/efi/libstub/Makefile2
-rw-r--r--drivers/firmware/efi/libstub/arm-stub.c77
-rw-r--r--drivers/firmware/efi/libstub/arm32-stub.c37
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c6
-rw-r--r--drivers/firmware/efi/libstub/gop.c354
-rw-r--r--drivers/firmware/efi/memattr.c182
-rw-r--r--drivers/firmware/efi/reboot.c12
-rw-r--r--drivers/firmware/efi/runtime-wrappers.c60
-rw-r--r--drivers/firmware/efi/vars.c56
-rw-r--r--drivers/gpu/drm/amd/amdgpu/atombios_dp.c4
-rw-r--r--drivers/gpu/drm/i915/i915_debugfs.c16
-rw-r--r--drivers/gpu/drm/i915/i915_reg.h2
-rw-r--r--drivers/gpu/drm/i915/intel_audio.c9
-rw-r--r--drivers/gpu/drm/i915/intel_crt.c8
-rw-r--r--drivers/gpu/drm/i915/intel_ddi.c24
-rw-r--r--drivers/gpu/drm/i915/intel_display.c5
-rw-r--r--drivers/gpu/drm/i915/intel_dp_mst.c22
-rw-r--r--drivers/gpu/drm/i915/intel_drv.h2
-rw-r--r--drivers/gpu/drm/i915/intel_lvds.c4
-rw-r--r--drivers/gpu/drm/i915/intel_pm.c6
-rw-r--r--drivers/gpu/drm/radeon/atombios_crtc.c10
-rw-r--r--drivers/gpu/drm/radeon/atombios_dp.c4
-rw-r--r--drivers/gpu/drm/radeon/radeon_dp_auxch.c2
-rw-r--r--drivers/input/misc/max8997_haptic.c6
-rw-r--r--drivers/input/misc/twl6040-vibra.c1
-rw-r--r--drivers/input/mouse/byd.c4
-rw-r--r--drivers/media/v4l2-core/videobuf2-v4l2.c6
-rw-r--r--drivers/net/ethernet/apm/xgene/xgene_enet_cle.c11
-rw-r--r--drivers/net/ethernet/apm/xgene/xgene_enet_cle.h2
-rw-r--r--drivers/net/ethernet/apm/xgene/xgene_enet_hw.c19
-rw-r--r--drivers/net/ethernet/apm/xgene/xgene_enet_hw.h8
-rw-r--r--drivers/net/ethernet/apm/xgene/xgene_enet_main.c75
-rw-r--r--drivers/net/ethernet/apm/xgene/xgene_enet_main.h18
-rw-r--r--drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h2
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.c63
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.h2
-rw-r--r--drivers/net/ethernet/cavium/thunder/nicvf_queues.c4
-rw-r--r--drivers/net/ethernet/ezchip/nps_enet.c30
-rw-r--r--drivers/net/ethernet/ezchip/nps_enet.h2
-rw-r--r--drivers/net/ethernet/marvell/Kconfig2
-rw-r--r--drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c8
-rw-r--r--drivers/net/ethernet/renesas/ravb_main.c2
-rw-r--r--drivers/net/phy/phy.c8
-rw-r--r--drivers/net/wireless/intel/iwlwifi/mvm/tx.c83
-rw-r--r--drivers/net/xen-netback/netback.c1
-rw-r--r--drivers/perf/arm_pmu.c8
-rw-r--r--drivers/pinctrl/pinctrl-at91-pio4.c2
-rw-r--r--drivers/powercap/intel_rapl.c69
-rw-r--r--drivers/regulator/axp20x-regulator.c12
-rw-r--r--drivers/regulator/da9063-regulator.c2
-rw-r--r--drivers/regulator/gpio-regulator.c2
-rw-r--r--drivers/regulator/s2mps11.c28
-rw-r--r--drivers/scsi/device_handler/scsi_dh_alua.c1
-rw-r--r--drivers/scsi/qla1280.c2
-rw-r--r--drivers/spi/spi-fsl-dspi.c4
-rw-r--r--drivers/spi/spi-omap2-mcspi.c62
-rw-r--r--drivers/spi/spi-pxa2xx.c2
-rw-r--r--drivers/spi/spi-ti-qspi.c45
-rw-r--r--drivers/video/fbdev/Kconfig2
-rw-r--r--drivers/video/fbdev/efifb.c21
-rw-r--r--drivers/xen/efi.c1
-rw-r--r--fs/ecryptfs/file.c71
-rw-r--r--fs/efivarfs/file.c2
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/isofs/rock.c13
-rw-r--r--fs/kernfs/dir.c6
-rw-r--r--fs/kernfs/mount.c15
-rw-r--r--fs/namei.c92
-rw-r--r--fs/ocfs2/acl.c87
-rw-r--r--fs/ocfs2/acl.h5
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/namei.c23
-rw-r--r--fs/ocfs2/refcounttree.c17
-rw-r--r--fs/ocfs2/xattr.c14
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--fs/open.c12
-rw-r--r--fs/overlayfs/super.c4
-rw-r--r--fs/splice.c3
-rw-r--r--include/asm-generic/rwsem.h13
-rw-r--r--include/linux/atomic.h4
-rw-r--r--include/linux/dcache.h12
-rw-r--r--include/linux/efi.h167
-rw-r--r--include/linux/kernfs.h2
-rw-r--r--include/linux/lockdep.h15
-rw-r--r--include/linux/mfd/samsung/s2mps11.h2
-rw-r--r--include/linux/mm.h9
-rw-r--r--include/linux/namei.h2
-rw-r--r--include/linux/perf_event.h139
-rw-r--r--include/linux/proportions.h137
-rw-r--r--include/linux/rcupdate.h30
-rw-r--r--include/linux/rcutiny.h16
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--include/linux/rwsem-spinlock.h2
-rw-r--r--include/linux/rwsem.h3
-rw-r--r--include/linux/sched.h21
-rw-r--r--include/linux/signal.h4
-rw-r--r--include/linux/swap.h6
-rw-r--r--include/linux/uio.h1
-rw-r--r--include/trace/events/rcu.h79
-rw-r--r--include/uapi/linux/if.h28
-rw-r--r--include/uapi/linux/libc-compat.h44
-rw-r--r--include/uapi/linux/perf_event.h4
-rw-r--r--include/uapi/linux/signal.h5
-rw-r--r--include/uapi/linux/tc_act/Kbuild1
-rw-r--r--kernel/bpf/stackmap.c8
-rw-r--r--kernel/cgroup.c63
-rw-r--r--kernel/events/callchain.c35
-rw-r--r--kernel/events/core.c911
-rw-r--r--kernel/events/internal.h10
-rw-r--r--kernel/events/ring_buffer.c128
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/locktorture.c25
-rw-r--r--kernel/locking/qspinlock_stat.h24
-rw-r--r--kernel/locking/rwsem-spinlock.c19
-rw-r--r--kernel/locking/rwsem-xadd.c38
-rw-r--r--kernel/locking/rwsem.c19
-rw-r--r--kernel/rcu/Makefile1
-rw-r--r--kernel/rcu/rcuperf.c655
-rw-r--r--kernel/rcu/rcutorture.c29
-rw-r--r--kernel/rcu/tree.c302
-rw-r--r--kernel/rcu/tree.h20
-rw-r--r--kernel/rcu/tree_plugin.h37
-rw-r--r--kernel/rcu/tree_trace.c13
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/signal.c29
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/torture.c4
-rw-r--r--kernel/trace/trace_event_perf.c3
-rw-r--r--kernel/workqueue.c11
-rw-r--r--lib/Kconfig.debug33
-rw-r--r--lib/Makefile2
-rw-r--r--lib/asn1_decoder.c16
-rw-r--r--lib/iov_iter.c19
-rw-r--r--lib/proportions.c407
-rw-r--r--mm/huge_memory.c71
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/memory.c22
-rw-r--r--mm/swapfile.c13
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/ip_gre.c7
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv6/route.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c6
-rw-r--r--net/netfilter/nfnetlink_acct.c2
-rw-r--r--net/netfilter/xt_IDLETIMER.c1
-rw-r--r--net/openvswitch/conntrack.c13
-rw-r--r--net/sched/act_ife.c14
-rw-r--r--net/sched/act_ipt.c19
-rw-r--r--net/sched/act_mirred.c19
-rw-r--r--net/sched/act_simple.c18
-rw-r--r--net/sched/act_skbedit.c18
-rw-r--r--net/sched/act_vlan.c22
-rw-r--r--net/x25/x25_facilities.c1
-rw-r--r--sound/pci/hda/hda_sysfs.c8
-rw-r--r--sound/pci/hda/patch_hdmi.c7
-rw-r--r--sound/pci/hda/patch_realtek.c13
-rw-r--r--sound/usb/quirks.c3
-rw-r--r--tools/Makefile3
-rw-r--r--tools/build/Makefile.feature10
-rw-r--r--tools/build/feature/Makefile27
-rw-r--r--tools/build/feature/test-all.c5
-rw-r--r--tools/build/feature/test-bpf.c3
-rw-r--r--tools/build/feature/test-dwarf_getlocations.c12
-rw-r--r--tools/build/feature/test-libunwind-aarch64.c26
-rw-r--r--tools/build/feature/test-libunwind-arm.c27
-rw-r--r--tools/build/feature/test-libunwind-debug-frame-aarch64.c16
-rw-r--r--tools/build/feature/test-libunwind-debug-frame-arm.c16
-rw-r--r--tools/build/feature/test-libunwind-x86.c27
-rw-r--r--tools/build/feature/test-libunwind-x86_64.c27
-rw-r--r--tools/lib/api/fs/fs.c13
-rw-r--r--tools/lib/api/fs/fs.h2
-rw-r--r--tools/lib/traceevent/parse-filter.c4
-rw-r--r--tools/perf/Documentation/intel-pt.txt7
-rw-r--r--tools/perf/Documentation/itrace.txt8
-rw-r--r--tools/perf/Documentation/perf-annotate.txt2
-rw-r--r--tools/perf/Documentation/perf-diff.txt2
-rw-r--r--tools/perf/Documentation/perf-list.txt107
-rw-r--r--tools/perf/Documentation/perf-mem.txt8
-rw-r--r--tools/perf/Documentation/perf-record.txt13
-rw-r--r--tools/perf/Documentation/perf-report.txt4
-rw-r--r--tools/perf/Documentation/perf-sched.txt16
-rw-r--r--tools/perf/Documentation/perf-script.txt14
-rw-r--r--tools/perf/Documentation/perf-top.txt2
-rw-r--r--tools/perf/Documentation/perf-trace.txt32
-rw-r--r--tools/perf/Makefile.perf15
-rw-r--r--tools/perf/arch/powerpc/Makefile1
-rw-r--r--tools/perf/arch/powerpc/util/dwarf-regs.c40
-rw-r--r--tools/perf/arch/powerpc/util/sym-handling.c43
-rw-r--r--tools/perf/arch/x86/Makefile23
-rw-r--r--tools/perf/arch/x86/entry/syscalls/syscall_64.tbl376
-rwxr-xr-xtools/perf/arch/x86/entry/syscalls/syscalltbl.sh39
-rw-r--r--tools/perf/arch/x86/tests/perf-time-to-tsc.c2
-rw-r--r--tools/perf/arch/x86/util/dwarf-regs.c8
-rw-r--r--tools/perf/arch/x86/util/intel-bts.c5
-rw-r--r--tools/perf/arch/x86/util/intel-pt.c5
-rw-r--r--tools/perf/arch/x86/util/tsc.c32
-rw-r--r--tools/perf/arch/x86/util/tsc.h17
-rw-r--r--tools/perf/bench/futex-lock-pi.c2
-rw-r--r--tools/perf/bench/futex.h6
-rw-r--r--tools/perf/bench/mem-functions.c22
-rw-r--r--tools/perf/builtin-config.c39
-rw-r--r--tools/perf/builtin-diff.c4
-rw-r--r--tools/perf/builtin-help.c18
-rw-r--r--tools/perf/builtin-inject.c1
-rw-r--r--tools/perf/builtin-kmem.c2
-rw-r--r--tools/perf/builtin-kvm.c2
-rw-r--r--tools/perf/builtin-mem.c11
-rw-r--r--tools/perf/builtin-record.c267
-rw-r--r--tools/perf/builtin-report.c19
-rw-r--r--tools/perf/builtin-sched.c198
-rw-r--r--tools/perf/builtin-script.c124
-rw-r--r--tools/perf/builtin-stat.c15
-rw-r--r--tools/perf/builtin-top.c39
-rw-r--r--tools/perf/builtin-trace.c1085
-rw-r--r--tools/perf/config/Makefile11
-rw-r--r--tools/perf/jvmti/jvmti_agent.c43
-rw-r--r--tools/perf/perf.c16
-rw-r--r--tools/perf/perf.h1
-rw-r--r--tools/perf/scripts/python/export-to-postgresql.py52
-rw-r--r--tools/perf/tests/Build2
-rw-r--r--tools/perf/tests/backward-ring-buffer.c151
-rw-r--r--tools/perf/tests/bpf.c2
-rw-r--r--tools/perf/tests/builtin-test.c8
-rw-r--r--tools/perf/tests/code-reading.c2
-rw-r--r--tools/perf/tests/dso-data.c2
-rw-r--r--tools/perf/tests/event-times.c236
-rw-r--r--tools/perf/tests/event_update.c2
-rw-r--r--tools/perf/tests/hists_common.c2
-rw-r--r--tools/perf/tests/hists_cumulate.c4
-rw-r--r--tools/perf/tests/hists_filter.c2
-rw-r--r--tools/perf/tests/hists_link.c4
-rw-r--r--tools/perf/tests/hists_output.c4
-rw-r--r--tools/perf/tests/keep-tracking.c2
-rw-r--r--tools/perf/tests/openat-syscall-tp-fields.c2
-rw-r--r--tools/perf/tests/perf-record.c2
-rw-r--r--tools/perf/tests/switch-tracking.c2
-rw-r--r--tools/perf/tests/tests.h2
-rw-r--r--tools/perf/tests/vmlinux-kallsyms.c11
-rw-r--r--tools/perf/trace/beauty/eventfd.c38
-rw-r--r--tools/perf/trace/beauty/flock.c31
-rw-r--r--tools/perf/trace/beauty/futex_op.c44
-rw-r--r--tools/perf/trace/beauty/mmap.c158
-rw-r--r--tools/perf/trace/beauty/mode_t.c68
-rw-r--r--tools/perf/trace/beauty/msg_flags.c62
-rw-r--r--tools/perf/trace/beauty/open_flags.c56
-rw-r--r--tools/perf/trace/beauty/perf_event_open.c43
-rw-r--r--tools/perf/trace/beauty/pid.c21
-rw-r--r--tools/perf/trace/beauty/sched_policy.c44
-rw-r--r--tools/perf/trace/beauty/seccomp.c52
-rw-r--r--tools/perf/trace/beauty/signum.c53
-rw-r--r--tools/perf/trace/beauty/socket_type.c60
-rw-r--r--tools/perf/trace/beauty/waitid_options.c26
-rw-r--r--tools/perf/ui/browsers/hists.c40
-rw-r--r--tools/perf/ui/gtk/hists.c2
-rw-r--r--tools/perf/ui/hist.c2
-rw-r--r--tools/perf/ui/stdio/hist.c3
-rw-r--r--tools/perf/util/Build8
-rw-r--r--tools/perf/util/annotate.c4
-rw-r--r--tools/perf/util/auxtrace.c7
-rw-r--r--tools/perf/util/auxtrace.h2
-rw-r--r--tools/perf/util/bpf-loader.c143
-rw-r--r--tools/perf/util/bpf-loader.h19
-rw-r--r--tools/perf/util/build-id.c36
-rw-r--r--tools/perf/util/cache.h19
-rw-r--r--tools/perf/util/call-path.c122
-rw-r--r--tools/perf/util/call-path.h77
-rw-r--r--tools/perf/util/callchain.c9
-rw-r--r--tools/perf/util/callchain.h9
-rw-r--r--tools/perf/util/config.c222
-rw-r--r--tools/perf/util/config.h26
-rw-r--r--tools/perf/util/cpumap.c12
-rw-r--r--tools/perf/util/cpumap.h2
-rw-r--r--tools/perf/util/data.c41
-rw-r--r--tools/perf/util/data.h11
-rw-r--r--tools/perf/util/db-export.c89
-rw-r--r--tools/perf/util/db-export.h3
-rw-r--r--tools/perf/util/dso.c4
-rw-r--r--tools/perf/util/dwarf-aux.c61
-rw-r--r--tools/perf/util/event.c13
-rw-r--r--tools/perf/util/event.h9
-rw-r--r--tools/perf/util/evlist.c174
-rw-r--r--tools/perf/util/evlist.h22
-rw-r--r--tools/perf/util/evsel.c162
-rw-r--r--tools/perf/util/evsel.h28
-rw-r--r--tools/perf/util/evsel_fprintf.c212
-rw-r--r--tools/perf/util/header.c33
-rw-r--r--tools/perf/util/help-unknown-cmd.c30
-rw-r--r--tools/perf/util/hist.c23
-rw-r--r--tools/perf/util/hist.h12
-rw-r--r--tools/perf/util/intel-bts.c5
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-decoder.c2
-rw-r--r--tools/perf/util/intel-pt.c22
-rw-r--r--tools/perf/util/jitdump.c41
-rw-r--r--tools/perf/util/jitdump.h3
-rw-r--r--tools/perf/util/machine.c105
-rw-r--r--tools/perf/util/machine.h7
-rw-r--r--tools/perf/util/map.c16
-rw-r--r--tools/perf/util/ordered-events.c9
-rw-r--r--tools/perf/util/ordered-events.h1
-rw-r--r--tools/perf/util/parse-events.c60
-rw-r--r--tools/perf/util/pmu.c23
-rw-r--r--tools/perf/util/probe-event.c406
-rw-r--r--tools/perf/util/probe-event.h5
-rw-r--r--tools/perf/util/probe-file.c3
-rw-r--r--tools/perf/util/probe-finder.c36
-rw-r--r--tools/perf/util/python-ext-sources1
-rw-r--r--tools/perf/util/quote.c36
-rw-r--r--tools/perf/util/quote.h2
-rw-r--r--tools/perf/util/rb_resort.h149
-rw-r--r--tools/perf/util/record.c5
-rw-r--r--tools/perf/util/scripting-engines/trace-event-perl.c126
-rw-r--r--tools/perf/util/scripting-engines/trace-event-python.c47
-rw-r--r--tools/perf/util/session.c115
-rw-r--r--tools/perf/util/session.h12
-rw-r--r--tools/perf/util/sort.c38
-rw-r--r--tools/perf/util/sort.h7
-rw-r--r--tools/perf/util/stat.c4
-rw-r--r--tools/perf/util/strbuf.c93
-rw-r--r--tools/perf/util/strbuf.h25
-rw-r--r--tools/perf/util/symbol-elf.c20
-rw-r--r--tools/perf/util/symbol.c108
-rw-r--r--tools/perf/util/symbol.h19
-rw-r--r--tools/perf/util/symbol_fprintf.c71
-rw-r--r--tools/perf/util/syscalltbl.c134
-rw-r--r--tools/perf/util/syscalltbl.h20
-rw-r--r--tools/perf/util/thread-stack.c139
-rw-r--r--tools/perf/util/thread-stack.h31
-rw-r--r--tools/perf/util/thread.c21
-rw-r--r--tools/perf/util/thread.h8
-rw-r--r--tools/perf/util/thread_map.c22
-rw-r--r--tools/perf/util/thread_map.h3
-rw-r--r--tools/perf/util/tool.h1
-rw-r--r--tools/perf/util/trigger.h94
-rw-r--r--tools/perf/util/tsc.h21
-rw-r--r--tools/perf/util/unwind-libunwind.c25
-rw-r--r--tools/perf/util/util.c38
-rw-r--r--tools/perf/util/util.h15
-rw-r--r--tools/perf/util/wrapper.c29
-rw-r--r--tools/testing/selftests/Makefile1
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/jitter.sh90
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh121
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh96
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck.sh5
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh59
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh24
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE042
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/TREE20
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/TREE5423
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh52
-rw-r--r--tools/testing/selftests/sigaltstack/Makefile8
-rw-r--r--tools/testing/selftests/sigaltstack/sas.c176
474 files changed, 23068 insertions, 8896 deletions
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg
new file mode 100644
index 000000000000..727e270b11e4
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg
@@ -0,0 +1,474 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:28:20 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="9.1in"
+ height="8.9in"
+ viewBox="-66 -66 10932 10707"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="BigTreeClassicRCU.fig">
+ <metadata
+ id="metadata106">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs104">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3864"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="973"
+ inkscape:window-height="1137"
+ id="namedview102"
+ showgrid="false"
+ inkscape:zoom="0.9743589"
+ inkscape:cx="409.50003"
+ inkscape:cy="400.49997"
+ inkscape:window-x="915"
+ inkscape:window-y="24"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="0"
+ width="10800"
+ height="5625"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="1125"
+ y="3600"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="3825"
+ y="900"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="6525"
+ y="3600"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect12" />
+ <!-- Line -->
+ <polyline
+ points="3375,6525 3375,5046 "
+ style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline14" />
+ <!-- Arrowhead on XXXpoint 3375 6525 - 3375 4860-->
+ <!-- Circle -->
+ <circle
+ cx="7425"
+ cy="6075"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle18" />
+ <!-- Circle -->
+ <circle
+ cx="7875"
+ cy="6075"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle20" />
+ <!-- Circle -->
+ <circle
+ cx="8325"
+ cy="6075"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle22" />
+ <!-- Circle -->
+ <circle
+ cx="2025"
+ cy="6075"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle24" />
+ <!-- Circle -->
+ <circle
+ cx="2475"
+ cy="6075"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle26" />
+ <!-- Circle -->
+ <circle
+ cx="2925"
+ cy="6075"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle28" />
+ <!-- Circle -->
+ <circle
+ cx="4725"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle30" />
+ <!-- Circle -->
+ <circle
+ cx="5175"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle32" />
+ <!-- Circle -->
+ <circle
+ cx="5625"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle34" />
+ <!-- Line: box -->
+ <rect
+ x="2025"
+ y="6525"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect36" />
+ <!-- Line -->
+ <polyline
+ points="2475,3600 3975,2310 "
+ style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline38" />
+ <!-- Arrowhead on XXXpoint 2475 3600 - 4116 2190-->
+ <!-- Line -->
+ <polyline
+ points="7875,3600 6372,2310 "
+ style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline42" />
+ <!-- Arrowhead on XXXpoint 7875 3600 - 6231 2190-->
+ <!-- Line -->
+ <polyline
+ points="6975,8775 6975,5046 "
+ style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline46" />
+ <!-- Arrowhead on XXXpoint 6975 8775 - 6975 4860-->
+ <!-- Line -->
+ <polyline
+ points="1575,8775 1575,5046 "
+ style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline50" />
+ <!-- Arrowhead on XXXpoint 1575 8775 - 1575 4860-->
+ <!-- Line -->
+ <polyline
+ points="8775,6525 8775,5046 "
+ style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline54" />
+ <!-- Arrowhead on XXXpoint 8775 6525 - 8775 4860-->
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1575"
+ y="9225"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text58">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1575"
+ y="9675"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text60">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1575"
+ y="10350"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text62">CPU 0</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3375"
+ y="6975"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text64">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3375"
+ y="7425"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text66">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3375"
+ y="8100"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text68">CPU 15</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6975"
+ y="9225"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text70">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6975"
+ y="9675"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text72">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6975"
+ y="10350"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text74">CPU 1007</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8730"
+ y="6930"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text76">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8730"
+ y="7380"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text78">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8730"
+ y="8055"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text80">CPU 1023</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="225"
+ y="450"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="start"
+ id="text82">struct rcu_state</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2475"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text84">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2475"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text86">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7875"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text88">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7875"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text90">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5175"
+ y="1350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text92">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5175"
+ y="1800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text94">rcu_node</text>
+ <!-- Line: box -->
+ <rect
+ x="225"
+ y="8775"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect96" />
+ <!-- Line: box -->
+ <rect
+ x="5625"
+ y="8775"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect98" />
+ <!-- Line: box -->
+ <rect
+ x="7380"
+ y="6480"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect100" />
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg
new file mode 100644
index 000000000000..9bbb1944f962
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg
@@ -0,0 +1,499 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:26:09 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="5.7in"
+ height="6.6in"
+ viewBox="-44 -44 6838 7888"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="BigTreeClassicRCUBH.fig">
+ <metadata
+ id="metadata110">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs108">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3868"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow2Mend"
+ style="overflow:visible;">
+ <path
+ id="path3886"
+ style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+ d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+ transform="scale(0.6) rotate(180) translate(0,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="878"
+ inkscape:window-height="1148"
+ id="namedview106"
+ showgrid="false"
+ inkscape:zoom="1.3547758"
+ inkscape:cx="256.5"
+ inkscape:cy="297"
+ inkscape:window-x="45"
+ inkscape:window-y="24"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="450"
+ y="0"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="4950"
+ y="4950"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="600"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="450"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect12" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="1050"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect14" />
+ <!-- Circle -->
+ <circle
+ cx="2850"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle16" />
+ <!-- Circle -->
+ <circle
+ cx="3150"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle18" />
+ <!-- Circle -->
+ <circle
+ cx="3450"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle20" />
+ <!-- Circle -->
+ <circle
+ cx="1350"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle22" />
+ <!-- Circle -->
+ <circle
+ cx="1650"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle24" />
+ <!-- Circle -->
+ <circle
+ cx="1950"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle26" />
+ <!-- Circle -->
+ <circle
+ cx="4350"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle28" />
+ <!-- Circle -->
+ <circle
+ cx="4650"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle30" />
+ <!-- Circle -->
+ <circle
+ cx="4950"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle32" />
+ <!-- Line -->
+ <polyline
+ points="1350,3450 2350,2590 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline34" />
+ <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
+ <!-- Line -->
+ <polyline
+ points="4950,3450 3948,2590 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline38" />
+ <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="3450"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect42" />
+ <!-- Line -->
+ <polyline
+ points="2250,5400 2250,4414 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline44" />
+ <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
+ <!-- Line: box -->
+ <rect
+ x="1500"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect48" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="6600"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect50" />
+ <!-- Line: box -->
+ <rect
+ x="3750"
+ y="3450"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect52" />
+ <!-- Line: box -->
+ <rect
+ x="4500"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect54" />
+ <!-- Line: box -->
+ <rect
+ x="3300"
+ y="6600"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect56" />
+ <!-- Line: box -->
+ <rect
+ x="2250"
+ y="1650"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect58" />
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6450"
+ y="300"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text60">rcu_bh</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="1950"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text62">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="2250"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text64">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="3750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text66">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text68">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text70">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="3750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text72">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="5700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text74">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6000"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text76">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="6900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text78">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text80">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="5700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text82">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="6000"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text84">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="6900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text86">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text88">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="1350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text90">struct rcu_state</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6000"
+ y="750"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text92">rcu_sched</text>
+ <!-- Line -->
+ <polyline
+ points="5250,5400 5250,4414 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline94" />
+ <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
+ <!-- Line -->
+ <polyline
+ points="4050,6600 4050,4414 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline98" />
+ <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
+ <!-- Line -->
+ <polyline
+ points="1050,6600 1050,4414 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline102" />
+ <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg
new file mode 100644
index 000000000000..21ba7823479d
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg
@@ -0,0 +1,695 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:20:02 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="5.7in"
+ height="8.6in"
+ viewBox="-44 -44 6838 10288"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="BigTreeClassicRCUBHdyntick.fig">
+ <metadata
+ id="metadata166">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs164">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3924"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow2Lend"
+ style="overflow:visible;">
+ <path
+ id="path3936"
+ style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+ d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+ transform="scale(1.1) rotate(180) translate(1,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="845"
+ inkscape:window-height="988"
+ id="namedview162"
+ showgrid="false"
+ inkscape:zoom="1.0452196"
+ inkscape:cx="256.5"
+ inkscape:cy="387.00003"
+ inkscape:window-x="356"
+ inkscape:window-y="61"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="450"
+ y="0"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="4950"
+ y="4950"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="600"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect10" />
+ <!-- Line -->
+ <polyline
+ points="5250,8100 5688,5912 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline12" />
+ <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
+ <polyline
+ points="5714 6068 5704 5822 5598 6044 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline14" />
+ <!-- Line -->
+ <polyline
+ points="4050,9300 4486,7262 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline16" />
+ <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
+ <polyline
+ points="4514 7418 4506 7172 4396 7394 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline18" />
+ <!-- Line -->
+ <polyline
+ points="1040,9300 1476,7262 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline20" />
+ <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
+ <polyline
+ points="1504 7418 1496 7172 1386 7394 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline22" />
+ <!-- Line -->
+ <polyline
+ points="2240,8100 2676,6062 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline24" />
+ <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
+ <polyline
+ points="2704 6218 2696 5972 2586 6194 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline26" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="450"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect28" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="1050"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect30" />
+ <!-- Line -->
+ <polyline
+ points="1350,3450 2350,2590 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline32" />
+ <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
+ <!-- Line -->
+ <polyline
+ points="4950,3450 3948,2590 "
+ style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline36" />
+ <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
+ <!-- Line -->
+ <polyline
+ points="4050,6600 4050,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline40" />
+ <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
+ <!-- Line -->
+ <polyline
+ points="1050,6600 1050,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline44" />
+ <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
+ <!-- Line -->
+ <polyline
+ points="2250,5400 2250,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline48" />
+ <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
+ <!-- Line -->
+ <polyline
+ points="2250,8100 2250,6364 "
+ style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+ id="polyline52" />
+ <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
+ <!-- Line -->
+ <polyline
+ points="1050,9300 1050,7564 "
+ style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+ id="polyline56" />
+ <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
+ <!-- Line -->
+ <polyline
+ points="4050,9300 4050,7564 "
+ style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+ id="polyline60" />
+ <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
+ <!-- Line -->
+ <polyline
+ points="5250,8100 5250,6364 "
+ style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+ id="polyline64" />
+ <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
+ <!-- Circle -->
+ <circle
+ cx="2850"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle68" />
+ <!-- Circle -->
+ <circle
+ cx="3150"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle70" />
+ <!-- Circle -->
+ <circle
+ cx="3450"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle72" />
+ <!-- Circle -->
+ <circle
+ cx="1350"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle74" />
+ <!-- Circle -->
+ <circle
+ cx="1650"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle76" />
+ <!-- Circle -->
+ <circle
+ cx="1950"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle78" />
+ <!-- Circle -->
+ <circle
+ cx="4350"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle80" />
+ <!-- Circle -->
+ <circle
+ cx="4650"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle82" />
+ <!-- Circle -->
+ <circle
+ cx="4950"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle84" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="3450"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect86" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="6600"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect88" />
+ <!-- Line: box -->
+ <rect
+ x="3750"
+ y="3450"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect90" />
+ <!-- Line: box -->
+ <rect
+ x="4500"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect92" />
+ <!-- Line: box -->
+ <rect
+ x="3300"
+ y="6600"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect94" />
+ <!-- Line: box -->
+ <rect
+ x="2250"
+ y="1650"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect96" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="9300"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect98" />
+ <!-- Line: box -->
+ <rect
+ x="1350"
+ y="8100"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect100" />
+ <!-- Line: box -->
+ <rect
+ x="3000"
+ y="9300"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect102" />
+ <!-- Line: box -->
+ <rect
+ x="4350"
+ y="8100"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect104" />
+ <!-- Line: box -->
+ <rect
+ x="1500"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect106" />
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6450"
+ y="300"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text108">rcu_bh</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="1950"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text110">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="2250"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text112">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="3750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text114">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text116">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text118">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="3750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text120">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="5700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text122">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6000"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text124">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="6900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text126">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text128">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="5700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text130">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="6000"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text132">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="6900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text134">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text136">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="1350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text138">struct rcu_state</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="9600"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text140">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="9900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text142">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="9600"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text144">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="9900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text146">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="8400"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text148">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="8700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text150">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="8400"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text152">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="8700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text154">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6000"
+ y="750"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text156">rcu_sched</text>
+ <!-- Line -->
+ <polyline
+ points="5250,5400 5250,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline158" />
+ <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg
new file mode 100644
index 000000000000..15adcac036c7
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg
@@ -0,0 +1,741 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:32:59 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="6.1in"
+ height="8.9in"
+ viewBox="-44 -44 7288 10738"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="BigTreePreemptRCUBHdyntick.fig">
+ <metadata
+ id="metadata182">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs180">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3940"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="874"
+ inkscape:window-height="1148"
+ id="namedview178"
+ showgrid="false"
+ inkscape:zoom="1.2097379"
+ inkscape:cx="274.5"
+ inkscape:cy="400.49997"
+ inkscape:window-x="946"
+ inkscape:window-y="24"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="900"
+ y="0"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="1200"
+ y="600"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="5400"
+ y="4950"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="450"
+ y="450"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect12" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="1050"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect14" />
+ <!-- Line: box -->
+ <rect
+ x="4950"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect16" />
+ <!-- Line -->
+ <polyline
+ points="5250,8550 5688,6362 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline18" />
+ <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
+ <polyline
+ points="5714 6518 5704 6272 5598 6494 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline20" />
+ <!-- Line -->
+ <polyline
+ points="4050,9750 4486,7712 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline22" />
+ <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
+ <polyline
+ points="4514 7868 4506 7622 4396 7844 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline24" />
+ <!-- Line -->
+ <polyline
+ points="1040,9750 1476,7712 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline26" />
+ <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
+ <polyline
+ points="1504 7868 1496 7622 1386 7844 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline28" />
+ <!-- Line -->
+ <polyline
+ points="2240,8550 2676,6512 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline30" />
+ <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
+ <polyline
+ points="2704 6668 2696 6422 2586 6644 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline32" />
+ <!-- Line -->
+ <polyline
+ points="4050,9750 5682,6360 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline34" />
+ <!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246-->
+ <polyline
+ points="5672 6518 5722 6276 5562 6466 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline36" />
+ <!-- Line -->
+ <polyline
+ points="1010,9750 2642,6360 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline38" />
+ <!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246-->
+ <polyline
+ points="2632 6518 2682 6276 2522 6466 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline40" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="900"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect42" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="1500"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect44" />
+ <!-- Line -->
+ <polyline
+ points="1350,3900 2350,3040 "
+ style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline46" />
+ <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
+ <!-- Line -->
+ <polyline
+ points="4950,3900 3948,3040 "
+ style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline50" />
+ <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
+ <!-- Line -->
+ <polyline
+ points="4050,7050 4050,4864 "
+ style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline54" />
+ <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
+ <!-- Line -->
+ <polyline
+ points="1050,7050 1050,4864 "
+ style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline58" />
+ <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
+ <!-- Line -->
+ <polyline
+ points="2250,5850 2250,4864 "
+ style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline62" />
+ <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
+ <!-- Line -->
+ <polyline
+ points="2250,8550 2250,6814 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline66" />
+ <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
+ <!-- Line -->
+ <polyline
+ points="1050,9750 1050,8014 "
+ style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline70" />
+ <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
+ <!-- Line -->
+ <polyline
+ points="4050,9750 4050,8014 "
+ style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline74" />
+ <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
+ <!-- Line -->
+ <polyline
+ points="5250,8550 5250,6814 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline78" />
+ <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
+ <!-- Circle -->
+ <circle
+ cx="2850"
+ cy="4350"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle82" />
+ <!-- Circle -->
+ <circle
+ cx="3150"
+ cy="4350"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle84" />
+ <!-- Circle -->
+ <circle
+ cx="3450"
+ cy="4350"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle86" />
+ <!-- Circle -->
+ <circle
+ cx="1350"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle88" />
+ <!-- Circle -->
+ <circle
+ cx="1650"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle90" />
+ <!-- Circle -->
+ <circle
+ cx="1950"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle92" />
+ <!-- Circle -->
+ <circle
+ cx="4350"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle94" />
+ <!-- Circle -->
+ <circle
+ cx="4650"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle96" />
+ <!-- Circle -->
+ <circle
+ cx="4950"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle98" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="3900"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect100" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="7050"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect102" />
+ <!-- Line: box -->
+ <rect
+ x="3750"
+ y="3900"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect104" />
+ <!-- Line: box -->
+ <rect
+ x="4500"
+ y="5850"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect106" />
+ <!-- Line: box -->
+ <rect
+ x="3300"
+ y="7050"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect108" />
+ <!-- Line: box -->
+ <rect
+ x="2250"
+ y="2100"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect110" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="9750"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect112" />
+ <!-- Line: box -->
+ <rect
+ x="1350"
+ y="8550"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect114" />
+ <!-- Line: box -->
+ <rect
+ x="3000"
+ y="9750"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect116" />
+ <!-- Line: box -->
+ <rect
+ x="4350"
+ y="8550"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect118" />
+ <!-- Line: box -->
+ <rect
+ x="1500"
+ y="5850"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect120" />
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6450"
+ y="750"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text122">rcu_bh</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="2400"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text124">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="2700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text126">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="4200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text128">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text130">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text132">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="4200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text134">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text136">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6450"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text138">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="7350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text140">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="7650"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text142">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="6150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text144">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="6450"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text146">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="7350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text148">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="7650"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text150">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="1800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text152">struct rcu_state</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="10050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text154">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="10350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text156">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="10050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text158">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="10350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text160">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="8850"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text162">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="9150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text164">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="8850"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text166">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="9150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text168">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6900"
+ y="300"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text170">rcu_preempt</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6000"
+ y="1200"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text172">rcu_sched</text>
+ <!-- Line -->
+ <polyline
+ points="5250,5850 5250,4864 "
+ style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline174" />
+ <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg
new file mode 100644
index 000000000000..bbc3801470d0
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg
@@ -0,0 +1,858 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:29:48 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="7.4in"
+ height="9.9in"
+ viewBox="-44 -44 8938 11938"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="BigTreePreemptRCUBHdyntickCB.svg">
+ <metadata
+ id="metadata212">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs210">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3970"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="881"
+ inkscape:window-height="1128"
+ id="namedview208"
+ showgrid="false"
+ inkscape:zoom="1.0195195"
+ inkscape:cx="333"
+ inkscape:cy="445.49997"
+ inkscape:window-x="936"
+ inkscape:window-y="24"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="900"
+ y="0"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="1200"
+ y="600"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="5400"
+ y="4950"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="450"
+ y="450"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect12" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="1050"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect14" />
+ <!-- Line: box -->
+ <rect
+ x="4950"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect16" />
+ <!-- Line -->
+ <polyline
+ points="5250,8550 5688,6362 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline18" />
+ <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
+ <polyline
+ points="5714 6518 5704 6272 5598 6494 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline20" />
+ <!-- Line -->
+ <polyline
+ points="4050,9750 4486,7712 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline22" />
+ <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
+ <polyline
+ points="4514 7868 4506 7622 4396 7844 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline24" />
+ <!-- Line -->
+ <polyline
+ points="1040,9750 1476,7712 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline26" />
+ <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
+ <polyline
+ points="1504 7868 1496 7622 1386 7844 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline28" />
+ <!-- Line -->
+ <polyline
+ points="2240,8550 2676,6512 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline30" />
+ <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
+ <polyline
+ points="2704 6668 2696 6422 2586 6644 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline32" />
+ <!-- Line -->
+ <polyline
+ points="4050,9600 5692,6062 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline34" />
+ <!-- Arrowhead on XXXpoint 4050 9600 - 5744 5948-->
+ <polyline
+ points="5682 6220 5730 5978 5574 6170 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline36" />
+ <!-- Line -->
+ <polyline
+ points="1086,9600 2728,6062 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline38" />
+ <!-- Arrowhead on XXXpoint 1086 9600 - 2780 5948-->
+ <polyline
+ points="2718 6220 2766 5978 2610 6170 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline40" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="900"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect42" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="1500"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect44" />
+ <!-- Line -->
+ <polyline
+ points="1350,3900 2350,3040 "
+ style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline46" />
+ <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
+ <!-- Line -->
+ <polyline
+ points="4950,3900 3948,3040 "
+ style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline50" />
+ <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
+ <!-- Line -->
+ <polyline
+ points="4050,7050 4050,4864 "
+ style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline54" />
+ <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
+ <!-- Line -->
+ <polyline
+ points="1050,7050 1050,4864 "
+ style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline58" />
+ <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
+ <!-- Line -->
+ <polyline
+ points="2250,5850 2250,4864 "
+ style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline62" />
+ <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
+ <!-- Line -->
+ <polyline
+ points="2250,8550 2250,6814 "
+ style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline66" />
+ <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
+ <!-- Line -->
+ <polyline
+ points="1050,9750 1050,8014 "
+ style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline70" />
+ <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
+ <!-- Line -->
+ <polyline
+ points="4050,9750 4050,8014 "
+ style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline74" />
+ <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
+ <!-- Line -->
+ <polyline
+ points="5250,8550 5250,6814 "
+ style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline78" />
+ <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
+ <!-- Line -->
+ <polyline
+ points="6000,6300 8048,7910 "
+ style="stroke:#87cfff;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+ id="polyline82" />
+ <!-- Arrowhead on XXXpoint 6000 6300 - 8146 7986-->
+ <!-- Circle -->
+ <circle
+ cx="2850"
+ cy="4350"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle86" />
+ <!-- Circle -->
+ <circle
+ cx="3150"
+ cy="4350"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle88" />
+ <!-- Circle -->
+ <circle
+ cx="3450"
+ cy="4350"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle90" />
+ <!-- Circle -->
+ <circle
+ cx="1350"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle92" />
+ <!-- Circle -->
+ <circle
+ cx="1650"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle94" />
+ <!-- Circle -->
+ <circle
+ cx="1950"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle96" />
+ <!-- Circle -->
+ <circle
+ cx="4350"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle98" />
+ <!-- Circle -->
+ <circle
+ cx="4650"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle100" />
+ <!-- Circle -->
+ <circle
+ cx="4950"
+ cy="5550"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle102" />
+ <!-- Line: box -->
+ <rect
+ x="7350"
+ y="7950"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect104" />
+ <!-- Line: box -->
+ <rect
+ x="7350"
+ y="9450"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect106" />
+ <!-- Line -->
+ <polyline
+ points="8100,8850 8100,9384 "
+ style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+ id="polyline108" />
+ <!-- Arrowhead on XXXpoint 8100 8850 - 8100 9510-->
+ <!-- Line: box -->
+ <rect
+ x="7350"
+ y="10950"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect112" />
+ <!-- Line -->
+ <polyline
+ points="8100,10350 8100,10884 "
+ style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+ id="polyline114" />
+ <!-- Arrowhead on XXXpoint 8100 10350 - 8100 11010-->
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="3900"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect118" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="7050"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect120" />
+ <!-- Line: box -->
+ <rect
+ x="3750"
+ y="3900"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect122" />
+ <!-- Line: box -->
+ <rect
+ x="4500"
+ y="5850"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect124" />
+ <!-- Line: box -->
+ <rect
+ x="3300"
+ y="7050"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect126" />
+ <!-- Line: box -->
+ <rect
+ x="2250"
+ y="2100"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect128" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="9750"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect130" />
+ <!-- Line: box -->
+ <rect
+ x="1350"
+ y="8550"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect132" />
+ <!-- Line: box -->
+ <rect
+ x="3000"
+ y="9750"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect134" />
+ <!-- Line: box -->
+ <rect
+ x="4350"
+ y="8550"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect136" />
+ <!-- Line: box -->
+ <rect
+ x="1500"
+ y="5850"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect138" />
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8100"
+ y="8250"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text140">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8100"
+ y="8550"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text142">rcu_head</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8100"
+ y="9750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text144">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8100"
+ y="10050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text146">rcu_head</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8100"
+ y="11250"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text148">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8100"
+ y="11550"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text150">rcu_head</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6000"
+ y="1200"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text152">rcu_sched</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6450"
+ y="750"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text154">rcu_bh</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="2400"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text156">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="2700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text158">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="4200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text160">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text162">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text164">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="4200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text166">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text168">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6450"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text170">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="7350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text172">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="7650"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text174">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="6150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text176">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="6450"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text178">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="7350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text180">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="7650"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text182">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="1800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text184">struct rcu_state</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="10050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text186">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="10350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text188">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="10050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text190">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="10350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text192">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="8850"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text194">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="9150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text196">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="8850"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text198">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="9150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text200">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6900"
+ y="300"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text202">rcu_preempt</text>
+ <!-- Line -->
+ <polyline
+ points="5250,5850 5250,4864 "
+ style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline204" />
+ <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
new file mode 100644
index 000000000000..7eb47ac25ad7
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@@ -0,0 +1,1333 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ "http://www.w3.org/TR/html4/loose.dtd">
+ <html>
+ <head><title>A Tour Through TREE_RCU's Data Structures [LWN.net]</title>
+ <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
+
+ <p>January 27, 2016</p>
+ <p>This article was contributed by Paul E.&nbsp;McKenney</p>
+
+<h3>Introduction</h3>
+
+This document describes RCU's major data structures and their relationship
+to each other.
+
+<ol>
+<li> <a href="#Data-Structure Relationships">
+ Data-Structure Relationships</a>
+<li> <a href="#The rcu_state Structure">
+ The <tt>rcu_state</tt> Structure</a>
+<li> <a href="#The rcu_node Structure">
+ The <tt>rcu_node</tt> Structure</a>
+<li> <a href="#The rcu_data Structure">
+ The <tt>rcu_data</tt> Structure</a>
+<li> <a href="#The rcu_dynticks Structure">
+ The <tt>rcu_dynticks</tt> Structure</a>
+<li> <a href="#The rcu_head Structure">
+ The <tt>rcu_head</tt> Structure</a>
+<li> <a href="#RCU-Specific Fields in the task_struct Structure">
+ RCU-Specific Fields in the <tt>task_struct</tt> Structure</a>
+<li> <a href="#Accessor Functions">
+ Accessor Functions</a>
+</ol>
+
+At the end we have the
+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+
+<h3><a name="Data-Structure Relationships">Data-Structure Relationships</a></h3>
+
+<p>RCU is for all intents and purposes a large state machine, and its
+data structures maintain the state in such a way as to allow RCU readers
+to execute extremely quickly, while also processing the RCU grace periods
+requested by updaters in an efficient and extremely scalable fashion.
+The efficiency and scalability of RCU updaters is provided primarily
+by a combining tree, as shown below:
+
+</p><p><img src="BigTreeClassicRCU.svg" alt="BigTreeClassicRCU.svg" width="30%">
+
+</p><p>This diagram shows an enclosing <tt>rcu_state</tt> structure
+containing a tree of <tt>rcu_node</tt> structures.
+Each leaf node of the <tt>rcu_node</tt> tree has up to 16
+<tt>rcu_data</tt> structures associated with it, so that there
+are <tt>NR_CPUS</tt> number of <tt>rcu_data</tt> structures,
+one for each possible CPU.
+This structure is adjusted at boot time, if needed, to handle the
+common case where <tt>nr_cpu_ids</tt> is much less than
+<tt>NR_CPUs</tt>.
+For example, a number of Linux distributions set <tt>NR_CPUs=4096</tt>,
+which results in a three-level <tt>rcu_node</tt> tree.
+If the actual hardware has only 16 CPUs, RCU will adjust itself
+at boot time, resulting in an <tt>rcu_node</tt> tree with only a single node.
+
+</p><p>The purpose of this combining tree is to allow per-CPU events
+such as quiescent states, dyntick-idle transitions,
+and CPU hotplug operations to be processed efficiently
+and scalably.
+Quiescent states are recorded by the per-CPU <tt>rcu_data</tt> structures,
+and other events are recorded by the leaf-level <tt>rcu_node</tt>
+structures.
+All of these events are combined at each level of the tree until finally
+grace periods are completed at the tree's root <tt>rcu_node</tt>
+structure.
+A grace period can be completed at the root once every CPU
+(or, in the case of <tt>CONFIG_PREEMPT_RCU</tt>, task)
+has passed through a quiescent state.
+Once a grace period has completed, record of that fact is propagated
+back down the tree.
+
+</p><p>As can be seen from the diagram, on a 64-bit system
+a two-level tree with 64 leaves can accommodate 1,024 CPUs, with a fanout
+of 64 at the root and a fanout of 16 at the leaves.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Why isn't the fanout at the leaves also 64?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Because there are more types of events that affect the leaf-level
+ <tt>rcu_node</tt> structures than further up the tree.
+ Therefore, if the leaf <tt>rcu_node</tt> structures have fanout of
+ 64, the contention on these structures' <tt>-&gt;structures</tt>
+ becomes excessive.
+ Experimentation on a wide variety of systems has shown that a fanout
+ of 16 works well for the leaves of the <tt>rcu_node</tt> tree.
+ </font>
+
+ <p><font color="ffffff">Of course, further experience with
+ systems having hundreds or thousands of CPUs may demonstrate
+ that the fanout for the non-leaf <tt>rcu_node</tt> structures
+ must also be reduced.
+ Such reduction can be easily carried out when and if it proves
+ necessary.
+ In the meantime, if you are using such a system and running into
+ contention problems on the non-leaf <tt>rcu_node</tt> structures,
+ you may use the <tt>CONFIG_RCU_FANOUT</tt> kernel configuration
+ parameter to reduce the non-leaf fanout as needed.
+ </font>
+
+ <p><font color="ffffff">Kernels built for systems with
+ strong NUMA characteristics might also need to adjust
+ <tt>CONFIG_RCU_FANOUT</tt> so that the domains of the
+ <tt>rcu_node</tt> structures align with hardware boundaries.
+ However, there has thus far been no need for this.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>If your system has more than 1,024 CPUs (or more than 512 CPUs on
+a 32-bit system), then RCU will automatically add more levels to the
+tree.
+For example, if you are crazy enough to build a 64-bit system with 65,536
+CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows:
+
+</p><p><img src="HugeTreeClassicRCU.svg" alt="HugeTreeClassicRCU.svg" width="50%">
+
+</p><p>RCU currently permits up to a four-level tree, which on a 64-bit system
+accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for
+32-bit systems.
+On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be
+as small as 2 if you wish, which would permit only 16 CPUs, which
+is useful for testing.
+
+</p><p>This multi-level combining tree allows us to get most of the
+performance and scalability
+benefits of partitioning, even though RCU grace-period detection is
+inherently a global operation.
+The trick here is that only the last CPU to report a quiescent state
+into a given <tt>rcu_node</tt> structure need advance to the <tt>rcu_node</tt>
+structure at the next level up the tree.
+This means that at the leaf-level <tt>rcu_node</tt> structure, only
+one access out of sixteen will progress up the tree.
+For the internal <tt>rcu_node</tt> structures, the situation is even
+more extreme: Only one access out of sixty-four will progress up
+the tree.
+Because the vast majority of the CPUs do not progress up the tree,
+the lock contention remains roughly constant up the tree.
+No matter how many CPUs there are in the system, at most 64 quiescent-state
+reports per grace period will progress all the way to the root
+<tt>rcu_node</tt> structure, thus ensuring that the lock contention
+on that root <tt>rcu_node</tt> structure remains acceptably low.
+
+</p><p>In effect, the combining tree acts like a big shock absorber,
+keeping lock contention under control at all tree levels regardless
+of the level of loading on the system.
+
+</p><p>The Linux kernel actually supports multiple flavors of RCU
+running concurrently, so RCU builds separate data structures for each
+flavor.
+For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides
+rcu_sched and rcu_bh, as shown below:
+
+</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%">
+
+</p><p>Energy efficiency is increasingly important, and for that
+reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which
+turns off the scheduling-clock interrupts on idle CPUs, which in
+turn allows those CPUs to attain deeper sleep states and to consume
+less energy.
+CPUs whose scheduling-clock interrupts have been turned off are
+said to be in <i>dyntick-idle mode</i>.
+RCU must handle dyntick-idle CPUs specially
+because RCU would otherwise wake up each CPU on every grace period,
+which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>.
+RCU uses the <tt>rcu_dynticks</tt> structure to track
+which CPUs are in dyntick idle mode, as shown below:
+
+</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%">
+
+</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode
+for all flavors of RCU.
+Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per
+CPU, and all of a given CPU's <tt>rcu_data</tt> structures share
+that <tt>rcu_dynticks</tt>, as shown in the figure.
+
+</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support
+rcu_preempt in addition to rcu_sched and rcu_bh, as shown below:
+
+</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%">
+
+</p><p>RCU updaters wait for normal grace periods by registering
+RCU callbacks, either directly via <tt>call_rcu()</tt> and
+friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
+there being a separate interface per flavor of RCU)
+or indirectly via <tt>synchronize_rcu()</tt> and friends.
+RCU callbacks are represented by <tt>rcu_head</tt> structures,
+which are queued on <tt>rcu_data</tt> structures while they are
+waiting for a grace period to elapse, as shown in the following figure:
+
+</p><p><img src="BigTreePreemptRCUBHdyntickCB.svg" alt="BigTreePreemptRCUBHdyntickCB.svg" width="40%">
+
+</p><p>This figure shows how <tt>TREE_RCU</tt>'s and
+<tt>PREEMPT_RCU</tt>'s major data structures are related.
+Lesser data structures will be introduced with the algorithms that
+make use of them.
+
+</p><p>Note that each of the data structures in the above figure has
+its own synchronization:
+
+<p><ol>
+<li> Each <tt>rcu_state</tt> structures has a lock and a mutex,
+ and some fields are protected by the corresponding root
+ <tt>rcu_node</tt> structure's lock.
+<li> Each <tt>rcu_node</tt> structure has a spinlock.
+<li> The fields in <tt>rcu_data</tt> are private to the corresponding
+ CPU, although a few can be read and written by other CPUs.
+<li> Similarly, the fields in <tt>rcu_dynticks</tt> are private
+ to the corresponding CPU, although a few can be read by
+ other CPUs.
+</ol>
+
+<p>It is important to note that different data structures can have
+very different ideas about the state of RCU at any given time.
+For but one example, awareness of the start or end of a given RCU
+grace period propagates slowly through the data structures.
+This slow propagation is absolutely necessary for RCU to have good
+read-side performance.
+If this balkanized implementation seems foreign to you, one useful
+trick is to consider each instance of these data structures to be
+a different person, each having the usual slightly different
+view of reality.
+
+</p><p>The general role of each of these data structures is as
+follows:
+
+</p><ol>
+<li> <tt>rcu_state</tt>:
+ This structure forms the interconnection between the
+ <tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
+ tracks grace periods, serves as short-term repository
+ for callbacks orphaned by CPU-hotplug events,
+ maintains <tt>rcu_barrier()</tt> state,
+ tracks expedited grace-period state,
+ and maintains state used to force quiescent states when
+ grace periods extend too long,
+<li> <tt>rcu_node</tt>: This structure forms the combining
+ tree that propagates quiescent-state
+ information from the leaves to the root, and also propagates
+ grace-period information from the root to the leaves.
+ It provides local copies of the grace-period state in order
+ to allow this information to be accessed in a synchronized
+ manner without suffering the scalability limitations that
+ would otherwise be imposed by global locking.
+ In <tt>CONFIG_PREEMPT_RCU</tt> kernels, it manages the lists
+ of tasks that have blocked while in their current
+ RCU read-side critical section.
+ In <tt>CONFIG_PREEMPT_RCU</tt> with
+ <tt>CONFIG_RCU_BOOST</tt>, it manages the
+ per-<tt>rcu_node</tt> priority-boosting
+ kernel threads (kthreads) and state.
+ Finally, it records CPU-hotplug state in order to determine
+ which CPUs should be ignored during a given grace period.
+<li> <tt>rcu_data</tt>: This per-CPU structure is the
+ focus of quiescent-state detection and RCU callback queuing.
+ It also tracks its relationship to the corresponding leaf
+ <tt>rcu_node</tt> structure to allow more-efficient
+ propagation of quiescent states up the <tt>rcu_node</tt>
+ combining tree.
+ Like the <tt>rcu_node</tt> structure, it provides a local
+ copy of the grace-period information to allow for-free
+ synchronized
+ access to this information from the corresponding CPU.
+ Finally, this structure records past dyntick-idle state
+ for the corresponding CPU and also tracks statistics.
+<li> <tt>rcu_dynticks</tt>:
+ This per-CPU structure tracks the current dyntick-idle
+ state for the corresponding CPU.
+ Unlike the other three structures, the <tt>rcu_dynticks</tt>
+ structure is not replicated per RCU flavor.
+<li> <tt>rcu_head</tt>:
+ This structure represents RCU callbacks, and is the
+ only structure allocated and managed by RCU users.
+ The <tt>rcu_head</tt> structure is normally embedded
+ within the RCU-protected data structure.
+</ol>
+
+<p>If all you wanted from this article was a general notion of how
+RCU's data structures are related, you are done.
+Otherwise, each of the following sections give more details on
+the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>,
+and <tt>rcu_dynticks</tt> data structures.
+
+<h3><a name="The rcu_state Structure">
+The <tt>rcu_state</tt> Structure</a></h3>
+
+<p>The <tt>rcu_state</tt> structure is the base structure that
+represents a flavor of RCU.
+This structure forms the interconnection between the
+<tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
+tracks grace periods, contains the lock used to
+synchronize with CPU-hotplug events,
+and maintains state used to force quiescent states when
+grace periods extend too long,
+
+</p><p>A few of the <tt>rcu_state</tt> structure's fields are discussed,
+singly and in groups, in the following sections.
+The more specialized fields are covered in the discussion of their
+use.
+
+<h5>Relationship to rcu_node and rcu_data Structures</h5>
+
+This portion of the <tt>rcu_state</tt> structure is declared
+as follows:
+
+<pre>
+ 1 struct rcu_node node[NUM_RCU_NODES];
+ 2 struct rcu_node *level[NUM_RCU_LVLS + 1];
+ 3 struct rcu_data __percpu *rda;
+</pre>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Wait a minute!
+ You said that the <tt>rcu_node</tt> structures formed a tree,
+ but they are declared as a flat array!
+ What gives?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ The tree is laid out in the array.
+ The first node In the array is the head, the next set of nodes in the
+ array are children of the head node, and so on until the last set of
+ nodes in the array are the leaves.
+ </font>
+
+ <p><font color="ffffff">See the following diagrams to see how
+ this works.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>The <tt>rcu_node</tt> tree is embedded into the
+<tt>-&gt;node[]</tt> array as shown in the following figure:
+
+</p><p><img src="TreeMapping.svg" alt="TreeMapping.svg" width="40%">
+
+</p><p>One interesting consequence of this mapping is that a
+breadth-first traversal of the tree is implemented as a simple
+linear scan of the array, which is in fact what the
+<tt>rcu_for_each_node_breadth_first()</tt> macro does.
+This macro is used at the beginning and ends of grace periods.
+
+</p><p>Each entry of the <tt>-&gt;level</tt> array references
+the first <tt>rcu_node</tt> structure on the corresponding level
+of the tree, for example, as shown below:
+
+</p><p><img src="TreeMappingLevel.svg" alt="TreeMappingLevel.svg" width="40%">
+
+</p><p>The zero<sup>th</sup> element of the array references the root
+<tt>rcu_node</tt> structure, the first element references the
+first child of the root <tt>rcu_node</tt>, and finally the second
+element references the first leaf <tt>rcu_node</tt> structure.
+
+</p><p>For whatever it is worth, if you draw the tree to be tree-shaped
+rather than array-shaped, it is easy to draw a planar representation:
+
+</p><p><img src="TreeLevel.svg" alt="TreeLevel.svg" width="60%">
+
+</p><p>Finally, the <tt>-&gt;rda</tt> field references a per-CPU
+pointer to the corresponding CPU's <tt>rcu_data</tt> structure.
+
+</p><p>All of these fields are constant once initialization is complete,
+and therefore need no protection.
+
+<h5>Grace-Period Tracking</h5>
+
+<p>This portion of the <tt>rcu_state</tt> structure is declared
+as follows:
+
+<pre>
+ 1 unsigned long gpnum;
+ 2 unsigned long completed;
+</pre>
+
+<p>RCU grace periods are numbered, and
+the <tt>-&gt;gpnum</tt> field contains the number of the grace
+period that started most recently.
+The <tt>-&gt;completed</tt> field contains the number of the
+grace period that completed most recently.
+If the two fields are equal, the RCU grace period that most recently
+started has already completed, and therefore the corresponding
+flavor of RCU is idle.
+If <tt>-&gt;gpnum</tt> is one greater than <tt>-&gt;completed</tt>,
+then <tt>-&gt;gpnum</tt> gives the number of the current RCU
+grace period, which has not yet completed.
+Any other combination of values indicates that something is broken.
+These two fields are protected by the root <tt>rcu_node</tt>'s
+<tt>-&gt;lock</tt> field.
+
+</p><p>There are <tt>-&gt;gpnum</tt> and <tt>-&gt;completed</tt> fields
+in the <tt>rcu_node</tt> and <tt>rcu_data</tt> structures
+as well.
+The fields in the <tt>rcu_state</tt> structure represent the
+most current values, and those of the other structures are compared
+in order to detect the start of a new grace period in a distributed
+fashion.
+The values flow from <tt>rcu_state</tt> to <tt>rcu_node</tt>
+(down the tree from the root to the leaves) to <tt>rcu_data</tt>.
+
+<h5>Miscellaneous</h5>
+
+<p>This portion of the <tt>rcu_state</tt> structure is declared
+as follows:
+
+<pre>
+ 1 unsigned long gp_max;
+ 2 char abbr;
+ 3 char *name;
+</pre>
+
+<p>The <tt>-&gt;gp_max</tt> field tracks the duration of the longest
+grace period in jiffies.
+It is protected by the root <tt>rcu_node</tt>'s <tt>-&gt;lock</tt>.
+
+<p>The <tt>-&gt;name</tt> field points to the name of the RCU flavor
+(for example, &ldquo;rcu_sched&rdquo;), and is constant.
+The <tt>-&gt;abbr</tt> field contains a one-character abbreviation,
+for example, &ldquo;s&rdquo; for RCU-sched.
+
+<h3><a name="The rcu_node Structure">
+The <tt>rcu_node</tt> Structure</a></h3>
+
+<p>The <tt>rcu_node</tt> structures form the combining
+tree that propagates quiescent-state
+information from the leaves to the root and also that propagates
+grace-period information from the root down to the leaves.
+They provides local copies of the grace-period state in order
+to allow this information to be accessed in a synchronized
+manner without suffering the scalability limitations that
+would otherwise be imposed by global locking.
+In <tt>CONFIG_PREEMPT_RCU</tt> kernels, they manage the lists
+of tasks that have blocked while in their current
+RCU read-side critical section.
+In <tt>CONFIG_PREEMPT_RCU</tt> with
+<tt>CONFIG_RCU_BOOST</tt>, they manage the
+per-<tt>rcu_node</tt> priority-boosting
+kernel threads (kthreads) and state.
+Finally, they record CPU-hotplug state in order to determine
+which CPUs should be ignored during a given grace period.
+
+</p><p>The <tt>rcu_node</tt> structure's fields are discussed,
+singly and in groups, in the following sections.
+
+<h5>Connection to Combining Tree</h5>
+
+<p>This portion of the <tt>rcu_node</tt> structure is declared
+as follows:
+
+<pre>
+ 1 struct rcu_node *parent;
+ 2 u8 level;
+ 3 u8 grpnum;
+ 4 unsigned long grpmask;
+ 5 int grplo;
+ 6 int grphi;
+</pre>
+
+<p>The <tt>-&gt;parent</tt> pointer references the <tt>rcu_node</tt>
+one level up in the tree, and is <tt>NULL</tt> for the root
+<tt>rcu_node</tt>.
+The RCU implementation makes heavy use of this field to push quiescent
+states up the tree.
+The <tt>-&gt;level</tt> field gives the level in the tree, with
+the root being at level zero, its children at level one, and so on.
+The <tt>-&gt;grpnum</tt> field gives this node's position within
+the children of its parent, so this number can range between 0 and 31
+on 32-bit systems and between 0 and 63 on 64-bit systems.
+The <tt>-&gt;level</tt> and <tt>-&gt;grpnum</tt> fields are
+used only during initialization and for tracing.
+The <tt>-&gt;grpmask</tt> field is the bitmask counterpart of
+<tt>-&gt;grpnum</tt>, and therefore always has exactly one bit set.
+This mask is used to clear the bit corresponding to this <tt>rcu_node</tt>
+structure in its parent's bitmasks, which are described later.
+Finally, the <tt>-&gt;grplo</tt> and <tt>-&gt;grphi</tt> fields
+contain the lowest and highest numbered CPU served by this
+<tt>rcu_node</tt> structure, respectively.
+
+</p><p>All of these fields are constant, and thus do not require any
+synchronization.
+
+<h5>Synchronization</h5>
+
+<p>This field of the <tt>rcu_node</tt> structure is declared
+as follows:
+
+<pre>
+ 1 raw_spinlock_t lock;
+</pre>
+
+<p>This field is used to protect the remaining fields in this structure,
+unless otherwise stated.
+That said, all of the fields in this structure can be accessed without
+locking for tracing purposes.
+Yes, this can result in confusing traces, but better some tracing confusion
+than to be heisenbugged out of existence.
+
+<h5>Grace-Period Tracking</h5>
+
+<p>This portion of the <tt>rcu_node</tt> structure is declared
+as follows:
+
+<pre>
+ 1 unsigned long gpnum;
+ 2 unsigned long completed;
+</pre>
+
+<p>These fields are the counterparts of the fields of the same name in
+the <tt>rcu_state</tt> structure.
+They each may lag up to one behind their <tt>rcu_state</tt>
+counterparts.
+If a given <tt>rcu_node</tt> structure's <tt>-&gt;gpnum</tt> and
+<tt>-&gt;complete</tt> fields are equal, then this <tt>rcu_node</tt>
+structure believes that RCU is idle.
+Otherwise, as with the <tt>rcu_state</tt> structure,
+the <tt>-&gt;gpnum</tt> field will be one greater than the
+<tt>-&gt;complete</tt> fields, with <tt>-&gt;gpnum</tt>
+indicating which grace period this <tt>rcu_node</tt> believes
+is still being waited for.
+
+</p><p>The <tt>&gt;gpnum</tt> field of each <tt>rcu_node</tt>
+structure is updated at the beginning
+of each grace period, and the <tt>-&gt;completed</tt> fields are
+updated at the end of each grace period.
+
+<h5>Quiescent-State Tracking</h5>
+
+<p>These fields manage the propagation of quiescent states up the
+combining tree.
+
+</p><p>This portion of the <tt>rcu_node</tt> structure has fields
+as follows:
+
+<pre>
+ 1 unsigned long qsmask;
+ 2 unsigned long expmask;
+ 3 unsigned long qsmaskinit;
+ 4 unsigned long expmaskinit;
+</pre>
+
+<p>The <tt>-&gt;qsmask</tt> field tracks which of this
+<tt>rcu_node</tt> structure's children still need to report
+quiescent states for the current normal grace period.
+Such children will have a value of 1 in their corresponding bit.
+Note that the leaf <tt>rcu_node</tt> structures should be
+thought of as having <tt>rcu_data</tt> structures as their
+children.
+Similarly, the <tt>-&gt;expmask</tt> field tracks which
+of this <tt>rcu_node</tt> structure's children still need to report
+quiescent states for the current expedited grace period.
+An expedited grace period has
+the same conceptual properties as a normal grace period, but the
+expedited implementation accepts extreme CPU overhead to obtain
+much lower grace-period latency, for example, consuming a few
+tens of microseconds worth of CPU time to reduce grace-period
+duration from milliseconds to tens of microseconds.
+The <tt>-&gt;qsmaskinit</tt> field tracks which of this
+<tt>rcu_node</tt> structure's children cover for at least
+one online CPU.
+This mask is used to initialize <tt>-&gt;qsmask</tt>,
+and <tt>-&gt;expmaskinit</tt> is used to initialize
+<tt>-&gt;expmask</tt> and the beginning of the
+normal and expedited grace periods, respectively.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Why are these bitmasks protected by locking?
+ Come on, haven't you heard of atomic instructions???
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Lockless grace-period computation! Such a tantalizing possibility!
+ </font>
+
+ <p><font color="ffffff">But consider the following sequence of events:
+ </font>
+
+ <ol>
+ <li> <font color="ffffff">CPU&nbsp;0 has been in dyntick-idle
+ mode for quite some time.
+ When it wakes up, it notices that the current RCU
+ grace period needs it to report in, so it sets a
+ flag where the scheduling clock interrupt will find it.
+ </font><p>
+ <li> <font color="ffffff">Meanwhile, CPU&nbsp;1 is running
+ <tt>force_quiescent_state()</tt>,
+ and notices that CPU&nbsp;0 has been in dyntick idle mode,
+ which qualifies as an extended quiescent state.
+ </font><p>
+ <li> <font color="ffffff">CPU&nbsp;0's scheduling clock
+ interrupt fires in the
+ middle of an RCU read-side critical section, and notices
+ that the RCU core needs something, so commences RCU softirq
+ processing.
+ </font>
+ <p>
+ <li> <font color="ffffff">CPU&nbsp;0's softirq handler
+ executes and is just about ready
+ to report its quiescent state up the <tt>rcu_node</tt>
+ tree.
+ </font><p>
+ <li> <font color="ffffff">But CPU&nbsp;1 beats it to the punch,
+ completing the current
+ grace period and starting a new one.
+ </font><p>
+ <li> <font color="ffffff">CPU&nbsp;0 now reports its quiescent
+ state for the wrong
+ grace period.
+ That grace period might now end before the RCU read-side
+ critical section.
+ If that happens, disaster will ensue.
+ </font>
+ </ol>
+
+ <p><font color="ffffff">So the locking is absolutely required in
+ order to coordinate
+ clearing of the bits with the grace-period numbers in
+ <tt>-&gt;gpnum</tt> and <tt>-&gt;completed</tt>.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<h5>Blocked-Task Management</h5>
+
+<p><tt>PREEMPT_RCU</tt> allows tasks to be preempted in the
+midst of their RCU read-side critical sections, and these tasks
+must be tracked explicitly.
+The details of exactly why and how they are tracked will be covered
+in a separate article on RCU read-side processing.
+For now, it is enough to know that the <tt>rcu_node</tt>
+structure tracks them.
+
+<pre>
+ 1 struct list_head blkd_tasks;
+ 2 struct list_head *gp_tasks;
+ 3 struct list_head *exp_tasks;
+ 4 bool wait_blkd_tasks;
+</pre>
+
+<p>The <tt>-&gt;blkd_tasks</tt> field is a list header for
+the list of blocked and preempted tasks.
+As tasks undergo context switches within RCU read-side critical
+sections, their <tt>task_struct</tt> structures are enqueued
+(via the <tt>task_struct</tt>'s <tt>-&gt;rcu_node_entry</tt>
+field) onto the head of the <tt>-&gt;blkd_tasks</tt> list for the
+leaf <tt>rcu_node</tt> structure corresponding to the CPU
+on which the outgoing context switch executed.
+As these tasks later exit their RCU read-side critical sections,
+they remove themselves from the list.
+This list is therefore in reverse time order, so that if one of the tasks
+is blocking the current grace period, all subsequent tasks must
+also be blocking that same grace period.
+Therefore, a single pointer into this list suffices to track
+all tasks blocking a given grace period.
+That pointer is stored in <tt>-&gt;gp_tasks</tt> for normal
+grace periods and in <tt>-&gt;exp_tasks</tt> for expedited
+grace periods.
+These last two fields are <tt>NULL</tt> if either there is
+no grace period in flight or if there are no blocked tasks
+preventing that grace period from completing.
+If either of these two pointers is referencing a task that
+removes itself from the <tt>-&gt;blkd_tasks</tt> list,
+then that task must advance the pointer to the next task on
+the list, or set the pointer to <tt>NULL</tt> if there
+are no subsequent tasks on the list.
+
+</p><p>For example, suppose that tasks&nbsp;T1, T2, and&nbsp;T3 are
+all hard-affinitied to the largest-numbered CPU in the system.
+Then if task&nbsp;T1 blocked in an RCU read-side
+critical section, then an expedited grace period started,
+then task&nbsp;T2 blocked in an RCU read-side critical section,
+then a normal grace period started, and finally task&nbsp;3 blocked
+in an RCU read-side critical section, then the state of the
+last leaf <tt>rcu_node</tt> structure's blocked-task list
+would be as shown below:
+
+</p><p><img src="blkd_task.svg" alt="blkd_task.svg" width="60%">
+
+</p><p>Task&nbsp;T1 is blocking both grace periods, task&nbsp;T2 is
+blocking only the normal grace period, and task&nbsp;T3 is blocking
+neither grace period.
+Note that these tasks will not remove themselves from this list
+immediately upon resuming execution.
+They will instead remain on the list until they execute the outermost
+<tt>rcu_read_unlock()</tt> that ends their RCU read-side critical
+section.
+
+<p>
+The <tt>-&gt;wait_blkd_tasks</tt> field indicates whether or not
+the current grace period is waiting on a blocked task.
+
+<h5>Sizing the <tt>rcu_node</tt> Array</h5>
+
+<p>The <tt>rcu_node</tt> array is sized via a series of
+C-preprocessor expressions as follows:
+
+<pre>
+ 1 #ifdef CONFIG_RCU_FANOUT
+ 2 #define RCU_FANOUT CONFIG_RCU_FANOUT
+ 3 #else
+ 4 # ifdef CONFIG_64BIT
+ 5 # define RCU_FANOUT 64
+ 6 # else
+ 7 # define RCU_FANOUT 32
+ 8 # endif
+ 9 #endif
+10
+11 #ifdef CONFIG_RCU_FANOUT_LEAF
+12 #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+13 #else
+14 # ifdef CONFIG_64BIT
+15 # define RCU_FANOUT_LEAF 64
+16 # else
+17 # define RCU_FANOUT_LEAF 32
+18 # endif
+19 #endif
+20
+21 #define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+22 #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
+23 #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
+24 #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
+25
+26 #if NR_CPUS &lt;= RCU_FANOUT_1
+27 # define RCU_NUM_LVLS 1
+28 # define NUM_RCU_LVL_0 1
+29 # define NUM_RCU_NODES NUM_RCU_LVL_0
+30 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
+31 # define RCU_NODE_NAME_INIT { "rcu_node_0" }
+32 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
+33 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
+34 #elif NR_CPUS &lt;= RCU_FANOUT_2
+35 # define RCU_NUM_LVLS 2
+36 # define NUM_RCU_LVL_0 1
+37 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+38 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+39 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+40 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
+41 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+42 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
+43 #elif NR_CPUS &lt;= RCU_FANOUT_3
+44 # define RCU_NUM_LVLS 3
+45 # define NUM_RCU_LVL_0 1
+46 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+47 # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+48 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+49 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+50 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+51 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+52 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
+53 #elif NR_CPUS &lt;= RCU_FANOUT_4
+54 # define RCU_NUM_LVLS 4
+55 # define NUM_RCU_LVL_0 1
+56 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+57 # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+58 # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+59 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+60 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+61 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+62 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+63 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
+64 #else
+65 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+66 #endif
+</pre>
+
+<p>The maximum number of levels in the <tt>rcu_node</tt> structure
+is currently limited to four, as specified by lines&nbsp;21-24
+and the structure of the subsequent &ldquo;if&rdquo; statement.
+For 32-bit systems, this allows 16*32*32*32=524,288 CPUs, which
+should be sufficient for the next few years at least.
+For 64-bit systems, 16*64*64*64=4,194,304 CPUs is allowed, which
+should see us through the next decade or so.
+This four-level tree also allows kernels built with
+<tt>CONFIG_RCU_FANOUT=8</tt> to support up to 4096 CPUs,
+which might be useful in very large systems having eight CPUs per
+socket (but please note that no one has yet shown any measurable
+performance degradation due to misaligned socket and <tt>rcu_node</tt>
+boundaries).
+In addition, building kernels with a full four levels of <tt>rcu_node</tt>
+tree permits better testing of RCU's combining-tree code.
+
+</p><p>The <tt>RCU_FANOUT</tt> symbol controls how many children
+are permitted at each non-leaf level of the <tt>rcu_node</tt> tree.
+If the <tt>CONFIG_RCU_FANOUT</tt> Kconfig option is not specified,
+it is set based on the word size of the system, which is also
+the Kconfig default.
+
+</p><p>The <tt>RCU_FANOUT_LEAF</tt> symbol controls how many CPUs are
+handled by each leaf <tt>rcu_node</tt> structure.
+Experience has shown that allowing a given leaf <tt>rcu_node</tt>
+structure to handle 64 CPUs, as permitted by the number of bits in
+the <tt>-&gt;qsmask</tt> field on a 64-bit system, results in
+excessive contention for the leaf <tt>rcu_node</tt> structures'
+<tt>-&gt;lock</tt> fields.
+The number of CPUs per leaf <tt>rcu_node</tt> structure is therefore
+limited to 16 given the default value of <tt>CONFIG_RCU_FANOUT_LEAF</tt>.
+If <tt>CONFIG_RCU_FANOUT_LEAF</tt> is unspecified, the value
+selected is based on the word size of the system, just as for
+<tt>CONFIG_RCU_FANOUT</tt>.
+Lines&nbsp;11-19 perform this computation.
+
+</p><p>Lines&nbsp;21-24 compute the maximum number of CPUs supported by
+a single-level (which contains a single <tt>rcu_node</tt> structure),
+two-level, three-level, and four-level <tt>rcu_node</tt> tree,
+respectively, given the fanout specified by <tt>RCU_FANOUT</tt>
+and <tt>RCU_FANOUT_LEAF</tt>.
+These numbers of CPUs are retained in the
+<tt>RCU_FANOUT_1</tt>,
+<tt>RCU_FANOUT_2</tt>,
+<tt>RCU_FANOUT_3</tt>, and
+<tt>RCU_FANOUT_4</tt>
+C-preprocessor variables, respectively.
+
+</p><p>These variables are used to control the C-preprocessor <tt>#if</tt>
+statement spanning lines&nbsp;26-66 that computes the number of
+<tt>rcu_node</tt> structures required for each level of the tree,
+as well as the number of levels required.
+The number of levels is placed in the <tt>NUM_RCU_LVLS</tt>
+C-preprocessor variable by lines&nbsp;27, 35, 44, and&nbsp;54.
+The number of <tt>rcu_node</tt> structures for the topmost level
+of the tree is always exactly one, and this value is unconditionally
+placed into <tt>NUM_RCU_LVL_0</tt> by lines&nbsp;28, 36, 45, and&nbsp;55.
+The rest of the levels (if any) of the <tt>rcu_node</tt> tree
+are computed by dividing the maximum number of CPUs by the
+fanout supported by the number of levels from the current level down,
+rounding up. This computation is performed by lines&nbsp;37,
+46-47, and&nbsp;56-58.
+Lines&nbsp;31-33, 40-42, 50-52, and&nbsp;62-63 create initializers
+for lockdep lock-class names.
+Finally, lines&nbsp;64-66 produce an error if the maximum number of
+CPUs is too large for the specified fanout.
+
+<h3><a name="The rcu_data Structure">
+The <tt>rcu_data</tt> Structure</a></h3>
+
+<p>The <tt>rcu_data</tt> maintains the per-CPU state for the
+corresponding flavor of RCU.
+The fields in this structure may be accessed only from the corresponding
+CPU (and from tracing) unless otherwise stated.
+This structure is the
+focus of quiescent-state detection and RCU callback queuing.
+It also tracks its relationship to the corresponding leaf
+<tt>rcu_node</tt> structure to allow more-efficient
+propagation of quiescent states up the <tt>rcu_node</tt>
+combining tree.
+Like the <tt>rcu_node</tt> structure, it provides a local
+copy of the grace-period information to allow for-free
+synchronized
+access to this information from the corresponding CPU.
+Finally, this structure records past dyntick-idle state
+for the corresponding CPU and also tracks statistics.
+
+</p><p>The <tt>rcu_data</tt> structure's fields are discussed,
+singly and in groups, in the following sections.
+
+<h5>Connection to Other Data Structures</h5>
+
+<p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+ 1 int cpu;
+ 2 struct rcu_state *rsp;
+ 3 struct rcu_node *mynode;
+ 4 struct rcu_dynticks *dynticks;
+ 5 unsigned long grpmask;
+ 6 bool beenonline;
+</pre>
+
+<p>The <tt>-&gt;cpu</tt> field contains the number of the
+corresponding CPU, the <tt>-&gt;rsp</tt> pointer references
+the corresponding <tt>rcu_state</tt> structure (and is most frequently
+used to locate the name of the corresponding flavor of RCU for tracing),
+and the <tt>-&gt;mynode</tt> field references the corresponding
+<tt>rcu_node</tt> structure.
+The <tt>-&gt;mynode</tt> is used to propagate quiescent states
+up the combining tree.
+<p>The <tt>-&gt;dynticks</tt> pointer references the
+<tt>rcu_dynticks</tt> structure corresponding to this
+CPU.
+Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt>
+structure is shared among all flavors of RCU.
+These first four fields are constant and therefore require not
+synchronization.
+
+</p><p>The <tt>-&gt;grpmask</tt> field indicates the bit in
+the <tt>-&gt;mynode-&gt;qsmask</tt> corresponding to this
+<tt>rcu_data</tt> structure, and is also used when propagating
+quiescent states.
+The <tt>-&gt;beenonline</tt> flag is set whenever the corresponding
+CPU comes online, which means that the debugfs tracing need not dump
+out any <tt>rcu_data</tt> structure for which this flag is not set.
+
+<h5>Quiescent-State and Grace-Period Tracking</h5>
+
+<p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+ 1 unsigned long completed;
+ 2 unsigned long gpnum;
+ 3 bool cpu_no_qs;
+ 4 bool core_needs_qs;
+ 5 bool gpwrap;
+ 6 unsigned long rcu_qs_ctr_snap;
+</pre>
+
+<p>The <tt>completed</tt> and <tt>gpnum</tt>
+fields are the counterparts of the fields of the same name
+in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.
+They may each lag up to one behind their <tt>rcu_node</tt>
+counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and
+<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag
+arbitrarily far behind for CPUs in dyntick-idle mode (but these counters
+will catch up upon exit from dyntick-idle mode).
+If a given <tt>rcu_data</tt> structure's <tt>-&gt;gpnum</tt> and
+<tt>-&gt;complete</tt> fields are equal, then this <tt>rcu_data</tt>
+structure believes that RCU is idle.
+Otherwise, as with the <tt>rcu_state</tt> and <tt>rcu_node</tt>
+structure,
+the <tt>-&gt;gpnum</tt> field will be one greater than the
+<tt>-&gt;complete</tt> fields, with <tt>-&gt;gpnum</tt>
+indicating which grace period this <tt>rcu_data</tt> believes
+is still being waited for.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ All this replication of the grace period numbers can only cause
+ massive confusion.
+ Why not just keep a global pair of counters and be done with it???
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Because if there was only a single global pair of grace-period
+ numbers, there would need to be a single global lock to allow
+ safely accessing and updating them.
+ And if we are not going to have a single global lock, we need
+ to carefully manage the numbers on a per-node basis.
+ Recall from the answer to a previous Quick Quiz that the consequences
+ of applying a previously sampled quiescent state to the wrong
+ grace period are quite severe.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>The <tt>-&gt;cpu_no_qs</tt> flag indicates that the
+CPU has not yet passed through a quiescent state,
+while the <tt>-&gt;core_needs_qs</tt> flag indicates that the
+RCU core needs a quiescent state from the corresponding CPU.
+The <tt>-&gt;gpwrap</tt> field indicates that the corresponding
+CPU has remained idle for so long that the <tt>completed</tt>
+and <tt>gpnum</tt> counters are in danger of overflow, which
+will cause the CPU to disregard the values of its counters on
+its next exit from idle.
+Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect
+cases where a given operation has resulted in a quiescent state
+for all flavors of RCU, for example, <tt>cond_resched_rcu_qs()</tt>.
+
+<h5>RCU Callback Handling</h5>
+
+<p>In the absence of CPU-hotplug events, RCU callbacks are invoked by
+the same CPU that registered them.
+This is strictly a cache-locality optimization: callbacks can and
+do get invoked on CPUs other than the one that registered them.
+After all, if the CPU that registered a given callback has gone
+offline before the callback can be invoked, there really is no other
+choice.
+
+</p><p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+ 1 struct rcu_head *nxtlist;
+ 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ 4 long qlen_lazy;
+ 5 long qlen;
+ 6 long qlen_last_fqs_check;
+ 7 unsigned long n_force_qs_snap;
+ 8 unsigned long n_cbs_invoked;
+ 9 unsigned long n_cbs_orphaned;
+10 unsigned long n_cbs_adopted;
+11 long blimit;
+</pre>
+
+<p>The <tt>-&gt;nxtlist</tt> pointer and the
+<tt>-&gt;nxttail[]</tt> array form a four-segment list with
+older callbacks near the head and newer ones near the tail.
+Each segment contains callbacks with the corresponding relationship
+to the current grace period.
+The pointer out of the end of each of the four segments is referenced
+by the element of the <tt>-&gt;nxttail[]</tt> array indexed by
+<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
+<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
+<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
+grace period), and
+<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
+with a specific grace period)
+respectively, as shown in the following figure.
+
+</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
+
+</p><p>In this figure, the <tt>-&gt;nxtlist</tt> pointer references the
+first
+RCU callback in the list.
+The <tt>-&gt;nxttail[RCU_DONE_TAIL]</tt> array element references
+the <tt>-&gt;nxtlist</tt> pointer itself, indicating that none
+of the callbacks is ready to invoke.
+The <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt> array element references callback
+CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
+CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period.
+The <tt>-&gt;nxttail[RCU_NEXT_READY_TAIL]</tt> array element
+references the same RCU callback that <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt>
+does, which indicates that there are no callbacks waiting on the next
+RCU grace period.
+The <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element references
+CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
+remaining RCU callbacks have not yet been assigned to an RCU grace
+period.
+Note that the <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element
+always references the last RCU callback's <tt>-&gt;next</tt> pointer
+unless the callback list is empty, in which case it references
+the <tt>-&gt;nxtlist</tt> pointer.
+
+</p><p>CPUs advance their callbacks from the
+<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
+<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
+as grace periods advance.
+The CPU advances the callbacks in its <tt>rcu_data</tt> structure
+whenever it notices that another RCU grace period has completed.
+The CPU detects the completion of an RCU grace period by noticing
+that the value of its <tt>rcu_data</tt> structure's
+<tt>-&gt;completed</tt> field differs from that of its leaf
+<tt>rcu_node</tt> structure.
+Recall that each <tt>rcu_node</tt> structure's
+<tt>-&gt;completed</tt> field is updated at the end of each
+grace period.
+
+</p><p>The <tt>-&gt;nxtcompleted[]</tt> array records grace-period
+numbers corresponding to the list segments.
+This allows CPUs that go idle for extended periods to determine
+which of their callbacks are ready to be invoked after reawakening.
+
+</p><p>The <tt>-&gt;qlen</tt> counter contains the number of
+callbacks in <tt>-&gt;nxtlist</tt>, and the
+<tt>-&gt;qlen_lazy</tt> contains the number of those callbacks that
+are known to only free memory, and whose invocation can therefore
+be safely deferred.
+The <tt>-&gt;qlen_last_fqs_check</tt> and
+<tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent
+states from <tt>call_rcu()</tt> and friends when callback
+lists grow excessively long.
+
+</p><p>The <tt>-&gt;n_cbs_invoked</tt>,
+<tt>-&gt;n_cbs_orphaned</tt>, and <tt>-&gt;n_cbs_adopted</tt>
+fields count the number of callbacks invoked,
+sent to other CPUs when this CPU goes offline,
+and received from other CPUs when those other CPUs go offline.
+Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of
+RCU callbacks that may be invoked at a given time.
+
+<h5>Dyntick-Idle Handling</h5>
+
+<p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+ 1 int dynticks_snap;
+ 2 unsigned long dynticks_fqs;
+</pre>
+
+The <tt>-&gt;dynticks_snap</tt> field is used to take a snapshot
+of the corresponding CPU's dyntick-idle state when forcing
+quiescent states, and is therefore accessed from other CPUs.
+Finally, the <tt>-&gt;dynticks_fqs</tt> field is used to
+count the number of times this CPU is determined to be in
+dyntick-idle state, and is used for tracing and debugging purposes.
+
+<h3><a name="The rcu_dynticks Structure">
+The <tt>rcu_dynticks</tt> Structure</a></h3>
+
+<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state
+for the corresponding CPU.
+Unlike the other structures, <tt>rcu_dynticks</tt> is not
+replicated over the different flavors of RCU.
+The fields in this structure may be accessed only from the corresponding
+CPU (and from tracing) unless otherwise stated.
+Its fields are as follows:
+
+<pre>
+ 1 int dynticks_nesting;
+ 2 int dynticks_nmi_nesting;
+ 3 atomic_t dynticks;
+</pre>
+
+<p>The <tt>-&gt;dynticks_nesting</tt> field counts the
+nesting depth of normal interrupts.
+In addition, this counter is incremented when exiting dyntick-idle
+mode and decremented when entering it.
+This counter can therefore be thought of as counting the number
+of reasons why this CPU cannot be permitted to enter dyntick-idle
+mode, aside from non-maskable interrupts (NMIs).
+NMIs are counted by the <tt>-&gt;dynticks_nmi_nesting</tt>
+field, except that NMIs that interrupt non-dyntick-idle execution
+are not counted.
+
+</p><p>Finally, the <tt>-&gt;dynticks</tt> field counts the corresponding
+CPU's transitions to and from dyntick-idle mode, so that this counter
+has an even value when the CPU is in dyntick-idle mode and an odd
+value otherwise.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Why not just count all NMIs?
+ Wouldn't that be simpler and less error prone?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ It seems simpler only until you think hard about how to go about
+ updating the <tt>rcu_dynticks</tt> structure's
+ <tt>-&gt;dynticks</tt> field.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>Additional fields are present for some special-purpose
+builds, and are discussed separately.
+
+<h3><a name="The rcu_head Structure">
+The <tt>rcu_head</tt> Structure</a></h3>
+
+<p>Each <tt>rcu_head</tt> structure represents an RCU callback.
+These structures are normally embedded within RCU-protected data
+structures whose algorithms use asynchronous grace periods.
+In contrast, when using algorithms that block waiting for RCU grace periods,
+RCU users need not provide <tt>rcu_head</tt> structures.
+
+</p><p>The <tt>rcu_head</tt> structure has fields as follows:
+
+<pre>
+ 1 struct rcu_head *next;
+ 2 void (*func)(struct rcu_head *head);
+</pre>
+
+<p>The <tt>-&gt;next</tt> field is used
+to link the <tt>rcu_head</tt> structures together in the
+lists within the <tt>rcu_data</tt> structures.
+The <tt>-&gt;func</tt> field is a pointer to the function
+to be called when the callback is ready to be invoked, and
+this function is passed a pointer to the <tt>rcu_head</tt>
+structure.
+However, <tt>kfree_rcu()</tt> uses the <tt>-&gt;func</tt>
+field to record the offset of the <tt>rcu_head</tt>
+structure within the enclosing RCU-protected data structure.
+
+</p><p>Both of these fields are used internally by RCU.
+From the viewpoint of RCU users, this structure is an
+opaque &ldquo;cookie&rdquo;.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Given that the callback function <tt>-&gt;func</tt>
+ is passed a pointer to the <tt>rcu_head</tt> structure,
+ how is that function supposed to find the beginning of the
+ enclosing RCU-protected data structure?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ In actual practice, there is a separate callback function per
+ type of RCU-protected data structure.
+ The callback function can therefore use the <tt>container_of()</tt>
+ macro in the Linux kernel (or other pointer-manipulation facilities
+ in other software environments) to find the beginning of the
+ enclosing structure.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<h3><a name="RCU-Specific Fields in the task_struct Structure">
+RCU-Specific Fields in the <tt>task_struct</tt> Structure</a></h3>
+
+<p>The <tt>CONFIG_PREEMPT_RCU</tt> implementation uses some
+additional fields in the <tt>task_struct</tt> structure:
+
+<pre>
+ 1 #ifdef CONFIG_PREEMPT_RCU
+ 2 int rcu_read_lock_nesting;
+ 3 union rcu_special rcu_read_unlock_special;
+ 4 struct list_head rcu_node_entry;
+ 5 struct rcu_node *rcu_blocked_node;
+ 6 #endif /* #ifdef CONFIG_PREEMPT_RCU */
+ 7 #ifdef CONFIG_TASKS_RCU
+ 8 unsigned long rcu_tasks_nvcsw;
+ 9 bool rcu_tasks_holdout;
+10 struct list_head rcu_tasks_holdout_list;
+11 int rcu_tasks_idle_cpu;
+12 #endif /* #ifdef CONFIG_TASKS_RCU */
+</pre>
+
+<p>The <tt>-&gt;rcu_read_lock_nesting</tt> field records the
+nesting level for RCU read-side critical sections, and
+the <tt>-&gt;rcu_read_unlock_special</tt> field is a bitmask
+that records special conditions that require <tt>rcu_read_unlock()</tt>
+to do additional work.
+The <tt>-&gt;rcu_node_entry</tt> field is used to form lists of
+tasks that have blocked within preemptible-RCU read-side critical
+sections and the <tt>-&gt;rcu_blocked_node</tt> field references
+the <tt>rcu_node</tt> structure whose list this task is a member of,
+or <tt>NULL</tt> if it is not blocked within a preemptible-RCU
+read-side critical section.
+
+<p>The <tt>-&gt;rcu_tasks_nvcsw</tt> field tracks the number of
+voluntary context switches that this task had undergone at the
+beginning of the current tasks-RCU grace period,
+<tt>-&gt;rcu_tasks_holdout</tt> is set if the current tasks-RCU
+grace period is waiting on this task, <tt>-&gt;rcu_tasks_holdout_list</tt>
+is a list element enqueuing this task on the holdout list,
+and <tt>-&gt;rcu_tasks_idle_cpu</tt> tracks which CPU this
+idle task is running, but only if the task is currently running,
+that is, if the CPU is currently idle.
+
+<h3><a name="Accessor Functions">
+Accessor Functions</a></h3>
+
+<p>The following listing shows the
+<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt>,
+<tt>rcu_for_each_nonleaf_node_breadth_first()</tt>, and
+<tt>rcu_for_each_leaf_node()</tt> function and macros:
+
+<pre>
+ 1 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+ 2 {
+ 3 return &amp;rsp-&gt;node[0];
+ 4 }
+ 5
+ 6 #define rcu_for_each_node_breadth_first(rsp, rnp) \
+ 7 for ((rnp) = &amp;(rsp)-&gt;node[0]; \
+ 8 (rnp) &lt; &amp;(rsp)-&gt;node[NUM_RCU_NODES]; (rnp)++)
+ 9
+ 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+ 11 for ((rnp) = &amp;(rsp)-&gt;node[0]; \
+ 12 (rnp) &lt; (rsp)-&gt;level[NUM_RCU_LVLS - 1]; (rnp)++)
+ 13
+ 14 #define rcu_for_each_leaf_node(rsp, rnp) \
+ 15 for ((rnp) = (rsp)-&gt;level[NUM_RCU_LVLS - 1]; \
+ 16 (rnp) &lt; &amp;(rsp)-&gt;node[NUM_RCU_NODES]; (rnp)++)
+</pre>
+
+<p>The <tt>rcu_get_root()</tt> simply returns a pointer to the
+first element of the specified <tt>rcu_state</tt> structure's
+<tt>-&gt;node[]</tt> array, which is the root <tt>rcu_node</tt>
+structure.
+
+</p><p>As noted earlier, the <tt>rcu_for_each_node_breadth_first()</tt>
+macro takes advantage of the layout of the <tt>rcu_node</tt>
+structures in the <tt>rcu_state</tt> structure's
+<tt>-&gt;node[]</tt> array, performing a breadth-first traversal by
+simply traversing the array in order.
+The <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> macro operates
+similarly, but traverses only the first part of the array, thus excluding
+the leaf <tt>rcu_node</tt> structures.
+Finally, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only
+the last part of the array, thus traversing only the leaf
+<tt>rcu_node</tt> structures.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ What do <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> and
+ <tt>rcu_for_each_leaf_node()</tt> do if the <tt>rcu_node</tt> tree
+ contains only a single node?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ In the single-node case,
+ <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> is a no-op
+ and <tt>rcu_for_each_leaf_node()</tt> traverses the single node.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<h3><a name="Summary">
+Summary</a></h3>
+
+So each flavor of RCU is represented by an <tt>rcu_state</tt> structure,
+which contains a combining tree of <tt>rcu_node</tt> and
+<tt>rcu_data</tt> structures.
+Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle
+state is tracked by an <tt>rcu_dynticks</tt> structure.
+
+If you made it this far, you are well prepared to read the code
+walkthroughs in the other articles in this series.
+
+<h3><a name="Acknowledgments">
+Acknowledgments</a></h3>
+
+I owe thanks to Cyrill Gorcunov, Mathieu Desnoyers, Dhaval Giani, Paul
+Turner, Abhishek Srivastava, Matt Kowalczyk, and Serge Hallyn
+for helping me get this document into a more human-readable state.
+
+<h3><a name="Legal Statement">
+Legal Statement</a></h3>
+
+<p>This work represents the view of the author and does not necessarily
+represent the view of IBM.
+
+</p><p>Linux is a registered trademark of Linus Torvalds.
+
+</p><p>Other company, product, and service names may be trademarks or
+service marks of others.
+
+</body></html>
diff --git a/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg
new file mode 100644
index 000000000000..2bf12b468206
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg
@@ -0,0 +1,939 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:37:22 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="15.1in"
+ height="11.2in"
+ viewBox="-66 -66 18087 13407"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="HugeTreeClassicRCU.fig">
+ <metadata
+ id="metadata224">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs222">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3982"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="1134"
+ inkscape:window-height="789"
+ id="namedview220"
+ showgrid="false"
+ inkscape:zoom="0.60515873"
+ inkscape:cx="679.5"
+ inkscape:cy="504"
+ inkscape:window-x="786"
+ inkscape:window-y="24"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="450"
+ y="0"
+ width="17100"
+ height="8325"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="11025"
+ y="3600"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="4275"
+ y="3600"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="5400"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect12" />
+ <!-- Line: box -->
+ <rect
+ x="9900"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect14" />
+ <!-- Line: box -->
+ <rect
+ x="14400"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect16" />
+ <!-- Line: box -->
+ <rect
+ x="900"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect18" />
+ <!-- Line: box -->
+ <rect
+ x="7650"
+ y="900"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect20" />
+ <!-- Line -->
+ <polyline
+ points="3150,9225 3150,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline22" />
+ <!-- Arrowhead on XXXpoint 3150 9225 - 3150 7560-->
+ <!-- Circle -->
+ <circle
+ cx="8550"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle26" />
+ <!-- Circle -->
+ <circle
+ cx="9000"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle28" />
+ <!-- Circle -->
+ <circle
+ cx="9450"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle30" />
+ <!-- Line -->
+ <polyline
+ points="6750,6300 8250,5010 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline32" />
+ <!-- Arrowhead on XXXpoint 6750 6300 - 8391 4890-->
+ <!-- Line -->
+ <polyline
+ points="11250,6300 9747,5010 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline36" />
+ <!-- Arrowhead on XXXpoint 11250 6300 - 9606 4890-->
+ <!-- Circle -->
+ <circle
+ cx="13950"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle40" />
+ <!-- Circle -->
+ <circle
+ cx="13500"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle42" />
+ <!-- Circle -->
+ <circle
+ cx="13050"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle44" />
+ <!-- Circle -->
+ <circle
+ cx="9450"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle46" />
+ <!-- Circle -->
+ <circle
+ cx="9000"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle48" />
+ <!-- Circle -->
+ <circle
+ cx="8550"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle50" />
+ <!-- Circle -->
+ <circle
+ cx="4950"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle52" />
+ <!-- Circle -->
+ <circle
+ cx="4500"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle54" />
+ <!-- Circle -->
+ <circle
+ cx="4050"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle56" />
+ <!-- Circle -->
+ <circle
+ cx="1800"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle58" />
+ <!-- Circle -->
+ <circle
+ cx="2250"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle60" />
+ <!-- Circle -->
+ <circle
+ cx="2700"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle62" />
+ <!-- Circle -->
+ <circle
+ cx="15300"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle64" />
+ <!-- Circle -->
+ <circle
+ cx="15750"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle66" />
+ <!-- Circle -->
+ <circle
+ cx="16200"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle68" />
+ <!-- Circle -->
+ <circle
+ cx="10800"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle70" />
+ <!-- Circle -->
+ <circle
+ cx="11250"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle72" />
+ <!-- Circle -->
+ <circle
+ cx="11700"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle74" />
+ <!-- Circle -->
+ <circle
+ cx="6300"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle76" />
+ <!-- Circle -->
+ <circle
+ cx="6750"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle78" />
+ <!-- Circle -->
+ <circle
+ cx="7200"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle80" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="11475"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect82" />
+ <!-- Line: box -->
+ <rect
+ x="1800"
+ y="9225"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect84" />
+ <!-- Line: box -->
+ <rect
+ x="4500"
+ y="11475"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect86" />
+ <!-- Line: box -->
+ <rect
+ x="6300"
+ y="9270"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect88" />
+ <!-- Line: box -->
+ <rect
+ x="8955"
+ y="11475"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect90" />
+ <!-- Line: box -->
+ <rect
+ x="10755"
+ y="9270"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect92" />
+ <!-- Line: box -->
+ <rect
+ x="13455"
+ y="11475"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect94" />
+ <!-- Line: box -->
+ <rect
+ x="15255"
+ y="9270"
+ width="2700"
+ height="1800"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect96" />
+ <!-- Line -->
+ <polyline
+ points="11700,3600 10197,2310 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline98" />
+ <!-- Arrowhead on XXXpoint 11700 3600 - 10056 2190-->
+ <!-- Line -->
+ <polyline
+ points="6300,3600 7800,2310 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline102" />
+ <!-- Arrowhead on XXXpoint 6300 3600 - 7941 2190-->
+ <!-- Line -->
+ <polyline
+ points="3150,6300 4650,5010 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline106" />
+ <!-- Arrowhead on XXXpoint 3150 6300 - 4791 4890-->
+ <!-- Line -->
+ <polyline
+ points="14850,6300 13347,5010 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline110" />
+ <!-- Arrowhead on XXXpoint 14850 6300 - 13206 4890-->
+ <!-- Line -->
+ <polyline
+ points="1350,11475 1350,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline114" />
+ <!-- Arrowhead on XXXpoint 1350 11475 - 1350 7560-->
+ <!-- Line -->
+ <polyline
+ points="16650,9225 16650,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline118" />
+ <!-- Arrowhead on XXXpoint 16650 9225 - 16650 7560-->
+ <!-- Line -->
+ <polyline
+ points="14850,11475 14850,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline122" />
+ <!-- Arrowhead on XXXpoint 14850 11475 - 14850 7560-->
+ <!-- Line -->
+ <polyline
+ points="12150,9225 12150,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline126" />
+ <!-- Arrowhead on XXXpoint 12150 9225 - 12150 7560-->
+ <!-- Line -->
+ <polyline
+ points="10350,11475 10350,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline130" />
+ <!-- Arrowhead on XXXpoint 10350 11475 - 10350 7560-->
+ <!-- Line -->
+ <polyline
+ points="7650,9225 7650,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline134" />
+ <!-- Arrowhead on XXXpoint 7650 9225 - 7650 7560-->
+ <!-- Line -->
+ <polyline
+ points="5850,11475 5850,7746 "
+ style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline138" />
+ <!-- Arrowhead on XXXpoint 5850 11475 - 5850 7560-->
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="12375"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text142">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="12375"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text144">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5625"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text146">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5625"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text148">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6750"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text150">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6750"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text152">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11250"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text154">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11250"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text156">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="15750"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text158">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="15750"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text160">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text162">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text164">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1350"
+ y="13050"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text166">CPU 0</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1350"
+ y="11925"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text168">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1350"
+ y="12375"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text170">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="10800"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text172">CPU 15</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="9675"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text174">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="10125"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text176">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5850"
+ y="11925"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text178">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5850"
+ y="12375"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text180">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5850"
+ y="13050"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text182">CPU 21823</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7650"
+ y="10845"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text184">CPU 21839</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7650"
+ y="10170"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text186">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7650"
+ y="9720"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text188">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="10305"
+ y="11925"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text190">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="10305"
+ y="12375"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text192">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="10305"
+ y="13050"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text194">CPU 43679</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="12105"
+ y="10845"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text196">CPU 43695</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="12105"
+ y="10170"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text198">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="12105"
+ y="9720"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text200">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="14805"
+ y="11925"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text202">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="14805"
+ y="12375"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text204">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="14805"
+ y="13050"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text206">CPU 65519</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="16605"
+ y="10845"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text208">CPU 65535</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="16605"
+ y="10170"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text210">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="16605"
+ y="9720"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text212">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="675"
+ y="450"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="start"
+ id="text214">struct rcu_state</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="9000"
+ y="1350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text216">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="9000"
+ y="1800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text218">rcu_node</text>
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/TreeLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg
new file mode 100644
index 000000000000..7a7eb3bac95c
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg
@@ -0,0 +1,828 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:41:29 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="17.7in"
+ height="10.4in"
+ viewBox="-66 -66 21237 12507"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="TreeLevel.fig">
+ <metadata
+ id="metadata216">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs214">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3974"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="1023"
+ inkscape:window-height="1148"
+ id="namedview212"
+ showgrid="false"
+ inkscape:zoom="0.55869424"
+ inkscape:cx="796.50006"
+ inkscape:cy="467.99997"
+ inkscape:window-x="897"
+ inkscape:window-y="24"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="0"
+ width="20655"
+ height="8325"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="14130"
+ y="3600"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="7380"
+ y="3600"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="8505"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect12" />
+ <!-- Line: box -->
+ <rect
+ x="13005"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect14" />
+ <!-- Line: box -->
+ <rect
+ x="17505"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect16" />
+ <!-- Line: box -->
+ <rect
+ x="4005"
+ y="6300"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect18" />
+ <!-- Line: box -->
+ <rect
+ x="10755"
+ y="900"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect20" />
+ <!-- Line -->
+ <polyline
+ points="6255,9225 6255,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline22" />
+ <!-- Arrowhead on XXXpoint 6255 9225 - 6255 7560-->
+ <!-- Circle -->
+ <circle
+ cx="11655"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle26" />
+ <!-- Circle -->
+ <circle
+ cx="12105"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle28" />
+ <!-- Circle -->
+ <circle
+ cx="12555"
+ cy="4275"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle30" />
+ <!-- Line -->
+ <polyline
+ points="9855,6300 11355,5010 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline32" />
+ <!-- Arrowhead on XXXpoint 9855 6300 - 11496 4890-->
+ <!-- Line -->
+ <polyline
+ points="14355,6300 12852,5010 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline36" />
+ <!-- Arrowhead on XXXpoint 14355 6300 - 12711 4890-->
+ <!-- Circle -->
+ <circle
+ cx="17055"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle40" />
+ <!-- Circle -->
+ <circle
+ cx="16605"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle42" />
+ <!-- Circle -->
+ <circle
+ cx="16155"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle44" />
+ <!-- Circle -->
+ <circle
+ cx="12555"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle46" />
+ <!-- Circle -->
+ <circle
+ cx="12105"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle48" />
+ <!-- Circle -->
+ <circle
+ cx="11655"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle50" />
+ <!-- Circle -->
+ <circle
+ cx="8055"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle52" />
+ <!-- Circle -->
+ <circle
+ cx="7605"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle54" />
+ <!-- Circle -->
+ <circle
+ cx="7155"
+ cy="6975"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle56" />
+ <!-- Circle -->
+ <circle
+ cx="4905"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle58" />
+ <!-- Circle -->
+ <circle
+ cx="5355"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle60" />
+ <!-- Circle -->
+ <circle
+ cx="5805"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle62" />
+ <!-- Circle -->
+ <circle
+ cx="18405"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle64" />
+ <!-- Circle -->
+ <circle
+ cx="18855"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle66" />
+ <!-- Circle -->
+ <circle
+ cx="19305"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle68" />
+ <!-- Circle -->
+ <circle
+ cx="13905"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle70" />
+ <!-- Circle -->
+ <circle
+ cx="14355"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle72" />
+ <!-- Circle -->
+ <circle
+ cx="14805"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle74" />
+ <!-- Circle -->
+ <circle
+ cx="9405"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle76" />
+ <!-- Circle -->
+ <circle
+ cx="9855"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle78" />
+ <!-- Circle -->
+ <circle
+ cx="10305"
+ cy="8775"
+ r="114"
+ style="fill:#000000;stroke:#000000;stroke-width:21;"
+ id="circle80" />
+ <!-- Line: box -->
+ <rect
+ x="225"
+ y="1125"
+ width="3150"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect82" />
+ <!-- Line: box -->
+ <rect
+ x="225"
+ y="2250"
+ width="3150"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect84" />
+ <!-- Line: box -->
+ <rect
+ x="225"
+ y="3375"
+ width="3150"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect86" />
+ <!-- Line -->
+ <polyline
+ points="14805,3600 13302,2310 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline88" />
+ <!-- Arrowhead on XXXpoint 14805 3600 - 13161 2190-->
+ <!-- Line -->
+ <polyline
+ points="9405,3600 10905,2310 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline92" />
+ <!-- Arrowhead on XXXpoint 9405 3600 - 11046 2190-->
+ <!-- Line -->
+ <polyline
+ points="6255,6300 7755,5010 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline96" />
+ <!-- Arrowhead on XXXpoint 6255 6300 - 7896 4890-->
+ <!-- Line -->
+ <polyline
+ points="17955,6300 16452,5010 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline100" />
+ <!-- Arrowhead on XXXpoint 17955 6300 - 16311 4890-->
+ <!-- Line -->
+ <polyline
+ points="4455,11025 4455,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline104" />
+ <!-- Arrowhead on XXXpoint 4455 11025 - 4455 7560-->
+ <!-- Line -->
+ <polyline
+ points="19755,9225 19755,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline108" />
+ <!-- Arrowhead on XXXpoint 19755 9225 - 19755 7560-->
+ <!-- Line -->
+ <polyline
+ points="17955,11025 17955,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline112" />
+ <!-- Arrowhead on XXXpoint 17955 11025 - 17955 7560-->
+ <!-- Line -->
+ <polyline
+ points="15255,9225 15255,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline116" />
+ <!-- Arrowhead on XXXpoint 15255 9225 - 15255 7560-->
+ <!-- Line -->
+ <polyline
+ points="13455,11025 13455,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline120" />
+ <!-- Arrowhead on XXXpoint 13455 11025 - 13455 7560-->
+ <!-- Line -->
+ <polyline
+ points="10755,9225 10755,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline124" />
+ <!-- Arrowhead on XXXpoint 10755 9225 - 10755 7560-->
+ <!-- Line -->
+ <polyline
+ points="8955,11025 8955,7746 "
+ style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline128" />
+ <!-- Arrowhead on XXXpoint 8955 11025 - 8955 7560-->
+ <!-- Line: box -->
+ <rect
+ x="12105"
+ y="11025"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect132" />
+ <!-- Line: box -->
+ <rect
+ x="13905"
+ y="9225"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect134" />
+ <!-- Line: box -->
+ <rect
+ x="16605"
+ y="11025"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect136" />
+ <!-- Line: box -->
+ <rect
+ x="18405"
+ y="9225"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect138" />
+ <!-- Line: box -->
+ <rect
+ x="9405"
+ y="9225"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect140" />
+ <!-- Line: box -->
+ <rect
+ x="7605"
+ y="11025"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect142" />
+ <!-- Line: box -->
+ <rect
+ x="4905"
+ y="9225"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect144" />
+ <!-- Line: box -->
+ <rect
+ x="3105"
+ y="11025"
+ width="2700"
+ height="1350"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect146" />
+ <!-- Line -->
+ <polyline
+ points="3375,1575 10701,1575 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline148" />
+ <!-- Arrowhead on XXXpoint 3375 1575 - 10890 1575-->
+ <!-- Line -->
+ <polyline
+ points="3375,3825 4050,3825 4050,5400 2700,5400 2700,6975 3951,6975 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline152" />
+ <!-- Arrowhead on XXXpoint 2700 6975 - 4140 6975-->
+ <!-- Line -->
+ <polyline
+ points="3375,2700 5175,2700 5175,4275 7326,4275 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline156" />
+ <!-- Arrowhead on XXXpoint 5175 4275 - 7515 4275-->
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="15480"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text160">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="15480"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text162">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8730"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text164">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8730"
+ y="4500"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text166">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="9855"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text168">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="9855"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text170">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="14355"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text172">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="14355"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text174">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="18855"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text176">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="18855"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text178">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5355"
+ y="6750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text180">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5355"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text182">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="1800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text184">-&gt;level[0]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="2925"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text186">-&gt;level[1]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text188">-&gt;level[2]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="12105"
+ y="1350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text190">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="12105"
+ y="1800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="middle"
+ id="text192">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6255"
+ y="10125"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text194">CPU 15</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4455"
+ y="11925"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text196">CPU 0</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="19755"
+ y="10125"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text198">CPU 65535</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="17955"
+ y="11925"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text200">CPU 65519</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="15255"
+ y="10125"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text202">CPU 43695</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="13455"
+ y="11925"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text204">CPU 43679</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="10755"
+ y="10125"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text206">CPU 21839</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="8955"
+ y="11925"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text208">CPU 21823</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="225"
+ y="450"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="288"
+ text-anchor="start"
+ id="text210">struct rcu_state</text>
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/TreeMapping.svg b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg
new file mode 100644
index 000000000000..729cfa9e6cdb
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg
@@ -0,0 +1,305 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:43:22 2015 -->
+
+<!-- Magnification: 1.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="3.1in"
+ height="0.9in"
+ viewBox="-12 -12 3699 1074"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="TreeMapping.fig">
+ <metadata
+ id="metadata66">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs64">
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow2Lend"
+ style="overflow:visible;">
+ <path
+ id="path3836"
+ style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+ d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+ transform="scale(1.1) rotate(180) translate(1,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow2Mend"
+ style="overflow:visible;">
+ <path
+ id="path3842"
+ style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+ d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+ transform="scale(0.6) rotate(180) translate(0,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3824"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="991"
+ inkscape:window-height="606"
+ id="namedview62"
+ showgrid="false"
+ inkscape:zoom="3.0752688"
+ inkscape:cx="139.5"
+ inkscape:cy="40.5"
+ inkscape:window-x="891"
+ inkscape:window-y="177"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="0"
+ width="3675"
+ height="1050"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="75"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="600"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="1125"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect12" />
+ <!-- Line: box -->
+ <rect
+ x="1650"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect14" />
+ <!-- Line: box -->
+ <rect
+ x="2175"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect16" />
+ <!-- Line: box -->
+ <rect
+ x="3225"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect18" />
+ <!-- Line -->
+ <polyline
+ points="675,375 675,150 300,150 300,358 "
+ style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline20" />
+ <!-- Arrowhead on XXXpoint 300 150 - 300 390-->
+ <!-- Line -->
+ <polyline
+ points="1200,675 1200,900 300,900 300,691 "
+ style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline24" />
+ <!-- Arrowhead on XXXpoint 300 900 - 300 660-->
+ <!-- Line -->
+ <polyline
+ points="1725,375 1725,150 900,150 900,358 "
+ style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline28" />
+ <!-- Arrowhead on XXXpoint 900 150 - 900 390-->
+ <!-- Line -->
+ <polyline
+ points="2250,375 2250,75 825,75 825,358 "
+ style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline32" />
+ <!-- Arrowhead on XXXpoint 825 75 - 825 390-->
+ <!-- Line -->
+ <polyline
+ points="2775,675 2775,900 1425,900 1425,691 "
+ style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline36" />
+ <!-- Arrowhead on XXXpoint 1425 900 - 1425 660-->
+ <!-- Line -->
+ <polyline
+ points="3300,675 3300,975 1350,975 1350,691 "
+ style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline40" />
+ <!-- Arrowhead on XXXpoint 1350 975 - 1350 660-->
+ <!-- Line: box -->
+ <rect
+ x="2700"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect44" />
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="300"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text46">0:7 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1350"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text48">4:7 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1875"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text50">0:1 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text52">2:3 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2925"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text54">4:5 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3450"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text56">6:7 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="825"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text58">0:3 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3600"
+ y="150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="end"
+ id="text60">struct rcu_state</text>
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg
new file mode 100644
index 000000000000..5b416a4b8453
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg
@@ -0,0 +1,380 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:45:19 2015 -->
+
+<!-- Magnification: 1.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="3.1in"
+ height="1.8in"
+ viewBox="-12 -12 3699 2124"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="TreeMappingLevel.svg">
+ <metadata
+ id="metadata98">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title />
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs96">
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow2Lend"
+ style="overflow:visible;">
+ <path
+ id="path3868"
+ style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+ d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+ transform="scale(1.1) rotate(180) translate(1,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="1598"
+ inkscape:window-height="1211"
+ id="namedview94"
+ showgrid="false"
+ inkscape:zoom="5.2508961"
+ inkscape:cx="139.5"
+ inkscape:cy="81"
+ inkscape:window-x="840"
+ inkscape:window-y="122"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="0"
+ width="3675"
+ height="2100"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="75"
+ y="1350"
+ width="750"
+ height="225"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="75"
+ y="1575"
+ width="750"
+ height="225"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="75"
+ y="1800"
+ width="750"
+ height="225"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect12" />
+ <!-- Arc -->
+ <path
+ style="stroke:#000000;stroke-width:7;stroke-linecap:butt;"
+ d="M 1800,900 A 118 118 0 0 0 1800 1125 "
+ id="path14" />
+ <!-- Arc -->
+ <path
+ style="stroke:#000000;stroke-width:7;stroke-linecap:butt;"
+ d="M 750,900 A 75 75 0 0 0 750 1050 "
+ id="path16" />
+ <!-- Line -->
+ <polyline
+ points="750,900 750,691 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline18" />
+ <!-- Arrowhead on XXXpoint 750 900 - 750 660-->
+ <!-- Line: box -->
+ <rect
+ x="75"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect22" />
+ <!-- Line: box -->
+ <rect
+ x="600"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect24" />
+ <!-- Line: box -->
+ <rect
+ x="1650"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect26" />
+ <!-- Line: box -->
+ <rect
+ x="2175"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect28" />
+ <!-- Line: box -->
+ <rect
+ x="3225"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect30" />
+ <!-- Line -->
+ <polyline
+ points="675,375 675,150 300,150 300,358 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline32" />
+ <!-- Arrowhead on XXXpoint 300 150 - 300 390-->
+ <!-- Line -->
+ <polyline
+ points="1725,375 1725,150 900,150 900,358 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline36" />
+ <!-- Arrowhead on XXXpoint 900 150 - 900 390-->
+ <!-- Line -->
+ <polyline
+ points="2250,375 2250,75 825,75 825,358 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline40" />
+ <!-- Arrowhead on XXXpoint 825 75 - 825 390-->
+ <!-- Line -->
+ <polyline
+ points="2775,675 2775,975 1425,975 1425,691 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline44" />
+ <!-- Arrowhead on XXXpoint 1425 975 - 1425 660-->
+ <!-- Line: box -->
+ <rect
+ x="2700"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect48" />
+ <!-- Line: box -->
+ <rect
+ x="1125"
+ y="375"
+ width="375"
+ height="300"
+ rx="0"
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect50" />
+ <!-- Line -->
+ <polyline
+ points="3300,675 3300,1050 1350,1050 1350,691 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline52" />
+ <!-- Arrowhead on XXXpoint 1350 1050 - 1350 660-->
+ <!-- Line -->
+ <polyline
+ points="825,1425 975,1425 975,1200 225,1200 225,691 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline56" />
+ <!-- Arrowhead on XXXpoint 225 1200 - 225 660-->
+ <!-- Line -->
+ <polyline
+ points="1200,675 1200,975 300,975 300,691 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline60" />
+ <!-- Arrowhead on XXXpoint 300 975 - 300 660-->
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="150"
+ y="1500"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="108"
+ text-anchor="start"
+ id="text64">-&gt;level[0]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="150"
+ y="1725"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="108"
+ text-anchor="start"
+ id="text66">-&gt;level[1]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="150"
+ y="1950"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="108"
+ text-anchor="start"
+ id="text68">-&gt;level[2]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="300"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text70">0:7 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1350"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text72">4:7 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1875"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text74">0:1 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text76">2:3 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2925"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text78">4:5 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3450"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text80">6:7 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="825"
+ y="525"
+ fill="#000000"
+ font-family="Times"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="middle"
+ id="text82">0:3 </text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3600"
+ y="150"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="normal"
+ font-size="96"
+ text-anchor="end"
+ id="text84">struct rcu_state</text>
+ <!-- Line -->
+ <polyline
+ points="825,1875 1800,1875 1800,1125 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:none"
+ id="polyline86" />
+ <!-- Line -->
+ <polyline
+ points="1800,900 1800,691 "
+ style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+ id="polyline88" />
+ <!-- Arrowhead on XXXpoint 1800 900 - 1800 660-->
+ <!-- Line -->
+ <polyline
+ points="825,1650 1200,1650 1200,1125 750,1125 750,1050 "
+ style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline92" />
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg
new file mode 100644
index 000000000000..00e810bb8419
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg
@@ -0,0 +1,843 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:35:03 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="10.1in"
+ height="8.6in"
+ viewBox="-44 -44 12088 10288"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="blkd_task.fig">
+ <metadata
+ id="metadata212">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs210">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3970"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="1087"
+ inkscape:window-height="1144"
+ id="namedview208"
+ showgrid="false"
+ inkscape:zoom="1.0495049"
+ inkscape:cx="454.50003"
+ inkscape:cy="387.00003"
+ inkscape:window-x="833"
+ inkscape:window-y="28"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="450"
+ y="0"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="4950"
+ y="4950"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="600"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect10" />
+ <!-- Line -->
+ <polyline
+ points="5250,8100 5688,5912 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline12" />
+ <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
+ <polyline
+ points="5714 6068 5704 5822 5598 6044 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline14" />
+ <!-- Line -->
+ <polyline
+ points="4050,9300 4486,7262 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline16" />
+ <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
+ <polyline
+ points="4514 7418 4506 7172 4396 7394 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline18" />
+ <!-- Line -->
+ <polyline
+ points="1040,9300 1476,7262 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline20" />
+ <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
+ <polyline
+ points="1504 7418 1496 7172 1386 7394 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline22" />
+ <!-- Line -->
+ <polyline
+ points="2240,8100 2676,6062 "
+ style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="polyline24" />
+ <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
+ <polyline
+ points="2704 6218 2696 5972 2586 6194 "
+ style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+ id="polyline26" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="450"
+ width="6300"
+ height="7350"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+ id="rect28" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="1050"
+ width="5700"
+ height="3750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+ id="rect30" />
+ <!-- Line -->
+ <polyline
+ points="1350,3450 2350,2590 "
+ style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline32" />
+ <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
+ <!-- Line -->
+ <polyline
+ points="4950,3450 3948,2590 "
+ style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline36" />
+ <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
+ <!-- Line -->
+ <polyline
+ points="4050,6600 4050,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline40" />
+ <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
+ <!-- Line -->
+ <polyline
+ points="1050,6600 1050,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline44" />
+ <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
+ <!-- Line -->
+ <polyline
+ points="2250,5400 2250,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline48" />
+ <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
+ <!-- Line -->
+ <polyline
+ points="2250,8100 2250,6364 "
+ style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline52" />
+ <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
+ <!-- Line -->
+ <polyline
+ points="1050,9300 1050,7564 "
+ style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline56" />
+ <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
+ <!-- Line -->
+ <polyline
+ points="4050,9300 4050,7564 "
+ style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline60" />
+ <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
+ <!-- Line -->
+ <polyline
+ points="5250,8100 5250,6364 "
+ style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline64" />
+ <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
+ <!-- Circle -->
+ <circle
+ cx="2850"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle68" />
+ <!-- Circle -->
+ <circle
+ cx="3150"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle70" />
+ <!-- Circle -->
+ <circle
+ cx="3450"
+ cy="3900"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle72" />
+ <!-- Circle -->
+ <circle
+ cx="1350"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle74" />
+ <!-- Circle -->
+ <circle
+ cx="1650"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle76" />
+ <!-- Circle -->
+ <circle
+ cx="1950"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle78" />
+ <!-- Circle -->
+ <circle
+ cx="4350"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle80" />
+ <!-- Circle -->
+ <circle
+ cx="4650"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle82" />
+ <!-- Circle -->
+ <circle
+ cx="4950"
+ cy="5100"
+ r="76"
+ style="fill:#000000;stroke:#000000;stroke-width:14;"
+ id="circle84" />
+ <!-- Line: box -->
+ <rect
+ x="750"
+ y="3450"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect86" />
+ <!-- Line: box -->
+ <rect
+ x="300"
+ y="6600"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect88" />
+ <!-- Line: box -->
+ <rect
+ x="4500"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect90" />
+ <!-- Line: box -->
+ <rect
+ x="3300"
+ y="6600"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect92" />
+ <!-- Line: box -->
+ <rect
+ x="2250"
+ y="1650"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect94" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="9300"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect96" />
+ <!-- Line: box -->
+ <rect
+ x="1350"
+ y="8100"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect98" />
+ <!-- Line: box -->
+ <rect
+ x="3000"
+ y="9300"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect100" />
+ <!-- Line: box -->
+ <rect
+ x="4350"
+ y="8100"
+ width="2100"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+ id="rect102" />
+ <!-- Line: box -->
+ <rect
+ x="1500"
+ y="5400"
+ width="1500"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect104" />
+ <!-- Line -->
+ <polygon
+ points="5550,3450 7350,2850 7350,5100 5550,4350 5550,3450 "
+ style="stroke:#000000;stroke-width:14; stroke-linejoin:miter; stroke-linecap:butt; stroke-dasharray:120 120;fill:#ffbfbf; "
+ id="polygon106" />
+ <!-- Line -->
+ <polyline
+ points="9300,3150 10734,3150 "
+ style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline108" />
+ <!-- Arrowhead on XXXpoint 9300 3150 - 10860 3150-->
+ <!-- Line: box -->
+ <rect
+ x="10800"
+ y="2850"
+ width="1200"
+ height="750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect112" />
+ <!-- Line -->
+ <polyline
+ points="11400,3600 11400,4284 "
+ style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline114" />
+ <!-- Arrowhead on XXXpoint 11400 3600 - 11400 4410-->
+ <!-- Line: box -->
+ <rect
+ x="10800"
+ y="4350"
+ width="1200"
+ height="750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect118" />
+ <!-- Line -->
+ <polyline
+ points="11400,5100 11400,5784 "
+ style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline120" />
+ <!-- Arrowhead on XXXpoint 11400 5100 - 11400 5910-->
+ <!-- Line: box -->
+ <rect
+ x="10800"
+ y="5850"
+ width="1200"
+ height="750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect124" />
+ <!-- Line -->
+ <polyline
+ points="9300,3900 9900,3900 9900,4650 10734,4650 "
+ style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline126" />
+ <!-- Arrowhead on XXXpoint 9900 4650 - 10860 4650-->
+ <!-- Line -->
+ <polyline
+ points="9300,4650 9600,4650 9600,6150 10734,6150 "
+ style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline130" />
+ <!-- Arrowhead on XXXpoint 9600 6150 - 10860 6150-->
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6450"
+ y="300"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text134">rcu_bh</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="1950"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text136">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="3150"
+ y="2250"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text138">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="3750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text140">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1650"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text142">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="5700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text144">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2250"
+ y="6000"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text146">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="6900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text148">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text150">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="5700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text152">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5250"
+ y="6000"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text154">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="6900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text156">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="7200"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text158">rcu_data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="450"
+ y="1350"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text160">struct rcu_state</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="9600"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text162">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="1050"
+ y="9900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text164">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="9600"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text166">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4050"
+ y="9900"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text168">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="8400"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text170">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="2400"
+ y="8700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text172">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="8400"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text174">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="5400"
+ y="8700"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text176">rcu_dynticks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="6000"
+ y="750"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="192"
+ text-anchor="end"
+ id="text178">rcu_sched</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11400"
+ y="3300"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="216"
+ text-anchor="middle"
+ id="text180">T3</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11400"
+ y="4800"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="216"
+ text-anchor="middle"
+ id="text182">T2</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11400"
+ y="6300"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="216"
+ text-anchor="middle"
+ id="text184">T1</text>
+ <!-- Line -->
+ <polyline
+ points="5250,5400 5250,4414 "
+ style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline186" />
+ <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
+ <!-- Line: box -->
+ <rect
+ x="3750"
+ y="3450"
+ width="1800"
+ height="900"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect190" />
+ <!-- Line: box -->
+ <rect
+ x="7350"
+ y="2850"
+ width="1950"
+ height="750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect192" />
+ <!-- Line: box -->
+ <rect
+ x="7350"
+ y="3600"
+ width="1950"
+ height="750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect194" />
+ <!-- Line: box -->
+ <rect
+ x="7350"
+ y="4350"
+ width="1950"
+ height="750"
+ rx="0"
+ style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+ id="rect196" />
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text198">rcu_node</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="4650"
+ y="3750"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="middle"
+ id="text200">struct</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7500"
+ y="3300"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text202">blkd_tasks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7500"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text204">gp_tasks</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7500"
+ y="4800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="192"
+ text-anchor="start"
+ id="text206">exp_tasks</text>
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
new file mode 100644
index 000000000000..abc4cc73a097
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
@@ -0,0 +1,396 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec 9 17:39:46 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="10.4in"
+ height="10.4in"
+ viewBox="-66 -66 12507 12507"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.4 r9939"
+ sodipodi:docname="nxtlist.fig">
+ <metadata
+ id="metadata94">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs92">
+ <marker
+ inkscape:stockid="Arrow1Mend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Mend"
+ style="overflow:visible;">
+ <path
+ id="path3852"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+ transform="scale(0.4) rotate(180) translate(10,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="925"
+ inkscape:window-height="928"
+ id="namedview90"
+ showgrid="false"
+ inkscape:zoom="0.80021373"
+ inkscape:cx="467.99997"
+ inkscape:cy="467.99997"
+ inkscape:window-x="948"
+ inkscape:window-y="73"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4" />
+ <g
+ style="stroke-width:.025in; fill:none"
+ id="g4">
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="0"
+ width="7875"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="1125"
+ width="7875"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="2250"
+ width="7875"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="3375"
+ width="7875"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect12" />
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="4500"
+ width="7875"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+ id="rect14" />
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="0"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect16" />
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="1125"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect18" />
+ <!-- Line -->
+ <polyline
+ points="11475,2250 11475,3276 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline20" />
+ <!-- Arrowhead on XXXpoint 11475 2250 - 11475 3465-->
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="6750"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect24" />
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="7875"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect26" />
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="10125"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect28" />
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="11250"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect30" />
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="3375"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect32" />
+ <!-- Line -->
+ <polyline
+ points="11475,5625 11475,6651 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline34" />
+ <!-- Arrowhead on XXXpoint 11475 5625 - 11475 6840-->
+ <!-- Line -->
+ <polyline
+ points="7875,225 10476,225 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline38" />
+ <!-- Arrowhead on XXXpoint 7875 225 - 10665 225-->
+ <!-- Line -->
+ <polyline
+ points="7875,1350 9675,1350 9675,675 7971,675 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline42" />
+ <!-- Arrowhead on XXXpoint 9675 675 - 7785 675-->
+ <!-- Line -->
+ <polyline
+ points="7875,2475 9675,2475 9675,4725 10476,4725 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline46" />
+ <!-- Arrowhead on XXXpoint 9675 4725 - 10665 4725-->
+ <!-- Line -->
+ <polyline
+ points="7875,3600 9225,3600 9225,5175 10476,5175 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline50" />
+ <!-- Arrowhead on XXXpoint 9225 5175 - 10665 5175-->
+ <!-- Line -->
+ <polyline
+ points="7875,4725 8775,4725 8775,11475 10476,11475 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline54" />
+ <!-- Arrowhead on XXXpoint 8775 11475 - 10665 11475-->
+ <!-- Line: box -->
+ <rect
+ x="10575"
+ y="4500"
+ width="1800"
+ height="1125"
+ rx="0"
+ style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+ id="rect58" />
+ <!-- Line -->
+ <polyline
+ points="11475,9000 11475,10026 "
+ style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+ id="polyline60" />
+ <!-- Arrowhead on XXXpoint 11475 9000 - 11475 10215-->
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="225"
+ y="675"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text64">nxtlist</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="225"
+ y="1800"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text66">nxttail[RCU_DONE_TAIL]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="225"
+ y="2925"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text68">nxttail[RCU_WAIT_TAIL]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="225"
+ y="4050"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text70">nxttail[RCU_NEXT_READY_TAIL]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="225"
+ y="5175"
+ fill="#000000"
+ font-family="Courier"
+ font-style="normal"
+ font-weight="bold"
+ font-size="324"
+ text-anchor="start"
+ id="text72">nxttail[RCU_NEXT_TAIL]</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="675"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text74">CB 1</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="1800"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text76">next</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="7425"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text78">CB 3</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="8550"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text80">next</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="10800"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text82">CB 4</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="11925"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text84">next</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="4050"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text86">CB 2</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="11475"
+ y="5175"
+ fill="#000000"
+ font-family="Helvetica"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ text-anchor="middle"
+ id="text88">next</text>
+ </g>
+</svg>
diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
deleted file mode 100644
index 7496a55e4e7b..000000000000
--- a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
+++ /dev/null
Binary files differ
diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
deleted file mode 100644
index ebcbeee391ed..000000000000
--- a/Documentation/RCU/Design/Requirements/RCUApplicability.svg
+++ /dev/null
@@ -1,237 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Creator: fig2dev Version 3.2 Patchlevel 5d -->
-
-<!-- CreationDate: Tue Mar 4 18:34:25 2014 -->
-
-<!-- Magnification: 3.000 -->
-
-<svg
- xmlns:dc="http://purl.org/dc/elements/1.1/"
- xmlns:cc="http://creativecommons.org/ns#"
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- xmlns:svg="http://www.w3.org/2000/svg"
- xmlns="http://www.w3.org/2000/svg"
- xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
- xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
- width="1089.1382"
- height="668.21368"
- viewBox="-2121 -36 14554.634 8876.4061"
- id="svg2"
- version="1.1"
- inkscape:version="0.48.3.1 r9886"
- sodipodi:docname="RCUApplicability.svg">
- <metadata
- id="metadata40">
- <rdf:RDF>
- <cc:Work
- rdf:about="">
- <dc:format>image/svg+xml</dc:format>
- <dc:type
- rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
- <dc:title />
- </cc:Work>
- </rdf:RDF>
- </metadata>
- <defs
- id="defs38" />
- <sodipodi:namedview
- pagecolor="#ffffff"
- bordercolor="#666666"
- borderopacity="1"
- objecttolerance="10"
- gridtolerance="10"
- guidetolerance="10"
- inkscape:pageopacity="0"
- inkscape:pageshadow="2"
- inkscape:window-width="849"
- inkscape:window-height="639"
- id="namedview36"
- showgrid="false"
- inkscape:zoom="0.51326165"
- inkscape:cx="544.56912"
- inkscape:cy="334.10686"
- inkscape:window-x="149"
- inkscape:window-y="448"
- inkscape:window-maximized="0"
- inkscape:current-layer="g4"
- fit-margin-top="5"
- fit-margin-left="5"
- fit-margin-right="5"
- fit-margin-bottom="5" />
- <g
- style="fill:none;stroke-width:0.025in"
- id="g4"
- transform="translate(-2043.6828,14.791398)">
- <!-- Line: box -->
- <rect
- x="0"
- y="0"
- width="14400"
- height="8775"
- rx="0"
- style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
- id="rect6" />
- <!-- Line: box -->
- <rect
- x="1350"
- y="0"
- width="11700"
- height="6075"
- rx="0"
- style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
- id="rect8" />
- <!-- Line: box -->
- <rect
- x="2700"
- y="0"
- width="9000"
- height="4275"
- rx="0"
- style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
- id="rect10" />
- <!-- Line: box -->
- <rect
- x="4050"
- y="0"
- width="6300"
- height="2475"
- rx="0"
- style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
- id="rect12" />
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="900"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text14"
- sodipodi:linespacing="125%"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- id="tspan3017">Read-Mostly, Stale &amp;</tspan></text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="1350"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text16"
- sodipodi:linespacing="125%"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- id="tspan3019">Inconsistent Data OK</tspan></text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="1800"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text18"
- sodipodi:linespacing="125%"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- id="tspan3021">(RCU Works Great!!!)</tspan></text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="3825"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text20"
- sodipodi:linespacing="125%"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- id="tspan3023">(RCU Works Well)</tspan></text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="3375"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text22"
- sodipodi:linespacing="125%"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="5175"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text24"
- sodipodi:linespacing="125%"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- id="tspan3027">Read-Write, Need Consistent Data</tspan></text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="6975"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text26"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="5625"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text28"
- sodipodi:linespacing="125%"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- id="tspan3029">(RCU Might Be OK...)</tspan></text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="7875"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text30"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="8325"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text32"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text>
- <!-- Text -->
- <text
- xml:space="preserve"
- x="7200"
- y="7425"
- font-style="normal"
- font-weight="normal"
- font-size="324"
- id="text34"
- style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
- sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text>
- </g>
-</svg>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index a725f9900ec8..e7e24b3e86e2 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -1,5 +1,3 @@
-<!-- DO NOT HAND EDIT. -->
-<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
@@ -65,8 +63,8 @@ All that aside, here are the categories of currently known RCU requirements:
<p>
This is followed by a <a href="#Summary">summary</a>,
-which is in turn followed by the inevitable
-<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+however, the answers to each quick quiz immediately follows the quiz.
+Select the big white space with your mouse to see the answer.
<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
@@ -153,13 +151,27 @@ Therefore, the outcome:
</blockquote>
cannot happen.
-<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a>
-Wait a minute!
-You said that updaters can make useful forward progress concurrently
-with readers, but pre-existing readers will block
-<tt>synchronize_rcu()</tt>!!!
-Just who are you trying to fool???
-<br><a href="#qq1answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Wait a minute!
+ You said that updaters can make useful forward progress concurrently
+ with readers, but pre-existing readers will block
+ <tt>synchronize_rcu()</tt>!!!
+ Just who are you trying to fool???
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ First, if updaters do not wish to be blocked by readers, they can use
+ <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
+ be discussed later.
+ Second, even when using <tt>synchronize_rcu()</tt>, the other
+ update-side code does run concurrently with readers, whether
+ pre-existing or not.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
This scenario resembles one of the first uses of RCU in
@@ -210,9 +222,20 @@ to guarantee that <tt>do_something()</tt> never runs concurrently
with <tt>recovery()</tt>, but with little or no synchronization
overhead in <tt>do_something_dlm()</tt>.
-<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a>
-Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
-<br><a href="#qq2answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Without that extra grace period, memory reordering could result in
+ <tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
+ concurrently with the last bits of <tt>recovery()</tt>.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
In order to avoid fatal problems such as deadlocks,
@@ -332,12 +355,27 @@ It also prevents any number of &ldquo;interesting&rdquo; compiler
optimizations, for example, the use of <tt>gp</tt> as a scratch
location immediately preceding the assignment.
-<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a>
-But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
-two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
-from being reordered.
-Can't that also cause problems?
-<br><a href="#qq3answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+ two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
+ from being reordered.
+ Can't that also cause problems?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ No, it cannot.
+ The readers cannot see either of these two fields until
+ the assignment to <tt>gp</tt>, by which time both fields are
+ fully initialized.
+ So reordering the assignments
+ to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
+ cause any problems.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
It is tempting to assume that the reader need not do anything special
@@ -494,11 +532,42 @@ The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
code protected by the corresponding update-side lock.
</ol>
-<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a>
-Without the <tt>rcu_dereference()</tt> or the
-<tt>rcu_access_pointer()</tt>, what destructive optimizations
-might the compiler make use of?
-<br><a href="#qq4answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Without the <tt>rcu_dereference()</tt> or the
+ <tt>rcu_access_pointer()</tt>, what destructive optimizations
+ might the compiler make use of?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Let's start with what happens to <tt>do_something_gp()</tt>
+ if it fails to use <tt>rcu_dereference()</tt>.
+ It could reuse a value formerly fetched from this same pointer.
+ It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
+ manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
+ mash-up of two distince pointer values.
+ It might even use value-speculation optimizations, where it makes
+ a wrong guess, but by the time it gets around to checking the
+ value, an update has changed the pointer to match the wrong guess.
+ Too bad about any dereferences that returned pre-initialization garbage
+ in the meantime!
+ </font>
+
+ <p><font color="ffffff">
+ For <tt>remove_gp_synchronous()</tt>, as long as all modifications
+ to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
+ the above optimizations are harmless.
+ However,
+ with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
+ <tt>sparse</tt> will complain if you
+ define <tt>gp</tt> with <tt>__rcu</tt> and then
+ access it without using
+ either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
In short, RCU's publish-subscribe guarantee is provided by the combination
@@ -571,17 +640,156 @@ systems with more than one CPU:
<tt>synchronize_rcu()</tt> migrates in the meantime.
</ol>
-<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a>
-Given that multiple CPUs can start RCU read-side critical sections
-at any time without any ordering whatsoever, how can RCU possibly tell whether
-or not a given RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>?
-<br><a href="#qq5answer">Answer</a>
-
-<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a>
-The first and second guarantees require unbelievably strict ordering!
-Are all these memory barriers <i> really</i> required?
-<br><a href="#qq6answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Given that multiple CPUs can start RCU read-side critical sections
+ at any time without any ordering whatsoever, how can RCU possibly
+ tell whether or not a given RCU read-side critical section starts
+ before a given instance of <tt>synchronize_rcu()</tt>?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ If RCU cannot tell whether or not a given
+ RCU read-side critical section starts before a
+ given instance of <tt>synchronize_rcu()</tt>,
+ then it must assume that the RCU read-side critical section
+ started first.
+ In other words, a given instance of <tt>synchronize_rcu()</tt>
+ can avoid waiting on a given RCU read-side critical section only
+ if it can prove that <tt>synchronize_rcu()</tt> started first.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ The first and second guarantees require unbelievably strict ordering!
+ Are all these memory barriers <i> really</i> required?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Yes, they really are required.
+ To see why the first guarantee is required, consider the following
+ sequence of events:
+ </font>
+
+ <ol>
+ <li> <font color="ffffff">
+ CPU 1: <tt>rcu_read_lock()</tt>
+ </font>
+ <li> <font color="ffffff">
+ CPU 1: <tt>q = rcu_dereference(gp);
+ /* Very likely to return p. */</tt>
+ </font>
+ <li> <font color="ffffff">
+ CPU 0: <tt>list_del_rcu(p);</tt>
+ </font>
+ <li> <font color="ffffff">
+ CPU 0: <tt>synchronize_rcu()</tt> starts.
+ </font>
+ <li> <font color="ffffff">
+ CPU 1: <tt>do_something_with(q-&gt;a);
+ /* No smp_mb(), so might happen after kfree(). */</tt>
+ </font>
+ <li> <font color="ffffff">
+ CPU 1: <tt>rcu_read_unlock()</tt>
+ </font>
+ <li> <font color="ffffff">
+ CPU 0: <tt>synchronize_rcu()</tt> returns.
+ </font>
+ <li> <font color="ffffff">
+ CPU 0: <tt>kfree(p);</tt>
+ </font>
+ </ol>
+
+ <p><font color="ffffff">
+ Therefore, there absolutely must be a full memory barrier between the
+ end of the RCU read-side critical section and the end of the
+ grace period.
+ </font>
+
+ <p><font color="ffffff">
+ The sequence of events demonstrating the necessity of the second rule
+ is roughly similar:
+ </font>
+
+ <ol>
+ <li> <font color="ffffff">CPU 0: <tt>list_del_rcu(p);</tt>
+ </font>
+ <li> <font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> starts.
+ </font>
+ <li> <font color="ffffff">CPU 1: <tt>rcu_read_lock()</tt>
+ </font>
+ <li> <font color="ffffff">CPU 1: <tt>q = rcu_dereference(gp);
+ /* Might return p if no memory barrier. */</tt>
+ </font>
+ <li> <font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> returns.
+ </font>
+ <li> <font color="ffffff">CPU 0: <tt>kfree(p);</tt>
+ </font>
+ <li> <font color="ffffff">
+ CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
+ </font>
+ <li> <font color="ffffff">CPU 1: <tt>rcu_read_unlock()</tt>
+ </font>
+ </ol>
+
+ <p><font color="ffffff">
+ And similarly, without a memory barrier between the beginning of the
+ grace period and the beginning of the RCU read-side critical section,
+ CPU&nbsp;1 might end up accessing the freelist.
+ </font>
+
+ <p><font color="ffffff">
+ The &ldquo;as if&rdquo; rule of course applies, so that any
+ implementation that acts as if the appropriate memory barriers
+ were in place is a correct implementation.
+ That said, it is much easier to fool yourself into believing
+ that you have adhered to the as-if rule than it is to actually
+ adhere to it!
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ You claim that <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
+ generate absolutely no code in some kernel builds.
+ This means that the compiler might arbitrarily rearrange consecutive
+ RCU read-side critical sections.
+ Given such rearrangement, if a given RCU read-side critical section
+ is done, how can you be sure that all prior RCU read-side critical
+ sections are done?
+ Won't the compiler rearrangements make that impossible to determine?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ In cases where <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
+ generate absolutely no code, RCU infers quiescent states only at
+ special locations, for example, within the scheduler.
+ Because calls to <tt>schedule()</tt> had better prevent calling-code
+ accesses to shared variables from being rearranged across the call to
+ <tt>schedule()</tt>, if RCU detects the end of a given RCU read-side
+ critical section, it will necessarily detect the end of all prior
+ RCU read-side critical sections, no matter how aggressively the
+ compiler scrambles the code.
+ </font>
+
+ <p><font color="ffffff">
+ Again, this all assumes that the compiler cannot scramble code across
+ calls to the scheduler, out of interrupt handlers, into the idle loop,
+ into user-mode code, and so on.
+ But if your kernel build allows that sort of scrambling, you have broken
+ far more than just RCU!
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
Note that these memory-barrier requirements do not replace the fundamental
@@ -626,9 +834,19 @@ inconvenience can be avoided through use of the
<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
described later in this document.
-<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a>
-But how does the upgrade-to-write operation exclude other readers?
-<br><a href="#qq7answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ But how does the upgrade-to-write operation exclude other readers?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ It doesn't, just like normal RCU updates, which also do not exclude
+ RCU readers.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
This guarantee allows lookup code to be shared between read-side
@@ -714,9 +932,20 @@ to do significant reordering.
This is by design: Any significant ordering constraints would slow down
these fast-path APIs.
-<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a>
-Can't the compiler also reorder this code?
-<br><a href="#qq8answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Can't the compiler also reorder this code?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ No, the volatile casts in <tt>READ_ONCE()</tt> and
+ <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
+ this particular case.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
@@ -769,10 +998,28 @@ new readers can start immediately after <tt>synchronize_rcu()</tt>
starts, and <tt>synchronize_rcu()</tt> is under no
obligation to wait for these new readers.
-<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a>
-Suppose that synchronize_rcu() did wait until all readers had completed.
-Would the updater be able to rely on this?
-<br><a href="#qq9answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Suppose that synchronize_rcu() did wait until <i>all</i>
+ readers had completed instead of waiting only on
+ pre-existing readers.
+ For how long would the updater be able to rely on there
+ being no readers?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ For no time at all.
+ Even if <tt>synchronize_rcu()</tt> were to wait until
+ all readers had completed, a new reader might start immediately after
+ <tt>synchronize_rcu()</tt> completed.
+ Therefore, the code following
+ <tt>synchronize_rcu()</tt> can <i>never</i> rely on there being
+ no readers.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
@@ -969,11 +1216,24 @@ grace period.
As a result, an RCU read-side critical section cannot partition a pair
of RCU grace periods.
-<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a>
-How long a sequence of grace periods, each separated by an RCU read-side
-critical section, would be required to partition the RCU read-side
-critical sections at the beginning and end of the chain?
-<br><a href="#qq10answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ How long a sequence of grace periods, each separated by an RCU
+ read-side critical section, would be required to partition the RCU
+ read-side critical sections at the beginning and end of the chain?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ In theory, an infinite number.
+ In practice, an unknown number that is sensitive to both implementation
+ details and timing considerations.
+ Therefore, even in practice, RCU users must abide by the
+ theoretical rather than the practical answer.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<h3><a name="Disabling Preemption Does Not Block Grace Periods">
Disabling Preemption Does Not Block Grace Periods</a></h3>
@@ -1109,12 +1369,27 @@ These classes is covered in the following sections.
<h3><a name="Specialization">Specialization</a></h3>
<p>
-RCU is and always has been intended primarily for read-mostly situations, as
-illustrated by the following figure.
-This means that RCU's read-side primitives are optimized, often at the
+RCU is and always has been intended primarily for read-mostly situations,
+which means that RCU's read-side primitives are optimized, often at the
expense of its update-side primitives.
+Experience thus far is captured by the following list of situations:
-<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
+<ol>
+<li> Read-mostly data, where stale and inconsistent data is not
+ a problem: RCU works great!
+<li> Read-mostly data, where data must be consistent:
+ RCU works well.
+<li> Read-write data, where data must be consistent:
+ RCU <i>might</i> work OK.
+ Or not.
+<li> Write-mostly data, where data must be consistent:
+ RCU is very unlikely to be the right tool for the job,
+ with the following exceptions, where RCU can provide:
+ <ol type=a>
+ <li> Existence guarantees for update-friendly mechanisms.
+ <li> Wait-free read-side primitives for real-time use.
+ </ol>
+</ol>
<p>
This focus on read-mostly situations means that RCU must interoperate
@@ -1127,9 +1402,43 @@ synchronization primitives be legal within RCU read-side critical sections,
including spinlocks, sequence locks, atomic operations, reference
counters, and memory barriers.
-<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a>
-What about sleeping locks?
-<br><a href="#qq11answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ What about sleeping locks?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ These are forbidden within Linux-kernel RCU read-side critical
+ sections because it is not legal to place a quiescent state
+ (in this case, voluntary context switch) within an RCU read-side
+ critical section.
+ However, sleeping locks may be used within userspace RCU read-side
+ critical sections, and also within Linux-kernel sleepable RCU
+ <a href="#Sleepable RCU"><font color="ffffff">(SRCU)</font></a>
+ read-side critical sections.
+ In addition, the -rt patchset turns spinlocks into a
+ sleeping locks so that the corresponding critical sections
+ can be preempted, which also means that these sleeplockified
+ spinlocks (but not other sleeping locks!) may be acquire within
+ -rt-Linux-kernel RCU read-side critical sections.
+ </font>
+
+ <p><font color="ffffff">
+ Note that it <i>is</i> legal for a normal RCU read-side
+ critical section to conditionally acquire a sleeping locks
+ (as in <tt>mutex_trylock()</tt>), but only as long as it does
+ not loop indefinitely attempting to conditionally acquire that
+ sleeping locks.
+ The key point is that things like <tt>mutex_trylock()</tt>
+ either return with the mutex held, or return an error indication if
+ the mutex was not immediately available.
+ Either way, <tt>mutex_trylock()</tt> returns immediately without
+ sleeping.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
It often comes as a surprise that many algorithms do not require a
@@ -1160,10 +1469,7 @@ some period of time, so the exact wait period is a judgment call.
One of our pair of veternarians might wait 30 seconds before pronouncing
the cat dead, while the other might insist on waiting a full minute.
The two veternarians would then disagree on the state of the cat during
-the final 30 seconds of the minute following the last heartbeat, as
-fancifully illustrated below:
-
-<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
+the final 30 seconds of the minute following the last heartbeat.
<p>
Interestingly enough, this same situation applies to hardware.
@@ -1343,7 +1649,8 @@ situations where neither <tt>synchronize_rcu()</tt> nor
<tt>synchronize_rcu_expedited()</tt> would be legal,
including within preempt-disable code, <tt>local_bh_disable()</tt> code,
interrupt-disable code, and interrupt handlers.
-However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
+However, even <tt>call_rcu()</tt> is illegal within NMI handlers
+and from idle and offline CPUs.
The callback function (<tt>remove_gp_cb()</tt> in this case) will be
executed within softirq (software interrupt) environment within the
Linux kernel,
@@ -1354,12 +1661,27 @@ write an RCU callback function that takes too long.
Long-running operations should be relegated to separate threads or
(in the Linux kernel) workqueues.
-<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a>
-Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
-After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
-structure, which would interact badly with concurrent insertions.
-Doesn't this mean that <tt>rcu_dereference()</tt> is required?
-<br><a href="#qq12answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
+ After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
+ structure, which would interact badly with concurrent insertions.
+ Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
+ any changes, including any insertions that <tt>rcu_dereference()</tt>
+ would protect against.
+ Therefore, any insertions will be delayed until after
+ <tt>-&gt;gp_lock</tt>
+ is released on line&nbsp;25, which in turn means that
+ <tt>rcu_access_pointer()</tt> suffices.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
However, all that <tt>remove_gp_cb()</tt> is doing is
@@ -1406,14 +1728,31 @@ This was due to the fact that RCU was not heavily used within DYNIX/ptx,
so the very few places that needed something like
<tt>synchronize_rcu()</tt> simply open-coded it.
-<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a>
-Earlier it was claimed that <tt>call_rcu()</tt> and
-<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
-by readers.
-But how can that be correct, given that the invocation of the callback
-and the freeing of the memory (respectively) must still wait for
-a grace period to elapse?
-<br><a href="#qq13answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Earlier it was claimed that <tt>call_rcu()</tt> and
+ <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+ by readers.
+ But how can that be correct, given that the invocation of the callback
+ and the freeing of the memory (respectively) must still wait for
+ a grace period to elapse?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ We could define things this way, but keep in mind that this sort of
+ definition would say that updates in garbage-collected languages
+ cannot complete until the next time the garbage collector runs,
+ which does not seem at all reasonable.
+ The key point is that in most cases, an updater using either
+ <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
+ next update as soon as it has invoked <tt>call_rcu()</tt> or
+ <tt>kfree_rcu()</tt>, without having to wait for a subsequent
+ grace period.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
But what if the updater must wait for the completion of code to be
@@ -1838,11 +2177,26 @@ kthreads to be spawned.
Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
initialization can result in deadlock.
-<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a>
-So what happens with <tt>synchronize_rcu()</tt> during
-scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
-kernels?
-<br><a href="#qq14answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ So what happens with <tt>synchronize_rcu()</tt> during
+ scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+ kernels?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
+ maps directly to <tt>synchronize_sched()</tt>.
+ Therefore, <tt>synchronize_rcu()</tt> works normally throughout
+ boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
+ However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
+ so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
+ during scheduler initialization.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
<p>
I learned of these boot-time requirements as a result of a series of
@@ -2171,6 +2525,14 @@ This real-time requirement motivated the grace-period kthread, which
also simplified handling of a number of race conditions.
<p>
+RCU must avoid degrading real-time response for CPU-bound threads, whether
+executing in usermode (which is one use case for
+<tt>CONFIG_NO_HZ_FULL=y</tt>) or in the kernel.
+That said, CPU-bound loops in the kernel must execute
+<tt>cond_resched_rcu_qs()</tt> at least once per few tens of milliseconds
+in order to avoid receiving an IPI from RCU.
+
+<p>
Finally, RCU's status as a synchronization primitive means that
any RCU failure can result in arbitrary memory corruption that can be
extremely difficult to debug.
@@ -2223,6 +2585,8 @@ described in a separate section.
<li> <a href="#Sched Flavor">Sched Flavor</a>
<li> <a href="#Sleepable RCU">Sleepable RCU</a>
<li> <a href="#Tasks RCU">Tasks RCU</a>
+<li> <a href="#Waiting for Multiple Grace Periods">
+ Waiting for Multiple Grace Periods</a>
</ol>
<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
@@ -2472,6 +2836,94 @@ The tasks-RCU API is quite compact, consisting only of
<tt>synchronize_rcu_tasks()</tt>, and
<tt>rcu_barrier_tasks()</tt>.
+<h3><a name="Waiting for Multiple Grace Periods">
+Waiting for Multiple Grace Periods</a></h3>
+
+<p>
+Perhaps you have an RCU protected data structure that is accessed from
+RCU read-side critical sections, from softirq handlers, and from
+hardware interrupt handlers.
+That is three flavors of RCU, the normal flavor, the bottom-half flavor,
+and the sched flavor.
+How to wait for a compound grace period?
+
+<p>
+The best approach is usually to &ldquo;just say no!&rdquo; and
+insert <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
+around each RCU read-side critical section, regardless of what
+environment it happens to be in.
+But suppose that some of the RCU read-side critical sections are
+on extremely hot code paths, and that use of <tt>CONFIG_PREEMPT=n</tt>
+is not a viable option, so that <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> are not free.
+What then?
+
+<p>
+You <i>could</i> wait on all three grace periods in succession, as follows:
+
+<blockquote>
+<pre>
+ 1 synchronize_rcu();
+ 2 synchronize_rcu_bh();
+ 3 synchronize_sched();
+</pre>
+</blockquote>
+
+<p>
+This works, but triples the update-side latency penalty.
+In cases where this is not acceptable, <tt>synchronize_rcu_mult()</tt>
+may be used to wait on all three flavors of grace period concurrently:
+
+<blockquote>
+<pre>
+ 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
+</pre>
+</blockquote>
+
+<p>
+But what if it is necessary to also wait on SRCU?
+This can be done as follows:
+
+<blockquote>
+<pre>
+ 1 static void call_my_srcu(struct rcu_head *head,
+ 2 void (*func)(struct rcu_head *head))
+ 3 {
+ 4 call_srcu(&amp;my_srcu, head, func);
+ 5 }
+ 6
+ 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
+</pre>
+</blockquote>
+
+<p>
+If you needed to wait on multiple different flavors of SRCU
+(but why???), you would need to create a wrapper function resembling
+<tt>call_my_srcu()</tt> for each SRCU flavor.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ But what if I need to wait for multiple RCU flavors, but I also need
+ the grace periods to be expedited?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ If you are using expedited grace periods, there should be less penalty
+ for waiting on them in succession.
+ But if that is nevertheless a problem, you can use workqueues
+ or multiple kthreads to wait on the various expedited grace
+ periods concurrently.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>
+Again, it is usually better to adjust the RCU read-side critical sections
+to use a single flavor of RCU, but when this is not feasible, you can use
+<tt>synchronize_rcu_mult()</tt>.
+
<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
<p>
@@ -2569,329 +3021,4 @@ and is provided
under the terms of the Creative Commons Attribution-Share Alike 3.0
United States license.
-<h3><a name="Answers to Quick Quizzes">
-Answers to Quick Quizzes</a></h3>
-
-<a name="qq1answer"></a>
-<p><b>Quick Quiz 1</b>:
-Wait a minute!
-You said that updaters can make useful forward progress concurrently
-with readers, but pre-existing readers will block
-<tt>synchronize_rcu()</tt>!!!
-Just who are you trying to fool???
-
-
-</p><p><b>Answer</b>:
-First, if updaters do not wish to be blocked by readers, they can use
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
-be discussed later.
-Second, even when using <tt>synchronize_rcu()</tt>, the other
-update-side code does run concurrently with readers, whether pre-existing
-or not.
-
-
-</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a>
-
-<a name="qq2answer"></a>
-<p><b>Quick Quiz 2</b>:
-Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
-
-
-</p><p><b>Answer</b>:
-Without that extra grace period, memory reordering could result in
-<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
-concurrently with the last bits of <tt>recovery()</tt>.
-
-
-</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a>
-
-<a name="qq3answer"></a>
-<p><b>Quick Quiz 3</b>:
-But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
-two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
-from being reordered.
-Can't that also cause problems?
-
-
-</p><p><b>Answer</b>:
-No, it cannot.
-The readers cannot see either of these two fields until
-the assignment to <tt>gp</tt>, by which time both fields are
-fully initialized.
-So reordering the assignments
-to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
-cause any problems.
-
-
-</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a>
-
-<a name="qq4answer"></a>
-<p><b>Quick Quiz 4</b>:
-Without the <tt>rcu_dereference()</tt> or the
-<tt>rcu_access_pointer()</tt>, what destructive optimizations
-might the compiler make use of?
-
-
-</p><p><b>Answer</b>:
-Let's start with what happens to <tt>do_something_gp()</tt>
-if it fails to use <tt>rcu_dereference()</tt>.
-It could reuse a value formerly fetched from this same pointer.
-It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
-manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
-mash-up of two distince pointer values.
-It might even use value-speculation optimizations, where it makes a wrong
-guess, but by the time it gets around to checking the value, an update
-has changed the pointer to match the wrong guess.
-Too bad about any dereferences that returned pre-initialization garbage
-in the meantime!
-
-<p>
-For <tt>remove_gp_synchronous()</tt>, as long as all modifications
-to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
-the above optimizations are harmless.
-However,
-with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
-<tt>sparse</tt> will complain if you
-define <tt>gp</tt> with <tt>__rcu</tt> and then
-access it without using
-either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
-
-
-</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a>
-
-<a name="qq5answer"></a>
-<p><b>Quick Quiz 5</b>:
-Given that multiple CPUs can start RCU read-side critical sections
-at any time without any ordering whatsoever, how can RCU possibly tell whether
-or not a given RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>?
-
-
-</p><p><b>Answer</b>:
-If RCU cannot tell whether or not a given
-RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>,
-then it must assume that the RCU read-side critical section
-started first.
-In other words, a given instance of <tt>synchronize_rcu()</tt>
-can avoid waiting on a given RCU read-side critical section only
-if it can prove that <tt>synchronize_rcu()</tt> started first.
-
-
-</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a>
-
-<a name="qq6answer"></a>
-<p><b>Quick Quiz 6</b>:
-The first and second guarantees require unbelievably strict ordering!
-Are all these memory barriers <i> really</i> required?
-
-
-</p><p><b>Answer</b>:
-Yes, they really are required.
-To see why the first guarantee is required, consider the following
-sequence of events:
-
-<ol>
-<li> CPU 1: <tt>rcu_read_lock()</tt>
-<li> CPU 1: <tt>q = rcu_dereference(gp);
- /* Very likely to return p. */</tt>
-<li> CPU 0: <tt>list_del_rcu(p);</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li> CPU 1: <tt>do_something_with(q-&gt;a);
- /* No smp_mb(), so might happen after kfree(). */</tt>
-<li> CPU 1: <tt>rcu_read_unlock()</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li> CPU 0: <tt>kfree(p);</tt>
-</ol>
-
-<p>
-Therefore, there absolutely must be a full memory barrier between the
-end of the RCU read-side critical section and the end of the
-grace period.
-
-<p>
-The sequence of events demonstrating the necessity of the second rule
-is roughly similar:
-
-<ol>
-<li> CPU 0: <tt>list_del_rcu(p);</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li> CPU 1: <tt>rcu_read_lock()</tt>
-<li> CPU 1: <tt>q = rcu_dereference(gp);
- /* Might return p if no memory barrier. */</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li> CPU 0: <tt>kfree(p);</tt>
-<li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
-<li> CPU 1: <tt>rcu_read_unlock()</tt>
-</ol>
-
-<p>
-And similarly, without a memory barrier between the beginning of the
-grace period and the beginning of the RCU read-side critical section,
-CPU&nbsp;1 might end up accessing the freelist.
-
-<p>
-The &ldquo;as if&rdquo; rule of course applies, so that any implementation
-that acts as if the appropriate memory barriers were in place is a
-correct implementation.
-That said, it is much easier to fool yourself into believing that you have
-adhered to the as-if rule than it is to actually adhere to it!
-
-
-</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a>
-
-<a name="qq7answer"></a>
-<p><b>Quick Quiz 7</b>:
-But how does the upgrade-to-write operation exclude other readers?
-
-
-</p><p><b>Answer</b>:
-It doesn't, just like normal RCU updates, which also do not exclude
-RCU readers.
-
-
-</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a>
-
-<a name="qq8answer"></a>
-<p><b>Quick Quiz 8</b>:
-Can't the compiler also reorder this code?
-
-
-</p><p><b>Answer</b>:
-No, the volatile casts in <tt>READ_ONCE()</tt> and
-<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
-this particular case.
-
-
-</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a>
-
-<a name="qq9answer"></a>
-<p><b>Quick Quiz 9</b>:
-Suppose that synchronize_rcu() did wait until all readers had completed.
-Would the updater be able to rely on this?
-
-
-</p><p><b>Answer</b>:
-No.
-Even if <tt>synchronize_rcu()</tt> were to wait until
-all readers had completed, a new reader might start immediately after
-<tt>synchronize_rcu()</tt> completed.
-Therefore, the code following
-<tt>synchronize_rcu()</tt> cannot rely on there being no readers
-in any case.
-
-
-</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a>
-
-<a name="qq10answer"></a>
-<p><b>Quick Quiz 10</b>:
-How long a sequence of grace periods, each separated by an RCU read-side
-critical section, would be required to partition the RCU read-side
-critical sections at the beginning and end of the chain?
-
-
-</p><p><b>Answer</b>:
-In theory, an infinite number.
-In practice, an unknown number that is sensitive to both implementation
-details and timing considerations.
-Therefore, even in practice, RCU users must abide by the theoretical rather
-than the practical answer.
-
-
-</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a>
-
-<a name="qq11answer"></a>
-<p><b>Quick Quiz 11</b>:
-What about sleeping locks?
-
-
-</p><p><b>Answer</b>:
-These are forbidden within Linux-kernel RCU read-side critical sections
-because it is not legal to place a quiescent state (in this case,
-voluntary context switch) within an RCU read-side critical section.
-However, sleeping locks may be used within userspace RCU read-side critical
-sections, and also within Linux-kernel sleepable RCU
-<a href="#Sleepable RCU">(SRCU)</a>
-read-side critical sections.
-In addition, the -rt patchset turns spinlocks into a sleeping locks so
-that the corresponding critical sections can be preempted, which
-also means that these sleeplockified spinlocks (but not other sleeping locks!)
-may be acquire within -rt-Linux-kernel RCU read-side critical sections.
-
-<p>
-Note that it <i>is</i> legal for a normal RCU read-side critical section
-to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
-but only as long as it does not loop indefinitely attempting to
-conditionally acquire that sleeping locks.
-The key point is that things like <tt>mutex_trylock()</tt>
-either return with the mutex held, or return an error indication if
-the mutex was not immediately available.
-Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
-
-
-</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a>
-
-<a name="qq12answer"></a>
-<p><b>Quick Quiz 12</b>:
-Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
-After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
-structure, which would interact badly with concurrent insertions.
-Doesn't this mean that <tt>rcu_dereference()</tt> is required?
-
-
-</p><p><b>Answer</b>:
-Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
-any changes, including any insertions that <tt>rcu_dereference()</tt>
-would protect against.
-Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
-is released on line&nbsp;25, which in turn means that
-<tt>rcu_access_pointer()</tt> suffices.
-
-
-</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a>
-
-<a name="qq13answer"></a>
-<p><b>Quick Quiz 13</b>:
-Earlier it was claimed that <tt>call_rcu()</tt> and
-<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
-by readers.
-But how can that be correct, given that the invocation of the callback
-and the freeing of the memory (respectively) must still wait for
-a grace period to elapse?
-
-
-</p><p><b>Answer</b>:
-We could define things this way, but keep in mind that this sort of
-definition would say that updates in garbage-collected languages
-cannot complete until the next time the garbage collector runs,
-which does not seem at all reasonable.
-The key point is that in most cases, an updater using either
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
-next update as soon as it has invoked <tt>call_rcu()</tt> or
-<tt>kfree_rcu()</tt>, without having to wait for a subsequent
-grace period.
-
-
-</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a>
-
-<a name="qq14answer"></a>
-<p><b>Quick Quiz 14</b>:
-So what happens with <tt>synchronize_rcu()</tt> during
-scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
-kernels?
-
-
-</p><p><b>Answer</b>:
-In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
-maps directly to <tt>synchronize_sched()</tt>.
-Therefore, <tt>synchronize_rcu()</tt> works normally throughout
-boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
-However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
-so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
-during scheduler initialization.
-
-
-</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a>
-
-
</body></html>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
deleted file mode 100644
index 3a97ba490c42..000000000000
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ /dev/null
@@ -1,2741 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
- "http://www.w3.org/TR/html4/loose.dtd">
- <html>
- <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
- <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
-
-<h1>A Tour Through RCU's Requirements</h1>
-
-<p>Copyright IBM Corporation, 2015</p>
-<p>Author: Paul E.&nbsp;McKenney</p>
-<p><i>The initial version of this document appeared in the
-<a href="https://lwn.net/">LWN</a> articles
-<a href="https://lwn.net/Articles/652156/">here</a>,
-<a href="https://lwn.net/Articles/652677/">here</a>, and
-<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
-
-<h2>Introduction</h2>
-
-<p>
-Read-copy update (RCU) is a synchronization mechanism that is often
-used as a replacement for reader-writer locking.
-RCU is unusual in that updaters do not block readers,
-which means that RCU's read-side primitives can be exceedingly fast
-and scalable.
-In addition, updaters can make useful forward progress concurrently
-with readers.
-However, all this concurrency between RCU readers and updaters does raise
-the question of exactly what RCU readers are doing, which in turn
-raises the question of exactly what RCU's requirements are.
-
-<p>
-This document therefore summarizes RCU's requirements, and can be thought
-of as an informal, high-level specification for RCU.
-It is important to understand that RCU's specification is primarily
-empirical in nature;
-in fact, I learned about many of these requirements the hard way.
-This situation might cause some consternation, however, not only
-has this learning process been a lot of fun, but it has also been
-a great privilege to work with so many people willing to apply
-technologies in interesting new ways.
-
-<p>
-All that aside, here are the categories of currently known RCU requirements:
-</p>
-
-<ol>
-<li> <a href="#Fundamental Requirements">
- Fundamental Requirements</a>
-<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
-<li> <a href="#Parallelism Facts of Life">
- Parallelism Facts of Life</a>
-<li> <a href="#Quality-of-Implementation Requirements">
- Quality-of-Implementation Requirements</a>
-<li> <a href="#Linux Kernel Complications">
- Linux Kernel Complications</a>
-<li> <a href="#Software-Engineering Requirements">
- Software-Engineering Requirements</a>
-<li> <a href="#Other RCU Flavors">
- Other RCU Flavors</a>
-<li> <a href="#Possible Future Changes">
- Possible Future Changes</a>
-</ol>
-
-<p>
-This is followed by a <a href="#Summary">summary</a>,
-which is in turn followed by the inevitable
-<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
-
-<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
-
-<p>
-RCU's fundamental requirements are the closest thing RCU has to hard
-mathematical requirements.
-These are:
-
-<ol>
-<li> <a href="#Grace-Period Guarantee">
- Grace-Period Guarantee</a>
-<li> <a href="#Publish-Subscribe Guarantee">
- Publish-Subscribe Guarantee</a>
-<li> <a href="#Memory-Barrier Guarantees">
- Memory-Barrier Guarantees</a>
-<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally">
- RCU Primitives Guaranteed to Execute Unconditionally</a>
-<li> <a href="#Guaranteed Read-to-Write Upgrade">
- Guaranteed Read-to-Write Upgrade</a>
-</ol>
-
-<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
-
-<p>
-RCU's grace-period guarantee is unusual in being premeditated:
-Jack Slingwine and I had this guarantee firmly in mind when we started
-work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
-That said, the past two decades of experience with RCU have produced
-a much more detailed understanding of this guarantee.
-
-<p>
-RCU's grace-period guarantee allows updaters to wait for the completion
-of all pre-existing RCU read-side critical sections.
-An RCU read-side critical section
-begins with the marker <tt>rcu_read_lock()</tt> and ends with
-the marker <tt>rcu_read_unlock()</tt>.
-These markers may be nested, and RCU treats a nested set as one
-big RCU read-side critical section.
-Production-quality implementations of <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
-fact have exactly zero overhead in Linux kernels built for production
-use with <tt>CONFIG_PREEMPT=n</tt>.
-
-<p>
-This guarantee allows ordering to be enforced with extremely low
-overhead to readers, for example:
-
-<blockquote>
-<pre>
- 1 int x, y;
- 2
- 3 void thread0(void)
- 4 {
- 5 rcu_read_lock();
- 6 r1 = READ_ONCE(x);
- 7 r2 = READ_ONCE(y);
- 8 rcu_read_unlock();
- 9 }
-10
-11 void thread1(void)
-12 {
-13 WRITE_ONCE(x, 1);
-14 synchronize_rcu();
-15 WRITE_ONCE(y, 1);
-16 }
-</pre>
-</blockquote>
-
-<p>
-Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
-all pre-existing readers, any instance of <tt>thread0()</tt> that
-loads a value of zero from <tt>x</tt> must complete before
-<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
-also load a value of zero from <tt>y</tt>.
-Similarly, any instance of <tt>thread0()</tt> that loads a value of
-one from <tt>y</tt> must have started after the
-<tt>synchronize_rcu()</tt> started, and must therefore also load
-a value of one from <tt>x</tt>.
-Therefore, the outcome:
-<blockquote>
-<pre>
-(r1 == 0 &amp;&amp; r2 == 1)
-</pre>
-</blockquote>
-cannot happen.
-
-<p>@@QQ@@
-Wait a minute!
-You said that updaters can make useful forward progress concurrently
-with readers, but pre-existing readers will block
-<tt>synchronize_rcu()</tt>!!!
-Just who are you trying to fool???
-<p>@@QQA@@
-First, if updaters do not wish to be blocked by readers, they can use
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
-be discussed later.
-Second, even when using <tt>synchronize_rcu()</tt>, the other
-update-side code does run concurrently with readers, whether pre-existing
-or not.
-<p>@@QQE@@
-
-<p>
-This scenario resembles one of the first uses of RCU in
-<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
-which managed a distributed lock manager's transition into
-a state suitable for handling recovery from node failure,
-more or less as follows:
-
-<blockquote>
-<pre>
- 1 #define STATE_NORMAL 0
- 2 #define STATE_WANT_RECOVERY 1
- 3 #define STATE_RECOVERING 2
- 4 #define STATE_WANT_NORMAL 3
- 5
- 6 int state = STATE_NORMAL;
- 7
- 8 void do_something_dlm(void)
- 9 {
-10 int state_snap;
-11
-12 rcu_read_lock();
-13 state_snap = READ_ONCE(state);
-14 if (state_snap == STATE_NORMAL)
-15 do_something();
-16 else
-17 do_something_carefully();
-18 rcu_read_unlock();
-19 }
-20
-21 void start_recovery(void)
-22 {
-23 WRITE_ONCE(state, STATE_WANT_RECOVERY);
-24 synchronize_rcu();
-25 WRITE_ONCE(state, STATE_RECOVERING);
-26 recovery();
-27 WRITE_ONCE(state, STATE_WANT_NORMAL);
-28 synchronize_rcu();
-29 WRITE_ONCE(state, STATE_NORMAL);
-30 }
-</pre>
-</blockquote>
-
-<p>
-The RCU read-side critical section in <tt>do_something_dlm()</tt>
-works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
-to guarantee that <tt>do_something()</tt> never runs concurrently
-with <tt>recovery()</tt>, but with little or no synchronization
-overhead in <tt>do_something_dlm()</tt>.
-
-<p>@@QQ@@
-Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
-<p>@@QQA@@
-Without that extra grace period, memory reordering could result in
-<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
-concurrently with the last bits of <tt>recovery()</tt>.
-<p>@@QQE@@
-
-<p>
-In order to avoid fatal problems such as deadlocks,
-an RCU read-side critical section must not contain calls to
-<tt>synchronize_rcu()</tt>.
-Similarly, an RCU read-side critical section must not
-contain anything that waits, directly or indirectly, on completion of
-an invocation of <tt>synchronize_rcu()</tt>.
-
-<p>
-Although RCU's grace-period guarantee is useful in and of itself, with
-<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
-it would be good to be able to use RCU to coordinate read-side
-access to linked data structures.
-For this, the grace-period guarantee is not sufficient, as can
-be seen in function <tt>add_gp_buggy()</tt> below.
-We will look at the reader's code later, but in the meantime, just think of
-the reader as locklessly picking up the <tt>gp</tt> pointer,
-and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
-<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
-
-<blockquote>
-<pre>
- 1 bool add_gp_buggy(int a, int b)
- 2 {
- 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
- 4 if (!p)
- 5 return -ENOMEM;
- 6 spin_lock(&amp;gp_lock);
- 7 if (rcu_access_pointer(gp)) {
- 8 spin_unlock(&amp;gp_lock);
- 9 return false;
-10 }
-11 p-&gt;a = a;
-12 p-&gt;b = a;
-13 gp = p; /* ORDERING BUG */
-14 spin_unlock(&amp;gp_lock);
-15 return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-The problem is that both the compiler and weakly ordered CPUs are within
-their rights to reorder this code as follows:
-
-<blockquote>
-<pre>
- 1 bool add_gp_buggy_optimized(int a, int b)
- 2 {
- 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
- 4 if (!p)
- 5 return -ENOMEM;
- 6 spin_lock(&amp;gp_lock);
- 7 if (rcu_access_pointer(gp)) {
- 8 spin_unlock(&amp;gp_lock);
- 9 return false;
-10 }
-<b>11 gp = p; /* ORDERING BUG */
-12 p-&gt;a = a;
-13 p-&gt;b = a;</b>
-14 spin_unlock(&amp;gp_lock);
-15 return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-If an RCU reader fetches <tt>gp</tt> just after
-<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
-it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
-fields.
-And this is but one of many ways in which compiler and hardware optimizations
-could cause trouble.
-Therefore, we clearly need some way to prevent the compiler and the CPU from
-reordering in this manner, which brings us to the publish-subscribe
-guarantee discussed in the next section.
-
-<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
-
-<p>
-RCU's publish-subscribe guarantee allows data to be inserted
-into a linked data structure without disrupting RCU readers.
-The updater uses <tt>rcu_assign_pointer()</tt> to insert the
-new data, and readers use <tt>rcu_dereference()</tt> to
-access data, whether new or old.
-The following shows an example of insertion:
-
-<blockquote>
-<pre>
- 1 bool add_gp(int a, int b)
- 2 {
- 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
- 4 if (!p)
- 5 return -ENOMEM;
- 6 spin_lock(&amp;gp_lock);
- 7 if (rcu_access_pointer(gp)) {
- 8 spin_unlock(&amp;gp_lock);
- 9 return false;
-10 }
-11 p-&gt;a = a;
-12 p-&gt;b = a;
-13 rcu_assign_pointer(gp, p);
-14 spin_unlock(&amp;gp_lock);
-15 return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
-equivalent to a simple assignment statement, but also guarantees
-that its assignment will
-happen after the two assignments in lines&nbsp;11 and&nbsp;12,
-similar to the C11 <tt>memory_order_release</tt> store operation.
-It also prevents any number of &ldquo;interesting&rdquo; compiler
-optimizations, for example, the use of <tt>gp</tt> as a scratch
-location immediately preceding the assignment.
-
-<p>@@QQ@@
-But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
-two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
-from being reordered.
-Can't that also cause problems?
-<p>@@QQA@@
-No, it cannot.
-The readers cannot see either of these two fields until
-the assignment to <tt>gp</tt>, by which time both fields are
-fully initialized.
-So reordering the assignments
-to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
-cause any problems.
-<p>@@QQE@@
-
-<p>
-It is tempting to assume that the reader need not do anything special
-to control its accesses to the RCU-protected data,
-as shown in <tt>do_something_gp_buggy()</tt> below:
-
-<blockquote>
-<pre>
- 1 bool do_something_gp_buggy(void)
- 2 {
- 3 rcu_read_lock();
- 4 p = gp; /* OPTIMIZATIONS GALORE!!! */
- 5 if (p) {
- 6 do_something(p-&gt;a, p-&gt;b);
- 7 rcu_read_unlock();
- 8 return true;
- 9 }
-10 rcu_read_unlock();
-11 return false;
-12 }
-</pre>
-</blockquote>
-
-<p>
-However, this temptation must be resisted because there are a
-surprisingly large number of ways that the compiler
-(to say nothing of
-<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
-can trip this code up.
-For but one example, if the compiler were short of registers, it
-might choose to refetch from <tt>gp</tt> rather than keeping
-a separate copy in <tt>p</tt> as follows:
-
-<blockquote>
-<pre>
- 1 bool do_something_gp_buggy_optimized(void)
- 2 {
- 3 rcu_read_lock();
- 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */
-<b> 5 do_something(gp-&gt;a, gp-&gt;b);</b>
- 6 rcu_read_unlock();
- 7 return true;
- 8 }
- 9 rcu_read_unlock();
-10 return false;
-11 }
-</pre>
-</blockquote>
-
-<p>
-If this function ran concurrently with a series of updates that
-replaced the current structure with a new one,
-the fetches of <tt>gp-&gt;a</tt>
-and <tt>gp-&gt;b</tt> might well come from two different structures,
-which could cause serious confusion.
-To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
-<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
-
-<blockquote>
-<pre>
- 1 bool do_something_gp(void)
- 2 {
- 3 rcu_read_lock();
- 4 p = rcu_dereference(gp);
- 5 if (p) {
- 6 do_something(p-&gt;a, p-&gt;b);
- 7 rcu_read_unlock();
- 8 return true;
- 9 }
-10 rcu_read_unlock();
-11 return false;
-12 }
-</pre>
-</blockquote>
-
-<p>
-The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
-memory barriers in the Linux kernel.
-Should a
-<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
-ever appear, then <tt>rcu_dereference()</tt> could be implemented
-as a <tt>memory_order_consume</tt> load.
-Regardless of the exact implementation, a pointer fetched by
-<tt>rcu_dereference()</tt> may not be used outside of the
-outermost RCU read-side critical section containing that
-<tt>rcu_dereference()</tt>, unless protection of
-the corresponding data element has been passed from RCU to some
-other synchronization mechanism, most commonly locking or
-<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
-
-<p>
-In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
-use <tt>rcu_dereference()</tt>, and these two RCU API elements
-work together to ensure that readers have a consistent view of
-newly added data elements.
-
-<p>
-Of course, it is also necessary to remove elements from RCU-protected
-data structures, for example, using the following process:
-
-<ol>
-<li> Remove the data element from the enclosing structure.
-<li> Wait for all pre-existing RCU read-side critical sections
- to complete (because only pre-existing readers can possibly have
- a reference to the newly removed data element).
-<li> At this point, only the updater has a reference to the
- newly removed data element, so it can safely reclaim
- the data element, for example, by passing it to <tt>kfree()</tt>.
-</ol>
-
-This process is implemented by <tt>remove_gp_synchronous()</tt>:
-
-<blockquote>
-<pre>
- 1 bool remove_gp_synchronous(void)
- 2 {
- 3 struct foo *p;
- 4
- 5 spin_lock(&amp;gp_lock);
- 6 p = rcu_access_pointer(gp);
- 7 if (!p) {
- 8 spin_unlock(&amp;gp_lock);
- 9 return false;
-10 }
-11 rcu_assign_pointer(gp, NULL);
-12 spin_unlock(&amp;gp_lock);
-13 synchronize_rcu();
-14 kfree(p);
-15 return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-This function is straightforward, with line&nbsp;13 waiting for a grace
-period before line&nbsp;14 frees the old data element.
-This waiting ensures that readers will reach line&nbsp;7 of
-<tt>do_something_gp()</tt> before the data element referenced by
-<tt>p</tt> is freed.
-The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
-<tt>rcu_dereference()</tt>, except that:
-
-<ol>
-<li> The value returned by <tt>rcu_access_pointer()</tt>
- cannot be dereferenced.
- If you want to access the value pointed to as well as
- the pointer itself, use <tt>rcu_dereference()</tt>
- instead of <tt>rcu_access_pointer()</tt>.
-<li> The call to <tt>rcu_access_pointer()</tt> need not be
- protected.
- In contrast, <tt>rcu_dereference()</tt> must either be
- within an RCU read-side critical section or in a code
- segment where the pointer cannot change, for example, in
- code protected by the corresponding update-side lock.
-</ol>
-
-<p>@@QQ@@
-Without the <tt>rcu_dereference()</tt> or the
-<tt>rcu_access_pointer()</tt>, what destructive optimizations
-might the compiler make use of?
-<p>@@QQA@@
-Let's start with what happens to <tt>do_something_gp()</tt>
-if it fails to use <tt>rcu_dereference()</tt>.
-It could reuse a value formerly fetched from this same pointer.
-It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
-manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
-mash-up of two distince pointer values.
-It might even use value-speculation optimizations, where it makes a wrong
-guess, but by the time it gets around to checking the value, an update
-has changed the pointer to match the wrong guess.
-Too bad about any dereferences that returned pre-initialization garbage
-in the meantime!
-
-<p>
-For <tt>remove_gp_synchronous()</tt>, as long as all modifications
-to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
-the above optimizations are harmless.
-However,
-with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
-<tt>sparse</tt> will complain if you
-define <tt>gp</tt> with <tt>__rcu</tt> and then
-access it without using
-either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
-<p>@@QQE@@
-
-<p>
-In short, RCU's publish-subscribe guarantee is provided by the combination
-of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
-This guarantee allows data elements to be safely added to RCU-protected
-linked data structures without disrupting RCU readers.
-This guarantee can be used in combination with the grace-period
-guarantee to also allow data elements to be removed from RCU-protected
-linked data structures, again without disrupting RCU readers.
-
-<p>
-This guarantee was only partially premeditated.
-DYNIX/ptx used an explicit memory barrier for publication, but had nothing
-resembling <tt>rcu_dereference()</tt> for subscription, nor did it
-have anything resembling the <tt>smp_read_barrier_depends()</tt>
-that was later subsumed into <tt>rcu_dereference()</tt>.
-The need for these operations made itself known quite suddenly at a
-late-1990s meeting with the DEC Alpha architects, back in the days when
-DEC was still a free-standing company.
-It took the Alpha architects a good hour to convince me that any sort
-of barrier would ever be needed, and it then took me a good <i>two</i> hours
-to convince them that their documentation did not make this point clear.
-More recent work with the C and C++ standards committees have provided
-much education on tricks and traps from the compiler.
-In short, compilers were much less tricky in the early 1990s, but in
-2015, don't even think about omitting <tt>rcu_dereference()</tt>!
-
-<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
-
-<p>
-The previous section's simple linked-data-structure scenario clearly
-demonstrates the need for RCU's stringent memory-ordering guarantees on
-systems with more than one CPU:
-
-<ol>
-<li> Each CPU that has an RCU read-side critical section that
- begins before <tt>synchronize_rcu()</tt> starts is
- guaranteed to execute a full memory barrier between the time
- that the RCU read-side critical section ends and the time that
- <tt>synchronize_rcu()</tt> returns.
- Without this guarantee, a pre-existing RCU read-side critical section
- might hold a reference to the newly removed <tt>struct foo</tt>
- after the <tt>kfree()</tt> on line&nbsp;14 of
- <tt>remove_gp_synchronous()</tt>.
-<li> Each CPU that has an RCU read-side critical section that ends
- after <tt>synchronize_rcu()</tt> returns is guaranteed
- to execute a full memory barrier between the time that
- <tt>synchronize_rcu()</tt> begins and the time that the RCU
- read-side critical section begins.
- Without this guarantee, a later RCU read-side critical section
- running after the <tt>kfree()</tt> on line&nbsp;14 of
- <tt>remove_gp_synchronous()</tt> might
- later run <tt>do_something_gp()</tt> and find the
- newly deleted <tt>struct foo</tt>.
-<li> If the task invoking <tt>synchronize_rcu()</tt> remains
- on a given CPU, then that CPU is guaranteed to execute a full
- memory barrier sometime during the execution of
- <tt>synchronize_rcu()</tt>.
- This guarantee ensures that the <tt>kfree()</tt> on
- line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
- execute after the removal on line&nbsp;11.
-<li> If the task invoking <tt>synchronize_rcu()</tt> migrates
- among a group of CPUs during that invocation, then each of the
- CPUs in that group is guaranteed to execute a full memory barrier
- sometime during the execution of <tt>synchronize_rcu()</tt>.
- This guarantee also ensures that the <tt>kfree()</tt> on
- line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
- execute after the removal on
- line&nbsp;11, but also in the case where the thread executing the
- <tt>synchronize_rcu()</tt> migrates in the meantime.
-</ol>
-
-<p>@@QQ@@
-Given that multiple CPUs can start RCU read-side critical sections
-at any time without any ordering whatsoever, how can RCU possibly tell whether
-or not a given RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>?
-<p>@@QQA@@
-If RCU cannot tell whether or not a given
-RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>,
-then it must assume that the RCU read-side critical section
-started first.
-In other words, a given instance of <tt>synchronize_rcu()</tt>
-can avoid waiting on a given RCU read-side critical section only
-if it can prove that <tt>synchronize_rcu()</tt> started first.
-<p>@@QQE@@
-
-<p>@@QQ@@
-The first and second guarantees require unbelievably strict ordering!
-Are all these memory barriers <i> really</i> required?
-<p>@@QQA@@
-Yes, they really are required.
-To see why the first guarantee is required, consider the following
-sequence of events:
-
-<ol>
-<li> CPU 1: <tt>rcu_read_lock()</tt>
-<li> CPU 1: <tt>q = rcu_dereference(gp);
- /* Very likely to return p. */</tt>
-<li> CPU 0: <tt>list_del_rcu(p);</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li> CPU 1: <tt>do_something_with(q-&gt;a);
- /* No smp_mb(), so might happen after kfree(). */</tt>
-<li> CPU 1: <tt>rcu_read_unlock()</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li> CPU 0: <tt>kfree(p);</tt>
-</ol>
-
-<p>
-Therefore, there absolutely must be a full memory barrier between the
-end of the RCU read-side critical section and the end of the
-grace period.
-
-<p>
-The sequence of events demonstrating the necessity of the second rule
-is roughly similar:
-
-<ol>
-<li> CPU 0: <tt>list_del_rcu(p);</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li> CPU 1: <tt>rcu_read_lock()</tt>
-<li> CPU 1: <tt>q = rcu_dereference(gp);
- /* Might return p if no memory barrier. */</tt>
-<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li> CPU 0: <tt>kfree(p);</tt>
-<li> CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
-<li> CPU 1: <tt>rcu_read_unlock()</tt>
-</ol>
-
-<p>
-And similarly, without a memory barrier between the beginning of the
-grace period and the beginning of the RCU read-side critical section,
-CPU&nbsp;1 might end up accessing the freelist.
-
-<p>
-The &ldquo;as if&rdquo; rule of course applies, so that any implementation
-that acts as if the appropriate memory barriers were in place is a
-correct implementation.
-That said, it is much easier to fool yourself into believing that you have
-adhered to the as-if rule than it is to actually adhere to it!
-<p>@@QQE@@
-
-<p>
-Note that these memory-barrier requirements do not replace the fundamental
-RCU requirement that a grace period wait for all pre-existing readers.
-On the contrary, the memory barriers called out in this section must operate in
-such a way as to <i>enforce</i> this fundamental requirement.
-Of course, different implementations enforce this requirement in different
-ways, but enforce it they must.
-
-<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
-
-<p>
-The common-case RCU primitives are unconditional.
-They are invoked, they do their job, and they return, with no possibility
-of error, and no need to retry.
-This is a key RCU design philosophy.
-
-<p>
-However, this philosophy is pragmatic rather than pigheaded.
-If someone comes up with a good justification for a particular conditional
-RCU primitive, it might well be implemented and added.
-After all, this guarantee was reverse-engineered, not premeditated.
-The unconditional nature of the RCU primitives was initially an
-accident of implementation, and later experience with synchronization
-primitives with conditional primitives caused me to elevate this
-accident to a guarantee.
-Therefore, the justification for adding a conditional primitive to
-RCU would need to be based on detailed and compelling use cases.
-
-<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
-
-<p>
-As far as RCU is concerned, it is always possible to carry out an
-update within an RCU read-side critical section.
-For example, that RCU read-side critical section might search for
-a given data element, and then might acquire the update-side
-spinlock in order to update that element, all while remaining
-in that RCU read-side critical section.
-Of course, it is necessary to exit the RCU read-side critical section
-before invoking <tt>synchronize_rcu()</tt>, however, this
-inconvenience can be avoided through use of the
-<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
-described later in this document.
-
-<p>@@QQ@@
-But how does the upgrade-to-write operation exclude other readers?
-<p>@@QQA@@
-It doesn't, just like normal RCU updates, which also do not exclude
-RCU readers.
-<p>@@QQE@@
-
-<p>
-This guarantee allows lookup code to be shared between read-side
-and update-side code, and was premeditated, appearing in the earliest
-DYNIX/ptx RCU documentation.
-
-<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
-
-<p>
-RCU provides extremely lightweight readers, and its read-side guarantees,
-though quite useful, are correspondingly lightweight.
-It is therefore all too easy to assume that RCU is guaranteeing more
-than it really is.
-Of course, the list of things that RCU does not guarantee is infinitely
-long, however, the following sections list a few non-guarantees that
-have caused confusion.
-Except where otherwise noted, these non-guarantees were premeditated.
-
-<ol>
-<li> <a href="#Readers Impose Minimal Ordering">
- Readers Impose Minimal Ordering</a>
-<li> <a href="#Readers Do Not Exclude Updaters">
- Readers Do Not Exclude Updaters</a>
-<li> <a href="#Updaters Only Wait For Old Readers">
- Updaters Only Wait For Old Readers</a>
-<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections">
- Grace Periods Don't Partition Read-Side Critical Sections</a>
-<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
- Read-Side Critical Sections Don't Partition Grace Periods</a>
-<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
- Disabling Preemption Does Not Block Grace Periods</a>
-</ol>
-
-<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
-
-<p>
-Reader-side markers such as <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
-except through their interaction with the grace-period APIs such as
-<tt>synchronize_rcu()</tt>.
-To see this, consider the following pair of threads:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3 rcu_read_lock();
- 4 WRITE_ONCE(x, 1);
- 5 rcu_read_unlock();
- 6 rcu_read_lock();
- 7 WRITE_ONCE(y, 1);
- 8 rcu_read_unlock();
- 9 }
-10
-11 void thread1(void)
-12 {
-13 rcu_read_lock();
-14 r1 = READ_ONCE(y);
-15 rcu_read_unlock();
-16 rcu_read_lock();
-17 r2 = READ_ONCE(x);
-18 rcu_read_unlock();
-19 }
-</pre>
-</blockquote>
-
-<p>
-After <tt>thread0()</tt> and <tt>thread1()</tt> execute
-concurrently, it is quite possible to have
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 0)
-</pre>
-</blockquote>
-
-(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
-which would not be possible if <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> had much in the way of ordering
-properties.
-But they do not, so the CPU is within its rights
-to do significant reordering.
-This is by design: Any significant ordering constraints would slow down
-these fast-path APIs.
-
-<p>@@QQ@@
-Can't the compiler also reorder this code?
-<p>@@QQA@@
-No, the volatile casts in <tt>READ_ONCE()</tt> and
-<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
-this particular case.
-<p>@@QQE@@
-
-<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
-
-<p>
-Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
-exclude updates.
-All they do is to prevent grace periods from ending.
-The following example illustrates this:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3 rcu_read_lock();
- 4 r1 = READ_ONCE(y);
- 5 if (r1) {
- 6 do_something_with_nonzero_x();
- 7 r2 = READ_ONCE(x);
- 8 WARN_ON(!r2); /* BUG!!! */
- 9 }
-10 rcu_read_unlock();
-11 }
-12
-13 void thread1(void)
-14 {
-15 spin_lock(&amp;my_lock);
-16 WRITE_ONCE(x, 1);
-17 WRITE_ONCE(y, 1);
-18 spin_unlock(&amp;my_lock);
-19 }
-</pre>
-</blockquote>
-
-<p>
-If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
-excluded the <tt>thread1()</tt> function's update,
-the <tt>WARN_ON()</tt> could never fire.
-But the fact is that <tt>rcu_read_lock()</tt> does not exclude
-much of anything aside from subsequent grace periods, of which
-<tt>thread1()</tt> has none, so the
-<tt>WARN_ON()</tt> can and does fire.
-
-<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
-
-<p>
-It might be tempting to assume that after <tt>synchronize_rcu()</tt>
-completes, there are no readers executing.
-This temptation must be avoided because
-new readers can start immediately after <tt>synchronize_rcu()</tt>
-starts, and <tt>synchronize_rcu()</tt> is under no
-obligation to wait for these new readers.
-
-<p>@@QQ@@
-Suppose that synchronize_rcu() did wait until all readers had completed.
-Would the updater be able to rely on this?
-<p>@@QQA@@
-No.
-Even if <tt>synchronize_rcu()</tt> were to wait until
-all readers had completed, a new reader might start immediately after
-<tt>synchronize_rcu()</tt> completed.
-Therefore, the code following
-<tt>synchronize_rcu()</tt> cannot rely on there being no readers
-in any case.
-<p>@@QQE@@
-
-<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
-Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
-
-<p>
-It is tempting to assume that if any part of one RCU read-side critical
-section precedes a given grace period, and if any part of another RCU
-read-side critical section follows that same grace period, then all of
-the first RCU read-side critical section must precede all of the second.
-However, this just isn't the case: A single grace period does not
-partition the set of RCU read-side critical sections.
-An example of this situation can be illustrated as follows, where
-<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3 rcu_read_lock();
- 4 WRITE_ONCE(a, 1);
- 5 WRITE_ONCE(b, 1);
- 6 rcu_read_unlock();
- 7 }
- 8
- 9 void thread1(void)
-10 {
-11 r1 = READ_ONCE(a);
-12 synchronize_rcu();
-13 WRITE_ONCE(c, 1);
-14 }
-15
-16 void thread2(void)
-17 {
-18 rcu_read_lock();
-19 r2 = READ_ONCE(b);
-20 r3 = READ_ONCE(c);
-21 rcu_read_unlock();
-22 }
-</pre>
-</blockquote>
-
-<p>
-It turns out that the outcome:
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
-</pre>
-</blockquote>
-
-is entirely possible.
-The following figure show how this can happen, with each circled
-<tt>QS</tt> indicating the point at which RCU recorded a
-<i>quiescent state</i> for each thread, that is, a state in which
-RCU knows that the thread cannot be in the midst of an RCU read-side
-critical section that started before the current grace period:
-
-<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
-
-<p>
-If it is necessary to partition RCU read-side critical sections in this
-manner, it is necessary to use two grace periods, where the first
-grace period is known to end before the second grace period starts:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3 rcu_read_lock();
- 4 WRITE_ONCE(a, 1);
- 5 WRITE_ONCE(b, 1);
- 6 rcu_read_unlock();
- 7 }
- 8
- 9 void thread1(void)
-10 {
-11 r1 = READ_ONCE(a);
-12 synchronize_rcu();
-13 WRITE_ONCE(c, 1);
-14 }
-15
-16 void thread2(void)
-17 {
-18 r2 = READ_ONCE(c);
-19 synchronize_rcu();
-20 WRITE_ONCE(d, 1);
-21 }
-22
-23 void thread3(void)
-24 {
-25 rcu_read_lock();
-26 r3 = READ_ONCE(b);
-27 r4 = READ_ONCE(d);
-28 rcu_read_unlock();
-29 }
-</pre>
-</blockquote>
-
-<p>
-Here, if <tt>(r1 == 1)</tt>, then
-<tt>thread0()</tt>'s write to <tt>b</tt> must happen
-before the end of <tt>thread1()</tt>'s grace period.
-If in addition <tt>(r4 == 1)</tt>, then
-<tt>thread3()</tt>'s read from <tt>b</tt> must happen
-after the beginning of <tt>thread2()</tt>'s grace period.
-If it is also the case that <tt>(r2 == 1)</tt>, then the
-end of <tt>thread1()</tt>'s grace period must precede the
-beginning of <tt>thread2()</tt>'s grace period.
-This mean that the two RCU read-side critical sections cannot overlap,
-guaranteeing that <tt>(r3 == 1)</tt>.
-As a result, the outcome:
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
-</pre>
-</blockquote>
-
-cannot happen.
-
-<p>
-This non-requirement was also non-premeditated, but became apparent
-when studying RCU's interaction with memory ordering.
-
-<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
-Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
-
-<p>
-It is also tempting to assume that if an RCU read-side critical section
-happens between a pair of grace periods, then those grace periods cannot
-overlap.
-However, this temptation leads nowhere good, as can be illustrated by
-the following, with all variables initially zero:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3 rcu_read_lock();
- 4 WRITE_ONCE(a, 1);
- 5 WRITE_ONCE(b, 1);
- 6 rcu_read_unlock();
- 7 }
- 8
- 9 void thread1(void)
-10 {
-11 r1 = READ_ONCE(a);
-12 synchronize_rcu();
-13 WRITE_ONCE(c, 1);
-14 }
-15
-16 void thread2(void)
-17 {
-18 rcu_read_lock();
-19 WRITE_ONCE(d, 1);
-20 r2 = READ_ONCE(c);
-21 rcu_read_unlock();
-22 }
-23
-24 void thread3(void)
-25 {
-26 r3 = READ_ONCE(d);
-27 synchronize_rcu();
-28 WRITE_ONCE(e, 1);
-29 }
-30
-31 void thread4(void)
-32 {
-33 rcu_read_lock();
-34 r4 = READ_ONCE(b);
-35 r5 = READ_ONCE(e);
-36 rcu_read_unlock();
-37 }
-</pre>
-</blockquote>
-
-<p>
-In this case, the outcome:
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
-</pre>
-</blockquote>
-
-is entirely possible, as illustrated below:
-
-<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
-
-<p>
-Again, an RCU read-side critical section can overlap almost all of a
-given grace period, just so long as it does not overlap the entire
-grace period.
-As a result, an RCU read-side critical section cannot partition a pair
-of RCU grace periods.
-
-<p>@@QQ@@
-How long a sequence of grace periods, each separated by an RCU read-side
-critical section, would be required to partition the RCU read-side
-critical sections at the beginning and end of the chain?
-<p>@@QQA@@
-In theory, an infinite number.
-In practice, an unknown number that is sensitive to both implementation
-details and timing considerations.
-Therefore, even in practice, RCU users must abide by the theoretical rather
-than the practical answer.
-<p>@@QQE@@
-
-<h3><a name="Disabling Preemption Does Not Block Grace Periods">
-Disabling Preemption Does Not Block Grace Periods</a></h3>
-
-<p>
-There was a time when disabling preemption on any given CPU would block
-subsequent grace periods.
-However, this was an accident of implementation and is not a requirement.
-And in the current Linux-kernel implementation, disabling preemption
-on a given CPU in fact does not block grace periods, as Oleg Nesterov
-<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
-
-<p>
-If you need a preempt-disable region to block grace periods, you need to add
-<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
-as follows:
-
-<blockquote>
-<pre>
- 1 preempt_disable();
- 2 rcu_read_lock();
- 3 do_something();
- 4 rcu_read_unlock();
- 5 preempt_enable();
- 6
- 7 /* Spinlocks implicitly disable preemption. */
- 8 spin_lock(&amp;mylock);
- 9 rcu_read_lock();
-10 do_something();
-11 rcu_read_unlock();
-12 spin_unlock(&amp;mylock);
-</pre>
-</blockquote>
-
-<p>
-In theory, you could enter the RCU read-side critical section first,
-but it is more efficient to keep the entire RCU read-side critical
-section contained in the preempt-disable region as shown above.
-Of course, RCU read-side critical sections that extend outside of
-preempt-disable regions will work correctly, but such critical sections
-can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
-more work.
-And no, this is <i>not</i> an invitation to enclose all of your RCU
-read-side critical sections within preempt-disable regions, because
-doing so would degrade real-time response.
-
-<p>
-This non-requirement appeared with preemptible RCU.
-If you need a grace period that waits on non-preemptible code regions, use
-<a href="#Sched Flavor">RCU-sched</a>.
-
-<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
-
-<p>
-These parallelism facts of life are by no means specific to RCU, but
-the RCU implementation must abide by them.
-They therefore bear repeating:
-
-<ol>
-<li> Any CPU or task may be delayed at any time,
- and any attempts to avoid these delays by disabling
- preemption, interrupts, or whatever are completely futile.
- This is most obvious in preemptible user-level
- environments and in virtualized environments (where
- a given guest OS's VCPUs can be preempted at any time by
- the underlying hypervisor), but can also happen in bare-metal
- environments due to ECC errors, NMIs, and other hardware
- events.
- Although a delay of more than about 20 seconds can result
- in splats, the RCU implementation is obligated to use
- algorithms that can tolerate extremely long delays, but where
- &ldquo;extremely long&rdquo; is not long enough to allow
- wrap-around when incrementing a 64-bit counter.
-<li> Both the compiler and the CPU can reorder memory accesses.
- Where it matters, RCU must use compiler directives and
- memory-barrier instructions to preserve ordering.
-<li> Conflicting writes to memory locations in any given cache line
- will result in expensive cache misses.
- Greater numbers of concurrent writes and more-frequent
- concurrent writes will result in more dramatic slowdowns.
- RCU is therefore obligated to use algorithms that have
- sufficient locality to avoid significant performance and
- scalability problems.
-<li> As a rough rule of thumb, only one CPU's worth of processing
- may be carried out under the protection of any given exclusive
- lock.
- RCU must therefore use scalable locking designs.
-<li> Counters are finite, especially on 32-bit systems.
- RCU's use of counters must therefore tolerate counter wrap,
- or be designed such that counter wrap would take way more
- time than a single system is likely to run.
- An uptime of ten years is quite possible, a runtime
- of a century much less so.
- As an example of the latter, RCU's dyntick-idle nesting counter
- allows 54 bits for interrupt nesting level (this counter
- is 64 bits even on a 32-bit system).
- Overflowing this counter requires 2<sup>54</sup>
- half-interrupts on a given CPU without that CPU ever going idle.
- If a half-interrupt happened every microsecond, it would take
- 570 years of runtime to overflow this counter, which is currently
- believed to be an acceptably long time.
-<li> Linux systems can have thousands of CPUs running a single
- Linux kernel in a single shared-memory environment.
- RCU must therefore pay close attention to high-end scalability.
-</ol>
-
-<p>
-This last parallelism fact of life means that RCU must pay special
-attention to the preceding facts of life.
-The idea that Linux might scale to systems with thousands of CPUs would
-have been met with some skepticism in the 1990s, but these requirements
-would have otherwise have been unsurprising, even in the early 1990s.
-
-<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
-
-<p>
-These sections list quality-of-implementation requirements.
-Although an RCU implementation that ignores these requirements could
-still be used, it would likely be subject to limitations that would
-make it inappropriate for industrial-strength production use.
-Classes of quality-of-implementation requirements are as follows:
-
-<ol>
-<li> <a href="#Specialization">Specialization</a>
-<li> <a href="#Performance and Scalability">Performance and Scalability</a>
-<li> <a href="#Composability">Composability</a>
-<li> <a href="#Corner Cases">Corner Cases</a>
-</ol>
-
-<p>
-These classes is covered in the following sections.
-
-<h3><a name="Specialization">Specialization</a></h3>
-
-<p>
-RCU is and always has been intended primarily for read-mostly situations, as
-illustrated by the following figure.
-This means that RCU's read-side primitives are optimized, often at the
-expense of its update-side primitives.
-
-<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
-
-<p>
-This focus on read-mostly situations means that RCU must interoperate
-with other synchronization primitives.
-For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
-examples discussed earlier use RCU to protect readers and locking to
-coordinate updaters.
-However, the need extends much farther, requiring that a variety of
-synchronization primitives be legal within RCU read-side critical sections,
-including spinlocks, sequence locks, atomic operations, reference
-counters, and memory barriers.
-
-<p>@@QQ@@
-What about sleeping locks?
-<p>@@QQA@@
-These are forbidden within Linux-kernel RCU read-side critical sections
-because it is not legal to place a quiescent state (in this case,
-voluntary context switch) within an RCU read-side critical section.
-However, sleeping locks may be used within userspace RCU read-side critical
-sections, and also within Linux-kernel sleepable RCU
-<a href="#Sleepable RCU">(SRCU)</a>
-read-side critical sections.
-In addition, the -rt patchset turns spinlocks into a sleeping locks so
-that the corresponding critical sections can be preempted, which
-also means that these sleeplockified spinlocks (but not other sleeping locks!)
-may be acquire within -rt-Linux-kernel RCU read-side critical sections.
-
-<p>
-Note that it <i>is</i> legal for a normal RCU read-side critical section
-to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
-but only as long as it does not loop indefinitely attempting to
-conditionally acquire that sleeping locks.
-The key point is that things like <tt>mutex_trylock()</tt>
-either return with the mutex held, or return an error indication if
-the mutex was not immediately available.
-Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
-<p>@@QQE@@
-
-<p>
-It often comes as a surprise that many algorithms do not require a
-consistent view of data, but many can function in that mode,
-with network routing being the poster child.
-Internet routing algorithms take significant time to propagate
-updates, so that by the time an update arrives at a given system,
-that system has been sending network traffic the wrong way for
-a considerable length of time.
-Having a few threads continue to send traffic the wrong way for a
-few more milliseconds is clearly not a problem: In the worst case,
-TCP retransmissions will eventually get the data where it needs to go.
-In general, when tracking the state of the universe outside of the
-computer, some level of inconsistency must be tolerated due to
-speed-of-light delays if nothing else.
-
-<p>
-Furthermore, uncertainty about external state is inherent in many cases.
-For example, a pair of veternarians might use heartbeat to determine
-whether or not a given cat was alive.
-But how long should they wait after the last heartbeat to decide that
-the cat is in fact dead?
-Waiting less than 400 milliseconds makes no sense because this would
-mean that a relaxed cat would be considered to cycle between death
-and life more than 100 times per minute.
-Moreover, just as with human beings, a cat's heart might stop for
-some period of time, so the exact wait period is a judgment call.
-One of our pair of veternarians might wait 30 seconds before pronouncing
-the cat dead, while the other might insist on waiting a full minute.
-The two veternarians would then disagree on the state of the cat during
-the final 30 seconds of the minute following the last heartbeat, as
-fancifully illustrated below:
-
-<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
-
-<p>
-Interestingly enough, this same situation applies to hardware.
-When push comes to shove, how do we tell whether or not some
-external server has failed?
-We send messages to it periodically, and declare it failed if we
-don't receive a response within a given period of time.
-Policy decisions can usually tolerate short
-periods of inconsistency.
-The policy was decided some time ago, and is only now being put into
-effect, so a few milliseconds of delay is normally inconsequential.
-
-<p>
-However, there are algorithms that absolutely must see consistent data.
-For example, the translation between a user-level SystemV semaphore
-ID to the corresponding in-kernel data structure is protected by RCU,
-but it is absolutely forbidden to update a semaphore that has just been
-removed.
-In the Linux kernel, this need for consistency is accommodated by acquiring
-spinlocks located in the in-kernel data structure from within
-the RCU read-side critical section, and this is indicated by the
-green box in the figure above.
-Many other techniques may be used, and are in fact used within the
-Linux kernel.
-
-<p>
-In short, RCU is not required to maintain consistency, and other
-mechanisms may be used in concert with RCU when consistency is required.
-RCU's specialization allows it to do its job extremely well, and its
-ability to interoperate with other synchronization mechanisms allows
-the right mix of synchronization tools to be used for a given job.
-
-<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
-
-<p>
-Energy efficiency is a critical component of performance today,
-and Linux-kernel RCU implementations must therefore avoid unnecessarily
-awakening idle CPUs.
-I cannot claim that this requirement was premeditated.
-In fact, I learned of it during a telephone conversation in which I
-was given &ldquo;frank and open&rdquo; feedback on the importance
-of energy efficiency in battery-powered systems and on specific
-energy-efficiency shortcomings of the Linux-kernel RCU implementation.
-In my experience, the battery-powered embedded community will consider
-any unnecessary wakeups to be extremely unfriendly acts.
-So much so that mere Linux-kernel-mailing-list posts are
-insufficient to vent their ire.
-
-<p>
-Memory consumption is not particularly important for in most
-situations, and has become decreasingly
-so as memory sizes have expanded and memory
-costs have plummeted.
-However, as I learned from Matt Mackall's
-<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
-efforts, memory footprint is critically important on single-CPU systems with
-non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
-<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
-was born.
-Josh Triplett has since taken over the small-memory banner with his
-<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
-project, which resulted in
-<a href="#Sleepable RCU">SRCU</a>
-becoming optional for those kernels not needing it.
-
-<p>
-The remaining performance requirements are, for the most part,
-unsurprising.
-For example, in keeping with RCU's read-side specialization,
-<tt>rcu_dereference()</tt> should have negligible overhead (for
-example, suppression of a few minor compiler optimizations).
-Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
-
-<p>
-In preemptible environments, in the case where the RCU read-side
-critical section was not preempted (as will be the case for the
-highest-priority real-time process), <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> should have minimal overhead.
-In particular, they should not contain atomic read-modify-write
-operations, memory-barrier instructions, preemption disabling,
-interrupt disabling, or backwards branches.
-However, in the case where the RCU read-side critical section was preempted,
-<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
-This is why it is better to nest an RCU read-side critical section
-within a preempt-disable region than vice versa, at least in cases
-where that critical section is short enough to avoid unduly degrading
-real-time latencies.
-
-<p>
-The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
-optimized for throughput.
-It may therefore incur several milliseconds of latency in addition to
-the duration of the longest RCU read-side critical section.
-On the other hand, multiple concurrent invocations of
-<tt>synchronize_rcu()</tt> are required to use batching optimizations
-so that they can be satisfied by a single underlying grace-period-wait
-operation.
-For example, in the Linux kernel, it is not unusual for a single
-grace-period-wait operation to serve more than
-<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
-of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
-overhead down to nearly zero.
-However, the grace-period optimization is also required to avoid
-measurable degradation of real-time scheduling and interrupt latencies.
-
-<p>
-In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
-latencies are unacceptable.
-In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
-instead, reducing the grace-period latency down to a few tens of
-microseconds on small systems, at least in cases where the RCU read-side
-critical sections are short.
-There are currently no special latency requirements for
-<tt>synchronize_rcu_expedited()</tt> on large systems, but,
-consistent with the empirical nature of the RCU specification,
-that is subject to change.
-However, there most definitely are scalability requirements:
-A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
-CPUs should at least make reasonable forward progress.
-In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
-is permitted to impose modest degradation of real-time latency
-on non-idle online CPUs.
-That said, it will likely be necessary to take further steps to reduce this
-degradation, hopefully to roughly that of a scheduling-clock interrupt.
-
-<p>
-There are a number of situations where even
-<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
-latency is unacceptable.
-In these situations, the asynchronous <tt>call_rcu()</tt> can be
-used in place of <tt>synchronize_rcu()</tt> as follows:
-
-<blockquote>
-<pre>
- 1 struct foo {
- 2 int a;
- 3 int b;
- 4 struct rcu_head rh;
- 5 };
- 6
- 7 static void remove_gp_cb(struct rcu_head *rhp)
- 8 {
- 9 struct foo *p = container_of(rhp, struct foo, rh);
-10
-11 kfree(p);
-12 }
-13
-14 bool remove_gp_asynchronous(void)
-15 {
-16 struct foo *p;
-17
-18 spin_lock(&amp;gp_lock);
-19 p = rcu_dereference(gp);
-20 if (!p) {
-21 spin_unlock(&amp;gp_lock);
-22 return false;
-23 }
-24 rcu_assign_pointer(gp, NULL);
-25 call_rcu(&amp;p-&gt;rh, remove_gp_cb);
-26 spin_unlock(&amp;gp_lock);
-27 return true;
-28 }
-</pre>
-</blockquote>
-
-<p>
-A definition of <tt>struct foo</tt> is finally needed, and appears
-on lines&nbsp;1-5.
-The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
-on line&nbsp;25, and will be invoked after the end of a subsequent
-grace period.
-This gets the same effect as <tt>remove_gp_synchronous()</tt>,
-but without forcing the updater to wait for a grace period to elapse.
-The <tt>call_rcu()</tt> function may be used in a number of
-situations where neither <tt>synchronize_rcu()</tt> nor
-<tt>synchronize_rcu_expedited()</tt> would be legal,
-including within preempt-disable code, <tt>local_bh_disable()</tt> code,
-interrupt-disable code, and interrupt handlers.
-However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
-The callback function (<tt>remove_gp_cb()</tt> in this case) will be
-executed within softirq (software interrupt) environment within the
-Linux kernel,
-either within a real softirq handler or under the protection
-of <tt>local_bh_disable()</tt>.
-In both the Linux kernel and in userspace, it is bad practice to
-write an RCU callback function that takes too long.
-Long-running operations should be relegated to separate threads or
-(in the Linux kernel) workqueues.
-
-<p>@@QQ@@
-Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
-After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
-structure, which would interact badly with concurrent insertions.
-Doesn't this mean that <tt>rcu_dereference()</tt> is required?
-<p>@@QQA@@
-Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
-any changes, including any insertions that <tt>rcu_dereference()</tt>
-would protect against.
-Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
-is released on line&nbsp;25, which in turn means that
-<tt>rcu_access_pointer()</tt> suffices.
-<p>@@QQE@@
-
-<p>
-However, all that <tt>remove_gp_cb()</tt> is doing is
-invoking <tt>kfree()</tt> on the data element.
-This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
-which allows &ldquo;fire and forget&rdquo; operation as shown below:
-
-<blockquote>
-<pre>
- 1 struct foo {
- 2 int a;
- 3 int b;
- 4 struct rcu_head rh;
- 5 };
- 6
- 7 bool remove_gp_faf(void)
- 8 {
- 9 struct foo *p;
-10
-11 spin_lock(&amp;gp_lock);
-12 p = rcu_dereference(gp);
-13 if (!p) {
-14 spin_unlock(&amp;gp_lock);
-15 return false;
-16 }
-17 rcu_assign_pointer(gp, NULL);
-18 kfree_rcu(p, rh);
-19 spin_unlock(&amp;gp_lock);
-20 return true;
-21 }
-</pre>
-</blockquote>
-
-<p>
-Note that <tt>remove_gp_faf()</tt> simply invokes
-<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
-further attention to the subsequent grace period and <tt>kfree()</tt>.
-It is permissible to invoke <tt>kfree_rcu()</tt> from the same
-environments as for <tt>call_rcu()</tt>.
-Interestingly enough, DYNIX/ptx had the equivalents of
-<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
-<tt>synchronize_rcu()</tt>.
-This was due to the fact that RCU was not heavily used within DYNIX/ptx,
-so the very few places that needed something like
-<tt>synchronize_rcu()</tt> simply open-coded it.
-
-<p>@@QQ@@
-Earlier it was claimed that <tt>call_rcu()</tt> and
-<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
-by readers.
-But how can that be correct, given that the invocation of the callback
-and the freeing of the memory (respectively) must still wait for
-a grace period to elapse?
-<p>@@QQA@@
-We could define things this way, but keep in mind that this sort of
-definition would say that updates in garbage-collected languages
-cannot complete until the next time the garbage collector runs,
-which does not seem at all reasonable.
-The key point is that in most cases, an updater using either
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
-next update as soon as it has invoked <tt>call_rcu()</tt> or
-<tt>kfree_rcu()</tt>, without having to wait for a subsequent
-grace period.
-<p>@@QQE@@
-
-<p>
-But what if the updater must wait for the completion of code to be
-executed after the end of the grace period, but has other tasks
-that can be carried out in the meantime?
-The polling-style <tt>get_state_synchronize_rcu()</tt> and
-<tt>cond_synchronize_rcu()</tt> functions may be used for this
-purpose, as shown below:
-
-<blockquote>
-<pre>
- 1 bool remove_gp_poll(void)
- 2 {
- 3 struct foo *p;
- 4 unsigned long s;
- 5
- 6 spin_lock(&amp;gp_lock);
- 7 p = rcu_access_pointer(gp);
- 8 if (!p) {
- 9 spin_unlock(&amp;gp_lock);
-10 return false;
-11 }
-12 rcu_assign_pointer(gp, NULL);
-13 spin_unlock(&amp;gp_lock);
-14 s = get_state_synchronize_rcu();
-15 do_something_while_waiting();
-16 cond_synchronize_rcu(s);
-17 kfree(p);
-18 return true;
-19 }
-</pre>
-</blockquote>
-
-<p>
-On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
-&ldquo;cookie&rdquo; from RCU,
-then line&nbsp;15 carries out other tasks,
-and finally, line&nbsp;16 returns immediately if a grace period has
-elapsed in the meantime, but otherwise waits as required.
-The need for <tt>get_state_synchronize_rcu</tt> and
-<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
-so it is too early to tell whether they will stand the test of time.
-
-<p>
-RCU thus provides a range of tools to allow updaters to strike the
-required tradeoff between latency, flexibility and CPU overhead.
-
-<h3><a name="Composability">Composability</a></h3>
-
-<p>
-Composability has received much attention in recent years, perhaps in part
-due to the collision of multicore hardware with object-oriented techniques
-designed in single-threaded environments for single-threaded use.
-And in theory, RCU read-side critical sections may be composed, and in
-fact may be nested arbitrarily deeply.
-In practice, as with all real-world implementations of composable
-constructs, there are limitations.
-
-<p>
-Implementations of RCU for which <tt>rcu_read_lock()</tt>
-and <tt>rcu_read_unlock()</tt> generate no code, such as
-Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
-nested arbitrarily deeply.
-After all, there is no overhead.
-Except that if all these instances of <tt>rcu_read_lock()</tt>
-and <tt>rcu_read_unlock()</tt> are visible to the compiler,
-compilation will eventually fail due to exhausting memory,
-mass storage, or user patience, whichever comes first.
-If the nesting is not visible to the compiler, as is the case with
-mutually recursive functions each in its own translation unit,
-stack overflow will result.
-If the nesting takes the form of loops, either the control variable
-will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
-Nevertheless, this class of RCU implementations is one
-of the most composable constructs in existence.
-
-<p>
-RCU implementations that explicitly track nesting depth
-are limited by the nesting-depth counter.
-For example, the Linux kernel's preemptible RCU limits nesting to
-<tt>INT_MAX</tt>.
-This should suffice for almost all practical purposes.
-That said, a consecutive pair of RCU read-side critical sections
-between which there is an operation that waits for a grace period
-cannot be enclosed in another RCU read-side critical section.
-This is because it is not legal to wait for a grace period within
-an RCU read-side critical section: To do so would result either
-in deadlock or
-in RCU implicitly splitting the enclosing RCU read-side critical
-section, neither of which is conducive to a long-lived and prosperous
-kernel.
-
-<p>
-It is worth noting that RCU is not alone in limiting composability.
-For example, many transactional-memory implementations prohibit
-composing a pair of transactions separated by an irrevocable
-operation (for example, a network receive operation).
-For another example, lock-based critical sections can be composed
-surprisingly freely, but only if deadlock is avoided.
-
-<p>
-In short, although RCU read-side critical sections are highly composable,
-care is required in some situations, just as is the case for any other
-composable synchronization mechanism.
-
-<h3><a name="Corner Cases">Corner Cases</a></h3>
-
-<p>
-A given RCU workload might have an endless and intense stream of
-RCU read-side critical sections, perhaps even so intense that there
-was never a point in time during which there was not at least one
-RCU read-side critical section in flight.
-RCU cannot allow this situation to block grace periods: As long as
-all the RCU read-side critical sections are finite, grace periods
-must also be finite.
-
-<p>
-That said, preemptible RCU implementations could potentially result
-in RCU read-side critical sections being preempted for long durations,
-which has the effect of creating a long-duration RCU read-side
-critical section.
-This situation can arise only in heavily loaded systems, but systems using
-real-time priorities are of course more vulnerable.
-Therefore, RCU priority boosting is provided to help deal with this
-case.
-That said, the exact requirements on RCU priority boosting will likely
-evolve as more experience accumulates.
-
-<p>
-Other workloads might have very high update rates.
-Although one can argue that such workloads should instead use
-something other than RCU, the fact remains that RCU must
-handle such workloads gracefully.
-This requirement is another factor driving batching of grace periods,
-but it is also the driving force behind the checks for large numbers
-of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
-Finally, high update rates should not delay RCU read-side critical
-sections, although some read-side delays can occur when using
-<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
-of <tt>try_stop_cpus()</tt>.
-(In the future, <tt>synchronize_rcu_expedited()</tt> will be
-converted to use lighter-weight inter-processor interrupts (IPIs),
-but this will still disturb readers, though to a much smaller degree.)
-
-<p>
-Although all three of these corner cases were understood in the early
-1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
-in a tight loop
-in the early 2000s suddenly provided a much deeper appreciation of the
-high-update-rate corner case.
-This test also motivated addition of some RCU code to react to high update
-rates, for example, if a given CPU finds itself with more than 10,000
-RCU callbacks queued, it will cause RCU to take evasive action by
-more aggressively starting grace periods and more aggressively forcing
-completion of grace-period processing.
-This evasive action causes the grace period to complete more quickly,
-but at the cost of restricting RCU's batching optimizations, thus
-increasing the CPU overhead incurred by that grace period.
-
-<h2><a name="Software-Engineering Requirements">
-Software-Engineering Requirements</a></h2>
-
-<p>
-Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
-guard against mishaps and misuse:
-
-<ol>
-<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt>
- everywhere that it is needed, so kernels built with
- <tt>CONFIG_PROVE_RCU=y</tt> will spat if
- <tt>rcu_dereference()</tt> is used outside of an
- RCU read-side critical section.
- Update-side code can use <tt>rcu_dereference_protected()</tt>,
- which takes a
- <a href="https://lwn.net/Articles/371986/">lockdep expression</a>
- to indicate what is providing the protection.
- If the indicated protection is not provided, a lockdep splat
- is emitted.
-
- <p>
- Code shared between readers and updaters can use
- <tt>rcu_dereference_check()</tt>, which also takes a
- lockdep expression, and emits a lockdep splat if neither
- <tt>rcu_read_lock()</tt> nor the indicated protection
- is in place.
- In addition, <tt>rcu_dereference_raw()</tt> is used in those
- (hopefully rare) cases where the required protection cannot
- be easily described.
- Finally, <tt>rcu_read_lock_held()</tt> is provided to
- allow a function to verify that it has been invoked within
- an RCU read-side critical section.
- I was made aware of this set of requirements shortly after Thomas
- Gleixner audited a number of RCU uses.
-<li> A given function might wish to check for RCU-related preconditions
- upon entry, before using any other RCU API.
- The <tt>rcu_lockdep_assert()</tt> does this job,
- asserting the expression in kernels having lockdep enabled
- and doing nothing otherwise.
-<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
- and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
- substituting a simple assignment.
- To catch this sort of error, a given RCU-protected pointer may be
- tagged with <tt>__rcu</tt>, after which running sparse
- with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
- about simple-assignment accesses to that pointer.
- Arnd Bergmann made me aware of this requirement, and also
- supplied the needed
- <a href="https://lwn.net/Articles/376011/">patch series</a>.
-<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
- will splat if a data element is passed to <tt>call_rcu()</tt>
- twice in a row, without a grace period in between.
- (This error is similar to a double free.)
- The corresponding <tt>rcu_head</tt> structures that are
- dynamically allocated are automatically tracked, but
- <tt>rcu_head</tt> structures allocated on the stack
- must be initialized with <tt>init_rcu_head_on_stack()</tt>
- and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
- Similarly, statically allocated non-stack <tt>rcu_head</tt>
- structures must be initialized with <tt>init_rcu_head()</tt>
- and cleaned up with <tt>destroy_rcu_head()</tt>.
- Mathieu Desnoyers made me aware of this requirement, and also
- supplied the needed
- <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
-<li> An infinite loop in an RCU read-side critical section will
- eventually trigger an RCU CPU stall warning splat, with
- the duration of &ldquo;eventually&rdquo; being controlled by the
- <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
- alternatively, by the
- <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
- parameter.
- However, RCU is not obligated to produce this splat
- unless there is a grace period waiting on that particular
- RCU read-side critical section.
- <p>
- Some extreme workloads might intentionally delay
- RCU grace periods, and systems running those workloads can
- be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
- to suppress the splats.
- This kernel parameter may also be set via <tt>sysfs</tt>.
- Furthermore, RCU CPU stall warnings are counter-productive
- during sysrq dumps and during panics.
- RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
- <tt>rcu_sysrq_end()</tt> API members to be called before
- and after long sysrq dumps.
- RCU also supplies the <tt>rcu_panic()</tt> notifier that is
- automatically invoked at the beginning of a panic to suppress
- further RCU CPU stall warnings.
-
- <p>
- This requirement made itself known in the early 1990s, pretty
- much the first time that it was necessary to debug a CPU stall.
- That said, the initial implementation in DYNIX/ptx was quite
- generic in comparison with that of Linux.
-<li> Although it would be very good to detect pointers leaking out
- of RCU read-side critical sections, there is currently no
- good way of doing this.
- One complication is the need to distinguish between pointers
- leaking and pointers that have been handed off from RCU to
- some other synchronization mechanism, for example, reference
- counting.
-<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
- information is provided via both debugfs and event tracing.
-<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
- <tt>rcu_dereference()</tt> to create typical linked
- data structures can be surprisingly error-prone.
- Therefore, RCU-protected
- <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
- and, more recently, RCU-protected
- <a href="https://lwn.net/Articles/612100/">hash tables</a>
- are available.
- Many other special-purpose RCU-protected data structures are
- available in the Linux kernel and the userspace RCU library.
-<li> Some linked structures are created at compile time, but still
- require <tt>__rcu</tt> checking.
- The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
- purpose.
-<li> It is not necessary to use <tt>rcu_assign_pointer()</tt>
- when creating linked structures that are to be published via
- a single external pointer.
- The <tt>RCU_INIT_POINTER()</tt> macro is provided for
- this task and also for assigning <tt>NULL</tt> pointers
- at runtime.
-</ol>
-
-<p>
-This not a hard-and-fast list: RCU's diagnostic capabilities will
-continue to be guided by the number and type of usage bugs found
-in real-world RCU usage.
-
-<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
-
-<p>
-The Linux kernel provides an interesting environment for all kinds of
-software, including RCU.
-Some of the relevant points of interest are as follows:
-
-<ol>
-<li> <a href="#Configuration">Configuration</a>.
-<li> <a href="#Firmware Interface">Firmware Interface</a>.
-<li> <a href="#Early Boot">Early Boot</a>.
-<li> <a href="#Interrupts and NMIs">
- Interrupts and non-maskable interrupts (NMIs)</a>.
-<li> <a href="#Loadable Modules">Loadable Modules</a>.
-<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
-<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
-<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
-<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
-<li> <a href="#Memory Efficiency">Memory Efficiency</a>.
-<li> <a href="#Performance, Scalability, Response Time, and Reliability">
- Performance, Scalability, Response Time, and Reliability</a>.
-</ol>
-
-<p>
-This list is probably incomplete, but it does give a feel for the
-most notable Linux-kernel complications.
-Each of the following sections covers one of the above topics.
-
-<h3><a name="Configuration">Configuration</a></h3>
-
-<p>
-RCU's goal is automatic configuration, so that almost nobody
-needs to worry about RCU's <tt>Kconfig</tt> options.
-And for almost all users, RCU does in fact work well
-&ldquo;out of the box.&rdquo;
-
-<p>
-However, there are specialized use cases that are handled by
-kernel boot parameters and <tt>Kconfig</tt> options.
-Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
-about new <tt>Kconfig</tt> options, which requires almost all of them
-be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
-
-<p>
-This all should be quite obvious, but the fact remains that
-Linus Torvalds recently had to
-<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
-me of this requirement.
-
-<h3><a name="Firmware Interface">Firmware Interface</a></h3>
-
-<p>
-In many cases, kernel obtains information about the system from the
-firmware, and sometimes things are lost in translation.
-Or the translation is accurate, but the original message is bogus.
-
-<p>
-For example, some systems' firmware overreports the number of CPUs,
-sometimes by a large factor.
-If RCU naively believed the firmware, as it used to do,
-it would create too many per-CPU kthreads.
-Although the resulting system will still run correctly, the extra
-kthreads needlessly consume memory and can cause confusion
-when they show up in <tt>ps</tt> listings.
-
-<p>
-RCU must therefore wait for a given CPU to actually come online before
-it can allow itself to believe that the CPU actually exists.
-The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
-come online) cause a number of
-<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
-
-<h3><a name="Early Boot">Early Boot</a></h3>
-
-<p>
-The Linux kernel's boot sequence is an interesting process,
-and RCU is used early, even before <tt>rcu_init()</tt>
-is invoked.
-In fact, a number of RCU's primitives can be used as soon as the
-initial task's <tt>task_struct</tt> is available and the
-boot CPU's per-CPU variables are set up.
-The read-side primitives (<tt>rcu_read_lock()</tt>,
-<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
-and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
-as will <tt>rcu_assign_pointer()</tt>.
-
-<p>
-Although <tt>call_rcu()</tt> may be invoked at any
-time during boot, callbacks are not guaranteed to be invoked until after
-the scheduler is fully up and running.
-This delay in callback invocation is due to the fact that RCU does not
-invoke callbacks until it is fully initialized, and this full initialization
-cannot occur until after the scheduler has initialized itself to the
-point where RCU can spawn and run its kthreads.
-In theory, it would be possible to invoke callbacks earlier,
-however, this is not a panacea because there would be severe restrictions
-on what operations those callbacks could invoke.
-
-<p>
-Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
-<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
-(<a href="#Bottom-Half Flavor">discussed below</a>),
-and
-<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
-will all operate normally
-during very early boot, the reason being that there is only one CPU
-and preemption is disabled.
-This means that the call <tt>synchronize_rcu()</tt> (or friends)
-itself is a quiescent
-state and thus a grace period, so the early-boot implementation can
-be a no-op.
-
-<p>
-Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
-continue to operate normally through the remainder of boot, courtesy
-of the fact that preemption is disabled across their RCU read-side
-critical sections and also courtesy of the fact that there is still
-only one CPU.
-However, once the scheduler starts initializing, preemption is enabled.
-There is still only a single CPU, but the fact that preemption is enabled
-means that the no-op implementation of <tt>synchronize_rcu()</tt> no
-longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
-Therefore, as soon as the scheduler starts initializing, the early-boot
-fastpath is disabled.
-This means that <tt>synchronize_rcu()</tt> switches to its runtime
-mode of operation where it posts callbacks, which in turn means that
-any call to <tt>synchronize_rcu()</tt> will block until the corresponding
-callback is invoked.
-Unfortunately, the callback cannot be invoked until RCU's runtime
-grace-period machinery is up and running, which cannot happen until
-the scheduler has initialized itself sufficiently to allow RCU's
-kthreads to be spawned.
-Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
-initialization can result in deadlock.
-
-<p>@@QQ@@
-So what happens with <tt>synchronize_rcu()</tt> during
-scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
-kernels?
-<p>@@QQA@@
-In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
-maps directly to <tt>synchronize_sched()</tt>.
-Therefore, <tt>synchronize_rcu()</tt> works normally throughout
-boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
-However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
-so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
-during scheduler initialization.
-<p>@@QQE@@
-
-<p>
-I learned of these boot-time requirements as a result of a series of
-system hangs.
-
-<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
-
-<p>
-The Linux kernel has interrupts, and RCU read-side critical sections are
-legal within interrupt handlers and within interrupt-disabled regions
-of code, as are invocations of <tt>call_rcu()</tt>.
-
-<p>
-Some Linux-kernel architectures can enter an interrupt handler from
-non-idle process context, and then just never leave it, instead stealthily
-transitioning back to process context.
-This trick is sometimes used to invoke system calls from inside the kernel.
-These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
-about how it counts interrupt nesting levels.
-I learned of this requirement the hard way during a rewrite
-of RCU's dyntick-idle code.
-
-<p>
-The Linux kernel has non-maskable interrupts (NMIs), and
-RCU read-side critical sections are legal within NMI handlers.
-Thankfully, RCU update-side primitives, including
-<tt>call_rcu()</tt>, are prohibited within NMI handlers.
-
-<p>
-The name notwithstanding, some Linux-kernel architectures
-can have nested NMIs, which RCU must handle correctly.
-Andy Lutomirski
-<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
-with this requirement;
-he also kindly surprised me with
-<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
-that meets this requirement.
-
-<h3><a name="Loadable Modules">Loadable Modules</a></h3>
-
-<p>
-The Linux kernel has loadable modules, and these modules can
-also be unloaded.
-After a given module has been unloaded, any attempt to call
-one of its functions results in a segmentation fault.
-The module-unload functions must therefore cancel any
-delayed calls to loadable-module functions, for example,
-any outstanding <tt>mod_timer()</tt> must be dealt with
-via <tt>del_timer_sync()</tt> or similar.
-
-<p>
-Unfortunately, there is no way to cancel an RCU callback;
-once you invoke <tt>call_rcu()</tt>, the callback function is
-going to eventually be invoked, unless the system goes down first.
-Because it is normally considered socially irresponsible to crash the system
-in response to a module unload request, we need some other way
-to deal with in-flight RCU callbacks.
-
-<p>
-RCU therefore provides
-<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
-which waits until all in-flight RCU callbacks have been invoked.
-If a module uses <tt>call_rcu()</tt>, its exit function should therefore
-prevent any future invocation of <tt>call_rcu()</tt>, then invoke
-<tt>rcu_barrier()</tt>.
-In theory, the underlying module-unload code could invoke
-<tt>rcu_barrier()</tt> unconditionally, but in practice this would
-incur unacceptable latencies.
-
-<p>
-Nikita Danilov noted this requirement for an analogous filesystem-unmount
-situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
-The need for <tt>rcu_barrier()</tt> for module unloading became
-apparent later.
-
-<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
-
-<p>
-The Linux kernel supports CPU hotplug, which means that CPUs
-can come and go.
-It is of course illegal to use any RCU API member from an offline CPU.
-This requirement was present from day one in DYNIX/ptx, but
-on the other hand, the Linux kernel's CPU-hotplug implementation
-is &ldquo;interesting.&rdquo;
-
-<p>
-The Linux-kernel CPU-hotplug implementation has notifiers that
-are used to allow the various kernel subsystems (including RCU)
-to respond appropriately to a given CPU-hotplug operation.
-Most RCU operations may be invoked from CPU-hotplug notifiers,
-including even normal synchronous grace-period operations
-such as <tt>synchronize_rcu()</tt>.
-However, expedited grace-period operations such as
-<tt>synchronize_rcu_expedited()</tt> are not supported,
-due to the fact that current implementations block CPU-hotplug
-operations, which could result in deadlock.
-
-<p>
-In addition, all-callback-wait operations such as
-<tt>rcu_barrier()</tt> are also not supported, due to the
-fact that there are phases of CPU-hotplug operations where
-the outgoing CPU's callbacks will not be invoked until after
-the CPU-hotplug operation ends, which could also result in deadlock.
-
-<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
-
-<p>
-RCU depends on the scheduler, and the scheduler uses RCU to
-protect some of its data structures.
-This means the scheduler is forbidden from acquiring
-the runqueue locks and the priority-inheritance locks
-in the middle of an outermost RCU read-side critical section unless either
-(1)&nbsp;it releases them before exiting that same
-RCU read-side critical section, or
-(2)&nbsp;interrupts are disabled across
-that entire RCU read-side critical section.
-This same prohibition also applies (recursively!) to any lock that is acquired
-while holding any lock to which this prohibition applies.
-Adhering to this rule prevents preemptible RCU from invoking
-<tt>rcu_read_unlock_special()</tt> while either runqueue or
-priority-inheritance locks are held, thus avoiding deadlock.
-
-<p>
-Prior to v4.4, it was only necessary to disable preemption across
-RCU read-side critical sections that acquired scheduler locks.
-In v4.4, expedited grace periods started using IPIs, and these
-IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
-Therefore, this expedited-grace-period change required disabling of
-interrupts, not just preemption.
-
-<p>
-For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
-implementation must be written carefully to avoid similar deadlocks.
-In particular, <tt>rcu_read_unlock()</tt> must tolerate an
-interrupt where the interrupt handler invokes both
-<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
-This possibility requires <tt>rcu_read_unlock()</tt> to use
-negative nesting levels to avoid destructive recursion via
-interrupt handler's use of RCU.
-
-<p>
-This pair of mutual scheduler-RCU requirements came as a
-<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
-
-<p>
-As noted above, RCU makes use of kthreads, and it is necessary to
-avoid excessive CPU-time accumulation by these kthreads.
-This requirement was no surprise, but RCU's violation of it
-when running context-switch-heavy workloads when built with
-<tt>CONFIG_NO_HZ_FULL=y</tt>
-<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
-RCU has made good progress towards meeting this requirement, even
-for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
-but there is room for further improvement.
-
-<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
-
-<p>
-It is possible to use tracing on RCU code, but tracing itself
-uses RCU.
-For this reason, <tt>rcu_dereference_raw_notrace()</tt>
-is provided for use by tracing, which avoids the destructive
-recursion that could otherwise ensue.
-This API is also used by virtualization in some architectures,
-where RCU readers execute in environments in which tracing
-cannot be used.
-The tracing folks both located the requirement and provided the
-needed fix, so this surprise requirement was relatively painless.
-
-<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
-
-<p>
-Interrupting idle CPUs is considered socially unacceptable,
-especially by people with battery-powered embedded systems.
-RCU therefore conserves energy by detecting which CPUs are
-idle, including tracking CPUs that have been interrupted from idle.
-This is a large part of the energy-efficiency requirement,
-so I learned of this via an irate phone call.
-
-<p>
-Because RCU avoids interrupting idle CPUs, it is illegal to
-execute an RCU read-side critical section on an idle CPU.
-(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
-if you try it.)
-The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
-event tracing is provided to work around this restriction.
-In addition, <tt>rcu_is_watching()</tt> may be used to
-test whether or not it is currently legal to run RCU read-side
-critical sections on this CPU.
-I learned of the need for diagnostics on the one hand
-and <tt>RCU_NONIDLE()</tt> on the other while inspecting
-idle-loop code.
-Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
-which is used quite heavily in the idle loop.
-
-<p>
-It is similarly socially unacceptable to interrupt an
-<tt>nohz_full</tt> CPU running in userspace.
-RCU must therefore track <tt>nohz_full</tt> userspace
-execution.
-And in
-<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
-kernels, RCU must separately track idle CPUs on the one hand and
-CPUs that are either idle or executing in userspace on the other.
-In both cases, RCU must be able to sample state at two points in
-time, and be able to determine whether or not some other CPU spent
-any time idle and/or executing in userspace.
-
-<p>
-These energy-efficiency requirements have proven quite difficult to
-understand and to meet, for example, there have been more than five
-clean-sheet rewrites of RCU's energy-efficiency code, the last of
-which was finally able to demonstrate
-<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
-As noted earlier,
-I learned of many of these requirements via angry phone calls:
-Flaming me on the Linux-kernel mailing list was apparently not
-sufficient to fully vent their ire at RCU's energy-efficiency bugs!
-
-<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
-
-<p>
-Although small-memory non-realtime systems can simply use Tiny RCU,
-code size is only one aspect of memory efficiency.
-Another aspect is the size of the <tt>rcu_head</tt> structure
-used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
-Although this structure contains nothing more than a pair of pointers,
-it does appear in many RCU-protected data structures, including
-some that are size critical.
-The <tt>page</tt> structure is a case in point, as evidenced by
-the many occurrences of the <tt>union</tt> keyword within that structure.
-
-<p>
-This need for memory efficiency is one reason that RCU uses hand-crafted
-singly linked lists to track the <tt>rcu_head</tt> structures that
-are waiting for a grace period to elapse.
-It is also the reason why <tt>rcu_head</tt> structures do not contain
-debug information, such as fields tracking the file and line of the
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
-Although this information might appear in debug-only kernel builds at some
-point, in the meantime, the <tt>-&gt;func</tt> field will often provide
-the needed debug information.
-
-<p>
-However, in some cases, the need for memory efficiency leads to even
-more extreme measures.
-Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
-shares storage with a great many other structures that are used at
-various points in the corresponding page's lifetime.
-In order to correctly resolve certain
-<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
-the Linux kernel's memory-management subsystem needs a particular bit
-to remain zero during all phases of grace-period processing,
-and that bit happens to map to the bottom bit of the
-<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
-RCU makes this guarantee as long as <tt>call_rcu()</tt>
-is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
-or some future &ldquo;lazy&rdquo;
-variant of <tt>call_rcu()</tt> that might one day be created for
-energy-efficiency purposes.
-
-<h3><a name="Performance, Scalability, Response Time, and Reliability">
-Performance, Scalability, Response Time, and Reliability</a></h3>
-
-<p>
-Expanding on the
-<a href="#Performance and Scalability">earlier discussion</a>,
-RCU is used heavily by hot code paths in performance-critical
-portions of the Linux kernel's networking, security, virtualization,
-and scheduling code paths.
-RCU must therefore use efficient implementations, especially in its
-read-side primitives.
-To that end, it would be good if preemptible RCU's implementation
-of <tt>rcu_read_lock()</tt> could be inlined, however, doing
-this requires resolving <tt>#include</tt> issues with the
-<tt>task_struct</tt> structure.
-
-<p>
-The Linux kernel supports hardware configurations with up to
-4096 CPUs, which means that RCU must be extremely scalable.
-Algorithms that involve frequent acquisitions of global locks or
-frequent atomic operations on global variables simply cannot be
-tolerated within the RCU implementation.
-RCU therefore makes heavy use of a combining tree based on the
-<tt>rcu_node</tt> structure.
-RCU is required to tolerate all CPUs continuously invoking any
-combination of RCU's runtime primitives with minimal per-operation
-overhead.
-In fact, in many cases, increasing load must <i>decrease</i> the
-per-operation overhead, witness the batching optimizations for
-<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
-<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
-As a general rule, RCU must cheerfully accept whatever the
-rest of the Linux kernel decides to throw at it.
-
-<p>
-The Linux kernel is used for real-time workloads, especially
-in conjunction with the
-<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
-The real-time-latency response requirements are such that the
-traditional approach of disabling preemption across RCU
-read-side critical sections is inappropriate.
-Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
-use an RCU implementation that allows RCU read-side critical
-sections to be preempted.
-This requirement made its presence known after users made it
-clear that an earlier
-<a href="https://lwn.net/Articles/107930/">real-time patch</a>
-did not meet their needs, in conjunction with some
-<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
-encountered by a very early version of the -rt patchset.
-
-<p>
-In addition, RCU must make do with a sub-100-microsecond real-time latency
-budget.
-In fact, on smaller systems with the -rt patchset, the Linux kernel
-provides sub-20-microsecond real-time latencies for the whole kernel,
-including RCU.
-RCU's scalability and latency must therefore be sufficient for
-these sorts of configurations.
-To my surprise, the sub-100-microsecond real-time latency budget
-<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
-applies to even the largest systems [PDF]</a>,
-up to and including systems with 4096 CPUs.
-This real-time requirement motivated the grace-period kthread, which
-also simplified handling of a number of race conditions.
-
-<p>
-Finally, RCU's status as a synchronization primitive means that
-any RCU failure can result in arbitrary memory corruption that can be
-extremely difficult to debug.
-This means that RCU must be extremely reliable, which in
-practice also means that RCU must have an aggressive stress-test
-suite.
-This stress-test suite is called <tt>rcutorture</tt>.
-
-<p>
-Although the need for <tt>rcutorture</tt> was no surprise,
-the current immense popularity of the Linux kernel is posing
-interesting&mdash;and perhaps unprecedented&mdash;validation
-challenges.
-To see this, keep in mind that there are well over one billion
-instances of the Linux kernel running today, given Android
-smartphones, Linux-powered televisions, and servers.
-This number can be expected to increase sharply with the advent of
-the celebrated Internet of Things.
-
-<p>
-Suppose that RCU contains a race condition that manifests on average
-once per million years of runtime.
-This bug will be occurring about three times per <i>day</i> across
-the installed base.
-RCU could simply hide behind hardware error rates, given that no one
-should really expect their smartphone to last for a million years.
-However, anyone taking too much comfort from this thought should
-consider the fact that in most jurisdictions, a successful multi-year
-test of a given mechanism, which might include a Linux kernel,
-suffices for a number of types of safety-critical certifications.
-In fact, rumor has it that the Linux kernel is already being used
-in production for safety-critical applications.
-I don't know about you, but I would feel quite bad if a bug in RCU
-killed someone.
-Which might explain my recent focus on validation and verification.
-
-<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
-
-<p>
-One of the more surprising things about RCU is that there are now
-no fewer than five <i>flavors</i>, or API families.
-In addition, the primary flavor that has been the sole focus up to
-this point has two different implementations, non-preemptible and
-preemptible.
-The other four flavors are listed below, with requirements for each
-described in a separate section.
-
-<ol>
-<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
-<li> <a href="#Sched Flavor">Sched Flavor</a>
-<li> <a href="#Sleepable RCU">Sleepable RCU</a>
-<li> <a href="#Tasks RCU">Tasks RCU</a>
-</ol>
-
-<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
-
-<p>
-The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
-hence the &ldquo;_bh&rdquo; abbreviations)
-flavor of RCU, or <i>RCU-bh</i>, was developed by
-Dipankar Sarma to provide a flavor of RCU that could withstand the
-network-based denial-of-service attacks researched by Robert
-Olsson.
-These attacks placed so much networking load on the system
-that some of the CPUs never exited softirq execution,
-which in turn prevented those CPUs from ever executing a context switch,
-which, in the RCU implementation of that time, prevented grace periods
-from ever ending.
-The result was an out-of-memory condition and a system hang.
-
-<p>
-The solution was the creation of RCU-bh, which does
-<tt>local_bh_disable()</tt>
-across its read-side critical sections, and which uses the transition
-from one type of softirq processing to another as a quiescent state
-in addition to context switch, idle, user mode, and offline.
-This means that RCU-bh grace periods can complete even when some of
-the CPUs execute in softirq indefinitely, thus allowing algorithms
-based on RCU-bh to withstand network-based denial-of-service attacks.
-
-<p>
-Because
-<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
-disable and re-enable softirq handlers, any attempt to start a softirq
-handlers during the
-RCU-bh read-side critical section will be deferred.
-In this case, <tt>rcu_read_unlock_bh()</tt>
-will invoke softirq processing, which can take considerable time.
-One can of course argue that this softirq overhead should be associated
-with the code following the RCU-bh read-side critical section rather
-than <tt>rcu_read_unlock_bh()</tt>, but the fact
-is that most profiling tools cannot be expected to make this sort
-of fine distinction.
-For example, suppose that a three-millisecond-long RCU-bh read-side
-critical section executes during a time of heavy networking load.
-There will very likely be an attempt to invoke at least one softirq
-handler during that three milliseconds, but any such invocation will
-be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
-This can of course make it appear at first glance as if
-<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
-
-<p>
-The
-<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
-includes
-<tt>rcu_read_lock_bh()</tt>,
-<tt>rcu_read_unlock_bh()</tt>,
-<tt>rcu_dereference_bh()</tt>,
-<tt>rcu_dereference_bh_check()</tt>,
-<tt>synchronize_rcu_bh()</tt>,
-<tt>synchronize_rcu_bh_expedited()</tt>,
-<tt>call_rcu_bh()</tt>,
-<tt>rcu_barrier_bh()</tt>, and
-<tt>rcu_read_lock_bh_held()</tt>.
-
-<h3><a name="Sched Flavor">Sched Flavor</a></h3>
-
-<p>
-Before preemptible RCU, waiting for an RCU grace period had the
-side effect of also waiting for all pre-existing interrupt
-and NMI handlers.
-However, there are legitimate preemptible-RCU implementations that
-do not have this property, given that any point in the code outside
-of an RCU read-side critical section can be a quiescent state.
-Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
-RCU in that an RCU-sched grace period waits for for pre-existing
-interrupt and NMI handlers.
-In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
-APIs have identical implementations, while kernels built with
-<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
-
-<p>
-Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
-<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
-disable and re-enable preemption, respectively.
-This means that if there was a preemption attempt during the
-RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
-will enter the scheduler, with all the latency and overhead entailed.
-Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
-as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
-However, the highest-priority task won't be preempted, so that task
-will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
-
-<p>
-The
-<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
-includes
-<tt>rcu_read_lock_sched()</tt>,
-<tt>rcu_read_unlock_sched()</tt>,
-<tt>rcu_read_lock_sched_notrace()</tt>,
-<tt>rcu_read_unlock_sched_notrace()</tt>,
-<tt>rcu_dereference_sched()</tt>,
-<tt>rcu_dereference_sched_check()</tt>,
-<tt>synchronize_sched()</tt>,
-<tt>synchronize_rcu_sched_expedited()</tt>,
-<tt>call_rcu_sched()</tt>,
-<tt>rcu_barrier_sched()</tt>, and
-<tt>rcu_read_lock_sched_held()</tt>.
-However, anything that disables preemption also marks an RCU-sched
-read-side critical section, including
-<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
-<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
-and so on.
-
-<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
-
-<p>
-For well over a decade, someone saying &ldquo;I need to block within
-an RCU read-side critical section&rdquo; was a reliable indication
-that this someone did not understand RCU.
-After all, if you are always blocking in an RCU read-side critical
-section, you can probably afford to use a higher-overhead synchronization
-mechanism.
-However, that changed with the advent of the Linux kernel's notifiers,
-whose RCU read-side critical
-sections almost never sleep, but sometimes need to.
-This resulted in the introduction of
-<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
-or <i>SRCU</i>.
-
-<p>
-SRCU allows different domains to be defined, with each such domain
-defined by an instance of an <tt>srcu_struct</tt> structure.
-A pointer to this structure must be passed in to each SRCU function,
-for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
-<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
-The key benefit of these domains is that a slow SRCU reader in one
-domain does not delay an SRCU grace period in some other domain.
-That said, one consequence of these domains is that read-side code
-must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
-to <tt>srcu_read_unlock()</tt>, for example, as follows:
-
-<blockquote>
-<pre>
- 1 int idx;
- 2
- 3 idx = srcu_read_lock(&amp;ss);
- 4 do_something();
- 5 srcu_read_unlock(&amp;ss, idx);
-</pre>
-</blockquote>
-
-<p>
-As noted above, it is legal to block within SRCU read-side critical sections,
-however, with great power comes great responsibility.
-If you block forever in one of a given domain's SRCU read-side critical
-sections, then that domain's grace periods will also be blocked forever.
-Of course, one good way to block forever is to deadlock, which can
-happen if any operation in a given domain's SRCU read-side critical
-section can block waiting, either directly or indirectly, for that domain's
-grace period to elapse.
-For example, this results in a self-deadlock:
-
-<blockquote>
-<pre>
- 1 int idx;
- 2
- 3 idx = srcu_read_lock(&amp;ss);
- 4 do_something();
- 5 synchronize_srcu(&amp;ss);
- 6 srcu_read_unlock(&amp;ss, idx);
-</pre>
-</blockquote>
-
-<p>
-However, if line&nbsp;5 acquired a mutex that was held across
-a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
-deadlock would still be possible.
-Furthermore, if line&nbsp;5 acquired a mutex that was held across
-a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
-and if an <tt>ss1</tt>-domain SRCU read-side critical section
-acquired another mutex that was held across as <tt>ss</tt>-domain
-<tt>synchronize_srcu()</tt>,
-deadlock would again be possible.
-Such a deadlock cycle could extend across an arbitrarily large number
-of different SRCU domains.
-Again, with great power comes great responsibility.
-
-<p>
-Unlike the other RCU flavors, SRCU read-side critical sections can
-run on idle and even offline CPUs.
-This ability requires that <tt>srcu_read_lock()</tt> and
-<tt>srcu_read_unlock()</tt> contain memory barriers, which means
-that SRCU readers will run a bit slower than would RCU readers.
-It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
-API, which, in combination with <tt>srcu_read_unlock()</tt>,
-guarantees a full memory barrier.
-
-<p>
-The
-<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
-includes
-<tt>srcu_read_lock()</tt>,
-<tt>srcu_read_unlock()</tt>,
-<tt>srcu_dereference()</tt>,
-<tt>srcu_dereference_check()</tt>,
-<tt>synchronize_srcu()</tt>,
-<tt>synchronize_srcu_expedited()</tt>,
-<tt>call_srcu()</tt>,
-<tt>srcu_barrier()</tt>, and
-<tt>srcu_read_lock_held()</tt>.
-It also includes
-<tt>DEFINE_SRCU()</tt>,
-<tt>DEFINE_STATIC_SRCU()</tt>, and
-<tt>init_srcu_struct()</tt>
-APIs for defining and initializing <tt>srcu_struct</tt> structures.
-
-<h3><a name="Tasks RCU">Tasks RCU</a></h3>
-
-<p>
-Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
-binary rewriting required to install different types of probes.
-It would be good to be able to free old trampolines, which sounds
-like a job for some form of RCU.
-However, because it is necessary to be able to install a trace
-anywhere in the code, it is not possible to use read-side markers
-such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
-In addition, it does not work to have these markers in the trampoline
-itself, because there would need to be instructions following
-<tt>rcu_read_unlock()</tt>.
-Although <tt>synchronize_rcu()</tt> would guarantee that execution
-reached the <tt>rcu_read_unlock()</tt>, it would not be able to
-guarantee that execution had completely left the trampoline.
-
-<p>
-The solution, in the form of
-<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
-is to have implicit
-read-side critical sections that are delimited by voluntary context
-switches, that is, calls to <tt>schedule()</tt>,
-<tt>cond_resched_rcu_qs()</tt>, and
-<tt>synchronize_rcu_tasks()</tt>.
-In addition, transitions to and from userspace execution also delimit
-tasks-RCU read-side critical sections.
-
-<p>
-The tasks-RCU API is quite compact, consisting only of
-<tt>call_rcu_tasks()</tt>,
-<tt>synchronize_rcu_tasks()</tt>, and
-<tt>rcu_barrier_tasks()</tt>.
-
-<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
-
-<p>
-One of the tricks that RCU uses to attain update-side scalability is
-to increase grace-period latency with increasing numbers of CPUs.
-If this becomes a serious problem, it will be necessary to rework the
-grace-period state machine so as to avoid the need for the additional
-latency.
-
-<p>
-Expedited grace periods scan the CPUs, so their latency and overhead
-increases with increasing numbers of CPUs.
-If this becomes a serious problem on large systems, it will be necessary
-to do some redesign to avoid this scalability problem.
-
-<p>
-RCU disables CPU hotplug in a few places, perhaps most notably in the
-expedited grace-period and <tt>rcu_barrier()</tt> operations.
-If there is a strong reason to use expedited grace periods in CPU-hotplug
-notifiers, it will be necessary to avoid disabling CPU hotplug.
-This would introduce some complexity, so there had better be a <i>very</i>
-good reason.
-
-<p>
-The tradeoff between grace-period latency on the one hand and interruptions
-of other CPUs on the other hand may need to be re-examined.
-The desire is of course for zero grace-period latency as well as zero
-interprocessor interrupts undertaken during an expedited grace period
-operation.
-While this ideal is unlikely to be achievable, it is quite possible that
-further improvements can be made.
-
-<p>
-The multiprocessor implementations of RCU use a combining tree that
-groups CPUs so as to reduce lock contention and increase cache locality.
-However, this combining tree does not spread its memory across NUMA
-nodes nor does it align the CPU groups with hardware features such
-as sockets or cores.
-Such spreading and alignment is currently believed to be unnecessary
-because the hotpath read-side primitives do not access the combining
-tree, nor does <tt>call_rcu()</tt> in the common case.
-If you believe that your architecture needs such spreading and alignment,
-then your architecture should also benefit from the
-<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
-to the number of CPUs in a socket, NUMA node, or whatever.
-If the number of CPUs is too large, use a fraction of the number of
-CPUs.
-If the number of CPUs is a large prime number, well, that certainly
-is an &ldquo;interesting&rdquo; architectural choice!
-More flexible arrangements might be considered, but only if
-<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
-if the inadequacy has been demonstrated by a carefully run and
-realistic system-level workload.
-
-<p>
-Please note that arrangements that require RCU to remap CPU numbers will
-require extremely good demonstration of need and full exploration of
-alternatives.
-
-<p>
-There is an embarrassingly large number of flavors of RCU, and this
-number has been increasing over time.
-Perhaps it will be possible to combine some at some future date.
-
-<p>
-RCU's various kthreads are reasonably recent additions.
-It is quite likely that adjustments will be required to more gracefully
-handle extreme loads.
-It might also be necessary to be able to relate CPU utilization by
-RCU's kthreads and softirq handlers to the code that instigated this
-CPU utilization.
-For example, RCU callback overhead might be charged back to the
-originating <tt>call_rcu()</tt> instance, though probably not
-in production kernels.
-
-<h2><a name="Summary">Summary</a></h2>
-
-<p>
-This document has presented more than two decade's worth of RCU
-requirements.
-Given that the requirements keep changing, this will not be the last
-word on this subject, but at least it serves to get an important
-subset of the requirements set forth.
-
-<h2><a name="Acknowledgments">Acknowledgments</a></h2>
-
-I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
-Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
-Andy Lutomirski for their help in rendering
-this article human readable, and to Michelle Rankin for her support
-of this effort.
-Other contributions are acknowledged in the Linux kernel's git archive.
-The cartoon is copyright (c) 2013 by Melissa Broussard,
-and is provided
-under the terms of the Creative Commons Attribution-Share Alike 3.0
-United States license.
-
-<p>@@QQAL@@
-
-</body></html>
diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh
deleted file mode 100755
index d354f069559b..000000000000
--- a/Documentation/RCU/Design/htmlqqz.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/sh
-#
-# Usage: sh htmlqqz.sh file
-#
-# Extracts and converts quick quizzes in a proto-HTML document file.htmlx.
-# Commands, all of which must be on a line by themselves:
-#
-# "<p>@@QQ@@": Start of a quick quiz.
-# "<p>@@QQA@@": Start of a quick-quiz answer.
-# "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz.
-# "<p>@@QQAL@@": Place to put quick-quiz answer list.
-#
-# Places the result in file.html.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-# Copyright (c) 2013 Paul E. McKenney, IBM Corporation.
-
-fn=$1
-if test ! -r $fn.htmlx
-then
- echo "Error: $fn.htmlx unreadable."
- exit 1
-fi
-
-echo "<!-- DO NOT HAND EDIT. -->" > $fn.html
-echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html
-awk < $fn.htmlx >> $fn.html '
-
-state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" {
- print $0;
- if ($0 ~ /^<p>@@QQ/)
- print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr"
- next;
-}
-
-state == "" && $1 == "<p>@@QQ@@" {
- qqn++;
- qqlineno = NR;
- haveqq = 1;
- state = "qq";
- print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>"
- next;
-}
-
-state == "qq" && $1 != "<p>@@QQA@@" {
- qq[qqn] = qq[qqn] $0 "\n";
- print $0
- if ($0 ~ /^<p>@@QQ/)
- print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr"
- next;
-}
-
-state == "qq" && $1 == "<p>@@QQA@@" {
- state = "qqa";
- print "<br><a href=\"#qq" qqn "answer\">Answer</a>"
- next;
-}
-
-state == "qqa" && $1 != "<p>@@QQE@@" {
- qqa[qqn] = qqa[qqn] $0 "\n";
- if ($0 ~ /^<p>@@QQ/)
- print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr"
- next;
-}
-
-state == "qqa" && $1 == "<p>@@QQE@@" {
- state = "";
- next;
-}
-
-state == "" && $1 == "<p>@@QQAL@@" {
- haveqq = "";
- print "<h3><a name=\"Answers to Quick Quizzes\">"
- print "Answers to Quick Quizzes</a></h3>"
- print "";
- for (i = 1; i <= qqn; i++) {
- print "<a name=\"qq" i "answer\"></a>"
- print "<p><b>Quick Quiz " i "</b>:"
- print qq[i];
- print "";
- print "</p><p><b>Answer</b>:"
- print qqa[i];
- print "";
- print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>"
- print "";
- }
- next;
-}
-
-END {
- if (state != "")
- print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr"
- else if (haveqq)
- print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr"
-}'
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index ec6998b1b6d0..00a3a38b375a 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -237,17 +237,17 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
-s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872
+s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872
These fields are as follows:
o "s" is the sequence number, with an odd number indicating that
an expedited grace period is in progress.
-o "wd0", "wd1", "wd2", and "wd3" are the number of times that an
- attempt to start an expedited grace period found that someone
- else had completed an expedited grace period that satisfies the
- attempted request. "Our work is done."
+o "wd1", "wd2", and "wd3" are the number of times that an attempt
+ to start an expedited grace period found that someone else had
+ completed an expedited grace period that satisfies the attempted
+ request. "Our work is done."
o "n" is number of times that a concurrent CPU-hotplug operation
forced a fallback to a normal grace period.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index dc49c6712b17..111770ffa10e 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -681,22 +681,30 @@ Although RCU can be used in many different ways, a very common use of
RCU is analogous to reader-writer locking. The following unified
diff shows how closely related RCU and reader-writer locking can be.
+ @@ -5,5 +5,5 @@ struct el {
+ int data;
+ /* Other data fields */
+ };
+ -rwlock_t listmutex;
+ +spinlock_t listmutex;
+ struct el head;
+
@@ -13,15 +14,15 @@
struct list_head *lp;
struct el *p;
- - read_lock();
+ - read_lock(&listmutex);
- list_for_each_entry(p, head, lp) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, head, lp) {
if (p->key == key) {
*result = p->data;
- - read_unlock();
+ - read_unlock(&listmutex);
+ rcu_read_unlock();
return 1;
}
}
- - read_unlock();
+ - read_unlock(&listmutex);
+ rcu_read_unlock();
return 0;
}
@@ -732,7 +740,7 @@ Or, for those who prefer a side-by-side listing:
5 int data; 5 int data;
6 /* Other data fields */ 6 /* Other data fields */
7 }; 7 };
- 8 spinlock_t listmutex; 8 spinlock_t listmutex;
+ 8 rwlock_t listmutex; 8 spinlock_t listmutex;
9 struct el head; 9 struct el head;
1 int search(long key, int *result) 1 int search(long key, int *result)
@@ -740,15 +748,15 @@ Or, for those who prefer a side-by-side listing:
3 struct list_head *lp; 3 struct list_head *lp;
4 struct el *p; 4 struct el *p;
5 5
- 6 read_lock(); 6 rcu_read_lock();
+ 6 read_lock(&listmutex); 6 rcu_read_lock();
7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
8 if (p->key == key) { 8 if (p->key == key) {
9 *result = p->data; 9 *result = p->data;
-10 read_unlock(); 10 rcu_read_unlock();
+10 read_unlock(&listmutex); 10 rcu_read_unlock();
11 return 1; 11 return 1;
12 } 12 }
13 } 13 }
-14 read_unlock(); 14 rcu_read_unlock();
+14 read_unlock(&listmutex); 14 rcu_read_unlock();
15 return 0; 15 return 0;
16 } 16 }
diff --git a/Documentation/devicetree/bindings/regmap/regmap.txt b/Documentation/devicetree/bindings/regmap/regmap.txt
index e98a9652ccc8..0127be360fe8 100644
--- a/Documentation/devicetree/bindings/regmap/regmap.txt
+++ b/Documentation/devicetree/bindings/regmap/regmap.txt
@@ -1,50 +1,29 @@
-Device-Tree binding for regmap
-
-The endianness mode of CPU & Device scenarios:
-Index Device Endianness properties
----------------------------------------------------
-1 BE 'big-endian'
-2 LE 'little-endian'
-3 Native 'native-endian'
-
-For one device driver, which will run in different scenarios above
-on different SoCs using the devicetree, we need one way to simplify
-this.
+Devicetree binding for regmap
Optional properties:
-- {big,little,native}-endian: these are boolean properties, if absent
- then the implementation will choose a default based on the device
- being controlled. These properties are for register values and all
- the buffers only. Native endian means that the CPU and device have
- the same endianness.
-Examples:
-Scenario 1 : CPU in LE mode & device in LE mode.
-dev: dev@40031000 {
- compatible = "name";
- reg = <0x40031000 0x1000>;
- ...
-};
+ little-endian,
+ big-endian,
+ native-endian: See common-properties.txt for a definition
-Scenario 2 : CPU in LE mode & device in BE mode.
-dev: dev@40031000 {
- compatible = "name";
- reg = <0x40031000 0x1000>;
- ...
- big-endian;
-};
+Note:
+Regmap defaults to little-endian register access on MMIO based
+devices, this is by far the most common setting. On CPU
+architectures that typically run big-endian operating systems
+(e.g. PowerPC), registers can be defined as big-endian and must
+be marked that way in the devicetree.
-Scenario 3 : CPU in BE mode & device in BE mode.
-dev: dev@40031000 {
- compatible = "name";
- reg = <0x40031000 0x1000>;
- ...
-};
+On SoCs that can be operated in both big-endian and little-endian
+modes, with a single hardware switch controlling both the endianess
+of the CPU and a byteswap for MMIO registers (e.g. many Broadcom MIPS
+chips), "native-endian" is used to allow using the same device tree
+blob in both cases.
-Scenario 4 : CPU in BE mode & device in LE mode.
+Examples:
+Scenario 1 : a register set in big-endian mode.
dev: dev@40031000 {
- compatible = "name";
+ compatible = "syscon";
reg = <0x40031000 0x1000>;
+ big-endian;
...
- little-endian;
};
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0b3de80ec8f6..49673bd30b87 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3284,6 +3284,44 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Lazy RCU callbacks are those which RCU can
prove do nothing more than free memory.
+ rcuperf.gp_exp= [KNL]
+ Measure performance of expedited synchronous
+ grace-period primitives.
+
+ rcuperf.holdoff= [KNL]
+ Set test-start holdoff period. The purpose of
+ this parameter is to delay the start of the
+ test until boot completes in order to avoid
+ interference.
+
+ rcuperf.nreaders= [KNL]
+ Set number of RCU readers. The value -1 selects
+ N, where N is the number of CPUs. A value
+ "n" less than -1 selects N-n+1, where N is again
+ the number of CPUs. For example, -2 selects N
+ (the number of CPUs), -3 selects N+1, and so on.
+ A value of "n" less than or equal to -N selects
+ a single reader.
+
+ rcuperf.nwriters= [KNL]
+ Set number of RCU writers. The values operate
+ the same as for rcuperf.nreaders.
+ N, where N is the number of CPUs
+
+ rcuperf.perf_runnable= [BOOT]
+ Start rcuperf running at boot time.
+
+ rcuperf.shutdown= [KNL]
+ Shut the system down after performance tests
+ complete. This is useful for hands-off automated
+ testing.
+
+ rcuperf.perf_type= [KNL]
+ Specify the RCU implementation to test.
+
+ rcuperf.verbose= [KNL]
+ Enable additional printk() statements.
+
rcutorture.cbflood_inter_holdoff= [KNL]
Set holdoff time (jiffies) between successive
callback-flood tests.
diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
index 5001280e9d82..9de1c158d44c 100644
--- a/Documentation/locking/lockdep-design.txt
+++ b/Documentation/locking/lockdep-design.txt
@@ -97,7 +97,7 @@ between any two lock-classes:
<hardirq-safe> -> <hardirq-unsafe>
<softirq-safe> -> <softirq-unsafe>
-The first rule comes from the fact the a hardirq-safe lock could be
+The first rule comes from the fact that a hardirq-safe lock could be
taken by a hardirq context, interrupting a hardirq-unsafe lock - and
thus could result in a lock inversion deadlock. Likewise, a softirq-safe
lock could be taken by an softirq context, interrupting a softirq-unsafe
@@ -220,7 +220,7 @@ calculated, which hash is unique for every lock chain. The hash value,
when the chain is validated for the first time, is then put into a hash
table, which hash-table can be checked in a lockfree manner. If the
locking chain occurs again later on, the hash table tells us that we
-dont have to validate the chain again.
+don't have to validate the chain again.
Troubleshooting:
----------------
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 3729cbe60e41..147ae8ec836f 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -4,8 +4,40 @@
By: David Howells <dhowells@redhat.com>
Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ Will Deacon <will.deacon@arm.com>
+ Peter Zijlstra <peterz@infradead.org>
-Contents:
+==========
+DISCLAIMER
+==========
+
+This document is not a specification; it is intentionally (for the sake of
+brevity) and unintentionally (due to being human) incomplete. This document is
+meant as a guide to using the various memory barriers provided by Linux, but
+in case of any doubt (and there are many) please ask.
+
+To repeat, this document is not a specification of what Linux expects from
+hardware.
+
+The purpose of this document is twofold:
+
+ (1) to specify the minimum functionality that one can rely on for any
+ particular barrier, and
+
+ (2) to provide a guide as to how to use the barriers that are available.
+
+Note that an architecture can provide more than the minimum requirement
+for any particular barrier, but if the architecure provides less than
+that, that architecture is incorrect.
+
+Note also that it is possible that a barrier may be a no-op for an
+architecture because the way that arch works renders an explicit barrier
+unnecessary in that case.
+
+
+========
+CONTENTS
+========
(*) Abstract memory access model.
@@ -31,15 +63,15 @@ Contents:
(*) Implicit kernel memory barriers.
- - Locking functions.
+ - Lock acquisition functions.
- Interrupt disabling functions.
- Sleep and wake-up functions.
- Miscellaneous functions.
- (*) Inter-CPU locking barrier effects.
+ (*) Inter-CPU acquiring barrier effects.
- - Locks vs memory accesses.
- - Locks vs I/O accesses.
+ - Acquires vs memory accesses.
+ - Acquires vs I/O accesses.
(*) Where are memory barriers needed?
@@ -61,6 +93,7 @@ Contents:
(*) The things CPUs get up to.
- And then there's the Alpha.
+ - Virtual Machine Guests.
(*) Example uses.
@@ -148,7 +181,7 @@ As a further example, consider this sequence of events:
CPU 1 CPU 2
=============== ===============
- { A == 1, B == 2, C = 3, P == &A, Q == &C }
+ { A == 1, B == 2, C == 3, P == &A, Q == &C }
B = 4; Q = P;
P = &B D = *Q;
@@ -430,8 +463,9 @@ And a couple of implicit varieties:
This acts as a one-way permeable barrier. It guarantees that all memory
operations after the ACQUIRE operation will appear to happen after the
ACQUIRE operation with respect to the other components of the system.
- ACQUIRE operations include LOCK operations and smp_load_acquire()
- operations.
+ ACQUIRE operations include LOCK operations and both smp_load_acquire()
+ and smp_cond_acquire() operations. The later builds the necessary ACQUIRE
+ semantics from relying on a control dependency and smp_rmb().
Memory operations that occur before an ACQUIRE operation may appear to
happen after it completes.
@@ -464,6 +498,11 @@ And a couple of implicit varieties:
This means that ACQUIRE acts as a minimal "acquire" operation and
RELEASE acts as a minimal "release" operation.
+A subset of the atomic operations described in atomic_ops.txt have ACQUIRE
+and RELEASE variants in addition to fully-ordered and relaxed (no barrier
+semantics) definitions. For compound atomics performing both a load and a
+store, ACQUIRE semantics apply only to the load and RELEASE semantics apply
+only to the store portion of the operation.
Memory barriers are only required where there's a possibility of interaction
between two CPUs or between a CPU and a device. If it can be guaranteed that
@@ -517,7 +556,7 @@ following sequence of events:
CPU 1 CPU 2
=============== ===============
- { A == 1, B == 2, C = 3, P == &A, Q == &C }
+ { A == 1, B == 2, C == 3, P == &A, Q == &C }
B = 4;
<write barrier>
WRITE_ONCE(P, &B)
@@ -544,7 +583,7 @@ between the address load and the data load:
CPU 1 CPU 2
=============== ===============
- { A == 1, B == 2, C = 3, P == &A, Q == &C }
+ { A == 1, B == 2, C == 3, P == &A, Q == &C }
B = 4;
<write barrier>
WRITE_ONCE(P, &B);
@@ -813,9 +852,10 @@ In summary:
the same variable, then those stores must be ordered, either by
preceding both of them with smp_mb() or by using smp_store_release()
to carry out the stores. Please note that it is -not- sufficient
- to use barrier() at beginning of each leg of the "if" statement,
- as optimizing compilers do not necessarily respect barrier()
- in this case.
+ to use barrier() at beginning of each leg of the "if" statement
+ because, as shown by the example above, optimizing compilers can
+ destroy the control dependency while respecting the letter of the
+ barrier() law.
(*) Control dependencies require at least one run-time conditional
between the prior load and the subsequent store, and this
@@ -1731,15 +1771,15 @@ The Linux kernel has eight basic CPU memory barriers:
All memory barriers except the data dependency barriers imply a compiler
-barrier. Data dependencies do not impose any additional compiler ordering.
+barrier. Data dependencies do not impose any additional compiler ordering.
Aside: In the case of data dependencies, the compiler would be expected
to issue the loads in the correct order (eg. `a[b]` would have to load
the value of b before loading a[b]), however there is no guarantee in
the C specification that the compiler may not speculate the value of b
(eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1)
-tmp = a[b]; ). There is also the problem of a compiler reloading b after
-having loaded a[b], thus having a newer copy of b than a[b]. A consensus
+tmp = a[b]; ). There is also the problem of a compiler reloading b after
+having loaded a[b], thus having a newer copy of b than a[b]. A consensus
has not yet been reached about these problems, however the READ_ONCE()
macro is a good place to start looking.
@@ -1794,6 +1834,7 @@ There are some more advanced barrier functions:
(*) lockless_dereference();
+
This can be thought of as a pointer-fetch wrapper around the
smp_read_barrier_depends() data-dependency barrier.
@@ -1858,7 +1899,7 @@ This is a variation on the mandatory write barrier that causes writes to weakly
ordered I/O regions to be partially ordered. Its effects may go beyond the
CPU->Hardware interface and actually affect the hardware at some level.
-See the subsection "Locks vs I/O accesses" for more information.
+See the subsection "Acquires vs I/O accesses" for more information.
===============================
@@ -1873,8 +1914,8 @@ provide more substantial guarantees, but these may not be relied upon outside
of arch specific code.
-ACQUIRING FUNCTIONS
--------------------
+LOCK ACQUISITION FUNCTIONS
+--------------------------
The Linux kernel has a number of locking constructs:
@@ -1895,7 +1936,7 @@ for each construct. These operations all imply certain barriers:
Memory operations issued before the ACQUIRE may be completed after
the ACQUIRE operation has completed. An smp_mb__before_spinlock(),
combined with a following ACQUIRE, orders prior stores against
- subsequent loads and stores. Note that this is weaker than smp_mb()!
+ subsequent loads and stores. Note that this is weaker than smp_mb()!
The smp_mb__before_spinlock() primitive is free on many architectures.
(2) RELEASE operation implication:
@@ -2090,9 +2131,9 @@ or:
event_indicated = 1;
wake_up_process(event_daemon);
-A write memory barrier is implied by wake_up() and co. if and only if they wake
-something up. The barrier occurs before the task state is cleared, and so sits
-between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+A write memory barrier is implied by wake_up() and co. if and only if they
+wake something up. The barrier occurs before the task state is cleared, and so
+sits between the STORE to indicate the event and the STORE to set TASK_RUNNING:
CPU 1 CPU 2
=============================== ===============================
@@ -2206,7 +2247,7 @@ three CPUs; then should the following sequence of events occur:
Then there is no guarantee as to what order CPU 3 will see the accesses to *A
through *H occur in, other than the constraints imposed by the separate locks
-on the separate CPUs. It might, for example, see:
+on the separate CPUs. It might, for example, see:
*E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M
@@ -2486,9 +2527,9 @@ The following operations are special locking primitives:
clear_bit_unlock();
__clear_bit_unlock();
-These implement ACQUIRE-class and RELEASE-class operations. These should be used in
-preference to other operations when implementing locking primitives, because
-their implementations can be optimised on many architectures.
+These implement ACQUIRE-class and RELEASE-class operations. These should be
+used in preference to other operations when implementing locking primitives,
+because their implementations can be optimised on many architectures.
[!] Note that special memory barrier primitives are available for these
situations because on some CPUs the atomic instructions used imply full memory
@@ -2568,12 +2609,12 @@ explicit barriers are used.
Normally this won't be a problem because the I/O accesses done inside such
sections will include synchronous load operations on strictly ordered I/O
-registers that form implicit I/O barriers. If this isn't sufficient then an
+registers that form implicit I/O barriers. If this isn't sufficient then an
mmiowb() may need to be used explicitly.
A similar situation may occur between an interrupt routine and two routines
-running on separate CPUs that communicate with each other. If such a case is
+running on separate CPUs that communicate with each other. If such a case is
likely, then interrupt-disabling locks should be used to guarantee ordering.
@@ -2587,8 +2628,8 @@ functions:
(*) inX(), outX():
These are intended to talk to I/O space rather than memory space, but
- that's primarily a CPU-specific concept. The i386 and x86_64 processors do
- indeed have special I/O space access cycles and instructions, but many
+ that's primarily a CPU-specific concept. The i386 and x86_64 processors
+ do indeed have special I/O space access cycles and instructions, but many
CPUs don't have such a concept.
The PCI bus, amongst others, defines an I/O space concept which - on such
@@ -2610,7 +2651,7 @@ functions:
Whether these are guaranteed to be fully ordered and uncombined with
respect to each other on the issuing CPU depends on the characteristics
- defined for the memory window through which they're accessing. On later
+ defined for the memory window through which they're accessing. On later
i386 architecture machines, for example, this is controlled by way of the
MTRR registers.
@@ -2635,10 +2676,10 @@ functions:
(*) readX_relaxed(), writeX_relaxed()
These are similar to readX() and writeX(), but provide weaker memory
- ordering guarantees. Specifically, they do not guarantee ordering with
+ ordering guarantees. Specifically, they do not guarantee ordering with
respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee
- ordering with respect to LOCK or UNLOCK operations. If the latter is
- required, an mmiowb() barrier can be used. Note that relaxed accesses to
+ ordering with respect to LOCK or UNLOCK operations. If the latter is
+ required, an mmiowb() barrier can be used. Note that relaxed accesses to
the same peripheral are guaranteed to be ordered with respect to each
other.
@@ -3040,8 +3081,9 @@ The Alpha defines the Linux kernel's memory barrier model.
See the subsection on "Cache Coherency" above.
+
VIRTUAL MACHINE GUESTS
--------------------
+----------------------
Guests running within virtual machines might be affected by SMP effects even if
the guest itself is compiled without SMP support. This is an artifact of
@@ -3050,7 +3092,7 @@ barriers for this use-case would be possible but is often suboptimal.
To handle this case optimally, low-level virt_mb() etc macros are available.
These have the same effect as smp_mb() etc when SMP is enabled, but generate
-identical code for SMP and non-SMP systems. For example, virtual machine guests
+identical code for SMP and non-SMP systems. For example, virtual machine guests
should use virt_mb() rather than smp_mb() when synchronizing against a
(possibly SMP) host.
@@ -3058,6 +3100,7 @@ These are equivalent to smp_mb() etc counterparts in all other respects,
in particular, they do not control MMIO effects: to control
MMIO effects, use mandatory barriers.
+
============
EXAMPLE USES
============
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index fcddfd5ded99..daabdd7ee543 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
- panic_on_warn
- perf_cpu_time_max_percent
- perf_event_paranoid
+- perf_event_max_stack
- pid_max
- powersave-nap [ PPC only ]
- printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 2.
==============================================================
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
pid_max:
PID allocation wrap value. When the kernel's next PID value
diff --git a/Makefile b/Makefile
index acf6155421cc..0f9cb36d45c2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 4
PATCHLEVEL = 6
SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
NAME = Charred Weasel
# *DOCUMENTATION*
diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index a83bbea62c67..0131a7058778 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -63,7 +63,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
return res >= 0 ? 1 : 0;
}
-static inline void __down_write(struct rw_semaphore *sem)
+static inline long ___down_write(struct rw_semaphore *sem)
{
long oldcount;
#ifndef CONFIG_SMP
@@ -83,10 +83,24 @@ static inline void __down_write(struct rw_semaphore *sem)
:"=&r" (oldcount), "=m" (sem->count), "=&r" (temp)
:"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory");
#endif
- if (unlikely(oldcount))
+ return oldcount;
+}
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ if (unlikely(___down_write(sem)))
rwsem_down_write_failed(sem);
}
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ if (unlikely(___down_write(sem)))
+ if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+ return -EINTR;
+
+ return 0;
+}
+
/*
* trylock for writing -- returns 1 if successful, 0 if contention
*/
diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi b/arch/arm/boot/dts/at91sam9x5.dtsi
index 0827d594b1f0..cd0cd5fd09a3 100644
--- a/arch/arm/boot/dts/at91sam9x5.dtsi
+++ b/arch/arm/boot/dts/at91sam9x5.dtsi
@@ -106,7 +106,7 @@
pmc: pmc@fffffc00 {
compatible = "atmel,at91sam9x5-pmc", "syscon";
- reg = <0xfffffc00 0x100>;
+ reg = <0xfffffc00 0x200>;
interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>;
interrupt-controller;
#address-cells = <1>;
diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi
index 78996bdbd3df..9817090c1b73 100644
--- a/arch/arm/boot/dts/sama5d2.dtsi
+++ b/arch/arm/boot/dts/sama5d2.dtsi
@@ -280,7 +280,7 @@
status = "disabled";
nfc@c0000000 {
- compatible = "atmel,sama5d4-nfc";
+ compatible = "atmel,sama5d3-nfc";
#address-cells = <1>;
#size-cells = <1>;
reg = < /* NFC Command Registers */
diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h
index e0eea72deb87..a708fa1f0905 100644
--- a/arch/arm/include/asm/efi.h
+++ b/arch/arm/include/asm/efi.h
@@ -17,34 +17,28 @@
#include <asm/mach/map.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
+#include <asm/ptrace.h>
#ifdef CONFIG_EFI
void efi_init(void);
int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
+int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
-#define efi_call_virt(f, ...) \
-({ \
- efi_##f##_t *__f; \
- efi_status_t __s; \
- \
- efi_virtmap_load(); \
- __f = efi.systab->runtime->f; \
- __s = __f(__VA_ARGS__); \
- efi_virtmap_unload(); \
- __s; \
-})
+#define arch_efi_call_virt_setup() efi_virtmap_load()
+#define arch_efi_call_virt_teardown() efi_virtmap_unload()
-#define __efi_call_virt(f, ...) \
+#define arch_efi_call_virt(f, args...) \
({ \
efi_##f##_t *__f; \
- \
- efi_virtmap_load(); \
__f = efi.systab->runtime->f; \
- __f(__VA_ARGS__); \
- efi_virtmap_unload(); \
+ __f(args); \
})
+#define ARCH_EFI_IRQ_FLAGS_MASK \
+ (PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \
+ PSR_T_BIT | MODE_MASK)
+
static inline void efi_set_pgd(struct mm_struct *mm)
{
check_and_switch_context(mm, NULL);
@@ -59,7 +53,16 @@ void efi_virtmap_unload(void);
/* arch specific definitions used by the stub code */
-#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+#define __efi_call_early(f, ...) f(__VA_ARGS__)
+#define efi_is_64bit() (false)
+
+struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg);
+void free_screen_info(efi_system_table_t *sys_table, struct screen_info *si);
+
+static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+}
/*
* A reasonable upper bound for the uncompressed kernel size is 32 MBytes,
diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c
index ff8a9d8acfac..9f43ba012d10 100644
--- a/arch/arm/kernel/efi.c
+++ b/arch/arm/kernel/efi.c
@@ -11,6 +11,41 @@
#include <asm/mach/map.h>
#include <asm/mmu_context.h>
+static int __init set_permissions(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ efi_memory_desc_t *md = data;
+ pte_t pte = *ptep;
+
+ if (md->attribute & EFI_MEMORY_RO)
+ pte = set_pte_bit(pte, __pgprot(L_PTE_RDONLY));
+ if (md->attribute & EFI_MEMORY_XP)
+ pte = set_pte_bit(pte, __pgprot(L_PTE_XN));
+ set_pte_ext(ptep, pte, PTE_EXT_NG);
+ return 0;
+}
+
+int __init efi_set_mapping_permissions(struct mm_struct *mm,
+ efi_memory_desc_t *md)
+{
+ unsigned long base, size;
+
+ base = md->virt_addr;
+ size = md->num_pages << EFI_PAGE_SHIFT;
+
+ /*
+ * We can only use apply_to_page_range() if we can guarantee that the
+ * entire region was mapped using pages. This should be the case if the
+ * region does not cover any naturally aligned SECTION_SIZE sized
+ * blocks.
+ */
+ if (round_down(base + size, SECTION_SIZE) <
+ round_up(base, SECTION_SIZE) + SECTION_SIZE)
+ return apply_to_page_range(mm, base, size, set_permissions, md);
+
+ return 0;
+}
+
int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
{
struct map_desc desc = {
@@ -34,5 +69,11 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
desc.type = MT_DEVICE;
create_mapping_late(mm, &desc, true);
+
+ /*
+ * If stricter permissions were specified, apply them now.
+ */
+ if (md->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))
+ return efi_set_mapping_permissions(mm, md);
return 0;
}
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 6284779d64ee..b8df45883cf7 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
info->address &= ~alignment_mask;
info->ctrl.len <<= offset;
- if (!bp->overflow_handler) {
+ if (is_default_overflow_handler(bp)) {
/*
* Mismatch breakpoints are required for single-stepping
* breakpoints.
@@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
* mismatch breakpoint so we can single-step over the
* watchpoint trigger.
*/
- if (!wp->overflow_handler)
+ if (is_default_overflow_handler(wp))
enable_single_step(wp, instruction_pointer(regs));
unlock:
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = user_backtrace(tail, entry);
}
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 2c4bea39cf22..7d4e2850910c 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -883,7 +883,8 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
request_resource(&ioport_resource, &lp2);
}
-#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE)
+#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) || \
+ defined(CONFIG_EFI)
struct screen_info screen_info = {
.orig_video_lines = 30,
.orig_video_cols = 80,
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index efa77c146415..521b1ec59157 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -2,6 +2,7 @@ menu "Platform selection"
config ARCH_SUNXI
bool "Allwinner sunxi 64-bit SoC Family"
+ select GENERIC_IRQ_CHIP
help
This enables support for Allwinner sunxi based SoCs like the A64.
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 8e88a696c9cb..622db3c6474e 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -4,6 +4,7 @@
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/neon.h>
+#include <asm/ptrace.h>
#include <asm/tlbflush.h>
#ifdef CONFIG_EFI
@@ -14,32 +15,29 @@ extern void efi_init(void);
int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
-#define efi_call_virt(f, ...) \
+#define efi_set_mapping_permissions efi_create_mapping
+
+#define arch_efi_call_virt_setup() \
({ \
- efi_##f##_t *__f; \
- efi_status_t __s; \
- \
kernel_neon_begin(); \
efi_virtmap_load(); \
- __f = efi.systab->runtime->f; \
- __s = __f(__VA_ARGS__); \
- efi_virtmap_unload(); \
- kernel_neon_end(); \
- __s; \
})
-#define __efi_call_virt(f, ...) \
+#define arch_efi_call_virt(f, args...) \
({ \
efi_##f##_t *__f; \
- \
- kernel_neon_begin(); \
- efi_virtmap_load(); \
__f = efi.systab->runtime->f; \
- __f(__VA_ARGS__); \
+ __f(args); \
+})
+
+#define arch_efi_call_virt_teardown() \
+({ \
efi_virtmap_unload(); \
kernel_neon_end(); \
})
+#define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT)
+
/* arch specific definitions used by the stub code */
/*
@@ -50,7 +48,16 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
#define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */
#define MAX_FDT_OFFSET SZ_512M
-#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+#define __efi_call_early(f, ...) f(__VA_ARGS__)
+#define efi_is_64bit() (true)
+
+#define alloc_screen_info(x...) &screen_info
+#define free_screen_info(x...)
+
+static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+}
#define EFI_ALLOC_ALIGN SZ_64K
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index b6abc852f2a1..78f52488f9ff 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -17,22 +17,51 @@
#include <asm/efi.h>
-int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
+/*
+ * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
+ * executable, everything else can be mapped with the XN bits
+ * set. Also take the new (optional) RO/XP bits into account.
+ */
+static __init pteval_t create_mapping_protection(efi_memory_desc_t *md)
{
- pteval_t prot_val;
+ u64 attr = md->attribute;
+ u32 type = md->type;
- /*
- * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
- * executable, everything else can be mapped with the XN bits
- * set.
- */
- if ((md->attribute & EFI_MEMORY_WB) == 0)
- prot_val = PROT_DEVICE_nGnRE;
- else if (md->type == EFI_RUNTIME_SERVICES_CODE ||
- !PAGE_ALIGNED(md->phys_addr))
- prot_val = pgprot_val(PAGE_KERNEL_EXEC);
- else
- prot_val = pgprot_val(PAGE_KERNEL);
+ if (type == EFI_MEMORY_MAPPED_IO)
+ return PROT_DEVICE_nGnRE;
+
+ if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr),
+ "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?"))
+ /*
+ * If the region is not aligned to the page size of the OS, we
+ * can not use strict permissions, since that would also affect
+ * the mapping attributes of the adjacent regions.
+ */
+ return pgprot_val(PAGE_KERNEL_EXEC);
+
+ /* R-- */
+ if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) ==
+ (EFI_MEMORY_XP | EFI_MEMORY_RO))
+ return pgprot_val(PAGE_KERNEL_RO);
+
+ /* R-X */
+ if (attr & EFI_MEMORY_RO)
+ return pgprot_val(PAGE_KERNEL_ROX);
+
+ /* RW- */
+ if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE)
+ return pgprot_val(PAGE_KERNEL);
+
+ /* RWX */
+ return pgprot_val(PAGE_KERNEL_EXEC);
+}
+
+/* we will fill this structure from the stub, so don't put it in .bss */
+struct screen_info screen_info __section(.data);
+
+int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
+{
+ pteval_t prot_val = create_mapping_protection(md);
create_pgd_mapping(mm, md->phys_addr, md->virt_addr,
md->num_pages << EFI_PAGE_SHIFT,
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index b45c95d34b83..4ef5373f9a76 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
perf_bp_event(bp, regs);
/* Do we need to handle the stepping? */
- if (!bp->overflow_handler)
+ if (is_default_overflow_handler(bp))
step = 1;
unlock:
rcu_read_unlock();
@@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
perf_bp_event(wp, regs);
/* Do we need to handle the stepping? */
- if (!wp->overflow_handler)
+ if (is_default_overflow_handler(wp))
step = 1;
unlock:
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index 5e360ce88f10..1428849aece8 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -112,6 +112,7 @@ __efistub___memset = KALLSYMS_HIDE(__pi_memset);
__efistub__text = KALLSYMS_HIDE(_text);
__efistub__end = KALLSYMS_HIDE(_end);
__efistub__edata = KALLSYMS_HIDE(_edata);
+__efistub_screen_info = KALLSYMS_HIDE(screen_info);
#endif
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct frame_tail __user *)regs->regs[29];
- while (entry->nr < PERF_MAX_STACK_DEPTH &&
+ while (entry->nr < sysctl_perf_event_max_stack &&
tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry);
} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+ while ((entry->nr < sysctl_perf_event_max_stack) &&
tail && !((unsigned long)tail & 0x3))
tail = compat_user_backtrace(tail, entry);
#endif
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a34420a5df9a..b405bbb54431 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -476,6 +476,7 @@ emit_cond_jmp:
case BPF_JGE:
jmp_cond = A64_COND_CS;
break;
+ case BPF_JSET:
case BPF_JNE:
jmp_cond = A64_COND_NE;
break;
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index ce112472bdd6..8b23e070b844 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -49,8 +49,8 @@ __down_read (struct rw_semaphore *sem)
/*
* lock for writing
*/
-static inline void
-__down_write (struct rw_semaphore *sem)
+static inline long
+___down_write (struct rw_semaphore *sem)
{
long old, new;
@@ -59,10 +59,26 @@ __down_write (struct rw_semaphore *sem)
new = old + RWSEM_ACTIVE_WRITE_BIAS;
} while (cmpxchg_acq(&sem->count, old, new) != old);
- if (old != 0)
+ return old;
+}
+
+static inline void
+__down_write (struct rw_semaphore *sem)
+{
+ if (___down_write(sem))
rwsem_down_write_failed(sem);
}
+static inline int
+__down_write_killable (struct rw_semaphore *sem)
+{
+ if (___down_write(sem))
+ if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+ return -EINTR;
+
+ return 0;
+}
+
/*
* unlock after reading
*/
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 300dac3702f1..bf0865cd438a 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -531,8 +531,6 @@ efi_init (void)
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff, vendor);
- set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
palo_phys = EFI_INVALID_TABLE_ADDR;
if (efi_config_init(arch_tables) != 0)
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
--frame;
- while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+ while ((entry->nr < sysctl_perf_event_max_stack) && frame)
frame = user_backtrace(frame, entry);
}
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
addr = *sp++;
if (__kernel_text_address(addr)) {
perf_callchain_store(entry, addr);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
}
}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
do {
perf_callchain_store(entry, pc);
- if (entry->nr >= PERF_MAX_STACK_DEPTH)
+ if (entry->nr >= sysctl_perf_event_max_stack)
break;
pc = unwind_stack(current, &sp, pc, &ra);
} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned long __user *) sp;
if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
sp = regs->gpr[1];
perf_callchain_store(entry, next_ip);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
return;
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index fead491dfc28..c75e4471e618 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
/*
* lock for writing
*/
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline long ___down_write(struct rw_semaphore *sem)
{
signed long old, new, tmp;
@@ -104,13 +104,23 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
: "=&d" (old), "=&d" (new), "=Q" (sem->count)
: "Q" (sem->count), "m" (tmp)
: "cc", "memory");
- if (old != 0)
- rwsem_down_write_failed(sem);
+
+ return old;
}
static inline void __down_write(struct rw_semaphore *sem)
{
- __down_write_nested(sem, 0);
+ if (___down_write(sem))
+ rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ if (___down_write(sem))
+ if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+ return -EINTR;
+
+ return 0;
}
/*
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index a319745a7b63..751c3373a92c 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -26,6 +26,7 @@ generic-y += percpu.h
generic-y += poll.h
generic-y += preempt.h
generic-y += resource.h
+generic-y += rwsem.h
generic-y += sembuf.h
generic-y += serial.h
generic-y += shmbuf.h
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
deleted file mode 100644
index edab57265293..000000000000
--- a/arch/sh/include/asm/rwsem.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * include/asm-sh/rwsem.h: R/W semaphores for SH using the stuff
- * in lib/rwsem.c.
- */
-
-#ifndef _ASM_SH_RWSEM_H
-#define _ASM_SH_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#define RWSEM_UNLOCKED_VALUE 0x00000000
-#define RWSEM_ACTIVE_BIAS 0x00000001
-#define RWSEM_ACTIVE_MASK 0x0000ffff
-#define RWSEM_WAITING_BIAS (-0x00010000)
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
- if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
- smp_wmb();
- else
- rwsem_down_read_failed(sem);
-}
-
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
- int tmp;
-
- while ((tmp = sem->count) >= 0) {
- if (tmp == cmpxchg(&sem->count, tmp,
- tmp + RWSEM_ACTIVE_READ_BIAS)) {
- smp_wmb();
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
- int tmp;
-
- tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
- (atomic_t *)(&sem->count));
- if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
- smp_wmb();
- else
- rwsem_down_write_failed(sem);
-}
-
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
- int tmp;
-
- tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
- RWSEM_ACTIVE_WRITE_BIAS);
- smp_wmb();
- return tmp == RWSEM_UNLOCKED_VALUE;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
- int tmp;
-
- smp_wmb();
- tmp = atomic_dec_return((atomic_t *)(&sem->count));
- if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
- rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
- smp_wmb();
- if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
- (atomic_t *)(&sem->count)) < 0)
- rwsem_wake(sem);
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
-{
- atomic_add(delta, (atomic_t *)(&sem->count));
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
- int tmp;
-
- smp_wmb();
- tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
- if (tmp < 0)
- rwsem_downgrade_wake(sem);
-}
-
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
- __down_write(sem);
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
-{
- smp_mb();
- return atomic_add_return(delta, (atomic_t *)(&sem->count));
-}
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_SH_RWSEM_H */
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index e928618838bc..6024c26c0585 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += mm-arch-hooks.h
generic-y += module.h
generic-y += mutex.h
generic-y += preempt.h
+generic-y += rwsem.h
generic-y += serial.h
generic-y += trace_clock.h
generic-y += types.h
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
deleted file mode 100644
index 069bf4d663a1..000000000000
--- a/arch/sparc/include/asm/rwsem.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * rwsem.h: R/W semaphores implemented using CAS
- *
- * Written by David S. Miller (davem@redhat.com), 2001.
- * Derived from asm-i386/rwsem.h
- */
-#ifndef _SPARC64_RWSEM_H
-#define _SPARC64_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#define RWSEM_UNLOCKED_VALUE 0x00000000L
-#define RWSEM_ACTIVE_BIAS 0x00000001L
-#define RWSEM_ACTIVE_MASK 0xffffffffL
-#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
- if (unlikely(atomic64_inc_return((atomic64_t *)(&sem->count)) <= 0L))
- rwsem_down_read_failed(sem);
-}
-
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
- long tmp;
-
- while ((tmp = sem->count) >= 0L) {
- if (tmp == cmpxchg(&sem->count, tmp,
- tmp + RWSEM_ACTIVE_READ_BIAS)) {
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
- long tmp;
-
- tmp = atomic64_add_return(RWSEM_ACTIVE_WRITE_BIAS,
- (atomic64_t *)(&sem->count));
- if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
- rwsem_down_write_failed(sem);
-}
-
-static inline void __down_write(struct rw_semaphore *sem)
-{
- __down_write_nested(sem, 0);
-}
-
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
- long tmp;
-
- tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
- RWSEM_ACTIVE_WRITE_BIAS);
- return tmp == RWSEM_UNLOCKED_VALUE;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
- long tmp;
-
- tmp = atomic64_dec_return((atomic64_t *)(&sem->count));
- if (unlikely(tmp < -1L && (tmp & RWSEM_ACTIVE_MASK) == 0L))
- rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
- if (unlikely(atomic64_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
- (atomic64_t *)(&sem->count)) < 0L))
- rwsem_wake(sem);
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
-{
- atomic64_add(delta, (atomic64_t *)(&sem->count));
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
- long tmp;
-
- tmp = atomic64_add_return(-RWSEM_WAITING_BIAS, (atomic64_t *)(&sem->count));
- if (tmp < 0L)
- rwsem_downgrade_wake(sem);
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
-{
- return atomic64_add_return(delta, (atomic64_t *)(&sem->count));
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _SPARC64_RWSEM_H */
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
}
}
#endif
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
pc = sf.callers_pc;
ufp = (unsigned long)sf.fp + STACK_BIAS;
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
ufp = (unsigned long)sf.fp;
}
perf_callchain_store(entry, pc);
- } while (entry->nr < PERF_MAX_STACK_DEPTH);
+ } while (entry->nr < sysctl_perf_event_max_stack);
}
void
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2dc18605831f..a494fa34713a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -164,10 +164,6 @@ config INSTRUCTION_DECODER
def_bool y
depends on KPROBES || PERF_EVENTS || UPROBES
-config PERF_EVENTS_INTEL_UNCORE
- def_bool y
- depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
-
config OUTPUT_FORMAT
string
default "elf32-i386" if X86_32
@@ -1046,6 +1042,8 @@ config X86_THERMAL_VECTOR
def_bool y
depends on X86_MCE_INTEL
+source "arch/x86/events/Kconfig"
+
config X86_LEGACY_VM86
bool "Legacy VM86 support"
default n
@@ -1210,15 +1208,6 @@ config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
-config PERF_EVENTS_AMD_POWER
- depends on PERF_EVENTS && CPU_SUP_AMD
- tristate "AMD Processor Power Reporting Mechanism"
- ---help---
- Provide power reporting mechanism support for AMD processors.
- Currently, it leverages X86_FEATURE_ACC_POWER
- (CPUID Fn8000_0007_EDX[12]) interface to calculate the
- average power consumption on Family 15h processors.
-
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
---help---
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 583d539a4197..52fef606bc54 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -571,312 +571,6 @@ free_handle:
efi_call_early(free_pool, pci_handle);
}
-static void
-setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
- struct efi_pixel_bitmask pixel_info, int pixel_format)
-{
- if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
- si->lfb_depth = 32;
- si->lfb_linelength = pixels_per_scan_line * 4;
- si->red_size = 8;
- si->red_pos = 0;
- si->green_size = 8;
- si->green_pos = 8;
- si->blue_size = 8;
- si->blue_pos = 16;
- si->rsvd_size = 8;
- si->rsvd_pos = 24;
- } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
- si->lfb_depth = 32;
- si->lfb_linelength = pixels_per_scan_line * 4;
- si->red_size = 8;
- si->red_pos = 16;
- si->green_size = 8;
- si->green_pos = 8;
- si->blue_size = 8;
- si->blue_pos = 0;
- si->rsvd_size = 8;
- si->rsvd_pos = 24;
- } else if (pixel_format == PIXEL_BIT_MASK) {
- find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
- find_bits(pixel_info.green_mask, &si->green_pos,
- &si->green_size);
- find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
- find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
- &si->rsvd_size);
- si->lfb_depth = si->red_size + si->green_size +
- si->blue_size + si->rsvd_size;
- si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
- } else {
- si->lfb_depth = 4;
- si->lfb_linelength = si->lfb_width / 2;
- si->red_size = 0;
- si->red_pos = 0;
- si->green_size = 0;
- si->green_pos = 0;
- si->blue_size = 0;
- si->blue_pos = 0;
- si->rsvd_size = 0;
- si->rsvd_pos = 0;
- }
-}
-
-static efi_status_t
-__gop_query32(struct efi_graphics_output_protocol_32 *gop32,
- struct efi_graphics_output_mode_info **info,
- unsigned long *size, u64 *fb_base)
-{
- struct efi_graphics_output_protocol_mode_32 *mode;
- efi_status_t status;
- unsigned long m;
-
- m = gop32->mode;
- mode = (struct efi_graphics_output_protocol_mode_32 *)m;
-
- status = efi_early->call(gop32->query_mode, gop32,
- mode->mode, size, info);
- if (status != EFI_SUCCESS)
- return status;
-
- *fb_base = mode->frame_buffer_base;
- return status;
-}
-
-static efi_status_t
-setup_gop32(struct screen_info *si, efi_guid_t *proto,
- unsigned long size, void **gop_handle)
-{
- struct efi_graphics_output_protocol_32 *gop32, *first_gop;
- unsigned long nr_gops;
- u16 width, height;
- u32 pixels_per_scan_line;
- u32 ext_lfb_base;
- u64 fb_base;
- struct efi_pixel_bitmask pixel_info;
- int pixel_format;
- efi_status_t status;
- u32 *handles = (u32 *)(unsigned long)gop_handle;
- int i;
-
- first_gop = NULL;
- gop32 = NULL;
-
- nr_gops = size / sizeof(u32);
- for (i = 0; i < nr_gops; i++) {
- struct efi_graphics_output_mode_info *info = NULL;
- efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
- bool conout_found = false;
- void *dummy = NULL;
- u32 h = handles[i];
- u64 current_fb_base;
-
- status = efi_call_early(handle_protocol, h,
- proto, (void **)&gop32);
- if (status != EFI_SUCCESS)
- continue;
-
- status = efi_call_early(handle_protocol, h,
- &conout_proto, &dummy);
- if (status == EFI_SUCCESS)
- conout_found = true;
-
- status = __gop_query32(gop32, &info, &size, &current_fb_base);
- if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
- /*
- * Systems that use the UEFI Console Splitter may
- * provide multiple GOP devices, not all of which are
- * backed by real hardware. The workaround is to search
- * for a GOP implementing the ConOut protocol, and if
- * one isn't found, to just fall back to the first GOP.
- */
- width = info->horizontal_resolution;
- height = info->vertical_resolution;
- pixel_format = info->pixel_format;
- pixel_info = info->pixel_information;
- pixels_per_scan_line = info->pixels_per_scan_line;
- fb_base = current_fb_base;
-
- /*
- * Once we've found a GOP supporting ConOut,
- * don't bother looking any further.
- */
- first_gop = gop32;
- if (conout_found)
- break;
- }
- }
-
- /* Did we find any GOPs? */
- if (!first_gop)
- goto out;
-
- /* EFI framebuffer */
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
- si->lfb_width = width;
- si->lfb_height = height;
- si->lfb_base = fb_base;
-
- ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
- if (ext_lfb_base) {
- si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
- si->ext_lfb_base = ext_lfb_base;
- }
-
- si->pages = 1;
-
- setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
-
- si->lfb_size = si->lfb_linelength * si->lfb_height;
-
- si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
-out:
- return status;
-}
-
-static efi_status_t
-__gop_query64(struct efi_graphics_output_protocol_64 *gop64,
- struct efi_graphics_output_mode_info **info,
- unsigned long *size, u64 *fb_base)
-{
- struct efi_graphics_output_protocol_mode_64 *mode;
- efi_status_t status;
- unsigned long m;
-
- m = gop64->mode;
- mode = (struct efi_graphics_output_protocol_mode_64 *)m;
-
- status = efi_early->call(gop64->query_mode, gop64,
- mode->mode, size, info);
- if (status != EFI_SUCCESS)
- return status;
-
- *fb_base = mode->frame_buffer_base;
- return status;
-}
-
-static efi_status_t
-setup_gop64(struct screen_info *si, efi_guid_t *proto,
- unsigned long size, void **gop_handle)
-{
- struct efi_graphics_output_protocol_64 *gop64, *first_gop;
- unsigned long nr_gops;
- u16 width, height;
- u32 pixels_per_scan_line;
- u32 ext_lfb_base;
- u64 fb_base;
- struct efi_pixel_bitmask pixel_info;
- int pixel_format;
- efi_status_t status;
- u64 *handles = (u64 *)(unsigned long)gop_handle;
- int i;
-
- first_gop = NULL;
- gop64 = NULL;
-
- nr_gops = size / sizeof(u64);
- for (i = 0; i < nr_gops; i++) {
- struct efi_graphics_output_mode_info *info = NULL;
- efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
- bool conout_found = false;
- void *dummy = NULL;
- u64 h = handles[i];
- u64 current_fb_base;
-
- status = efi_call_early(handle_protocol, h,
- proto, (void **)&gop64);
- if (status != EFI_SUCCESS)
- continue;
-
- status = efi_call_early(handle_protocol, h,
- &conout_proto, &dummy);
- if (status == EFI_SUCCESS)
- conout_found = true;
-
- status = __gop_query64(gop64, &info, &size, &current_fb_base);
- if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
- /*
- * Systems that use the UEFI Console Splitter may
- * provide multiple GOP devices, not all of which are
- * backed by real hardware. The workaround is to search
- * for a GOP implementing the ConOut protocol, and if
- * one isn't found, to just fall back to the first GOP.
- */
- width = info->horizontal_resolution;
- height = info->vertical_resolution;
- pixel_format = info->pixel_format;
- pixel_info = info->pixel_information;
- pixels_per_scan_line = info->pixels_per_scan_line;
- fb_base = current_fb_base;
-
- /*
- * Once we've found a GOP supporting ConOut,
- * don't bother looking any further.
- */
- first_gop = gop64;
- if (conout_found)
- break;
- }
- }
-
- /* Did we find any GOPs? */
- if (!first_gop)
- goto out;
-
- /* EFI framebuffer */
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
- si->lfb_width = width;
- si->lfb_height = height;
- si->lfb_base = fb_base;
-
- ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
- if (ext_lfb_base) {
- si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
- si->ext_lfb_base = ext_lfb_base;
- }
-
- si->pages = 1;
-
- setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
-
- si->lfb_size = si->lfb_linelength * si->lfb_height;
-
- si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
-out:
- return status;
-}
-
-/*
- * See if we have Graphics Output Protocol
- */
-static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
- unsigned long size)
-{
- efi_status_t status;
- void **gop_handle = NULL;
-
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
- size, (void **)&gop_handle);
- if (status != EFI_SUCCESS)
- return status;
-
- status = efi_call_early(locate_handle,
- EFI_LOCATE_BY_PROTOCOL,
- proto, NULL, &size, gop_handle);
- if (status != EFI_SUCCESS)
- goto free_handle;
-
- if (efi_early->is64)
- status = setup_gop64(si, proto, size, gop_handle);
- else
- status = setup_gop32(si, proto, size, gop_handle);
-
-free_handle:
- efi_call_early(free_pool, gop_handle);
- return status;
-}
-
static efi_status_t
setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
{
@@ -1038,7 +732,7 @@ void setup_graphics(struct boot_params *boot_params)
EFI_LOCATE_BY_PROTOCOL,
&graphics_proto, NULL, &size, gop_handle);
if (status == EFI_BUFFER_TOO_SMALL)
- status = setup_gop(si, &graphics_proto, size);
+ status = efi_setup_gop(NULL, si, &graphics_proto, size);
if (status != EFI_SUCCESS) {
size = 0;
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index d487e727f1ec..c0223f1a89d7 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -11,80 +11,6 @@
#define DESC_TYPE_CODE_DATA (1 << 0)
-#define EFI_CONSOLE_OUT_DEVICE_GUID \
- EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \
- 0x3f, 0xc1, 0x4d)
-
-#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0
-#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1
-#define PIXEL_BIT_MASK 2
-#define PIXEL_BLT_ONLY 3
-#define PIXEL_FORMAT_MAX 4
-
-struct efi_pixel_bitmask {
- u32 red_mask;
- u32 green_mask;
- u32 blue_mask;
- u32 reserved_mask;
-};
-
-struct efi_graphics_output_mode_info {
- u32 version;
- u32 horizontal_resolution;
- u32 vertical_resolution;
- int pixel_format;
- struct efi_pixel_bitmask pixel_information;
- u32 pixels_per_scan_line;
-} __packed;
-
-struct efi_graphics_output_protocol_mode_32 {
- u32 max_mode;
- u32 mode;
- u32 info;
- u32 size_of_info;
- u64 frame_buffer_base;
- u32 frame_buffer_size;
-} __packed;
-
-struct efi_graphics_output_protocol_mode_64 {
- u32 max_mode;
- u32 mode;
- u64 info;
- u64 size_of_info;
- u64 frame_buffer_base;
- u64 frame_buffer_size;
-} __packed;
-
-struct efi_graphics_output_protocol_mode {
- u32 max_mode;
- u32 mode;
- unsigned long info;
- unsigned long size_of_info;
- u64 frame_buffer_base;
- unsigned long frame_buffer_size;
-} __packed;
-
-struct efi_graphics_output_protocol_32 {
- u32 query_mode;
- u32 set_mode;
- u32 blt;
- u32 mode;
-};
-
-struct efi_graphics_output_protocol_64 {
- u64 query_mode;
- u64 set_mode;
- u64 blt;
- u64 mode;
-};
-
-struct efi_graphics_output_protocol {
- void *query_mode;
- unsigned long set_mode;
- unsigned long blt;
- struct efi_graphics_output_protocol_mode *mode;
-};
-
struct efi_uga_draw_protocol_32 {
u32 get_mode;
u32 set_mode;
diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config
index f9affcc3b9f1..9906505c998a 100644
--- a/arch/x86/configs/kvm_guest.config
+++ b/arch/x86/configs/kvm_guest.config
@@ -26,3 +26,6 @@ CONFIG_VIRTIO_NET=y
CONFIG_9P_FS=y
CONFIG_NET_9P=y
CONFIG_NET_9P_VIRTIO=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_INPUT=y
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index b30dd8154cc2..4cddd17153fb 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -384,5 +384,5 @@
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
377 i386 copy_file_range sys_copy_file_range
-378 i386 preadv2 sys_preadv2
-379 i386 pwritev2 sys_pwritev2
+378 i386 preadv2 sys_preadv2 compat_sys_preadv2
+379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2
diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig
new file mode 100644
index 000000000000..98397db5ceae
--- /dev/null
+++ b/arch/x86/events/Kconfig
@@ -0,0 +1,36 @@
+menu "Performance monitoring"
+
+config PERF_EVENTS_INTEL_UNCORE
+ tristate "Intel uncore performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ ---help---
+ Include support for Intel uncore performance events. These are
+ available on NehalemEX and more modern processors.
+
+config PERF_EVENTS_INTEL_RAPL
+ tristate "Intel rapl performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ ---help---
+ Include support for Intel rapl performance events for power
+ monitoring on modern processors.
+
+config PERF_EVENTS_INTEL_CSTATE
+ tristate "Intel cstate performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ ---help---
+ Include support for Intel cstate performance events for power
+ monitoring on modern processors.
+
+config PERF_EVENTS_AMD_POWER
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ tristate "AMD Processor Power Reporting Mechanism"
+ ---help---
+ Provide power reporting mechanism support for AMD processors.
+ Currently, it leverages X86_FEATURE_ACC_POWER
+ (CPUID Fn8000_0007_EDX[12]) interface to calculate the
+ average power consumption on Family 15h processors.
+
+endmenu
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index f59618a39990..1d392c39fe56 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,9 +6,6 @@ obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o
ifdef CONFIG_AMD_IOMMU
obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o
endif
-obj-$(CONFIG_CPU_SUP_INTEL) += intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel/cstate.o intel/ds.o intel/knc.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel/rapl.o msr.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o
+
+obj-$(CONFIG_CPU_SUP_INTEL) += msr.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel/
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 3db9569e658c..98ac57381bf9 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -263,6 +263,7 @@ static const struct attribute_group *amd_uncore_attr_groups[] = {
};
static struct pmu amd_nb_pmu = {
+ .task_ctx_nr = perf_invalid_context,
.attr_groups = amd_uncore_attr_groups,
.name = "amd_nb",
.event_init = amd_uncore_event_init,
@@ -274,6 +275,7 @@ static struct pmu amd_nb_pmu = {
};
static struct pmu amd_l2_pmu = {
+ .task_ctx_nr = perf_invalid_context,
.attr_groups = amd_uncore_attr_groups,
.name = "amd_l2",
.event_init = amd_uncore_event_init,
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index dd39fde66b54..b7080bef9137 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -360,6 +360,9 @@ int x86_add_exclusive(unsigned int what)
{
int i;
+ if (x86_pmu.lbr_pt_coexist)
+ return 0;
+
if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
mutex_lock(&pmc_reserve_mutex);
for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
@@ -380,6 +383,9 @@ fail_unlock:
void x86_del_exclusive(unsigned int what)
{
+ if (x86_pmu.lbr_pt_coexist)
+ return;
+
atomic_dec(&x86_pmu.lbr_exclusive[what]);
atomic_dec(&active_events);
}
@@ -2277,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
@@ -2337,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;
pagefault_disable();
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
+ while (entry->nr < sysctl_perf_event_max_stack) {
unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
new file mode 100644
index 000000000000..3660b2cf245a
--- /dev/null
+++ b/arch/x86/events/intel/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o
+obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl.o
+intel-rapl-objs := rapl.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o
+intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o
+intel-cstate-objs := cstate.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index b99dc9258c0f..0a6e393a2e62 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -171,18 +171,6 @@ static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
memset(page_address(phys->page) + index, 0, phys->size - index);
}
-static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
-{
- if (buf->snapshot)
- return false;
-
- if (local_read(&buf->data_size) >= bts->handle.size ||
- bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
- return true;
-
- return false;
-}
-
static void bts_update(struct bts_ctx *bts)
{
int cpu = raw_smp_processor_id();
@@ -213,18 +201,15 @@ static void bts_update(struct bts_ctx *bts)
}
}
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
+
static void __bts_event_start(struct perf_event *event)
{
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
struct bts_buffer *buf = perf_get_aux(&bts->handle);
u64 config = 0;
- if (!buf || bts_buffer_is_full(buf, bts))
- return;
-
- event->hw.itrace_started = 1;
- event->hw.state = 0;
-
if (!buf->snapshot)
config |= ARCH_PERFMON_EVENTSEL_INT;
if (!event->attr.exclude_kernel)
@@ -241,16 +226,41 @@ static void __bts_event_start(struct perf_event *event)
wmb();
intel_pmu_enable_bts(config);
+
}
static void bts_event_start(struct perf_event *event, int flags)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf;
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ goto fail_stop;
+
+ if (bts_buffer_reset(buf, &bts->handle))
+ goto fail_end_stop;
+
+ bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+ bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+ bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+ event->hw.itrace_started = 1;
+ event->hw.state = 0;
__bts_event_start(event);
/* PMI handler: this counter is running and likely generating PMIs */
ACCESS_ONCE(bts->started) = 1;
+
+ return;
+
+fail_end_stop:
+ perf_aux_output_end(&bts->handle, 0, false);
+
+fail_stop:
+ event->hw.state = PERF_HES_STOPPED;
}
static void __bts_event_stop(struct perf_event *event)
@@ -269,15 +279,32 @@ static void __bts_event_stop(struct perf_event *event)
static void bts_event_stop(struct perf_event *event, int flags)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
/* PMI handler: don't restart this counter */
ACCESS_ONCE(bts->started) = 0;
__bts_event_stop(event);
- if (flags & PERF_EF_UPDATE)
+ if (flags & PERF_EF_UPDATE) {
bts_update(bts);
+
+ if (buf) {
+ if (buf->snapshot)
+ bts->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+ }
+
+ cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+ cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+ }
}
void intel_bts_enable_local(void)
@@ -417,34 +444,14 @@ int intel_bts_interrupt(void)
static void bts_event_del(struct perf_event *event, int mode)
{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
- struct bts_buffer *buf = perf_get_aux(&bts->handle);
-
bts_event_stop(event, PERF_EF_UPDATE);
-
- if (buf) {
- if (buf->snapshot)
- bts->handle.head =
- local_xchg(&buf->data_size,
- buf->nr_pages << PAGE_SHIFT);
- perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
- !!local_xchg(&buf->lost, 0));
- }
-
- cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
- cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
- cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
- cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
}
static int bts_event_add(struct perf_event *event, int mode)
{
- struct bts_buffer *buf;
struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- int ret = -EBUSY;
event->hw.state = PERF_HES_STOPPED;
@@ -454,26 +461,10 @@ static int bts_event_add(struct perf_event *event, int mode)
if (bts->handle.event)
return -EBUSY;
- buf = perf_aux_output_begin(&bts->handle, event);
- if (!buf)
- return -EINVAL;
-
- ret = bts_buffer_reset(buf, &bts->handle);
- if (ret) {
- perf_aux_output_end(&bts->handle, 0, false);
- return ret;
- }
-
- bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
- bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
- bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
-
if (mode & PERF_EF_START) {
bts_event_start(event, 0);
- if (hwc->state & PERF_HES_STOPPED) {
- bts_event_del(event, 0);
- return -EBUSY;
- }
+ if (hwc->state & PERF_HES_STOPPED)
+ return -EINVAL;
}
return 0;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6fd4dbcf820..7c666958a625 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1465,6 +1465,140 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
+static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+#define GLM_DEMAND_DATA_RD BIT_ULL(0)
+#define GLM_DEMAND_RFO BIT_ULL(1)
+#define GLM_ANY_RESPONSE BIT_ULL(16)
+#define GLM_SNP_NONE_OR_MISS BIT_ULL(33)
+#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD
+#define GLM_DEMAND_WRITE GLM_DEMAND_RFO
+#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+#define GLM_LLC_ACCESS GLM_ANY_RESPONSE
+#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM)
+#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 glm_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
+ [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ },
+ [C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+ [C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+};
+
+static __initconst const u64 glm_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_READ|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_READ|
+ GLM_LLC_MISS,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_PREFETCH|
+ GLM_LLC_MISS,
+ },
+ },
+};
+
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
@@ -3447,7 +3581,7 @@ __init int intel_pmu_init(void)
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
- intel_pmu_lbr_init_atom();
+ intel_pmu_lbr_init_slm();
x86_pmu.event_constraints = intel_slm_event_constraints;
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
@@ -3456,6 +3590,30 @@ __init int intel_pmu_init(void)
pr_cont("Silvermont events, ");
break;
+ case 92: /* 14nm Atom "Goldmont" */
+ case 95: /* 14nm Atom "Goldmont Denverton" */
+ memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_glm_extra_regs;
+ /*
+ * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+ * for precise cycles.
+ * :pp is identical to :ppp
+ */
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ pr_cont("Goldmont events, ");
+ break;
+
case 37: /* 32nm Westmere */
case 44: /* 32nm Westmere-EP */
case 47: /* 32nm Westmere-EX */
@@ -3708,7 +3866,7 @@ __init int intel_pmu_init(void)
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
c->idxmsk64 &=
- ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+ ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
c->weight = hweight64(c->idxmsk64);
}
}
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 7946c4231169..9ba4e4136a15 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -91,6 +91,8 @@
#include <asm/cpu_device_id.h>
#include "../perf_event.h"
+MODULE_LICENSE("GPL");
+
#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
static ssize_t __cstate_##_var##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
@@ -106,22 +108,27 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf);
+/* Model -> events mapping */
+struct cstate_model {
+ unsigned long core_events;
+ unsigned long pkg_events;
+ unsigned long quirks;
+};
+
+/* Quirk flags */
+#define SLM_PKG_C6_USE_C7_MSR (1UL << 0)
+
struct perf_cstate_msr {
u64 msr;
struct perf_pmu_events_attr *attr;
- bool (*test)(int idx);
};
/* cstate_core PMU */
-
static struct pmu cstate_core_pmu;
static bool has_cstate_core;
-enum perf_cstate_core_id {
- /*
- * cstate_core events
- */
+enum perf_cstate_core_events {
PERF_CSTATE_CORE_C1_RES = 0,
PERF_CSTATE_CORE_C3_RES,
PERF_CSTATE_CORE_C6_RES,
@@ -130,69 +137,16 @@ enum perf_cstate_core_id {
PERF_CSTATE_CORE_EVENT_MAX,
};
-bool test_core(int idx)
-{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- boot_cpu_data.x86 != 6)
- return false;
-
- switch (boot_cpu_data.x86_model) {
- case 30: /* 45nm Nehalem */
- case 26: /* 45nm Nehalem-EP */
- case 46: /* 45nm Nehalem-EX */
-
- case 37: /* 32nm Westmere */
- case 44: /* 32nm Westmere-EP */
- case 47: /* 32nm Westmere-EX */
- if (idx == PERF_CSTATE_CORE_C3_RES ||
- idx == PERF_CSTATE_CORE_C6_RES)
- return true;
- break;
- case 42: /* 32nm SandyBridge */
- case 45: /* 32nm SandyBridge-E/EN/EP */
-
- case 58: /* 22nm IvyBridge */
- case 62: /* 22nm IvyBridge-EP/EX */
-
- case 60: /* 22nm Haswell Core */
- case 63: /* 22nm Haswell Server */
- case 69: /* 22nm Haswell ULT */
- case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
- case 61: /* 14nm Broadwell Core-M */
- case 86: /* 14nm Broadwell Xeon D */
- case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
- case 79: /* 14nm Broadwell Server */
-
- case 78: /* 14nm Skylake Mobile */
- case 94: /* 14nm Skylake Desktop */
- if (idx == PERF_CSTATE_CORE_C3_RES ||
- idx == PERF_CSTATE_CORE_C6_RES ||
- idx == PERF_CSTATE_CORE_C7_RES)
- return true;
- break;
- case 55: /* 22nm Atom "Silvermont" */
- case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
- case 76: /* 14nm Atom "Airmont" */
- if (idx == PERF_CSTATE_CORE_C1_RES ||
- idx == PERF_CSTATE_CORE_C6_RES)
- return true;
- break;
- }
-
- return false;
-}
-
PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
static struct perf_cstate_msr core_msr[] = {
- [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, },
- [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, },
- [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, },
- [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, },
+ [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1 },
+ [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3 },
+ [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6 },
+ [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7 },
};
static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
@@ -234,18 +188,11 @@ static const struct attribute_group *core_attr_groups[] = {
NULL,
};
-/* cstate_core PMU end */
-
-
/* cstate_pkg PMU */
-
static struct pmu cstate_pkg_pmu;
static bool has_cstate_pkg;
-enum perf_cstate_pkg_id {
- /*
- * cstate_pkg events
- */
+enum perf_cstate_pkg_events {
PERF_CSTATE_PKG_C2_RES = 0,
PERF_CSTATE_PKG_C3_RES,
PERF_CSTATE_PKG_C6_RES,
@@ -257,69 +204,6 @@ enum perf_cstate_pkg_id {
PERF_CSTATE_PKG_EVENT_MAX,
};
-bool test_pkg(int idx)
-{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
- boot_cpu_data.x86 != 6)
- return false;
-
- switch (boot_cpu_data.x86_model) {
- case 30: /* 45nm Nehalem */
- case 26: /* 45nm Nehalem-EP */
- case 46: /* 45nm Nehalem-EX */
-
- case 37: /* 32nm Westmere */
- case 44: /* 32nm Westmere-EP */
- case 47: /* 32nm Westmere-EX */
- if (idx == PERF_CSTATE_CORE_C3_RES ||
- idx == PERF_CSTATE_CORE_C6_RES ||
- idx == PERF_CSTATE_CORE_C7_RES)
- return true;
- break;
- case 42: /* 32nm SandyBridge */
- case 45: /* 32nm SandyBridge-E/EN/EP */
-
- case 58: /* 22nm IvyBridge */
- case 62: /* 22nm IvyBridge-EP/EX */
-
- case 60: /* 22nm Haswell Core */
- case 63: /* 22nm Haswell Server */
- case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
- case 61: /* 14nm Broadwell Core-M */
- case 86: /* 14nm Broadwell Xeon D */
- case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
- case 79: /* 14nm Broadwell Server */
-
- case 78: /* 14nm Skylake Mobile */
- case 94: /* 14nm Skylake Desktop */
- if (idx == PERF_CSTATE_PKG_C2_RES ||
- idx == PERF_CSTATE_PKG_C3_RES ||
- idx == PERF_CSTATE_PKG_C6_RES ||
- idx == PERF_CSTATE_PKG_C7_RES)
- return true;
- break;
- case 55: /* 22nm Atom "Silvermont" */
- case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
- case 76: /* 14nm Atom "Airmont" */
- if (idx == PERF_CSTATE_CORE_C6_RES)
- return true;
- break;
- case 69: /* 22nm Haswell ULT */
- if (idx == PERF_CSTATE_PKG_C2_RES ||
- idx == PERF_CSTATE_PKG_C3_RES ||
- idx == PERF_CSTATE_PKG_C6_RES ||
- idx == PERF_CSTATE_PKG_C7_RES ||
- idx == PERF_CSTATE_PKG_C8_RES ||
- idx == PERF_CSTATE_PKG_C9_RES ||
- idx == PERF_CSTATE_PKG_C10_RES)
- return true;
- break;
- }
-
- return false;
-}
-
PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
@@ -329,13 +213,13 @@ PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
static struct perf_cstate_msr pkg_msr[] = {
- [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, },
- [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, },
- [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, },
- [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, },
- [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, },
- [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, },
- [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, },
+ [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2 },
+ [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3 },
+ [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6 },
+ [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7 },
+ [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8 },
+ [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9 },
+ [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10 },
};
static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
@@ -366,8 +250,6 @@ static const struct attribute_group *pkg_attr_groups[] = {
NULL,
};
-/* cstate_pkg PMU end*/
-
static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -385,7 +267,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
static int cstate_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config;
- int ret = 0;
+ int cpu;
if (event->attr.type != event->pmu->type)
return -ENOENT;
@@ -400,26 +282,36 @@ static int cstate_pmu_event_init(struct perf_event *event)
event->attr.sample_period) /* no sampling */
return -EINVAL;
+ if (event->cpu < 0)
+ return -EINVAL;
+
if (event->pmu == &cstate_core_pmu) {
if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
return -EINVAL;
if (!core_msr[cfg].attr)
return -EINVAL;
event->hw.event_base = core_msr[cfg].msr;
+ cpu = cpumask_any_and(&cstate_core_cpu_mask,
+ topology_sibling_cpumask(event->cpu));
} else if (event->pmu == &cstate_pkg_pmu) {
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
return -EINVAL;
if (!pkg_msr[cfg].attr)
return -EINVAL;
event->hw.event_base = pkg_msr[cfg].msr;
- } else
+ cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
+ topology_core_cpumask(event->cpu));
+ } else {
return -ENOENT;
+ }
+
+ if (cpu >= nr_cpu_ids)
+ return -ENODEV;
- /* must be done before validate_group */
+ event->cpu = cpu;
event->hw.config = cfg;
event->hw.idx = -1;
-
- return ret;
+ return 0;
}
static inline u64 cstate_pmu_read_counter(struct perf_event *event)
@@ -469,172 +361,91 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode)
return 0;
}
+/*
+ * Check if exiting cpu is the designated reader. If so migrate the
+ * events when there is a valid target available
+ */
static void cstate_cpu_exit(int cpu)
{
- int i, id, target;
+ unsigned int target;
- /* cpu exit for cstate core */
- if (has_cstate_core) {
- id = topology_core_id(cpu);
- target = -1;
-
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
- if (id == topology_core_id(i)) {
- target = i;
- break;
- }
- }
- if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+ if (has_cstate_core &&
+ cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
+
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ /* Migrate events if there is a valid target */
+ if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_core_cpu_mask);
- WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
- if (target >= 0)
perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+ }
}
- /* cpu exit for cstate pkg */
- if (has_cstate_pkg) {
- id = topology_physical_package_id(cpu);
- target = -1;
-
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
- if (id == topology_physical_package_id(i)) {
- target = i;
- break;
- }
- }
- if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+ if (has_cstate_pkg &&
+ cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
+
+ target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+ /* Migrate events if there is a valid target */
+ if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
- WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
- if (target >= 0)
perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+ }
}
}
static void cstate_cpu_init(int cpu)
{
- int i, id;
+ unsigned int target;
- /* cpu init for cstate core */
- if (has_cstate_core) {
- id = topology_core_id(cpu);
- for_each_cpu(i, &cstate_core_cpu_mask) {
- if (id == topology_core_id(i))
- break;
- }
- if (i >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
- }
+ /*
+ * If this is the first online thread of that core, set it in
+ * the core cpu mask as the designated reader.
+ */
+ target = cpumask_any_and(&cstate_core_cpu_mask,
+ topology_sibling_cpumask(cpu));
- /* cpu init for cstate pkg */
- if (has_cstate_pkg) {
- id = topology_physical_package_id(cpu);
- for_each_cpu(i, &cstate_pkg_cpu_mask) {
- if (id == topology_physical_package_id(i))
- break;
- }
- if (i >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
- }
+ if (has_cstate_core && target >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+
+ /*
+ * If this is the first online thread of that package, set it
+ * in the package cpu mask as the designated reader.
+ */
+ target = cpumask_any_and(&cstate_pkg_cpu_mask,
+ topology_core_cpumask(cpu));
+ if (has_cstate_pkg && target >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
}
static int cstate_cpu_notifier(struct notifier_block *self,
- unsigned long action, void *hcpu)
+ unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- break;
case CPU_STARTING:
cstate_cpu_init(cpu);
break;
- case CPU_UP_CANCELED:
- case CPU_DYING:
- break;
- case CPU_ONLINE:
- case CPU_DEAD:
- break;
case CPU_DOWN_PREPARE:
cstate_cpu_exit(cpu);
break;
default:
break;
}
-
return NOTIFY_OK;
}
-/*
- * Probe the cstate events and insert the available one into sysfs attrs
- * Return false if there is no available events.
- */
-static bool cstate_probe_msr(struct perf_cstate_msr *msr,
- struct attribute **events_attrs,
- int max_event_nr)
-{
- int i, j = 0;
- u64 val;
-
- /* Probe the cstate events. */
- for (i = 0; i < max_event_nr; i++) {
- if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
- msr[i].attr = NULL;
- }
-
- /* List remaining events in the sysfs attrs. */
- for (i = 0; i < max_event_nr; i++) {
- if (msr[i].attr)
- events_attrs[j++] = &msr[i].attr->attr.attr;
- }
- events_attrs[j] = NULL;
-
- return (j > 0) ? true : false;
-}
-
-static int __init cstate_init(void)
-{
- /* SLM has different MSR for PKG C6 */
- switch (boot_cpu_data.x86_model) {
- case 55:
- case 76:
- case 77:
- pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
- }
-
- if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
- has_cstate_core = true;
-
- if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
- has_cstate_pkg = true;
-
- return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
-}
-
-static void __init cstate_cpumask_init(void)
-{
- int cpu;
-
- cpu_notifier_register_begin();
-
- for_each_online_cpu(cpu)
- cstate_cpu_init(cpu);
-
- __perf_cpu_notifier(cstate_cpu_notifier);
-
- cpu_notifier_register_done();
-}
+static struct notifier_block cstate_cpu_nb = {
+ .notifier_call = cstate_cpu_notifier,
+ .priority = CPU_PRI_PERF + 1,
+};
static struct pmu cstate_core_pmu = {
.attr_groups = core_attr_groups,
.name = "cstate_core",
.task_ctx_nr = perf_invalid_context,
.event_init = cstate_pmu_event_init,
- .add = cstate_pmu_event_add, /* must have */
- .del = cstate_pmu_event_del, /* must have */
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
.start = cstate_pmu_event_start,
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
@@ -646,49 +457,203 @@ static struct pmu cstate_pkg_pmu = {
.name = "cstate_pkg",
.task_ctx_nr = perf_invalid_context,
.event_init = cstate_pmu_event_init,
- .add = cstate_pmu_event_add, /* must have */
- .del = cstate_pmu_event_del, /* must have */
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
.start = cstate_pmu_event_start,
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
};
-static void __init cstate_pmus_register(void)
+static const struct cstate_model nhm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model snb_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model hswult_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model slm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+ .quirks = SLM_PKG_C6_USE_C7_MSR,
+};
+
+#define X86_CSTATES_MODEL(model, states) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) }
+
+static const struct x86_cpu_id intel_cstates_match[] __initconst = {
+ X86_CSTATES_MODEL(30, nhm_cstates), /* 45nm Nehalem */
+ X86_CSTATES_MODEL(26, nhm_cstates), /* 45nm Nehalem-EP */
+ X86_CSTATES_MODEL(46, nhm_cstates), /* 45nm Nehalem-EX */
+
+ X86_CSTATES_MODEL(37, nhm_cstates), /* 32nm Westmere */
+ X86_CSTATES_MODEL(44, nhm_cstates), /* 32nm Westmere-EP */
+ X86_CSTATES_MODEL(47, nhm_cstates), /* 32nm Westmere-EX */
+
+ X86_CSTATES_MODEL(42, snb_cstates), /* 32nm SandyBridge */
+ X86_CSTATES_MODEL(45, snb_cstates), /* 32nm SandyBridge-E/EN/EP */
+
+ X86_CSTATES_MODEL(58, snb_cstates), /* 22nm IvyBridge */
+ X86_CSTATES_MODEL(62, snb_cstates), /* 22nm IvyBridge-EP/EX */
+
+ X86_CSTATES_MODEL(60, snb_cstates), /* 22nm Haswell Core */
+ X86_CSTATES_MODEL(63, snb_cstates), /* 22nm Haswell Server */
+ X86_CSTATES_MODEL(70, snb_cstates), /* 22nm Haswell + GT3e */
+
+ X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT */
+
+ X86_CSTATES_MODEL(55, slm_cstates), /* 22nm Atom Silvermont */
+ X86_CSTATES_MODEL(77, slm_cstates), /* 22nm Atom Avoton/Rangely */
+ X86_CSTATES_MODEL(76, slm_cstates), /* 22nm Atom Airmont */
+
+ X86_CSTATES_MODEL(61, snb_cstates), /* 14nm Broadwell Core-M */
+ X86_CSTATES_MODEL(86, snb_cstates), /* 14nm Broadwell Xeon D */
+ X86_CSTATES_MODEL(71, snb_cstates), /* 14nm Broadwell + GT3e */
+ X86_CSTATES_MODEL(79, snb_cstates), /* 14nm Broadwell Server */
+
+ X86_CSTATES_MODEL(78, snb_cstates), /* 14nm Skylake Mobile */
+ X86_CSTATES_MODEL(94, snb_cstates), /* 14nm Skylake Desktop */
+ { },
+};
+MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there are no available events.
+ */
+static bool __init cstate_probe_msr(const unsigned long evmsk, int max,
+ struct perf_cstate_msr *msr,
+ struct attribute **attrs)
{
- int err;
+ bool found = false;
+ unsigned int bit;
+ u64 val;
+
+ for (bit = 0; bit < max; bit++) {
+ if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) {
+ *attrs++ = &msr[bit].attr->attr.attr;
+ found = true;
+ } else {
+ msr[bit].attr = NULL;
+ }
+ }
+ *attrs = NULL;
+
+ return found;
+}
+
+static int __init cstate_probe(const struct cstate_model *cm)
+{
+ /* SLM has different MSR for PKG C6 */
+ if (cm->quirks & SLM_PKG_C6_USE_C7_MSR)
+ pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+
+ has_cstate_core = cstate_probe_msr(cm->core_events,
+ PERF_CSTATE_CORE_EVENT_MAX,
+ core_msr, core_events_attrs);
+
+ has_cstate_pkg = cstate_probe_msr(cm->pkg_events,
+ PERF_CSTATE_PKG_EVENT_MAX,
+ pkg_msr, pkg_events_attrs);
+
+ return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static inline void cstate_cleanup(void)
+{
+ if (has_cstate_core)
+ perf_pmu_unregister(&cstate_core_pmu);
+
+ if (has_cstate_pkg)
+ perf_pmu_unregister(&cstate_pkg_pmu);
+}
+
+static int __init cstate_init(void)
+{
+ int cpu, err;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(cpu)
+ cstate_cpu_init(cpu);
if (has_cstate_core) {
err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
- if (WARN_ON(err))
- pr_info("Failed to register PMU %s error %d\n",
- cstate_core_pmu.name, err);
+ if (err) {
+ has_cstate_core = false;
+ pr_info("Failed to register cstate core pmu\n");
+ goto out;
+ }
}
if (has_cstate_pkg) {
err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
- if (WARN_ON(err))
- pr_info("Failed to register PMU %s error %d\n",
- cstate_pkg_pmu.name, err);
+ if (err) {
+ has_cstate_pkg = false;
+ pr_info("Failed to register cstate pkg pmu\n");
+ cstate_cleanup();
+ goto out;
+ }
}
+ __register_cpu_notifier(&cstate_cpu_nb);
+out:
+ cpu_notifier_register_done();
+ return err;
}
static int __init cstate_pmu_init(void)
{
+ const struct x86_cpu_id *id;
int err;
- if (cpu_has_hypervisor)
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return -ENODEV;
+
+ id = x86_match_cpu(intel_cstates_match);
+ if (!id)
return -ENODEV;
- err = cstate_init();
+ err = cstate_probe((const struct cstate_model *) id->driver_data);
if (err)
return err;
- cstate_cpumask_init();
-
- cstate_pmus_register();
-
- return 0;
+ return cstate_init();
}
+module_init(cstate_pmu_init);
-device_initcall(cstate_pmu_init);
+static void __exit cstate_pmu_exit(void)
+{
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&cstate_cpu_nb);
+ cstate_cleanup();
+ cpu_notifier_register_done();
+}
+module_exit(cstate_pmu_exit);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8584b90d8e0b..7ce9f3f669e6 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -645,6 +645,12 @@ struct event_constraint intel_slm_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_glm_pebs_event_constraints[] = {
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 1ca5d1e7d4f2..9e2b40cdb05f 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -14,7 +14,8 @@ enum {
LBR_FORMAT_EIP_FLAGS = 0x03,
LBR_FORMAT_EIP_FLAGS2 = 0x04,
LBR_FORMAT_INFO = 0x05,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO,
+ LBR_FORMAT_TIME = 0x06,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
};
static enum {
@@ -464,6 +465,16 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
abort = !!(info & LBR_INFO_ABORT);
cycles = (info & LBR_INFO_CYCLES);
}
+
+ if (lbr_format == LBR_FORMAT_TIME) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
+ cycles = ((to >> 48) & LBR_INFO_CYCLES);
+
+ to = (u64)((((s64)to) << 16) >> 16);
+ }
+
if (lbr_flags & LBR_EIP_FLAGS) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
@@ -1049,6 +1060,24 @@ void __init intel_pmu_lbr_init_atom(void)
pr_cont("8-deep LBR, ");
}
+/* slm */
+void __init intel_pmu_lbr_init_slm(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+ pr_cont("8-deep LBR, ");
+}
+
/* Knights Landing */
void intel_pmu_lbr_init_knl(void)
{
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 09a77dbc73c9..04bb5fb5a8d7 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -67,11 +67,13 @@ static struct pt_cap_desc {
PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)),
+ PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)),
PT_CAP(mtc, 0, CR_EBX, BIT(3)),
PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CR_ECX, BIT(2)),
PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
+ PT_CAP(num_address_ranges, 1, CR_EAX, 0x3),
PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000),
PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff),
PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000),
@@ -125,9 +127,46 @@ static struct attribute_group pt_format_group = {
.attrs = pt_formats_attr,
};
+static ssize_t
+pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ switch (pmu_attr->id) {
+ case 0:
+ return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
+ case 1:
+ return sprintf(page, "%u:%u\n",
+ pt_pmu.tsc_art_num,
+ pt_pmu.tsc_art_den);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
+ pt_timing_attr_show);
+PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
+ pt_timing_attr_show);
+
+static struct attribute *pt_timing_attr[] = {
+ &timing_attr_max_nonturbo_ratio.attr.attr,
+ &timing_attr_tsc_art_ratio.attr.attr,
+ NULL,
+};
+
+static struct attribute_group pt_timing_group = {
+ .attrs = pt_timing_attr,
+};
+
static const struct attribute_group *pt_attr_groups[] = {
&pt_cap_group,
&pt_format_group,
+ &pt_timing_group,
NULL,
};
@@ -140,6 +179,23 @@ static int __init pt_pmu_hw_init(void)
int ret;
long i;
+ rdmsrl(MSR_PLATFORM_INFO, reg);
+ pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
+
+ /*
+ * if available, read in TSC to core crystal clock ratio,
+ * otherwise, zero for numerator stands for "not enumerated"
+ * as per SDM
+ */
+ if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
+
+ pt_pmu.tsc_art_num = ebx;
+ pt_pmu.tsc_art_den = eax;
+ }
+
if (boot_cpu_has(X86_FEATURE_VMX)) {
/*
* Intel SDM, 36.5 "Tracing post-VMXON" says that
@@ -263,6 +319,75 @@ static bool pt_event_valid(struct perf_event *event)
* These all are cpu affine and operate on a local PT
*/
+/* Address ranges and their corresponding msr configuration registers */
+static const struct pt_address_range {
+ unsigned long msr_a;
+ unsigned long msr_b;
+ unsigned int reg_off;
+} pt_address_ranges[] = {
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR0_A,
+ .msr_b = MSR_IA32_RTIT_ADDR0_B,
+ .reg_off = RTIT_CTL_ADDR0_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR1_A,
+ .msr_b = MSR_IA32_RTIT_ADDR1_B,
+ .reg_off = RTIT_CTL_ADDR1_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR2_A,
+ .msr_b = MSR_IA32_RTIT_ADDR2_B,
+ .reg_off = RTIT_CTL_ADDR2_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR3_A,
+ .msr_b = MSR_IA32_RTIT_ADDR3_B,
+ .reg_off = RTIT_CTL_ADDR3_OFFSET,
+ }
+};
+
+static u64 pt_config_filters(struct perf_event *event)
+{
+ struct pt_filters *filters = event->hw.addr_filters;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ unsigned int range = 0;
+ u64 rtit_ctl = 0;
+
+ if (!filters)
+ return 0;
+
+ perf_event_addr_filters_sync(event);
+
+ for (range = 0; range < filters->nr_filters; range++) {
+ struct pt_filter *filter = &filters->filter[range];
+
+ /*
+ * Note, if the range has zero start/end addresses due
+ * to its dynamic object not being loaded yet, we just
+ * go ahead and program zeroed range, which will simply
+ * produce no data. Note^2: if executable code at 0x0
+ * is a concern, we can set up an "invalid" configuration
+ * such as msr_b < msr_a.
+ */
+
+ /* avoid redundant msr writes */
+ if (pt->filters.filter[range].msr_a != filter->msr_a) {
+ wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
+ pt->filters.filter[range].msr_a = filter->msr_a;
+ }
+
+ if (pt->filters.filter[range].msr_b != filter->msr_b) {
+ wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
+ pt->filters.filter[range].msr_b = filter->msr_b;
+ }
+
+ rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
+ }
+
+ return rtit_ctl;
+}
+
static void pt_config(struct perf_event *event)
{
u64 reg;
@@ -272,7 +397,8 @@ static void pt_config(struct perf_event *event)
wrmsrl(MSR_IA32_RTIT_STATUS, 0);
}
- reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+ reg = pt_config_filters(event);
+ reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
if (!event->attr.exclude_kernel)
reg |= RTIT_CTL_OS;
@@ -709,6 +835,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
/* clear STOP and INT from current entry */
buf->topa_index[buf->stop_pos]->stop = 0;
+ buf->topa_index[buf->stop_pos]->intr = 0;
buf->topa_index[buf->intr_pos]->intr = 0;
/* how many pages till the STOP marker */
@@ -733,6 +860,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
buf->intr_pos = idx;
buf->topa_index[buf->stop_pos]->stop = 1;
+ buf->topa_index[buf->stop_pos]->intr = 1;
buf->topa_index[buf->intr_pos]->intr = 1;
return 0;
@@ -919,24 +1047,80 @@ static void pt_buffer_free_aux(void *data)
kfree(buf);
}
-/**
- * pt_buffer_is_full() - check if the buffer is full
- * @buf: PT buffer.
- * @pt: Per-cpu pt handle.
- *
- * If the user hasn't read data from the output region that aux_head
- * points to, the buffer is considered full: the user needs to read at
- * least this region and update aux_tail to point past it.
- */
-static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+static int pt_addr_filters_init(struct perf_event *event)
{
- if (buf->snapshot)
- return false;
+ struct pt_filters *filters;
+ int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
+
+ if (!pt_cap_get(PT_CAP_num_address_ranges))
+ return 0;
+
+ filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
+ if (!filters)
+ return -ENOMEM;
+
+ if (event->parent)
+ memcpy(filters, event->parent->hw.addr_filters,
+ sizeof(*filters));
+
+ event->hw.addr_filters = filters;
+
+ return 0;
+}
+
+static void pt_addr_filters_fini(struct perf_event *event)
+{
+ kfree(event->hw.addr_filters);
+ event->hw.addr_filters = NULL;
+}
+
+static int pt_event_addr_filters_validate(struct list_head *filters)
+{
+ struct perf_addr_filter *filter;
+ int range = 0;
+
+ list_for_each_entry(filter, filters, entry) {
+ /* PT doesn't support single address triggers */
+ if (!filter->range)
+ return -EOPNOTSUPP;
+
+ if (!filter->inode && !kernel_ip(filter->offset))
+ return -EINVAL;
+
+ if (++range > pt_cap_get(PT_CAP_num_address_ranges))
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void pt_event_addr_filters_sync(struct perf_event *event)
+{
+ struct perf_addr_filters_head *head = perf_event_addr_filters(event);
+ unsigned long msr_a, msr_b, *offs = event->addr_filters_offs;
+ struct pt_filters *filters = event->hw.addr_filters;
+ struct perf_addr_filter *filter;
+ int range = 0;
+
+ if (!filters)
+ return;
- if (local_read(&buf->data_size) >= pt->handle.size)
- return true;
+ list_for_each_entry(filter, &head->list, entry) {
+ if (filter->inode && !offs[range]) {
+ msr_a = msr_b = 0;
+ } else {
+ /* apply the offset */
+ msr_a = filter->offset + offs[range];
+ msr_b = filter->size + msr_a;
+ }
+
+ filters->filter[range].msr_a = msr_a;
+ filters->filter[range].msr_b = msr_b;
+ filters->filter[range].config = filter->filter ? 1 : 2;
+ range++;
+ }
- return false;
+ filters->nr_filters = range;
}
/**
@@ -953,7 +1137,7 @@ void intel_pt_interrupt(void)
* after PT has been disabled by pt_event_stop(). Make sure we don't
* do anything (particularly, re-enable) for this event here.
*/
- if (!ACCESS_ONCE(pt->handle_nmi))
+ if (!READ_ONCE(pt->handle_nmi))
return;
/*
@@ -1038,23 +1222,36 @@ EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
static void pt_event_start(struct perf_event *event, int mode)
{
+ struct hw_perf_event *hwc = &event->hw;
struct pt *pt = this_cpu_ptr(&pt_ctx);
- struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ struct pt_buffer *buf;
if (READ_ONCE(pt->vmx_on))
return;
- if (!buf || pt_buffer_is_full(buf, pt)) {
- event->hw.state = PERF_HES_STOPPED;
- return;
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf)
+ goto fail_stop;
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ if (!buf->snapshot) {
+ if (pt_buffer_reset_markers(buf, &pt->handle))
+ goto fail_end_stop;
}
- ACCESS_ONCE(pt->handle_nmi) = 1;
- event->hw.state = 0;
+ WRITE_ONCE(pt->handle_nmi, 1);
+ hwc->state = 0;
pt_config_buffer(buf->cur->table, buf->cur_idx,
buf->output_off);
pt_config(event);
+
+ return;
+
+fail_end_stop:
+ perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+ hwc->state = PERF_HES_STOPPED;
}
static void pt_event_stop(struct perf_event *event, int mode)
@@ -1065,7 +1262,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
* Protect against the PMI racing with disabling wrmsr,
* see comment in intel_pt_interrupt().
*/
- ACCESS_ONCE(pt->handle_nmi) = 0;
+ WRITE_ONCE(pt->handle_nmi, 0);
pt_config_stop(event);
@@ -1088,19 +1285,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
pt_handle_status(pt);
pt_update_head(pt);
- }
-}
-
-static void pt_event_del(struct perf_event *event, int mode)
-{
- struct pt *pt = this_cpu_ptr(&pt_ctx);
- struct pt_buffer *buf;
- pt_event_stop(event, PERF_EF_UPDATE);
-
- buf = perf_get_aux(&pt->handle);
-
- if (buf) {
if (buf->snapshot)
pt->handle.head =
local_xchg(&buf->data_size,
@@ -1110,9 +1295,13 @@ static void pt_event_del(struct perf_event *event, int mode)
}
}
+static void pt_event_del(struct perf_event *event, int mode)
+{
+ pt_event_stop(event, PERF_EF_UPDATE);
+}
+
static int pt_event_add(struct perf_event *event, int mode)
{
- struct pt_buffer *buf;
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct hw_perf_event *hwc = &event->hw;
int ret = -EBUSY;
@@ -1120,34 +1309,18 @@ static int pt_event_add(struct perf_event *event, int mode)
if (pt->handle.event)
goto fail;
- buf = perf_aux_output_begin(&pt->handle, event);
- ret = -EINVAL;
- if (!buf)
- goto fail_stop;
-
- pt_buffer_reset_offsets(buf, pt->handle.head);
- if (!buf->snapshot) {
- ret = pt_buffer_reset_markers(buf, &pt->handle);
- if (ret)
- goto fail_end_stop;
- }
-
if (mode & PERF_EF_START) {
pt_event_start(event, 0);
- ret = -EBUSY;
+ ret = -EINVAL;
if (hwc->state == PERF_HES_STOPPED)
- goto fail_end_stop;
+ goto fail;
} else {
hwc->state = PERF_HES_STOPPED;
}
- return 0;
-
-fail_end_stop:
- perf_aux_output_end(&pt->handle, 0, true);
-fail_stop:
- hwc->state = PERF_HES_STOPPED;
+ ret = 0;
fail:
+
return ret;
}
@@ -1157,6 +1330,7 @@ static void pt_event_read(struct perf_event *event)
static void pt_event_destroy(struct perf_event *event)
{
+ pt_addr_filters_fini(event);
x86_del_exclusive(x86_lbr_exclusive_pt);
}
@@ -1171,6 +1345,11 @@ static int pt_event_init(struct perf_event *event)
if (x86_add_exclusive(x86_lbr_exclusive_pt))
return -EBUSY;
+ if (pt_addr_filters_init(event)) {
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+ return -ENOMEM;
+ }
+
event->destroy = pt_event_destroy;
return 0;
@@ -1190,7 +1369,7 @@ static __init int pt_init(void)
BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
- if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
return -ENODEV;
get_online_cpus();
@@ -1224,16 +1403,21 @@ static __init int pt_init(void)
PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
- pt_pmu.pmu.attr_groups = pt_attr_groups;
- pt_pmu.pmu.task_ctx_nr = perf_sw_context;
- pt_pmu.pmu.event_init = pt_event_init;
- pt_pmu.pmu.add = pt_event_add;
- pt_pmu.pmu.del = pt_event_del;
- pt_pmu.pmu.start = pt_event_start;
- pt_pmu.pmu.stop = pt_event_stop;
- pt_pmu.pmu.read = pt_event_read;
- pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
- pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ pt_pmu.pmu.attr_groups = pt_attr_groups;
+ pt_pmu.pmu.task_ctx_nr = perf_sw_context;
+ pt_pmu.pmu.event_init = pt_event_init;
+ pt_pmu.pmu.add = pt_event_add;
+ pt_pmu.pmu.del = pt_event_del;
+ pt_pmu.pmu.start = pt_event_start;
+ pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.read = pt_event_read;
+ pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
+ pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync;
+ pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
+ pt_pmu.pmu.nr_addr_filters =
+ pt_cap_get(PT_CAP_num_address_ranges);
+
ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
return ret;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 3abb5f5cccc8..efffa4a09f68 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,6 +20,40 @@
#define __INTEL_PT_H__
/*
+ * PT MSR bit definitions
+ */
+#define RTIT_CTL_TRACEEN BIT(0)
+#define RTIT_CTL_CYCLEACC BIT(1)
+#define RTIT_CTL_OS BIT(2)
+#define RTIT_CTL_USR BIT(3)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA BIT(8)
+#define RTIT_CTL_MTC_EN BIT(9)
+#define RTIT_CTL_TSC_EN BIT(10)
+#define RTIT_CTL_DISRETC BIT(11)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET 14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET 24
+#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET 32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET 36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET 40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET 44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
+#define RTIT_STATUS_FILTEREN BIT(0)
+#define RTIT_STATUS_CONTEXTEN BIT(1)
+#define RTIT_STATUS_TRIGGEREN BIT(2)
+#define RTIT_STATUS_BUFFOVF BIT(3)
+#define RTIT_STATUS_ERROR BIT(4)
+#define RTIT_STATUS_STOPPED BIT(5)
+
+/*
* Single-entry ToPA: when this close to region boundary, switch
* buffers to avoid losing data.
*/
@@ -48,15 +82,20 @@ struct topa_entry {
#define PT_CPUID_LEAVES 2
#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
+/* TSC to Core Crystal Clock Ratio */
+#define CPUID_TSC_LEAF 0x15
+
enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
PT_CAP_psb_cyc,
+ PT_CAP_ip_filtering,
PT_CAP_mtc,
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
PT_CAP_payloads_lip,
+ PT_CAP_num_address_ranges,
PT_CAP_mtc_periods,
PT_CAP_cycle_thresholds,
PT_CAP_psb_periods,
@@ -66,6 +105,9 @@ struct pt_pmu {
struct pmu pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
bool vmx;
+ unsigned long max_nonturbo_ratio;
+ unsigned int tsc_art_num;
+ unsigned int tsc_art_den;
};
/**
@@ -104,14 +146,40 @@ struct pt_buffer {
struct topa_entry *topa_index[0];
};
+#define PT_FILTERS_NUM 4
+
+/**
+ * struct pt_filter - IP range filter configuration
+ * @msr_a: range start, goes to RTIT_ADDRn_A
+ * @msr_b: range end, goes to RTIT_ADDRn_B
+ * @config: 4-bit field in RTIT_CTL
+ */
+struct pt_filter {
+ unsigned long msr_a;
+ unsigned long msr_b;
+ unsigned long config;
+};
+
+/**
+ * struct pt_filters - IP range filtering context
+ * @filter: filters defined for this context
+ * @nr_filters: number of defined filters in the @filter array
+ */
+struct pt_filters {
+ struct pt_filter filter[PT_FILTERS_NUM];
+ unsigned int nr_filters;
+};
+
/**
* struct pt - per-cpu pt context
* @handle: perf output handle
+ * @filters: last configured filters
* @handle_nmi: do handle PT PMI on this cpu, there's an active event
* @vmx_on: 1 if VMX is ON on this cpu
*/
struct pt {
struct perf_output_handle handle;
+ struct pt_filters filters;
int handle_nmi;
int vmx_on;
};
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 1705c9d75e44..99c4bab123cd 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -27,10 +27,14 @@
* event: rapl_energy_dram
* perf code: 0x3
*
- * dram counter: consumption of the builtin-gpu domain (client only)
+ * gpu counter: consumption of the builtin-gpu domain (client only)
* event: rapl_energy_gpu
* perf code: 0x4
*
+ * psys counter: consumption of the builtin-psys domain (client only)
+ * event: rapl_energy_psys
+ * perf code: 0x5
+ *
* We manage those counters as free running (read-only). They may be
* use simultaneously by other tools, such as turbostat.
*
@@ -53,6 +57,8 @@
#include <asm/cpu_device_id.h>
#include "../perf_event.h"
+MODULE_LICENSE("GPL");
+
/*
* RAPL energy status counters
*/
@@ -64,13 +70,16 @@
#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */
+#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */
-#define NR_RAPL_DOMAINS 0x4
+#define NR_RAPL_DOMAINS 0x5
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
"pp0-core",
"package",
"dram",
"pp1-gpu",
+ "psys",
};
/* Clients have PP0, PKG */
@@ -89,6 +98,13 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
1<<RAPL_IDX_RAM_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
+/* SKL clients have PP0, PKG, RAM, PP1, PSYS */
+#define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT|\
+ 1<<RAPL_IDX_PSYS_NRG_STAT)
+
/* Knights Landing has PKG, RAM */
#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
1<<RAPL_IDX_RAM_NRG_STAT)
@@ -360,6 +376,10 @@ static int rapl_pmu_event_init(struct perf_event *event)
bit = RAPL_IDX_PP1_NRG_STAT;
msr = MSR_PP1_ENERGY_STATUS;
break;
+ case INTEL_RAPL_PSYS:
+ bit = RAPL_IDX_PSYS_NRG_STAT;
+ msr = MSR_PLATFORM_ENERGY_STATUS;
+ break;
default:
return -EINVAL;
}
@@ -414,11 +434,13 @@ RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
@@ -427,6 +449,7 @@ RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890
RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
static struct attribute *rapl_events_srv_attr[] = {
EVENT_PTR(rapl_cores),
@@ -476,6 +499,27 @@ static struct attribute *rapl_events_hsw_attr[] = {
NULL,
};
+static struct attribute *rapl_events_skl_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+ EVENT_PTR(rapl_ram),
+ EVENT_PTR(rapl_psys),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+ EVENT_PTR(rapl_ram_unit),
+ EVENT_PTR(rapl_psys_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ EVENT_PTR(rapl_ram_scale),
+ EVENT_PTR(rapl_psys_scale),
+ NULL,
+};
+
static struct attribute *rapl_events_knl_attr[] = {
EVENT_PTR(rapl_pkg),
EVENT_PTR(rapl_ram),
@@ -592,6 +636,11 @@ static int rapl_cpu_notifier(struct notifier_block *self,
return NOTIFY_OK;
}
+static struct notifier_block rapl_cpu_nb = {
+ .notifier_call = rapl_cpu_notifier,
+ .priority = CPU_PRI_PERF + 1,
+};
+
static int rapl_check_hw_unit(bool apply_quirk)
{
u64 msr_rapl_power_unit_bits;
@@ -660,7 +709,7 @@ static int __init rapl_prepare_cpus(void)
return 0;
}
-static void __init cleanup_rapl_pmus(void)
+static void cleanup_rapl_pmus(void)
{
int i;
@@ -691,52 +740,92 @@ static int __init init_rapl_pmus(void)
return 0;
}
+#define X86_RAPL_MODEL_MATCH(model, init) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
+
+struct intel_rapl_init_fun {
+ bool apply_quirk;
+ int cntr_mask;
+ struct attribute **attrs;
+};
+
+static const struct intel_rapl_init_fun snb_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_CLN,
+ .attrs = rapl_events_cln_attr,
+};
+
+static const struct intel_rapl_init_fun hsx_rapl_init __initconst = {
+ .apply_quirk = true,
+ .cntr_mask = RAPL_IDX_SRV,
+ .attrs = rapl_events_srv_attr,
+};
+
+static const struct intel_rapl_init_fun hsw_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_HSW,
+ .attrs = rapl_events_hsw_attr,
+};
+
+static const struct intel_rapl_init_fun snbep_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_SRV,
+ .attrs = rapl_events_srv_attr,
+};
+
+static const struct intel_rapl_init_fun knl_rapl_init __initconst = {
+ .apply_quirk = true,
+ .cntr_mask = RAPL_IDX_KNL,
+ .attrs = rapl_events_knl_attr,
+};
+
+static const struct intel_rapl_init_fun skl_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_SKL_CLN,
+ .attrs = rapl_events_skl_attr,
+};
+
static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
- [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
- [1] = {},
+ X86_RAPL_MODEL_MATCH(42, snb_rapl_init), /* Sandy Bridge */
+ X86_RAPL_MODEL_MATCH(45, snbep_rapl_init), /* Sandy Bridge-EP */
+
+ X86_RAPL_MODEL_MATCH(58, snb_rapl_init), /* Ivy Bridge */
+ X86_RAPL_MODEL_MATCH(62, snbep_rapl_init), /* IvyTown */
+
+ X86_RAPL_MODEL_MATCH(60, hsw_rapl_init), /* Haswell */
+ X86_RAPL_MODEL_MATCH(63, hsx_rapl_init), /* Haswell-Server */
+ X86_RAPL_MODEL_MATCH(69, hsw_rapl_init), /* Haswell-Celeron */
+ X86_RAPL_MODEL_MATCH(70, hsw_rapl_init), /* Haswell GT3e */
+
+ X86_RAPL_MODEL_MATCH(61, hsw_rapl_init), /* Broadwell */
+ X86_RAPL_MODEL_MATCH(71, hsw_rapl_init), /* Broadwell-H */
+ X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */
+ X86_RAPL_MODEL_MATCH(86, hsx_rapl_init), /* Broadwell Xeon D */
+
+ X86_RAPL_MODEL_MATCH(87, knl_rapl_init), /* Knights Landing */
+
+ X86_RAPL_MODEL_MATCH(78, skl_rapl_init), /* Skylake */
+ X86_RAPL_MODEL_MATCH(94, skl_rapl_init), /* Skylake H/S */
+ {},
};
+MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match);
+
static int __init rapl_pmu_init(void)
{
- bool apply_quirk = false;
+ const struct x86_cpu_id *id;
+ struct intel_rapl_init_fun *rapl_init;
+ bool apply_quirk;
int ret;
- if (!x86_match_cpu(rapl_cpu_match))
+ id = x86_match_cpu(rapl_cpu_match);
+ if (!id)
return -ENODEV;
- switch (boot_cpu_data.x86_model) {
- case 42: /* Sandy Bridge */
- case 58: /* Ivy Bridge */
- rapl_cntr_mask = RAPL_IDX_CLN;
- rapl_pmu_events_group.attrs = rapl_events_cln_attr;
- break;
- case 63: /* Haswell-Server */
- case 79: /* Broadwell-Server */
- apply_quirk = true;
- rapl_cntr_mask = RAPL_IDX_SRV;
- rapl_pmu_events_group.attrs = rapl_events_srv_attr;
- break;
- case 60: /* Haswell */
- case 69: /* Haswell-Celeron */
- case 70: /* Haswell GT3e */
- case 61: /* Broadwell */
- case 71: /* Broadwell-H */
- rapl_cntr_mask = RAPL_IDX_HSW;
- rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
- break;
- case 45: /* Sandy Bridge-EP */
- case 62: /* IvyTown */
- rapl_cntr_mask = RAPL_IDX_SRV;
- rapl_pmu_events_group.attrs = rapl_events_srv_attr;
- break;
- case 87: /* Knights Landing */
- apply_quirk = true;
- rapl_cntr_mask = RAPL_IDX_KNL;
- rapl_pmu_events_group.attrs = rapl_events_knl_attr;
- break;
- default:
- return -ENODEV;
- }
+ rapl_init = (struct intel_rapl_init_fun *)id->driver_data;
+ apply_quirk = rapl_init->apply_quirk;
+ rapl_cntr_mask = rapl_init->cntr_mask;
+ rapl_pmu_events_group.attrs = rapl_init->attrs;
ret = rapl_check_hw_unit(apply_quirk);
if (ret)
@@ -756,7 +845,7 @@ static int __init rapl_pmu_init(void)
if (ret)
goto out;
- __perf_cpu_notifier(rapl_cpu_notifier);
+ __register_cpu_notifier(&rapl_cpu_nb);
cpu_notifier_register_done();
rapl_advertise();
return 0;
@@ -767,4 +856,14 @@ out:
cpu_notifier_register_done();
return ret;
}
-device_initcall(rapl_pmu_init);
+module_init(rapl_pmu_init);
+
+static void __exit intel_rapl_exit(void)
+{
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&rapl_cpu_nb);
+ perf_pmu_unregister(&rapl_pmus->pmu);
+ cleanup_rapl_pmus();
+ cpu_notifier_register_done();
+}
+module_exit(intel_rapl_exit);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 7012d18bb293..17734a6ef474 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1,3 +1,4 @@
+#include <asm/cpu_device_id.h>
#include "uncore.h"
static struct intel_uncore_type *empty_uncore[] = { NULL, };
@@ -21,6 +22,8 @@ static struct event_constraint uncore_constraint_fixed =
struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
+MODULE_LICENSE("GPL");
+
static int uncore_pcibus_to_physid(struct pci_bus *bus)
{
struct pci2phy_map *map;
@@ -754,7 +757,7 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
pmu->registered = false;
}
-static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
{
struct intel_uncore_pmu *pmu = type->pmus;
struct intel_uncore_box *box;
@@ -770,7 +773,7 @@ static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
}
}
-static void __init uncore_exit_boxes(void *dummy)
+static void uncore_exit_boxes(void *dummy)
{
struct intel_uncore_type **types;
@@ -787,7 +790,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
kfree(pmu->boxes);
}
-static void __init uncore_type_exit(struct intel_uncore_type *type)
+static void uncore_type_exit(struct intel_uncore_type *type)
{
struct intel_uncore_pmu *pmu = type->pmus;
int i;
@@ -804,7 +807,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
type->events_group = NULL;
}
-static void __init uncore_types_exit(struct intel_uncore_type **types)
+static void uncore_types_exit(struct intel_uncore_type **types)
{
for (; *types; types++)
uncore_type_exit(*types);
@@ -989,46 +992,6 @@ static int __init uncore_pci_init(void)
size_t size;
int ret;
- switch (boot_cpu_data.x86_model) {
- case 45: /* Sandy Bridge-EP */
- ret = snbep_uncore_pci_init();
- break;
- case 62: /* Ivy Bridge-EP */
- ret = ivbep_uncore_pci_init();
- break;
- case 63: /* Haswell-EP */
- ret = hswep_uncore_pci_init();
- break;
- case 79: /* BDX-EP */
- case 86: /* BDX-DE */
- ret = bdx_uncore_pci_init();
- break;
- case 42: /* Sandy Bridge */
- ret = snb_uncore_pci_init();
- break;
- case 58: /* Ivy Bridge */
- ret = ivb_uncore_pci_init();
- break;
- case 60: /* Haswell */
- case 69: /* Haswell Celeron */
- ret = hsw_uncore_pci_init();
- break;
- case 61: /* Broadwell */
- ret = bdw_uncore_pci_init();
- break;
- case 87: /* Knights Landing */
- ret = knl_uncore_pci_init();
- break;
- case 94: /* SkyLake */
- ret = skl_uncore_pci_init();
- break;
- default:
- return -ENODEV;
- }
-
- if (ret)
- return ret;
-
size = max_packages * sizeof(struct pci_extra_dev);
uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
if (!uncore_extra_pci_dev) {
@@ -1060,7 +1023,7 @@ err:
return ret;
}
-static void __init uncore_pci_exit(void)
+static void uncore_pci_exit(void)
{
if (pcidrv_registered) {
pcidrv_registered = false;
@@ -1287,46 +1250,6 @@ static int __init uncore_cpu_init(void)
{
int ret;
- switch (boot_cpu_data.x86_model) {
- case 26: /* Nehalem */
- case 30:
- case 37: /* Westmere */
- case 44:
- nhm_uncore_cpu_init();
- break;
- case 42: /* Sandy Bridge */
- case 58: /* Ivy Bridge */
- case 60: /* Haswell */
- case 69: /* Haswell */
- case 70: /* Haswell */
- case 61: /* Broadwell */
- case 71: /* Broadwell */
- snb_uncore_cpu_init();
- break;
- case 45: /* Sandy Bridge-EP */
- snbep_uncore_cpu_init();
- break;
- case 46: /* Nehalem-EX */
- case 47: /* Westmere-EX aka. Xeon E7 */
- nhmex_uncore_cpu_init();
- break;
- case 62: /* Ivy Bridge-EP */
- ivbep_uncore_cpu_init();
- break;
- case 63: /* Haswell-EP */
- hswep_uncore_cpu_init();
- break;
- case 79: /* BDX-EP */
- case 86: /* BDX-DE */
- bdx_uncore_cpu_init();
- break;
- case 87: /* Knights Landing */
- knl_uncore_cpu_init();
- break;
- default:
- return -ENODEV;
- }
-
ret = uncore_types_init(uncore_msr_uncores, true);
if (ret)
goto err;
@@ -1376,11 +1299,105 @@ static int __init uncore_cpumask_init(bool msr)
return 0;
}
+#define X86_UNCORE_MODEL_MATCH(model, init) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
+
+struct intel_uncore_init_fun {
+ void (*cpu_init)(void);
+ int (*pci_init)(void);
+};
+
+static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
+ .cpu_init = nhm_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun snb_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = snb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun ivb_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = ivb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hsw_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = hsw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdw_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = bdw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun snbep_uncore_init __initconst = {
+ .cpu_init = snbep_uncore_cpu_init,
+ .pci_init = snbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = {
+ .cpu_init = nhmex_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = {
+ .cpu_init = ivbep_uncore_cpu_init,
+ .pci_init = ivbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hswep_uncore_init __initconst = {
+ .cpu_init = hswep_uncore_cpu_init,
+ .pci_init = hswep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdx_uncore_init __initconst = {
+ .cpu_init = bdx_uncore_cpu_init,
+ .pci_init = bdx_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun knl_uncore_init __initconst = {
+ .cpu_init = knl_uncore_cpu_init,
+ .pci_init = knl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
+ .pci_init = skl_uncore_pci_init,
+};
+
+static const struct x86_cpu_id intel_uncore_match[] __initconst = {
+ X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init), /* Nehalem */
+ X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init), /* Westmere */
+ X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(42, snb_uncore_init), /* Sandy Bridge */
+ X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init), /* Ivy Bridge */
+ X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init), /* Haswell */
+ X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init), /* Haswell Celeron */
+ X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init), /* Haswell */
+ X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init), /* Broadwell */
+ X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init), /* Broadwell */
+ X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init), /* Sandy Bridge-EP */
+ X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init), /* Nehalem-EX */
+ X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init), /* Westmere-EX aka. Xeon E7 */
+ X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init), /* Ivy Bridge-EP */
+ X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init), /* Haswell-EP */
+ X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init), /* BDX-EP */
+ X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init), /* BDX-DE */
+ X86_UNCORE_MODEL_MATCH(87, knl_uncore_init), /* Knights Landing */
+ X86_UNCORE_MODEL_MATCH(94, skl_uncore_init), /* SkyLake */
+ {},
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
+
static int __init intel_uncore_init(void)
{
- int pret, cret, ret;
+ const struct x86_cpu_id *id;
+ struct intel_uncore_init_fun *uncore_init;
+ int pret = 0, cret = 0, ret;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ id = x86_match_cpu(intel_uncore_match);
+ if (!id)
return -ENODEV;
if (cpu_has_hypervisor)
@@ -1388,8 +1405,17 @@ static int __init intel_uncore_init(void)
max_packages = topology_max_packages();
- pret = uncore_pci_init();
- cret = uncore_cpu_init();
+ uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+ if (uncore_init->pci_init) {
+ pret = uncore_init->pci_init();
+ if (!pret)
+ pret = uncore_pci_init();
+ }
+
+ if (uncore_init->cpu_init) {
+ uncore_init->cpu_init();
+ cret = uncore_cpu_init();
+ }
if (cret && pret)
return -ENODEV;
@@ -1409,4 +1435,14 @@ err:
cpu_notifier_register_done();
return ret;
}
-device_initcall(intel_uncore_init);
+module_init(intel_uncore_init);
+
+static void __exit intel_uncore_exit(void)
+{
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&uncore_cpu_nb);
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_pci_exit();
+ cpu_notifier_register_done();
+}
+module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index ab2bcaaebe38..b2625867ebd1 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -219,6 +219,9 @@
#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff
#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18)
#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE (0x1ULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE (0x1ULL << 33)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_NNC (0x1ULL << 37)
/* KNL EDC/MC UCLK */
#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400
@@ -1902,6 +1905,10 @@ static int knl_cha_hw_config(struct intel_uncore_box *box,
reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE;
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE;
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_NNC;
reg1->idx = idx;
}
return 0;
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index ec863b9a9f78..85ef3c2e80e0 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -6,6 +6,8 @@ enum perf_msr_id {
PERF_MSR_MPERF = 2,
PERF_MSR_PPERF = 3,
PERF_MSR_SMI = 4,
+ PERF_MSR_PTSC = 5,
+ PERF_MSR_IRPERF = 6,
PERF_MSR_EVENT_MAX,
};
@@ -15,6 +17,16 @@ static bool test_aperfmperf(int idx)
return boot_cpu_has(X86_FEATURE_APERFMPERF);
}
+static bool test_ptsc(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_PTSC);
+}
+
+static bool test_irperf(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_IRPERF);
+}
+
static bool test_intel(int idx)
{
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
@@ -69,18 +81,22 @@ struct perf_msr {
bool (*test)(int idx);
};
-PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
-PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05");
+PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06");
static struct perf_msr msr[] = {
- [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
- [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
- [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
- [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
- [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
+ [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
+ [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
+ [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
+ [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
+ [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
+ [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, },
+ [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, },
};
static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
@@ -166,7 +182,7 @@ again:
if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
delta = sign_extend64(delta, 31);
- local64_add(now - prev, &event->count);
+ local64_add(delta, &event->count);
}
static void msr_event_start(struct perf_event *event, int flags)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ad4dc7ffffb5..8bd764df815d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -601,6 +601,7 @@ struct x86_pmu {
u64 lbr_sel_mask; /* LBR_SELECT valid bits */
const int *lbr_sel_map; /* lbr_select mappings */
bool lbr_double_abort; /* duplicated lbr aborts */
+ bool lbr_pt_coexist; /* LBR may coexist with PT */
/*
* Intel PT/LBR/BTS are exclusive
@@ -859,6 +860,8 @@ extern struct event_constraint intel_atom_pebs_event_constraints[];
extern struct event_constraint intel_slm_pebs_event_constraints[];
+extern struct event_constraint intel_glm_pebs_event_constraints[];
+
extern struct event_constraint intel_nehalem_pebs_event_constraints[];
extern struct event_constraint intel_westmere_pebs_event_constraints[];
@@ -907,6 +910,8 @@ void intel_pmu_lbr_init_nhm(void);
void intel_pmu_lbr_init_atom(void);
+void intel_pmu_lbr_init_slm(void);
+
void intel_pmu_lbr_init_snb(void);
void intel_pmu_lbr_init_hsw(void);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3636ec06c887..53ac9bbf2064 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -27,6 +27,7 @@ enum cpuid_leafs
CPUID_6_EAX,
CPUID_8000_000A_EDX,
CPUID_7_ECX,
+ CPUID_8000_0007_EBX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 8f9afefd2dc5..0aee9dd1976e 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -12,7 +12,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 17 /* N 32-bit words worth of info */
+#define NCAPINTS 18 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -177,6 +177,7 @@
#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
+#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */
#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
@@ -250,6 +251,7 @@
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
+#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
@@ -280,6 +282,11 @@
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 53748c45e488..78d1e7467eae 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,7 @@
#include <asm/fpu/api.h>
#include <asm/pgtable.h>
+#include <asm/processor-flags.h>
#include <asm/tlb.h>
/*
@@ -28,33 +29,22 @@
#define MAX_CMDLINE_ADDRESS UINT_MAX
-#ifdef CONFIG_X86_32
+#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF
+#ifdef CONFIG_X86_32
extern unsigned long asmlinkage efi_call_phys(void *, ...);
+#define arch_efi_call_virt_setup() kernel_fpu_begin()
+#define arch_efi_call_virt_teardown() kernel_fpu_end()
+
/*
* Wrap all the virtual calls in a way that forces the parameters on the stack.
*/
-
-/* Use this macro if your virtual returns a non-void value */
-#define efi_call_virt(f, args...) \
+#define arch_efi_call_virt(f, args...) \
({ \
- efi_status_t __s; \
- kernel_fpu_begin(); \
- __s = ((efi_##f##_t __attribute__((regparm(0)))*) \
- efi.systab->runtime->f)(args); \
- kernel_fpu_end(); \
- __s; \
-})
-
-/* Use this macro if your virtual call does not return any value */
-#define __efi_call_virt(f, args...) \
-({ \
- kernel_fpu_begin(); \
((efi_##f##_t __attribute__((regparm(0)))*) \
efi.systab->runtime->f)(args); \
- kernel_fpu_end(); \
})
#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size)
@@ -78,10 +68,8 @@ struct efi_scratch {
u64 phys_stack;
} __packed;
-#define efi_call_virt(f, ...) \
+#define arch_efi_call_virt_setup() \
({ \
- efi_status_t __s; \
- \
efi_sync_low_kernel_mappings(); \
preempt_disable(); \
__kernel_fpu_begin(); \
@@ -91,9 +79,13 @@ struct efi_scratch {
write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \
} \
- \
- __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \
- \
+})
+
+#define arch_efi_call_virt(f, args...) \
+ efi_call((void *)efi.systab->runtime->f, args) \
+
+#define arch_efi_call_virt_teardown() \
+({ \
if (efi_scratch.use_pgd) { \
write_cr3(efi_scratch.prev_cr3); \
__flush_tlb_all(); \
@@ -101,15 +93,8 @@ struct efi_scratch {
\
__kernel_fpu_end(); \
preempt_enable(); \
- __s; \
})
-/*
- * All X86_64 virt calls return non-void values. Thus, use non-void call for
- * virt calls that would be void on X86_32.
- */
-#define __efi_call_virt(f, args...) efi_call_virt(f, args)
-
extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
u32 type, u64 attribute);
@@ -180,6 +165,8 @@ static inline bool efi_runtime_supported(void)
extern struct console early_efi_console;
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
+extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
+
#ifdef CONFIG_EFI_MIXED
extern void efi_thunk_runtime_setup(void);
extern efi_status_t efi_thunk_set_virtual_address_map(
@@ -225,6 +212,11 @@ __pure const struct efi_config *__efi_early(void);
#define efi_call_early(f, ...) \
__efi_early()->call(__efi_early()->f, __VA_ARGS__);
+#define __efi_call_early(f, ...) \
+ __efi_early()->call((unsigned long)f, __VA_ARGS__);
+
+#define efi_is_64bit() __efi_early()->is64
+
extern bool efi_reboot_required(void);
#else
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 92b6f651fa4f..8bf766ef0e18 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,13 +104,23 @@
#define MCE_LOG_SIGNATURE "MACHINECHECK"
/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CTL 0xc0002000
+#define MSR_AMD64_SMCA_MC0_STATUS 0xc0002001
+#define MSR_AMD64_SMCA_MC0_ADDR 0xc0002002
#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
+#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
/*
@@ -168,9 +178,18 @@ struct mce_vendor_flags {
__reserved_0 : 61;
};
+
+struct mca_msr_regs {
+ u32 (*ctl) (int bank);
+ u32 (*status) (int bank);
+ u32 (*addr) (int bank);
+ u32 (*misc) (int bank);
+};
+
extern struct mce_vendor_flags mce_flags;
extern struct mca_config mca_cfg;
+extern struct mca_msr_regs msr_ops;
extern void mce_register_decode_chain(struct notifier_block *nb);
extern void mce_unregister_decode_chain(struct notifier_block *nb);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5b3c9a55f51c..5a73a9c62c39 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -89,27 +89,16 @@
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
#define MSR_IA32_RTIT_CTL 0x00000570
-#define RTIT_CTL_TRACEEN BIT(0)
-#define RTIT_CTL_CYCLEACC BIT(1)
-#define RTIT_CTL_OS BIT(2)
-#define RTIT_CTL_USR BIT(3)
-#define RTIT_CTL_CR3EN BIT(7)
-#define RTIT_CTL_TOPA BIT(8)
-#define RTIT_CTL_MTC_EN BIT(9)
-#define RTIT_CTL_TSC_EN BIT(10)
-#define RTIT_CTL_DISRETC BIT(11)
-#define RTIT_CTL_BRANCH_EN BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET 14
-#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET 19
-#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET 24
-#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
#define MSR_IA32_RTIT_STATUS 0x00000571
-#define RTIT_STATUS_CONTEXTEN BIT(1)
-#define RTIT_STATUS_TRIGGEREN BIT(2)
-#define RTIT_STATUS_ERROR BIT(4)
-#define RTIT_STATUS_STOPPED BIT(5)
+#define MSR_IA32_RTIT_STATUS 0x00000571
+#define MSR_IA32_RTIT_ADDR0_A 0x00000580
+#define MSR_IA32_RTIT_ADDR0_B 0x00000581
+#define MSR_IA32_RTIT_ADDR1_A 0x00000582
+#define MSR_IA32_RTIT_ADDR1_B 0x00000583
+#define MSR_IA32_RTIT_ADDR2_A 0x00000584
+#define MSR_IA32_RTIT_ADDR2_B 0x00000585
+#define MSR_IA32_RTIT_ADDR3_A 0x00000586
+#define MSR_IA32_RTIT_ADDR3_B 0x00000587
#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
@@ -205,6 +194,8 @@
#define MSR_CONFIG_TDP_CONTROL 0x0000064B
#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D
+
#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
@@ -315,6 +306,9 @@
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+/* Fam 17h MSRs */
+#define MSR_F17H_IRPERF 0xc00000e9
+
/* Fam 16h MSRs */
#define MSR_F16H_L2I_PERF_CTL 0xc0010230
#define MSR_F16H_L2I_PERF_CTR 0xc0010231
@@ -328,6 +322,7 @@
#define MSR_F15H_PERF_CTR 0xc0010201
#define MSR_F15H_NB_PERF_CTL 0xc0010240
#define MSR_F15H_NB_PERF_CTR 0xc0010241
+#define MSR_F15H_PTSC 0xc0010280
#define MSR_F15H_IC_CFG 0xc0011021
/* Fam 10h MSRs */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ceec86eb68e9..453744c1d347 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -99,26 +99,36 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
/*
* lock for writing
*/
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+#define ____down_write(sem, slow_path) \
+({ \
+ long tmp; \
+ struct rw_semaphore* ret; \
+ asm volatile("# beginning down_write\n\t" \
+ LOCK_PREFIX " xadd %1,(%3)\n\t" \
+ /* adds 0xffff0001, returns the old value */ \
+ " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \
+ /* was the active mask 0 before? */\
+ " jz 1f\n" \
+ " call " slow_path "\n" \
+ "1:\n" \
+ "# ending down_write" \
+ : "+m" (sem->count), "=d" (tmp), "=a" (ret) \
+ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \
+ : "memory", "cc"); \
+ ret; \
+})
+
+static inline void __down_write(struct rw_semaphore *sem)
{
- long tmp;
- asm volatile("# beginning down_write\n\t"
- LOCK_PREFIX " xadd %1,(%2)\n\t"
- /* adds 0xffff0001, returns the old value */
- " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t"
- /* was the active mask 0 before? */
- " jz 1f\n"
- " call call_rwsem_down_write_failed\n"
- "1:\n"
- "# ending down_write"
- : "+m" (sem->count), "=d" (tmp)
- : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
- : "memory", "cc");
+ ____down_write(sem, "call_rwsem_down_write_failed");
}
-static inline void __down_write(struct rw_semaphore *sem)
+static inline int __down_write_killable(struct rw_semaphore *sem)
{
- __down_write_nested(sem, 0);
+ if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable")))
+ return -EINTR;
+
+ return 0;
}
/*
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a969ae607be8..2e7513d1f1f4 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -108,6 +108,14 @@ struct exception_table_entry {
#define ARCH_HAS_RELATIVE_EXTABLE
+#define swap_ex_entry_fixup(a, b, tmp, delta) \
+ do { \
+ (a)->fixup = (b)->fixup + (delta); \
+ (b)->fixup = (tmp).fixup - (delta); \
+ (a)->handler = (b)->handler + (delta); \
+ (b)->handler = (tmp).handler - (delta); \
+ } while (0)
+
extern int fixup_exception(struct pt_regs *regs, int trapnr);
extern bool ex_has_fault_handler(unsigned long ip);
extern int early_fixup_exception(unsigned long *ip);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8394b3d1f94f..dbc6f066e231 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -717,6 +717,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
}
}
+ if (c->extended_cpuid_level >= 0x80000007) {
+ cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0007_EBX] = ebx;
+ c->x86_power = edx;
+ }
+
if (c->extended_cpuid_level >= 0x80000008) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
@@ -729,9 +736,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_phys_bits = 36;
#endif
- if (c->extended_cpuid_level >= 0x80000007)
- c->x86_power = cpuid_edx(0x80000007);
-
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 2658e2af74ec..93d824ec3120 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -26,6 +26,52 @@ static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
static char gen_pool_buf[MCE_POOLSZ];
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+ struct mce_evt_llist *node;
+ struct mce *m1, *m2;
+
+ m1 = &t->mce;
+
+ llist_for_each_entry(node, &l->llnode, llnode) {
+ m2 = &node->mce;
+
+ if (!mce_cmp(m1, m2))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet. The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+ struct llist_node *head;
+ LLIST_HEAD(new_head);
+ struct mce_evt_llist *node, *t;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return NULL;
+
+ /* squeeze out duplicates while reversing order */
+ llist_for_each_entry_safe(node, t, head, llnode) {
+ if (!is_duplicate_mce_record(node, t))
+ llist_add(&node->llnode, &new_head);
+ }
+
+ return new_head.first;
+}
+
void mce_gen_pool_process(void)
{
struct llist_node *head;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 547720efd923..cd74a3f00aea 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -35,6 +35,7 @@ void mce_gen_pool_process(void);
bool mce_gen_pool_empty(void);
int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
@@ -81,3 +82,17 @@ static inline int apei_clear_mce(u64 record_id)
#endif
void mce_inject_log(struct mce *m);
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+ return m1->bank != m2->bank ||
+ m1->status != m2->status ||
+ m1->addr != m2->addr ||
+ m1->misc != m2->misc;
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 5119766d9889..631356c8cca4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -204,6 +204,33 @@ static int error_context(struct mce *m)
return IN_KERNEL;
}
+static int mce_severity_amd_smca(struct mce *m, int err_ctx)
+{
+ u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+ u32 low, high;
+
+ /*
+ * We need to look at the following bits:
+ * - "succor" bit (data poisoning support), and
+ * - TCC bit (Task Context Corrupt)
+ * in MCi_STATUS to determine error severity.
+ */
+ if (!mce_flags.succor)
+ return MCE_PANIC_SEVERITY;
+
+ if (rdmsr_safe(addr, &low, &high))
+ return MCE_PANIC_SEVERITY;
+
+ /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
+ if ((low & MCI_CONFIG_MCAX) &&
+ (m->status & MCI_STATUS_TCC) &&
+ (err_ctx == IN_KERNEL))
+ return MCE_PANIC_SEVERITY;
+
+ /* ...otherwise invoke hwpoison handler. */
+ return MCE_AR_SEVERITY;
+}
+
/*
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
@@ -225,6 +252,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc
* to at least kill process to prolong system operation.
*/
if (mce_flags.overflow_recov) {
+ if (mce_flags.smca)
+ return mce_severity_amd_smca(m, ctx);
+
/* software can try to contain */
if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
return MCE_PANIC_SEVERITY;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index f0c921b03e42..92e5e37d97bf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -161,7 +161,6 @@ void mce_log(struct mce *mce)
if (!mce_gen_pool_add(mce))
irq_work_queue(&mce_irq_work);
- mce->finished = 0;
wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog.next);
@@ -194,7 +193,6 @@ void mce_log(struct mce *mce)
mcelog.entry[entry].finished = 1;
wmb();
- mce->finished = 1;
set_bit(0, &mce_need_notify);
}
@@ -224,6 +222,53 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+static inline u32 ctl_reg(int bank)
+{
+ return MSR_IA32_MCx_CTL(bank);
+}
+
+static inline u32 status_reg(int bank)
+{
+ return MSR_IA32_MCx_STATUS(bank);
+}
+
+static inline u32 addr_reg(int bank)
+{
+ return MSR_IA32_MCx_ADDR(bank);
+}
+
+static inline u32 misc_reg(int bank)
+{
+ return MSR_IA32_MCx_MISC(bank);
+}
+
+static inline u32 smca_ctl_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_CTL(bank);
+}
+
+static inline u32 smca_status_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_STATUS(bank);
+}
+
+static inline u32 smca_addr_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_ADDR(bank);
+}
+
+static inline u32 smca_misc_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+}
+
+struct mca_msr_regs msr_ops = {
+ .ctl = ctl_reg,
+ .status = status_reg,
+ .addr = addr_reg,
+ .misc = misc_reg
+};
+
static void print_mce(struct mce *m)
{
int ret = 0;
@@ -290,7 +335,9 @@ static void wait_for_panic(void)
static void mce_panic(const char *msg, struct mce *final, char *exp)
{
- int i, apei_err = 0;
+ int apei_err = 0;
+ struct llist_node *pending;
+ struct mce_evt_llist *l;
if (!fake_panic) {
/*
@@ -307,11 +354,10 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
if (atomic_inc_return(&mce_fake_panicked) > 1)
return;
}
+ pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
if (!(m->status & MCI_STATUS_UC)) {
print_mce(m);
if (!apei_err)
@@ -319,13 +365,11 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
}
}
/* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
if (!(m->status & MCI_STATUS_UC))
continue;
- if (!final || memcmp(m, final, sizeof(struct mce))) {
+ if (!final || mce_cmp(m, final)) {
print_mce(m);
if (!apei_err)
apei_err = apei_write_mce(m);
@@ -356,11 +400,11 @@ static int msr_to_offset(u32 msr)
if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
- if (msr == MSR_IA32_MCx_STATUS(bank))
+ if (msr == msr_ops.status(bank))
return offsetof(struct mce, status);
- if (msr == MSR_IA32_MCx_ADDR(bank))
+ if (msr == msr_ops.addr(bank))
return offsetof(struct mce, addr);
- if (msr == MSR_IA32_MCx_MISC(bank))
+ if (msr == msr_ops.misc(bank))
return offsetof(struct mce, misc);
if (msr == MSR_IA32_MCG_STATUS)
return offsetof(struct mce, mcgstatus);
@@ -523,9 +567,9 @@ static struct notifier_block mce_srao_nb = {
static void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
- m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+ m->misc = mce_rdmsrl(msr_ops.misc(i));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ m->addr = mce_rdmsrl(msr_ops.addr(i));
/*
* Mask the reported address by the reported granularity.
@@ -607,7 +651,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.tsc = 0;
barrier();
- m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+ m.status = mce_rdmsrl(msr_ops.status(i));
if (!(m.status & MCI_STATUS_VAL))
continue;
@@ -654,7 +698,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
/*
* Clear state for this bank.
*/
- mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
+ mce_wrmsrl(msr_ops.status(i), 0);
}
/*
@@ -679,7 +723,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
char *tmp;
for (i = 0; i < mca_cfg.banks; i++) {
- m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+ m->status = mce_rdmsrl(msr_ops.status(i));
if (m->status & MCI_STATUS_VAL) {
__set_bit(i, validp);
if (quirk_no_way_out)
@@ -830,9 +874,9 @@ static int mce_start(int *no_way_out)
atomic_add(*no_way_out, &global_nwo);
/*
- * global_nwo should be updated before mce_callin
+ * Rely on the implied barrier below, such that global_nwo
+ * is updated before mce_callin.
*/
- smp_wmb();
order = atomic_inc_return(&mce_callin);
/*
@@ -957,7 +1001,7 @@ static void mce_clear_state(unsigned long *toclear)
for (i = 0; i < mca_cfg.banks; i++) {
if (test_bit(i, toclear))
- mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
+ mce_wrmsrl(msr_ops.status(i), 0);
}
}
@@ -994,11 +1038,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int i;
int worst = 0;
int severity;
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
*/
- int order;
+ int order = -1;
/*
* If no_way_out gets set, there is no safe way to recover from this
* MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
@@ -1012,7 +1057,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
- int lmce = 0;
+
+ /*
+ * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+ * on Intel.
+ */
+ int lmce = 1;
/* If this CPU is offline, just bail out. */
if (cpu_is_offline(smp_processor_id())) {
@@ -1051,19 +1101,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
kill_it = 1;
/*
- * Check if this MCE is signaled to only this logical processor
+ * Check if this MCE is signaled to only this logical processor,
+ * on Intel only.
*/
- if (m.mcgstatus & MCG_STATUS_LMCES)
- lmce = 1;
- else {
- /*
- * Go through all the banks in exclusion of the other CPUs.
- * This way we don't report duplicated events on shared banks
- * because the first one to see it will clear it.
- * If this is a Local MCE, then no need to perform rendezvous.
- */
+ if (m.cpuvendor == X86_VENDOR_INTEL)
+ lmce = m.mcgstatus & MCG_STATUS_LMCES;
+
+ /*
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+ * to see it will clear it. If this is a Local MCE, then no need to
+ * perform rendezvous.
+ */
+ if (!lmce)
order = mce_start(&no_way_out);
- }
for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
@@ -1076,7 +1127,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
m.addr = 0;
m.bank = i;
- m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+ m.status = mce_rdmsrl(msr_ops.status(i));
if ((m.status & MCI_STATUS_VAL) == 0)
continue;
@@ -1420,7 +1471,6 @@ static void __mcheck_cpu_init_generic(void)
enum mcp_flags m_fl = 0;
mce_banks_t all_banks;
u64 cap;
- int i;
if (!mca_cfg.bootlog)
m_fl = MCP_DONTLOG;
@@ -1436,14 +1486,19 @@ static void __mcheck_cpu_init_generic(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_clear_banks(void)
+{
+ int i;
for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
continue;
- wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
- wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
+ wrmsrl(msr_ops.ctl(i), b->ctl);
+ wrmsrl(msr_ops.status(i), 0);
}
}
@@ -1495,7 +1550,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
- if (c->x86 <= 17 && cfg->bootlog < 0) {
+ if (c->x86 < 17 && cfg->bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
@@ -1628,11 +1683,19 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
break;
case X86_VENDOR_AMD: {
- u32 ebx = cpuid_ebx(0x80000007);
+ mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
+ mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
+ mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
- mce_flags.overflow_recov = !!(ebx & BIT(0));
- mce_flags.succor = !!(ebx & BIT(1));
- mce_flags.smca = !!(ebx & BIT(3));
+ /*
+ * Install proper ops for Scalable MCA enabled processors
+ */
+ if (mce_flags.smca) {
+ msr_ops.ctl = smca_ctl_reg;
+ msr_ops.status = smca_status_reg;
+ msr_ops.addr = smca_addr_reg;
+ msr_ops.misc = smca_misc_reg;
+ }
mce_amd_feature_init(c);
break;
@@ -1717,6 +1780,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_clear_banks();
__mcheck_cpu_init_timer();
}
@@ -2082,7 +2146,7 @@ static void mce_disable_error_reporting(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), 0);
+ wrmsrl(msr_ops.ctl(i), 0);
}
return;
}
@@ -2121,6 +2185,7 @@ static void mce_syscore_resume(void)
{
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+ __mcheck_cpu_init_clear_banks();
}
static struct syscore_ops mce_syscore_ops = {
@@ -2138,6 +2203,7 @@ static void mce_cpu_restart(void *data)
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
__mcheck_cpu_init_generic();
+ __mcheck_cpu_init_clear_banks();
__mcheck_cpu_init_timer();
}
@@ -2413,7 +2479,7 @@ static void mce_reenable_cpu(void *h)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
+ wrmsrl(msr_ops.ctl(i), b->ctl);
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9d656fd436ef..10b0661651e0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -54,14 +54,6 @@
/* Threshold LVT offset is at MSR0xC0000410[15:12] */
#define SMCA_THR_LVT_OFF 0xF000
-/*
- * OS is required to set the MCAX bit to acknowledge that it is now using the
- * new MSR ranges and new registers under each bank. It also means that the OS
- * will configure deferred errors in the new MCx_CONFIG register. If the bit is
- * not set, uncorrectable errors will cause a system panic.
- */
-#define SMCA_MCAX_EN_OFF 0x1
-
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -333,7 +325,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
/* Fall back to method we used for older processors: */
switch (block) {
case 0:
- addr = MSR_IA32_MCx_MISC(bank);
+ addr = msr_ops.misc(bank);
break;
case 1:
offset = ((low & MASK_BLKPTR_LO) >> 21);
@@ -351,6 +343,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
+ u32 smca_low, smca_high, smca_addr;
struct threshold_block b;
int new;
@@ -369,24 +362,49 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
b.interrupt_enable = 1;
- if (mce_flags.smca) {
- u32 smca_low, smca_high;
- u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+ if (!mce_flags.smca) {
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
+ goto set_offset;
+ }
- if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
- smca_high |= SMCA_MCAX_EN_OFF;
- wrmsr(smca_addr, smca_low, smca_high);
- }
+ smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
- /* Gather LVT offset for thresholding: */
- if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
- goto out;
+ if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ smca_high |= BIT(0);
- new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
- } else {
- new = (misc_high & MASK_LVTOFF_HI) >> 20;
+ /*
+ * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
+ * registers with the option of additionally logging to
+ * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
+ *
+ * This bit is usually set by BIOS to retain the old behavior
+ * for OSes that don't use the new registers. Linux supports the
+ * new registers so let's disable that additional logging here.
+ *
+ * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
+ * portion of the MSR).
+ */
+ smca_high &= ~BIT(2);
+
+ wrmsr(smca_addr, smca_low, smca_high);
}
+ /* Gather LVT offset for thresholding: */
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+ goto out;
+
+ new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+
+set_offset:
offset = setup_APIC_mce_threshold(offset, new);
if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
@@ -430,12 +448,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
-static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+static void
+__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
{
+ u32 msr_status = msr_ops.status(bank);
+ u32 msr_addr = msr_ops.addr(bank);
struct mce m;
u64 status;
- rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+ WARN_ON_ONCE(deferred_err && threshold_err);
+
+ if (deferred_err && mce_flags.smca) {
+ msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
+ msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
+ }
+
+ rdmsrl(msr_status, status);
+
if (!(status & MCI_STATUS_VAL))
return;
@@ -448,10 +477,11 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
m.misc = misc;
if (m.status & MCI_STATUS_ADDRV)
- rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+ rdmsrl(msr_addr, m.addr);
mce_log(&m);
- wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+
+ wrmsrl(msr_status, 0);
}
static inline void __smp_deferred_error_interrupt(void)
@@ -479,17 +509,21 @@ asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
/* APIC interrupt handler for deferred errors */
static void amd_deferred_error_interrupt(void)
{
- u64 status;
unsigned int bank;
+ u32 msr_status;
+ u64 status;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
- rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+ msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
+ : msr_ops.status(bank);
+
+ rdmsrl(msr_status, status);
if (!(status & MCI_STATUS_VAL) ||
!(status & MCI_STATUS_DEFERRED))
continue;
- __log_error(bank, false, 0);
+ __log_error(bank, true, false, 0);
break;
}
}
@@ -544,7 +578,7 @@ static void amd_threshold_interrupt(void)
return;
log:
- __log_error(bank, true, ((u64)high << 32) | low);
+ __log_error(bank, false, true, ((u64)high << 32) | low);
}
/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index ab0adc0fa5db..a9b31eb815f2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -535,6 +535,15 @@ static void native_machine_emergency_restart(void)
mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
*((unsigned short *)__va(0x472)) = mode;
+ /*
+ * If an EFI capsule has been registered with the firmware then
+ * override the reboot= parameter.
+ */
+ if (efi_capsule_pending(NULL)) {
+ pr_info("EFI capsule is pending, forcing EFI reboot.\n");
+ reboot_type = BOOT_EFI;
+ }
+
for (;;) {
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 548ddf7d6fd2..3e84ef16f657 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -248,18 +248,17 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
if (config_enabled(CONFIG_X86_64))
sp -= 128;
- if (!onsigstack) {
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (current->sas_ss_size)
- sp = current->sas_ss_sp + current->sas_ss_size;
- } else if (config_enabled(CONFIG_X86_32) &&
- (regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer) {
- /* This is the legacy signal stack switching. */
- sp = (unsigned long) ka->sa.sa_restorer;
- }
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (sas_ss_flags(sp) == 0)
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ } else if (config_enabled(CONFIG_X86_32) &&
+ !onsigstack &&
+ (regs->ss & 0xffff) != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ /* This is the legacy signal stack switching. */
+ sp = (unsigned long) ka->sa.sa_restorer;
}
if (fpu->fpstate_active) {
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
index 5da924bbf0a0..623965e86b65 100644
--- a/arch/x86/kernel/sysfb_efi.c
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -68,6 +68,21 @@ struct efifb_dmi_info efifb_dmi_list[] = {
[M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
};
+void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+ int i;
+
+ for (i = 0; i < M_UNKNOWN; i++) {
+ if (efifb_dmi_list[i].base != 0 &&
+ !strcmp(opt, efifb_dmi_list[i].optname)) {
+ si->lfb_base = efifb_dmi_list[i].base;
+ si->lfb_linelength = efifb_dmi_list[i].stride;
+ si->lfb_width = efifb_dmi_list[i].width;
+ si->lfb_height = efifb_dmi_list[i].height;
+ }
+ }
+}
+
#define choose_value(dmivalue, fwvalue, field, flags) ({ \
typeof(fwvalue) _ret_ = fwvalue; \
if ((flags) & (field)) \
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index bf4db6eaec8f..bd074151bfd6 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -578,7 +578,7 @@ static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
riprel_post_xol(auprobe, regs);
}
-static struct uprobe_xol_ops default_xol_ops = {
+static const struct uprobe_xol_ops default_xol_ops = {
.pre_xol = default_pre_xol_op,
.post_xol = default_post_xol_op,
.abort = default_abort_op,
@@ -695,7 +695,7 @@ static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
0, insn->immediate.nbytes);
}
-static struct uprobe_xol_ops branch_xol_ops = {
+static const struct uprobe_xol_ops branch_xol_ops = {
.emulate = branch_emulate_op,
.post_xol = branch_post_xol_op,
};
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0f6294376fbd..a2f24af3c999 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5110,13 +5110,17 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
{
+ register void *__sp asm(_ASM_SP);
ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+
if (!(ctxt->d & ByteOp))
fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+
asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
: "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
- [fastop]"+S"(fop)
+ [fastop]"+S"(fop), "+r"(__sp)
: "c"(ctxt->src2.val));
+
ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
if (!fop) /* exception is returned in fop variable */
return emulate_de(ctxt);
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index be110efa0096..bf2c6074efd2 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -29,8 +29,10 @@
* there is contention on the semaphore.
*
* %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
+ * registers (%eax, %edx and %ecx) except %eax which is either a return
+ * value or just gets clobbered. Same is true for %edx so make sure GCC
+ * reloads it after the slow path, by making it hold a temporary, for
+ * example see ____down_write().
*/
#define save_common_regs \
@@ -106,6 +108,16 @@ ENTRY(call_rwsem_down_write_failed)
ret
ENDPROC(call_rwsem_down_write_failed)
+ENTRY(call_rwsem_down_write_failed_killable)
+ FRAME_BEGIN
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_down_write_failed_killable
+ restore_common_regs
+ FRAME_END
+ ret
+ENDPROC(call_rwsem_down_write_failed_killable)
+
ENTRY(call_rwsem_wake)
FRAME_BEGIN
/* do nothing if still outstanding active readers */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 01be9ec3bf79..a1f0e1d0ddc2 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1125,8 +1125,14 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
int primary)
{
- if (cpa->pgd)
+ if (cpa->pgd) {
+ /*
+ * Right now, we only execute this code path when mapping
+ * the EFI virtual memory map regions, no other users
+ * provide a ->pgd value. This may change in the future.
+ */
return populate_pgd(cpa, vaddr);
+ }
/*
* Ignore all non primary paths.
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 994a7df84a7b..f93545e7dc54 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -54,10 +54,6 @@
#include <asm/rtc.h>
#include <asm/uv/uv.h>
-#define EFI_DEBUG
-
-struct efi_memory_map memmap;
-
static struct efi efi_phys __initdata;
static efi_system_table_t efi_systab __initdata;
@@ -119,11 +115,10 @@ void efi_get_time(struct timespec *now)
void __init efi_find_mirror(void)
{
- void *p;
+ efi_memory_desc_t *md;
u64 mirror_size = 0, total_size = 0;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
+ for_each_efi_memory_desc(md) {
unsigned long long start = md->phys_addr;
unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
@@ -146,10 +141,9 @@ void __init efi_find_mirror(void)
static void __init do_add_efi_memmap(void)
{
- void *p;
+ efi_memory_desc_t *md;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
+ for_each_efi_memory_desc(md) {
unsigned long long start = md->phys_addr;
unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
int e820_type;
@@ -209,47 +203,47 @@ int __init efi_memblock_x86_reserve_range(void)
#else
pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
#endif
- memmap.phys_map = pmap;
- memmap.nr_map = e->efi_memmap_size /
+ efi.memmap.phys_map = pmap;
+ efi.memmap.nr_map = e->efi_memmap_size /
e->efi_memdesc_size;
- memmap.desc_size = e->efi_memdesc_size;
- memmap.desc_version = e->efi_memdesc_version;
+ efi.memmap.desc_size = e->efi_memdesc_size;
+ efi.memmap.desc_version = e->efi_memdesc_version;
- memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
+ WARN(efi.memmap.desc_version != 1,
+ "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
+ efi.memmap.desc_version);
- efi.memmap = &memmap;
+ memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size);
return 0;
}
void __init efi_print_memmap(void)
{
-#ifdef EFI_DEBUG
efi_memory_desc_t *md;
- void *p;
- int i;
+ int i = 0;
- for (p = memmap.map, i = 0;
- p < memmap.map_end;
- p += memmap.desc_size, i++) {
+ for_each_efi_memory_desc(md) {
char buf[64];
- md = p;
pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n",
- i, efi_md_typeattr_format(buf, sizeof(buf), md),
+ i++, efi_md_typeattr_format(buf, sizeof(buf), md),
md->phys_addr,
md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1,
(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
}
-#endif /* EFI_DEBUG */
}
void __init efi_unmap_memmap(void)
{
+ unsigned long size;
+
clear_bit(EFI_MEMMAP, &efi.flags);
- if (memmap.map) {
- early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size);
- memmap.map = NULL;
+
+ size = efi.memmap.nr_map * efi.memmap.desc_size;
+ if (efi.memmap.map) {
+ early_memunmap(efi.memmap.map, size);
+ efi.memmap.map = NULL;
}
}
@@ -352,8 +346,6 @@ static int __init efi_systab_init(void *phys)
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff);
- set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
return 0;
}
@@ -440,17 +432,22 @@ static int __init efi_runtime_init(void)
static int __init efi_memmap_init(void)
{
+ unsigned long addr, size;
+
if (efi_enabled(EFI_PARAVIRT))
return 0;
/* Map the EFI memory map */
- memmap.map = early_memremap((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
- if (memmap.map == NULL) {
+ size = efi.memmap.nr_map * efi.memmap.desc_size;
+ addr = (unsigned long)efi.memmap.phys_map;
+
+ efi.memmap.map = early_memremap(addr, size);
+ if (efi.memmap.map == NULL) {
pr_err("Could not map the memory map!\n");
return -ENOMEM;
}
- memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
+ efi.memmap.map_end = efi.memmap.map + size;
if (add_efi_memmap)
do_add_efi_memmap();
@@ -552,12 +549,9 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
void __init runtime_code_page_mkexec(void)
{
efi_memory_desc_t *md;
- void *p;
/* Make EFI runtime service code area executable */
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
-
+ for_each_efi_memory_desc(md) {
if (md->type != EFI_RUNTIME_SERVICES_CODE)
continue;
@@ -604,12 +598,10 @@ void __init old_map_region(efi_memory_desc_t *md)
/* Merge contiguous regions of the same type and attribute */
static void __init efi_merge_regions(void)
{
- void *p;
efi_memory_desc_t *md, *prev_md = NULL;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ for_each_efi_memory_desc(md) {
u64 prev_size;
- md = p;
if (!prev_md) {
prev_md = md;
@@ -651,30 +643,31 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
static void __init save_runtime_map(void)
{
#ifdef CONFIG_KEXEC_CORE
+ unsigned long desc_size;
efi_memory_desc_t *md;
- void *tmp, *p, *q = NULL;
+ void *tmp, *q = NULL;
int count = 0;
if (efi_enabled(EFI_OLD_MEMMAP))
return;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
+ desc_size = efi.memmap.desc_size;
+ for_each_efi_memory_desc(md) {
if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
(md->type == EFI_BOOT_SERVICES_CODE) ||
(md->type == EFI_BOOT_SERVICES_DATA))
continue;
- tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL);
+ tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL);
if (!tmp)
goto out;
q = tmp;
- memcpy(q + count * memmap.desc_size, md, memmap.desc_size);
+ memcpy(q + count * desc_size, md, desc_size);
count++;
}
- efi_runtime_map_setup(q, count, memmap.desc_size);
+ efi_runtime_map_setup(q, count, desc_size);
return;
out:
@@ -714,10 +707,10 @@ static inline void *efi_map_next_entry_reverse(void *entry)
{
/* Initial call */
if (!entry)
- return memmap.map_end - memmap.desc_size;
+ return efi.memmap.map_end - efi.memmap.desc_size;
- entry -= memmap.desc_size;
- if (entry < memmap.map)
+ entry -= efi.memmap.desc_size;
+ if (entry < efi.memmap.map)
return NULL;
return entry;
@@ -759,10 +752,10 @@ static void *efi_map_next_entry(void *entry)
/* Initial call */
if (!entry)
- return memmap.map;
+ return efi.memmap.map;
- entry += memmap.desc_size;
- if (entry >= memmap.map_end)
+ entry += efi.memmap.desc_size;
+ if (entry >= efi.memmap.map_end)
return NULL;
return entry;
@@ -776,8 +769,11 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
{
void *p, *new_memmap = NULL;
unsigned long left = 0;
+ unsigned long desc_size;
efi_memory_desc_t *md;
+ desc_size = efi.memmap.desc_size;
+
p = NULL;
while ((p = efi_map_next_entry(p))) {
md = p;
@@ -792,7 +788,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
efi_map_region(md);
get_systab_virt_addr(md);
- if (left < memmap.desc_size) {
+ if (left < desc_size) {
new_memmap = realloc_pages(new_memmap, *pg_shift);
if (!new_memmap)
return NULL;
@@ -801,10 +797,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
(*pg_shift)++;
}
- memcpy(new_memmap + (*count * memmap.desc_size), md,
- memmap.desc_size);
+ memcpy(new_memmap + (*count * desc_size), md, desc_size);
- left -= memmap.desc_size;
+ left -= desc_size;
(*count)++;
}
@@ -816,7 +811,6 @@ static void __init kexec_enter_virtual_mode(void)
#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
unsigned int num_pages;
- void *p;
efi.systab = NULL;
@@ -840,8 +834,7 @@ static void __init kexec_enter_virtual_mode(void)
* Map efi regions which were passed via setup_data. The virt_addr is a
* fixed addr which was used in first kernel of a kexec boot.
*/
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
+ for_each_efi_memory_desc(md) {
efi_map_region_fixed(md); /* FIXME: add error handling */
get_systab_virt_addr(md);
}
@@ -850,10 +843,10 @@ static void __init kexec_enter_virtual_mode(void)
BUG_ON(!efi.systab);
- num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE);
+ num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE);
num_pages >>= PAGE_SHIFT;
- if (efi_setup_page_tables(memmap.phys_map, num_pages)) {
+ if (efi_setup_page_tables(efi.memmap.phys_map, num_pages)) {
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
}
@@ -937,16 +930,16 @@ static void __init __efi_enter_virtual_mode(void)
if (efi_is_native()) {
status = phys_efi_set_virtual_address_map(
- memmap.desc_size * count,
- memmap.desc_size,
- memmap.desc_version,
+ efi.memmap.desc_size * count,
+ efi.memmap.desc_size,
+ efi.memmap.desc_version,
(efi_memory_desc_t *)__pa(new_memmap));
} else {
status = efi_thunk_set_virtual_address_map(
efi_phys.set_virtual_address_map,
- memmap.desc_size * count,
- memmap.desc_size,
- memmap.desc_version,
+ efi.memmap.desc_size * count,
+ efi.memmap.desc_size,
+ efi.memmap.desc_version,
(efi_memory_desc_t *)__pa(new_memmap));
}
@@ -1011,13 +1004,11 @@ void __init efi_enter_virtual_mode(void)
u32 efi_mem_type(unsigned long phys_addr)
{
efi_memory_desc_t *md;
- void *p;
if (!efi_enabled(EFI_MEMMAP))
return 0;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
+ for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 49e4dd4a1f58..6e7242be1c87 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -55,14 +55,12 @@ struct efi_scratch efi_scratch;
static void __init early_code_mapping_set_exec(int executable)
{
efi_memory_desc_t *md;
- void *p;
if (!(__supported_pte_mask & _PAGE_NX))
return;
/* Make EFI service code area executable */
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
+ for_each_efi_memory_desc(md) {
if (md->type == EFI_RUNTIME_SERVICES_CODE ||
md->type == EFI_BOOT_SERVICES_CODE)
efi_set_executable(md, executable);
@@ -253,7 +251,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* Map all of RAM so that we can access arguments in the 1:1
* mapping when making EFI runtime calls.
*/
- for_each_efi_memory_desc(&memmap, md) {
+ for_each_efi_memory_desc(md) {
if (md->type != EFI_CONVENTIONAL_MEMORY &&
md->type != EFI_LOADER_DATA &&
md->type != EFI_LOADER_CODE)
@@ -398,7 +396,6 @@ void __init efi_runtime_update_mappings(void)
unsigned long pfn;
pgd_t *pgd = efi_pgd;
efi_memory_desc_t *md;
- void *p;
if (efi_enabled(EFI_OLD_MEMMAP)) {
if (__supported_pte_mask & _PAGE_NX)
@@ -409,9 +406,8 @@ void __init efi_runtime_update_mappings(void)
if (!efi_enabled(EFI_NX_PE_DATA))
return;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ for_each_efi_memory_desc(md) {
unsigned long pf = 0;
- md = p;
if (!(md->attribute & EFI_MEMORY_RUNTIME))
continue;
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index ab50ada1d56e..097cb09d917b 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -195,10 +195,9 @@ static bool can_free_region(u64 start, u64 size)
*/
void __init efi_reserve_boot_services(void)
{
- void *p;
+ efi_memory_desc_t *md;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
+ for_each_efi_memory_desc(md) {
u64 start = md->phys_addr;
u64 size = md->num_pages << EFI_PAGE_SHIFT;
bool already_reserved;
@@ -250,10 +249,9 @@ void __init efi_reserve_boot_services(void)
void __init efi_free_boot_services(void)
{
- void *p;
+ efi_memory_desc_t *md;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
+ for_each_efi_memory_desc(md) {
unsigned long long start = md->phys_addr;
unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index df280da34825..d957d5f21a86 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -1,4 +1,4 @@
-config AMD_MCE_INJ
+config MCE_AMD_INJ
tristate "Simple MCE injection interface for AMD processors"
depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB
default n
diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile
index dd2c98b84037..5f94546db280 100644
--- a/arch/x86/ras/Makefile
+++ b/arch/x86/ras/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_AMD_MCE_INJ) += mce_amd_inj.o
+obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 9e02dcaef683..e69f4701a076 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -290,14 +290,33 @@ static void do_inject(void)
wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS,
(u32)mcg_status, (u32)(mcg_status >> 32));
- wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
- (u32)i_mce.status, (u32)(i_mce.status >> 32));
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (inj_type == DFR_INT_INJ) {
+ wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b),
+ (u32)i_mce.status, (u32)(i_mce.status >> 32));
+
+ wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b),
+ (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+ } else {
+ wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b),
+ (u32)i_mce.status, (u32)(i_mce.status >> 32));
+
+ wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b),
+ (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+ }
+
+ wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b),
+ (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+ } else {
+ wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
+ (u32)i_mce.status, (u32)(i_mce.status >> 32));
- wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
- (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+ wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
+ (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
- wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
- (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+ wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
+ (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+ }
toggle_hw_mce_inject(cpu, false);
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index b56855a1382a..28cf4c5d65ef 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += mm-arch-hooks.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += resource.h
+generic-y += rwsem.h
generic-y += sections.h
generic-y += siginfo.h
generic-y += statfs.h
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
deleted file mode 100644
index 249619e7e7f2..000000000000
--- a/arch/xtensa/include/asm/rwsem.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * include/asm-xtensa/rwsem.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Largely copied from include/asm-ppc/rwsem.h
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
-#ifndef _XTENSA_RWSEM_H
-#define _XTENSA_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
-#endif
-
-#define RWSEM_UNLOCKED_VALUE 0x00000000
-#define RWSEM_ACTIVE_BIAS 0x00000001
-#define RWSEM_ACTIVE_MASK 0x0000ffff
-#define RWSEM_WAITING_BIAS (-0x00010000)
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
- if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0)
- smp_wmb();
- else
- rwsem_down_read_failed(sem);
-}
-
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
- int tmp;
-
- while ((tmp = sem->count) >= 0) {
- if (tmp == cmpxchg(&sem->count, tmp,
- tmp + RWSEM_ACTIVE_READ_BIAS)) {
- smp_wmb();
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
- int tmp;
-
- tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
- (atomic_t *)(&sem->count));
- if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
- smp_wmb();
- else
- rwsem_down_write_failed(sem);
-}
-
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
- int tmp;
-
- tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
- RWSEM_ACTIVE_WRITE_BIAS);
- smp_wmb();
- return tmp == RWSEM_UNLOCKED_VALUE;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
- int tmp;
-
- smp_wmb();
- tmp = atomic_sub_return(1,(atomic_t *)(&sem->count));
- if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
- rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
- smp_wmb();
- if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
- (atomic_t *)(&sem->count)) < 0)
- rwsem_wake(sem);
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
-{
- atomic_add(delta, (atomic_t *)(&sem->count));
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
- int tmp;
-
- smp_wmb();
- tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
- if (tmp < 0)
- rwsem_downgrade_wake(sem);
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
-{
- smp_mb();
- return atomic_add_return(delta, (atomic_t *)(&sem->count));
-}
-
-#endif /* _XTENSA_RWSEM_H */
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
callchain_trace, NULL, entry);
}
void perf_callchain_user(struct perf_callchain_entry *entry,
struct pt_regs *regs)
{
- xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+ xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
callchain_trace, entry);
}
diff --git a/block/blk-map.c b/block/blk-map.c
index a54f0543b956..b9f88b7751fb 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -9,24 +9,6 @@
#include "blk.h"
-static bool iovec_gap_to_prv(struct request_queue *q,
- struct iovec *prv, struct iovec *cur)
-{
- unsigned long prev_end;
-
- if (!queue_virt_boundary(q))
- return false;
-
- if (prv->iov_base == NULL && prv->iov_len == 0)
- /* prv is not set - don't check */
- return false;
-
- prev_end = (unsigned long)(prv->iov_base + prv->iov_len);
-
- return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) ||
- prev_end & queue_virt_boundary(q));
-}
-
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
struct bio *bio)
{
@@ -125,31 +107,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data,
const struct iov_iter *iter, gfp_t gfp_mask)
{
- struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
- bool copy = (q->dma_pad_mask & iter->count) || map_data;
+ bool copy = false;
+ unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
int ret;
- if (!iter || !iter->count)
- return -EINVAL;
-
- iov_for_each(iov, i, *iter) {
- unsigned long uaddr = (unsigned long) iov.iov_base;
-
- if (!iov.iov_len)
- return -EINVAL;
-
- /*
- * Keep going so we check length of all segments
- */
- if ((uaddr & queue_dma_alignment(q)) ||
- iovec_gap_to_prv(q, &prv, &iov))
- copy = true;
-
- prv.iov_base = iov.iov_base;
- prv.iov_len = iov.iov_len;
- }
+ if (map_data)
+ copy = true;
+ else if (iov_iter_alignment(iter) & align)
+ copy = true;
+ else if (queue_virt_boundary(q))
+ copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
i = *iter;
do {
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index b86883aedca1..7d4acc449233 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1776,6 +1776,7 @@ static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver,
static int do_test_rsa(struct crypto_akcipher *tfm,
struct akcipher_testvec *vecs)
{
+ char *xbuf[XBUFSIZE];
struct akcipher_request *req;
void *outbuf_enc = NULL;
void *outbuf_dec = NULL;
@@ -1784,9 +1785,12 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
int err = -ENOMEM;
struct scatterlist src, dst, src_tab[2];
+ if (testmgr_alloc_buf(xbuf))
+ return err;
+
req = akcipher_request_alloc(tfm, GFP_KERNEL);
if (!req)
- return err;
+ goto free_xbuf;
init_completion(&result.completion);
@@ -1804,9 +1808,14 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
if (!outbuf_enc)
goto free_req;
+ if (WARN_ON(vecs->m_size > PAGE_SIZE))
+ goto free_all;
+
+ memcpy(xbuf[0], vecs->m, vecs->m_size);
+
sg_init_table(src_tab, 2);
- sg_set_buf(&src_tab[0], vecs->m, 8);
- sg_set_buf(&src_tab[1], vecs->m + 8, vecs->m_size - 8);
+ sg_set_buf(&src_tab[0], xbuf[0], 8);
+ sg_set_buf(&src_tab[1], xbuf[0] + 8, vecs->m_size - 8);
sg_init_one(&dst, outbuf_enc, out_len_max);
akcipher_request_set_crypt(req, src_tab, &dst, vecs->m_size,
out_len_max);
@@ -1825,7 +1834,7 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
goto free_all;
}
/* verify that encrypted message is equal to expected */
- if (memcmp(vecs->c, sg_virt(req->dst), vecs->c_size)) {
+ if (memcmp(vecs->c, outbuf_enc, vecs->c_size)) {
pr_err("alg: rsa: encrypt test failed. Invalid output\n");
err = -EINVAL;
goto free_all;
@@ -1840,7 +1849,13 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
err = -ENOMEM;
goto free_all;
}
- sg_init_one(&src, vecs->c, vecs->c_size);
+
+ if (WARN_ON(vecs->c_size > PAGE_SIZE))
+ goto free_all;
+
+ memcpy(xbuf[0], vecs->c, vecs->c_size);
+
+ sg_init_one(&src, xbuf[0], vecs->c_size);
sg_init_one(&dst, outbuf_dec, out_len_max);
init_completion(&result.completion);
akcipher_request_set_crypt(req, &src, &dst, vecs->c_size, out_len_max);
@@ -1867,6 +1882,8 @@ free_all:
kfree(outbuf_enc);
free_req:
akcipher_request_free(req);
+free_xbuf:
+ testmgr_free_buf(xbuf);
return err;
}
diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 5c79526245c2..a0380338946a 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -13,6 +13,7 @@
#ifndef _REGMAP_INTERNAL_H
#define _REGMAP_INTERNAL_H
+#include <linux/device.h>
#include <linux/regmap.h>
#include <linux/fs.h>
#include <linux/list.h>
diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index 7526906ca080..5189fd6182f6 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c
@@ -23,6 +23,8 @@
#include <linux/regmap.h>
#include <linux/slab.h>
+#include "internal.h"
+
struct regmap_mmio_context {
void __iomem *regs;
unsigned val_bytes;
@@ -212,6 +214,7 @@ static const struct regmap_bus regmap_mmio = {
.reg_write = regmap_mmio_write,
.reg_read = regmap_mmio_read,
.free_context = regmap_mmio_free_context,
+ .val_format_endian_default = REGMAP_ENDIAN_LITTLE,
};
static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev,
@@ -245,7 +248,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev,
ctx->val_bytes = config->val_bits / 8;
ctx->clk = ERR_PTR(-ENODEV);
- switch (config->reg_format_endian) {
+ switch (regmap_get_val_endian(dev, &regmap_mmio, config)) {
case REGMAP_ENDIAN_DEFAULT:
case REGMAP_ENDIAN_LITTLE:
#ifdef __LITTLE_ENDIAN
diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c
index 7e58f6560399..4a36e415e938 100644
--- a/drivers/base/regmap/regmap-spmi.c
+++ b/drivers/base/regmap/regmap-spmi.c
@@ -142,7 +142,7 @@ static int regmap_spmi_ext_read(void *context,
while (val_size) {
len = min_t(size_t, val_size, 8);
- err = spmi_ext_register_readl(context, addr, val, val_size);
+ err = spmi_ext_register_readl(context, addr, val, len);
if (err)
goto err_out;
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 49768c08ac07..9b6800a79c7f 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1052,7 +1052,6 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
struct mce *m = (struct mce *)data;
struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
int ecc;
- u32 ebx = cpuid_ebx(0x80000007);
if (amd_filter_mce(m))
return NOTIFY_STOP;
@@ -1075,7 +1074,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
- if (!!(ebx & BIT(3))) {
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
u32 low, high;
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
@@ -1094,7 +1093,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (m->status & MCI_STATUS_ADDRV)
pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
- if (!!(ebx & BIT(3))) {
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
decode_smca_errors(m);
goto err_code;
}
@@ -1149,7 +1148,6 @@ static struct notifier_block amd_mce_dec_nb = {
static int __init mce_amd_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- u32 ebx;
if (c->x86_vendor != X86_VENDOR_AMD)
return -ENODEV;
@@ -1205,9 +1203,8 @@ static int __init mce_amd_init(void)
break;
case 0x17:
- ebx = cpuid_ebx(0x80000007);
xec_mask = 0x3f;
- if (!(ebx & BIT(3))) {
+ if (!boot_cpu_has(X86_FEATURE_SMCA)) {
printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
goto err_out;
}
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index e1670d533f97..6394152f648f 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -87,6 +87,31 @@ config EFI_RUNTIME_WRAPPERS
config EFI_ARMSTUB
bool
+config EFI_BOOTLOADER_CONTROL
+ tristate "EFI Bootloader Control"
+ depends on EFI_VARS
+ default n
+ ---help---
+ This module installs a reboot hook, such that if reboot() is
+ invoked with a string argument NNN, "NNN" is copied to the
+ "LoaderEntryOneShot" EFI variable, to be read by the
+ bootloader. If the string matches one of the boot labels
+ defined in its configuration, the bootloader will boot once
+ to that label. The "LoaderEntryRebootReason" EFI variable is
+ set with the reboot reason: "reboot" or "shutdown". The
+ bootloader reads this reboot reason and takes particular
+ action according to its policy.
+
+config EFI_CAPSULE_LOADER
+ tristate "EFI capsule loader"
+ depends on EFI
+ help
+ This option exposes a loader interface "/dev/efi_capsule_loader" for
+ users to load EFI capsules. This driver requires working runtime
+ capsule support in the firmware, which many OEMs do not provide.
+
+ Most users should say N.
+
endmenu
config UEFI_CPER
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 62e654f255f4..a219640f881f 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -9,7 +9,8 @@
#
KASAN_SANITIZE_runtime-wrappers.o := n
-obj-$(CONFIG_EFI) += efi.o vars.o reboot.o
+obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o
+obj-$(CONFIG_EFI) += capsule.o
obj-$(CONFIG_EFI_VARS) += efivars.o
obj-$(CONFIG_EFI_ESRT) += esrt.o
obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o
@@ -18,7 +19,9 @@ obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o
obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o
obj-$(CONFIG_EFI_STUB) += libstub/
obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o
+obj-$(CONFIG_EFI_BOOTLOADER_CONTROL) += efibc.o
arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o
obj-$(CONFIG_ARM) += $(arm-obj-y)
obj-$(CONFIG_ARM64) += $(arm-obj-y)
+obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o
diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index 8714f8c271ba..ef90f0c4b70a 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -11,17 +11,19 @@
*
*/
+#define pr_fmt(fmt) "efi: " fmt
+
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/mm_types.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
#include <asm/efi.h>
-struct efi_memory_map memmap;
-
u64 efi_system_table;
static int __init is_normal_ram(efi_memory_desc_t *md)
@@ -40,7 +42,7 @@ static phys_addr_t efi_to_phys(unsigned long addr)
{
efi_memory_desc_t *md;
- for_each_efi_memory_desc(&memmap, md) {
+ for_each_efi_memory_desc(md) {
if (!(md->attribute & EFI_MEMORY_RUNTIME))
continue;
if (md->virt_addr == 0)
@@ -53,6 +55,36 @@ static phys_addr_t efi_to_phys(unsigned long addr)
return addr;
}
+static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR;
+
+static __initdata efi_config_table_type_t arch_tables[] = {
+ {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, NULL, &screen_info_table},
+ {NULL_GUID, NULL, NULL}
+};
+
+static void __init init_screen_info(void)
+{
+ struct screen_info *si;
+
+ if (screen_info_table != EFI_INVALID_TABLE_ADDR) {
+ si = early_memremap_ro(screen_info_table, sizeof(*si));
+ if (!si) {
+ pr_err("Could not map screen_info config table\n");
+ return;
+ }
+ screen_info = *si;
+ early_memunmap(si, sizeof(*si));
+
+ /* dummycon on ARM needs non-zero values for columns/lines */
+ screen_info.orig_video_cols = 80;
+ screen_info.orig_video_lines = 25;
+ }
+
+ if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI &&
+ memblock_is_map_memory(screen_info.lfb_base))
+ memblock_mark_nomap(screen_info.lfb_base, screen_info.lfb_size);
+}
+
static int __init uefi_init(void)
{
efi_char16_t *c16;
@@ -85,6 +117,8 @@ static int __init uefi_init(void)
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff);
+ efi.runtime_version = efi.systab->hdr.revision;
+
/* Show what we know for posterity */
c16 = early_memremap_ro(efi_to_phys(efi.systab->fw_vendor),
sizeof(vendor) * sizeof(efi_char16_t));
@@ -108,7 +142,8 @@ static int __init uefi_init(void)
goto out;
}
retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables,
- sizeof(efi_config_table_t), NULL);
+ sizeof(efi_config_table_t),
+ arch_tables);
early_memunmap(config_tables, table_size);
out:
@@ -143,7 +178,7 @@ static __init void reserve_regions(void)
if (efi_enabled(EFI_DBG))
pr_info("Processing EFI memory map:\n");
- for_each_efi_memory_desc(&memmap, md) {
+ for_each_efi_memory_desc(md) {
paddr = md->phys_addr;
npages = md->num_pages;
@@ -184,9 +219,9 @@ void __init efi_init(void)
efi_system_table = params.system_table;
- memmap.phys_map = params.mmap;
- memmap.map = early_memremap_ro(params.mmap, params.mmap_size);
- if (memmap.map == NULL) {
+ efi.memmap.phys_map = params.mmap;
+ efi.memmap.map = early_memremap_ro(params.mmap, params.mmap_size);
+ if (efi.memmap.map == NULL) {
/*
* If we are booting via UEFI, the UEFI memory map is the only
* description of memory we have, so there is little point in
@@ -194,28 +229,37 @@ void __init efi_init(void)
*/
panic("Unable to map EFI memory map.\n");
}
- memmap.map_end = memmap.map + params.mmap_size;
- memmap.desc_size = params.desc_size;
- memmap.desc_version = params.desc_ver;
+ efi.memmap.map_end = efi.memmap.map + params.mmap_size;
+ efi.memmap.desc_size = params.desc_size;
+ efi.memmap.desc_version = params.desc_ver;
+
+ WARN(efi.memmap.desc_version != 1,
+ "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
+ efi.memmap.desc_version);
if (uefi_init() < 0)
return;
reserve_regions();
- early_memunmap(memmap.map, params.mmap_size);
+ efi_memattr_init();
+ early_memunmap(efi.memmap.map, params.mmap_size);
- if (IS_ENABLED(CONFIG_ARM)) {
- /*
- * ARM currently does not allow ioremap_cache() to be called on
- * memory regions that are covered by struct page. So remove the
- * UEFI memory map from the linear mapping.
- */
- memblock_mark_nomap(params.mmap & PAGE_MASK,
- PAGE_ALIGN(params.mmap_size +
- (params.mmap & ~PAGE_MASK)));
- } else {
- memblock_reserve(params.mmap & PAGE_MASK,
- PAGE_ALIGN(params.mmap_size +
- (params.mmap & ~PAGE_MASK)));
- }
+ memblock_reserve(params.mmap & PAGE_MASK,
+ PAGE_ALIGN(params.mmap_size +
+ (params.mmap & ~PAGE_MASK)));
+
+ init_screen_info();
+}
+
+static int __init register_gop_device(void)
+{
+ void *pd;
+
+ if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI)
+ return 0;
+
+ pd = platform_device_register_data(NULL, "efi-framebuffer", 0,
+ &screen_info, sizeof(screen_info));
+ return PTR_ERR_OR_ZERO(pd);
}
+subsys_initcall(register_gop_device);
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
index 6ae21e41a429..17ccf0a8787a 100644
--- a/drivers/firmware/efi/arm-runtime.c
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -42,11 +42,13 @@ static struct mm_struct efi_mm = {
static bool __init efi_virtmap_init(void)
{
efi_memory_desc_t *md;
+ bool systab_found;
efi_mm.pgd = pgd_alloc(&efi_mm);
init_new_context(NULL, &efi_mm);
- for_each_efi_memory_desc(&memmap, md) {
+ systab_found = false;
+ for_each_efi_memory_desc(md) {
phys_addr_t phys = md->phys_addr;
int ret;
@@ -64,7 +66,25 @@ static bool __init efi_virtmap_init(void)
&phys, ret);
return false;
}
+ /*
+ * If this entry covers the address of the UEFI system table,
+ * calculate and record its virtual address.
+ */
+ if (efi_system_table >= phys &&
+ efi_system_table < phys + (md->num_pages * EFI_PAGE_SIZE)) {
+ efi.systab = (void *)(unsigned long)(efi_system_table -
+ phys + md->virt_addr);
+ systab_found = true;
+ }
+ }
+ if (!systab_found) {
+ pr_err("No virtual mapping found for the UEFI System Table\n");
+ return false;
}
+
+ if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions))
+ return false;
+
return true;
}
@@ -89,26 +109,17 @@ static int __init arm_enable_runtime_services(void)
pr_info("Remapping and enabling EFI services.\n");
- mapsize = memmap.map_end - memmap.map;
- memmap.map = (__force void *)ioremap_cache(memmap.phys_map,
- mapsize);
- if (!memmap.map) {
- pr_err("Failed to remap EFI memory map\n");
- return -ENOMEM;
- }
- memmap.map_end = memmap.map + mapsize;
- efi.memmap = &memmap;
+ mapsize = efi.memmap.map_end - efi.memmap.map;
- efi.systab = (__force void *)ioremap_cache(efi_system_table,
- sizeof(efi_system_table_t));
- if (!efi.systab) {
- pr_err("Failed to remap EFI System Table\n");
+ efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB);
+ if (!efi.memmap.map) {
+ pr_err("Failed to remap EFI memory map\n");
return -ENOMEM;
}
- set_bit(EFI_SYSTEM_TABLES, &efi.flags);
+ efi.memmap.map_end = efi.memmap.map + mapsize;
if (!efi_virtmap_init()) {
- pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n");
+ pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n");
return -ENOMEM;
}
@@ -116,8 +127,6 @@ static int __init arm_enable_runtime_services(void)
efi_native_runtime_setup();
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
- efi.runtime_version = efi.systab->hdr.revision;
-
return 0;
}
early_initcall(arm_enable_runtime_services);
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
new file mode 100644
index 000000000000..c99c24bc79b0
--- /dev/null
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -0,0 +1,343 @@
+/*
+ * EFI capsule loader driver.
+ *
+ * Copyright 2015 Intel Corporation
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/efi.h>
+
+#define NO_FURTHER_WRITE_ACTION -1
+
+struct capsule_info {
+ bool header_obtained;
+ int reset_type;
+ long index;
+ size_t count;
+ size_t total_size;
+ struct page **pages;
+ size_t page_bytes_remain;
+};
+
+/**
+ * efi_free_all_buff_pages - free all previous allocated buffer pages
+ * @cap_info: pointer to current instance of capsule_info structure
+ *
+ * In addition to freeing buffer pages, it flags NO_FURTHER_WRITE_ACTION
+ * to cease processing data in subsequent write(2) calls until close(2)
+ * is called.
+ **/
+static void efi_free_all_buff_pages(struct capsule_info *cap_info)
+{
+ while (cap_info->index > 0)
+ __free_page(cap_info->pages[--cap_info->index]);
+
+ cap_info->index = NO_FURTHER_WRITE_ACTION;
+}
+
+/**
+ * efi_capsule_setup_info - obtain the efi capsule header in the binary and
+ * setup capsule_info structure
+ * @cap_info: pointer to current instance of capsule_info structure
+ * @kbuff: a mapped first page buffer pointer
+ * @hdr_bytes: the total received number of bytes for efi header
+ **/
+static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info,
+ void *kbuff, size_t hdr_bytes)
+{
+ efi_capsule_header_t *cap_hdr;
+ size_t pages_needed;
+ int ret;
+ void *temp_page;
+
+ /* Only process data block that is larger than efi header size */
+ if (hdr_bytes < sizeof(efi_capsule_header_t))
+ return 0;
+
+ /* Reset back to the correct offset of header */
+ cap_hdr = kbuff - cap_info->count;
+ pages_needed = ALIGN(cap_hdr->imagesize, PAGE_SIZE) >> PAGE_SHIFT;
+
+ if (pages_needed == 0) {
+ pr_err("%s: pages count invalid\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Check if the capsule binary supported */
+ ret = efi_capsule_supported(cap_hdr->guid, cap_hdr->flags,
+ cap_hdr->imagesize,
+ &cap_info->reset_type);
+ if (ret) {
+ pr_err("%s: efi_capsule_supported() failed\n",
+ __func__);
+ return ret;
+ }
+
+ cap_info->total_size = cap_hdr->imagesize;
+ temp_page = krealloc(cap_info->pages,
+ pages_needed * sizeof(void *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!temp_page) {
+ pr_debug("%s: krealloc() failed\n", __func__);
+ return -ENOMEM;
+ }
+
+ cap_info->pages = temp_page;
+ cap_info->header_obtained = true;
+
+ return 0;
+}
+
+/**
+ * efi_capsule_submit_update - invoke the efi_capsule_update API once binary
+ * upload done
+ * @cap_info: pointer to current instance of capsule_info structure
+ **/
+static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
+{
+ int ret;
+ void *cap_hdr_temp;
+
+ cap_hdr_temp = kmap(cap_info->pages[0]);
+ if (!cap_hdr_temp) {
+ pr_debug("%s: kmap() failed\n", __func__);
+ return -EFAULT;
+ }
+
+ ret = efi_capsule_update(cap_hdr_temp, cap_info->pages);
+ kunmap(cap_info->pages[0]);
+ if (ret) {
+ pr_err("%s: efi_capsule_update() failed\n", __func__);
+ return ret;
+ }
+
+ /* Indicate capsule binary uploading is done */
+ cap_info->index = NO_FURTHER_WRITE_ACTION;
+ pr_info("%s: Successfully upload capsule file with reboot type '%s'\n",
+ __func__, !cap_info->reset_type ? "RESET_COLD" :
+ cap_info->reset_type == 1 ? "RESET_WARM" :
+ "RESET_SHUTDOWN");
+ return 0;
+}
+
+/**
+ * efi_capsule_write - store the capsule binary and pass it to
+ * efi_capsule_update() API
+ * @file: file pointer
+ * @buff: buffer pointer
+ * @count: number of bytes in @buff
+ * @offp: not used
+ *
+ * Expectation:
+ * - A user space tool should start at the beginning of capsule binary and
+ * pass data in sequentially.
+ * - Users should close and re-open this file note in order to upload more
+ * capsules.
+ * - After an error returned, user should close the file and restart the
+ * operation for the next try otherwise -EIO will be returned until the
+ * file is closed.
+ * - An EFI capsule header must be located at the beginning of capsule
+ * binary file and passed in as first block data of write operation.
+ **/
+static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
+ size_t count, loff_t *offp)
+{
+ int ret = 0;
+ struct capsule_info *cap_info = file->private_data;
+ struct page *page;
+ void *kbuff = NULL;
+ size_t write_byte;
+
+ if (count == 0)
+ return 0;
+
+ /* Return error while NO_FURTHER_WRITE_ACTION is flagged */
+ if (cap_info->index < 0)
+ return -EIO;
+
+ /* Only alloc a new page when previous page is full */
+ if (!cap_info->page_bytes_remain) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ pr_debug("%s: alloc_page() failed\n", __func__);
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ cap_info->pages[cap_info->index++] = page;
+ cap_info->page_bytes_remain = PAGE_SIZE;
+ }
+
+ page = cap_info->pages[cap_info->index - 1];
+
+ kbuff = kmap(page);
+ if (!kbuff) {
+ pr_debug("%s: kmap() failed\n", __func__);
+ ret = -EFAULT;
+ goto failed;
+ }
+ kbuff += PAGE_SIZE - cap_info->page_bytes_remain;
+
+ /* Copy capsule binary data from user space to kernel space buffer */
+ write_byte = min_t(size_t, count, cap_info->page_bytes_remain);
+ if (copy_from_user(kbuff, buff, write_byte)) {
+ pr_debug("%s: copy_from_user() failed\n", __func__);
+ ret = -EFAULT;
+ goto fail_unmap;
+ }
+ cap_info->page_bytes_remain -= write_byte;
+
+ /* Setup capsule binary info structure */
+ if (!cap_info->header_obtained) {
+ ret = efi_capsule_setup_info(cap_info, kbuff,
+ cap_info->count + write_byte);
+ if (ret)
+ goto fail_unmap;
+ }
+
+ cap_info->count += write_byte;
+ kunmap(page);
+
+ /* Submit the full binary to efi_capsule_update() API */
+ if (cap_info->header_obtained &&
+ cap_info->count >= cap_info->total_size) {
+ if (cap_info->count > cap_info->total_size) {
+ pr_err("%s: upload size exceeded header defined size\n",
+ __func__);
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ ret = efi_capsule_submit_update(cap_info);
+ if (ret)
+ goto failed;
+ }
+
+ return write_byte;
+
+fail_unmap:
+ kunmap(page);
+failed:
+ efi_free_all_buff_pages(cap_info);
+ return ret;
+}
+
+/**
+ * efi_capsule_flush - called by file close or file flush
+ * @file: file pointer
+ * @id: not used
+ *
+ * If a capsule is being partially uploaded then calling this function
+ * will be treated as upload termination and will free those completed
+ * buffer pages and -ECANCELED will be returned.
+ **/
+static int efi_capsule_flush(struct file *file, fl_owner_t id)
+{
+ int ret = 0;
+ struct capsule_info *cap_info = file->private_data;
+
+ if (cap_info->index > 0) {
+ pr_err("%s: capsule upload not complete\n", __func__);
+ efi_free_all_buff_pages(cap_info);
+ ret = -ECANCELED;
+ }
+
+ return ret;
+}
+
+/**
+ * efi_capsule_release - called by file close
+ * @inode: not used
+ * @file: file pointer
+ *
+ * We will not free successfully submitted pages since efi update
+ * requires data to be maintained across system reboot.
+ **/
+static int efi_capsule_release(struct inode *inode, struct file *file)
+{
+ struct capsule_info *cap_info = file->private_data;
+
+ kfree(cap_info->pages);
+ kfree(file->private_data);
+ file->private_data = NULL;
+ return 0;
+}
+
+/**
+ * efi_capsule_open - called by file open
+ * @inode: not used
+ * @file: file pointer
+ *
+ * Will allocate each capsule_info memory for each file open call.
+ * This provided the capability to support multiple file open feature
+ * where user is not needed to wait for others to finish in order to
+ * upload their capsule binary.
+ **/
+static int efi_capsule_open(struct inode *inode, struct file *file)
+{
+ struct capsule_info *cap_info;
+
+ cap_info = kzalloc(sizeof(*cap_info), GFP_KERNEL);
+ if (!cap_info)
+ return -ENOMEM;
+
+ cap_info->pages = kzalloc(sizeof(void *), GFP_KERNEL);
+ if (!cap_info->pages) {
+ kfree(cap_info);
+ return -ENOMEM;
+ }
+
+ file->private_data = cap_info;
+
+ return 0;
+}
+
+static const struct file_operations efi_capsule_fops = {
+ .owner = THIS_MODULE,
+ .open = efi_capsule_open,
+ .write = efi_capsule_write,
+ .flush = efi_capsule_flush,
+ .release = efi_capsule_release,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice efi_capsule_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "efi_capsule_loader",
+ .fops = &efi_capsule_fops,
+};
+
+static int __init efi_capsule_loader_init(void)
+{
+ int ret;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return -ENODEV;
+
+ ret = misc_register(&efi_capsule_misc);
+ if (ret)
+ pr_err("%s: Failed to register misc char file note\n",
+ __func__);
+
+ return ret;
+}
+module_init(efi_capsule_loader_init);
+
+static void __exit efi_capsule_loader_exit(void)
+{
+ misc_deregister(&efi_capsule_misc);
+}
+module_exit(efi_capsule_loader_exit);
+
+MODULE_DESCRIPTION("EFI capsule firmware binary loader");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c
new file mode 100644
index 000000000000..53b9fd2293ee
--- /dev/null
+++ b/drivers/firmware/efi/capsule.c
@@ -0,0 +1,308 @@
+/*
+ * EFI capsule support.
+ *
+ * Copyright 2013 Intel Corporation; author Matt Fleming
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/highmem.h>
+#include <linux/efi.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+
+typedef struct {
+ u64 length;
+ u64 data;
+} efi_capsule_block_desc_t;
+
+static bool capsule_pending;
+static bool stop_capsules;
+static int efi_reset_type = -1;
+
+/*
+ * capsule_mutex serialises access to both capsule_pending and
+ * efi_reset_type and stop_capsules.
+ */
+static DEFINE_MUTEX(capsule_mutex);
+
+/**
+ * efi_capsule_pending - has a capsule been passed to the firmware?
+ * @reset_type: store the type of EFI reset if capsule is pending
+ *
+ * To ensure that the registered capsule is processed correctly by the
+ * firmware we need to perform a specific type of reset. If a capsule is
+ * pending return the reset type in @reset_type.
+ *
+ * This function will race with callers of efi_capsule_update(), for
+ * example, calling this function while somebody else is in
+ * efi_capsule_update() but hasn't reached efi_capsue_update_locked()
+ * will miss the updates to capsule_pending and efi_reset_type after
+ * efi_capsule_update_locked() completes.
+ *
+ * A non-racy use is from platform reboot code because we use
+ * system_state to ensure no capsules can be sent to the firmware once
+ * we're at SYSTEM_RESTART. See efi_capsule_update_locked().
+ */
+bool efi_capsule_pending(int *reset_type)
+{
+ if (!capsule_pending)
+ return false;
+
+ if (reset_type)
+ *reset_type = efi_reset_type;
+
+ return true;
+}
+
+/*
+ * Whitelist of EFI capsule flags that we support.
+ *
+ * We do not handle EFI_CAPSULE_INITIATE_RESET because that would
+ * require us to prepare the kernel for reboot. Refuse to load any
+ * capsules with that flag and any other flags that we do not know how
+ * to handle.
+ */
+#define EFI_CAPSULE_SUPPORTED_FLAG_MASK \
+ (EFI_CAPSULE_PERSIST_ACROSS_RESET | EFI_CAPSULE_POPULATE_SYSTEM_TABLE)
+
+/**
+ * efi_capsule_supported - does the firmware support the capsule?
+ * @guid: vendor guid of capsule
+ * @flags: capsule flags
+ * @size: size of capsule data
+ * @reset: the reset type required for this capsule
+ *
+ * Check whether a capsule with @flags is supported by the firmware
+ * and that @size doesn't exceed the maximum size for a capsule.
+ *
+ * No attempt is made to check @reset against the reset type required
+ * by any pending capsules because of the races involved.
+ */
+int efi_capsule_supported(efi_guid_t guid, u32 flags, size_t size, int *reset)
+{
+ efi_capsule_header_t capsule;
+ efi_capsule_header_t *cap_list[] = { &capsule };
+ efi_status_t status;
+ u64 max_size;
+
+ if (flags & ~EFI_CAPSULE_SUPPORTED_FLAG_MASK)
+ return -EINVAL;
+
+ capsule.headersize = capsule.imagesize = sizeof(capsule);
+ memcpy(&capsule.guid, &guid, sizeof(efi_guid_t));
+ capsule.flags = flags;
+
+ status = efi.query_capsule_caps(cap_list, 1, &max_size, reset);
+ if (status != EFI_SUCCESS)
+ return efi_status_to_err(status);
+
+ if (size > max_size)
+ return -ENOSPC;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(efi_capsule_supported);
+
+/*
+ * Every scatter gather list (block descriptor) page must end with a
+ * continuation pointer. The last continuation pointer of the last
+ * page must be zero to mark the end of the chain.
+ */
+#define SGLIST_PER_PAGE ((PAGE_SIZE / sizeof(efi_capsule_block_desc_t)) - 1)
+
+/*
+ * How many scatter gather list (block descriptor) pages do we need
+ * to map @count pages?
+ */
+static inline unsigned int sg_pages_num(unsigned int count)
+{
+ return DIV_ROUND_UP(count, SGLIST_PER_PAGE);
+}
+
+/**
+ * efi_capsule_update_locked - pass a single capsule to the firmware
+ * @capsule: capsule to send to the firmware
+ * @sg_pages: array of scatter gather (block descriptor) pages
+ * @reset: the reset type required for @capsule
+ *
+ * Since this function must be called under capsule_mutex check
+ * whether efi_reset_type will conflict with @reset, and atomically
+ * set it and capsule_pending if a capsule was successfully sent to
+ * the firmware.
+ *
+ * We also check to see if the system is about to restart, and if so,
+ * abort. This avoids races between efi_capsule_update() and
+ * efi_capsule_pending().
+ */
+static int
+efi_capsule_update_locked(efi_capsule_header_t *capsule,
+ struct page **sg_pages, int reset)
+{
+ efi_physical_addr_t sglist_phys;
+ efi_status_t status;
+
+ lockdep_assert_held(&capsule_mutex);
+
+ /*
+ * If someone has already registered a capsule that requires a
+ * different reset type, we're out of luck and must abort.
+ */
+ if (efi_reset_type >= 0 && efi_reset_type != reset) {
+ pr_err("Conflicting capsule reset type %d (%d).\n",
+ reset, efi_reset_type);
+ return -EINVAL;
+ }
+
+ /*
+ * If the system is getting ready to restart it may have
+ * called efi_capsule_pending() to make decisions (such as
+ * whether to force an EFI reboot), and we're racing against
+ * that call. Abort in that case.
+ */
+ if (unlikely(stop_capsules)) {
+ pr_warn("Capsule update raced with reboot, aborting.\n");
+ return -EINVAL;
+ }
+
+ sglist_phys = page_to_phys(sg_pages[0]);
+
+ status = efi.update_capsule(&capsule, 1, sglist_phys);
+ if (status == EFI_SUCCESS) {
+ capsule_pending = true;
+ efi_reset_type = reset;
+ }
+
+ return efi_status_to_err(status);
+}
+
+/**
+ * efi_capsule_update - send a capsule to the firmware
+ * @capsule: capsule to send to firmware
+ * @pages: an array of capsule data pages
+ *
+ * Build a scatter gather list with EFI capsule block descriptors to
+ * map the capsule described by @capsule with its data in @pages and
+ * send it to the firmware via the UpdateCapsule() runtime service.
+ *
+ * @capsule must be a virtual mapping of the first page in @pages
+ * (@pages[0]) in the kernel address space. That is, a
+ * capsule_header_t that describes the entire contents of the capsule
+ * must be at the start of the first data page.
+ *
+ * Even though this function will validate that the firmware supports
+ * the capsule guid, users will likely want to check that
+ * efi_capsule_supported() returns true before calling this function
+ * because it makes it easier to print helpful error messages.
+ *
+ * If the capsule is successfully submitted to the firmware, any
+ * subsequent calls to efi_capsule_pending() will return true. @pages
+ * must not be released or modified if this function returns
+ * successfully.
+ *
+ * Callers must be prepared for this function to fail, which can
+ * happen if we raced with system reboot or if there is already a
+ * pending capsule that has a reset type that conflicts with the one
+ * required by @capsule. Do NOT use efi_capsule_pending() to detect
+ * this conflict since that would be racy. Instead, submit the capsule
+ * to efi_capsule_update() and check the return value.
+ *
+ * Return 0 on success, a converted EFI status code on failure.
+ */
+int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages)
+{
+ u32 imagesize = capsule->imagesize;
+ efi_guid_t guid = capsule->guid;
+ unsigned int count, sg_count;
+ u32 flags = capsule->flags;
+ struct page **sg_pages;
+ int rv, reset_type;
+ int i, j;
+
+ rv = efi_capsule_supported(guid, flags, imagesize, &reset_type);
+ if (rv)
+ return rv;
+
+ count = DIV_ROUND_UP(imagesize, PAGE_SIZE);
+ sg_count = sg_pages_num(count);
+
+ sg_pages = kzalloc(sg_count * sizeof(*sg_pages), GFP_KERNEL);
+ if (!sg_pages)
+ return -ENOMEM;
+
+ for (i = 0; i < sg_count; i++) {
+ sg_pages[i] = alloc_page(GFP_KERNEL);
+ if (!sg_pages[i]) {
+ rv = -ENOMEM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < sg_count; i++) {
+ efi_capsule_block_desc_t *sglist;
+
+ sglist = kmap(sg_pages[i]);
+ if (!sglist) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) {
+ u64 sz = min_t(u64, imagesize, PAGE_SIZE);
+
+ sglist[j].length = sz;
+ sglist[j].data = page_to_phys(*pages++);
+
+ imagesize -= sz;
+ count--;
+ }
+
+ /* Continuation pointer */
+ sglist[j].length = 0;
+
+ if (i + 1 == sg_count)
+ sglist[j].data = 0;
+ else
+ sglist[j].data = page_to_phys(sg_pages[i + 1]);
+
+ kunmap(sg_pages[i]);
+ }
+
+ mutex_lock(&capsule_mutex);
+ rv = efi_capsule_update_locked(capsule, sg_pages, reset_type);
+ mutex_unlock(&capsule_mutex);
+
+out:
+ for (i = 0; rv && i < sg_count; i++) {
+ if (sg_pages[i])
+ __free_page(sg_pages[i]);
+ }
+
+ kfree(sg_pages);
+ return rv;
+}
+EXPORT_SYMBOL_GPL(efi_capsule_update);
+
+static int capsule_reboot_notify(struct notifier_block *nb, unsigned long event, void *cmd)
+{
+ mutex_lock(&capsule_mutex);
+ stop_capsules = true;
+ mutex_unlock(&capsule_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block capsule_reboot_nb = {
+ .notifier_call = capsule_reboot_notify,
+};
+
+static int __init capsule_reboot_register(void)
+{
+ return register_reboot_notifier(&capsule_reboot_nb);
+}
+core_initcall(capsule_reboot_register);
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3a69ed5ecfcb..05509f3aaee8 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -43,6 +43,7 @@ struct efi __read_mostly efi = {
.config_table = EFI_INVALID_TABLE_ADDR,
.esrt = EFI_INVALID_TABLE_ADDR,
.properties_table = EFI_INVALID_TABLE_ADDR,
+ .mem_attr_table = EFI_INVALID_TABLE_ADDR,
};
EXPORT_SYMBOL(efi);
@@ -256,7 +257,7 @@ subsys_initcall(efisubsys_init);
*/
int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
{
- struct efi_memory_map *map = efi.memmap;
+ struct efi_memory_map *map = &efi.memmap;
phys_addr_t p, e;
if (!efi_enabled(EFI_MEMMAP)) {
@@ -338,6 +339,7 @@ static __initdata efi_config_table_type_t common_tables[] = {
{UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
{EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt},
{EFI_PROPERTIES_TABLE_GUID, "PROP", &efi.properties_table},
+ {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi.mem_attr_table},
{NULL_GUID, NULL, NULL},
};
@@ -351,8 +353,9 @@ static __init int match_config_table(efi_guid_t *guid,
for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) {
if (!efi_guidcmp(*guid, table_types[i].guid)) {
*(table_types[i].ptr) = table;
- pr_cont(" %s=0x%lx ",
- table_types[i].name, table);
+ if (table_types[i].name)
+ pr_cont(" %s=0x%lx ",
+ table_types[i].name, table);
return 1;
}
}
@@ -620,16 +623,12 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
*/
u64 __weak efi_mem_attributes(unsigned long phys_addr)
{
- struct efi_memory_map *map;
efi_memory_desc_t *md;
- void *p;
if (!efi_enabled(EFI_MEMMAP))
return 0;
- map = efi.memmap;
- for (p = map->map; p < map->map_end; p += map->desc_size) {
- md = p;
+ for_each_efi_memory_desc(md) {
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
@@ -637,3 +636,36 @@ u64 __weak efi_mem_attributes(unsigned long phys_addr)
}
return 0;
}
+
+int efi_status_to_err(efi_status_t status)
+{
+ int err;
+
+ switch (status) {
+ case EFI_SUCCESS:
+ err = 0;
+ break;
+ case EFI_INVALID_PARAMETER:
+ err = -EINVAL;
+ break;
+ case EFI_OUT_OF_RESOURCES:
+ err = -ENOSPC;
+ break;
+ case EFI_DEVICE_ERROR:
+ err = -EIO;
+ break;
+ case EFI_WRITE_PROTECTED:
+ err = -EROFS;
+ break;
+ case EFI_SECURITY_VIOLATION:
+ err = -EACCES;
+ break;
+ case EFI_NOT_FOUND:
+ err = -ENOENT;
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+ return err;
+}
diff --git a/drivers/firmware/efi/efibc.c b/drivers/firmware/efi/efibc.c
new file mode 100644
index 000000000000..8dd0c7085e59
--- /dev/null
+++ b/drivers/firmware/efi/efibc.c
@@ -0,0 +1,113 @@
+/*
+ * efibc: control EFI bootloaders which obey LoaderEntryOneShot var
+ * Copyright (c) 2013-2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) "efibc: " fmt
+
+#include <linux/efi.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+
+static void efibc_str_to_str16(const char *str, efi_char16_t *str16)
+{
+ size_t i;
+
+ for (i = 0; i < strlen(str); i++)
+ str16[i] = str[i];
+
+ str16[i] = '\0';
+}
+
+static int efibc_set_variable(const char *name, const char *value)
+{
+ int ret;
+ efi_guid_t guid = LINUX_EFI_LOADER_ENTRY_GUID;
+ struct efivar_entry *entry;
+ size_t size = (strlen(value) + 1) * sizeof(efi_char16_t);
+
+ if (size > sizeof(entry->var.Data)) {
+ pr_err("value is too large");
+ return -EINVAL;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ pr_err("failed to allocate efivar entry");
+ return -ENOMEM;
+ }
+
+ efibc_str_to_str16(name, entry->var.VariableName);
+ efibc_str_to_str16(value, (efi_char16_t *)entry->var.Data);
+ memcpy(&entry->var.VendorGuid, &guid, sizeof(guid));
+
+ ret = efivar_entry_set(entry,
+ EFI_VARIABLE_NON_VOLATILE
+ | EFI_VARIABLE_BOOTSERVICE_ACCESS
+ | EFI_VARIABLE_RUNTIME_ACCESS,
+ size, entry->var.Data, NULL);
+ if (ret)
+ pr_err("failed to set %s EFI variable: 0x%x\n",
+ name, ret);
+
+ kfree(entry);
+ return ret;
+}
+
+static int efibc_reboot_notifier_call(struct notifier_block *notifier,
+ unsigned long event, void *data)
+{
+ const char *reason = "shutdown";
+ int ret;
+
+ if (event == SYS_RESTART)
+ reason = "reboot";
+
+ ret = efibc_set_variable("LoaderEntryRebootReason", reason);
+ if (ret || !data)
+ return NOTIFY_DONE;
+
+ efibc_set_variable("LoaderEntryOneShot", (char *)data);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block efibc_reboot_notifier = {
+ .notifier_call = efibc_reboot_notifier_call,
+};
+
+static int __init efibc_init(void)
+{
+ int ret;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return -ENODEV;
+
+ ret = register_reboot_notifier(&efibc_reboot_notifier);
+ if (ret)
+ pr_err("unable to register reboot notifier\n");
+
+ return ret;
+}
+module_init(efibc_init);
+
+static void __exit efibc_exit(void)
+{
+ unregister_reboot_notifier(&efibc_reboot_notifier);
+}
+module_exit(efibc_exit);
+
+MODULE_AUTHOR("Jeremy Compostella <jeremy.compostella@intel.com>");
+MODULE_AUTHOR("Matt Gumbel <matthew.k.gumbel@intel.com");
+MODULE_DESCRIPTION("EFI Bootloader Control");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index 096adcbcb5a9..116b244dee68 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -661,7 +661,7 @@ static void efivar_update_sysfs_entries(struct work_struct *work)
return;
err = efivar_init(efivar_update_sysfs_entry, entry,
- true, false, &efivar_sysfs_list);
+ false, &efivar_sysfs_list);
if (!err)
break;
@@ -730,8 +730,7 @@ int efivars_sysfs_init(void)
return -ENOMEM;
}
- efivar_init(efivars_sysfs_callback, NULL, false,
- true, &efivar_sysfs_list);
+ efivar_init(efivars_sysfs_callback, NULL, true, &efivar_sysfs_list);
error = create_efivars_bin_attributes();
if (error) {
diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index ed3a854950cc..48430aba13c1 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -57,7 +57,7 @@ static int __init cmp_fake_mem(const void *x1, const void *x2)
void __init efi_fake_memmap(void)
{
u64 start, end, m_start, m_end, m_attr;
- int new_nr_map = memmap.nr_map;
+ int new_nr_map = efi.memmap.nr_map;
efi_memory_desc_t *md;
phys_addr_t new_memmap_phy;
void *new_memmap;
@@ -68,8 +68,7 @@ void __init efi_fake_memmap(void)
return;
/* count up the number of EFI memory descriptor */
- for (old = memmap.map; old < memmap.map_end; old += memmap.desc_size) {
- md = old;
+ for_each_efi_memory_desc(md) {
start = md->phys_addr;
end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1;
@@ -95,25 +94,25 @@ void __init efi_fake_memmap(void)
}
/* allocate memory for new EFI memmap */
- new_memmap_phy = memblock_alloc(memmap.desc_size * new_nr_map,
+ new_memmap_phy = memblock_alloc(efi.memmap.desc_size * new_nr_map,
PAGE_SIZE);
if (!new_memmap_phy)
return;
/* create new EFI memmap */
new_memmap = early_memremap(new_memmap_phy,
- memmap.desc_size * new_nr_map);
+ efi.memmap.desc_size * new_nr_map);
if (!new_memmap) {
- memblock_free(new_memmap_phy, memmap.desc_size * new_nr_map);
+ memblock_free(new_memmap_phy, efi.memmap.desc_size * new_nr_map);
return;
}
- for (old = memmap.map, new = new_memmap;
- old < memmap.map_end;
- old += memmap.desc_size, new += memmap.desc_size) {
+ for (old = efi.memmap.map, new = new_memmap;
+ old < efi.memmap.map_end;
+ old += efi.memmap.desc_size, new += efi.memmap.desc_size) {
/* copy original EFI memory descriptor */
- memcpy(new, old, memmap.desc_size);
+ memcpy(new, old, efi.memmap.desc_size);
md = new;
start = md->phys_addr;
end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
@@ -134,8 +133,8 @@ void __init efi_fake_memmap(void)
md->num_pages = (m_end - md->phys_addr + 1) >>
EFI_PAGE_SHIFT;
/* latter part */
- new += memmap.desc_size;
- memcpy(new, old, memmap.desc_size);
+ new += efi.memmap.desc_size;
+ memcpy(new, old, efi.memmap.desc_size);
md = new;
md->phys_addr = m_end + 1;
md->num_pages = (end - md->phys_addr + 1) >>
@@ -147,16 +146,16 @@ void __init efi_fake_memmap(void)
md->num_pages = (m_start - md->phys_addr) >>
EFI_PAGE_SHIFT;
/* middle part */
- new += memmap.desc_size;
- memcpy(new, old, memmap.desc_size);
+ new += efi.memmap.desc_size;
+ memcpy(new, old, efi.memmap.desc_size);
md = new;
md->attribute |= m_attr;
md->phys_addr = m_start;
md->num_pages = (m_end - m_start + 1) >>
EFI_PAGE_SHIFT;
/* last part */
- new += memmap.desc_size;
- memcpy(new, old, memmap.desc_size);
+ new += efi.memmap.desc_size;
+ memcpy(new, old, efi.memmap.desc_size);
md = new;
md->phys_addr = m_end + 1;
md->num_pages = (end - m_end) >>
@@ -169,8 +168,8 @@ void __init efi_fake_memmap(void)
md->num_pages = (m_start - md->phys_addr) >>
EFI_PAGE_SHIFT;
/* latter part */
- new += memmap.desc_size;
- memcpy(new, old, memmap.desc_size);
+ new += efi.memmap.desc_size;
+ memcpy(new, old, efi.memmap.desc_size);
md = new;
md->phys_addr = m_start;
md->num_pages = (end - md->phys_addr + 1) >>
@@ -182,10 +181,10 @@ void __init efi_fake_memmap(void)
/* swap into new EFI memmap */
efi_unmap_memmap();
- memmap.map = new_memmap;
- memmap.phys_map = new_memmap_phy;
- memmap.nr_map = new_nr_map;
- memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size;
+ efi.memmap.map = new_memmap;
+ efi.memmap.phys_map = new_memmap_phy;
+ efi.memmap.nr_map = new_nr_map;
+ efi.memmap.map_end = efi.memmap.map + efi.memmap.nr_map * efi.memmap.desc_size;
set_bit(EFI_MEMMAP, &efi.flags);
/* print new EFI memmap */
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index da99bbb74aeb..c06945160a41 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD := y
# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
KCOV_INSTRUMENT := n
-lib-y := efi-stub-helper.o
+lib-y := efi-stub-helper.o gop.o
# include the stub's generic dependencies from lib/ when building for ARM/arm64
arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 414deb85c2e5..993aa56755f6 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -20,27 +20,49 @@
bool __nokaslr;
-static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg)
+static int efi_get_secureboot(efi_system_table_t *sys_table_arg)
{
- static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID;
- static efi_char16_t const var_name[] = {
+ static efi_char16_t const sb_var_name[] = {
'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 };
+ static efi_char16_t const sm_var_name[] = {
+ 'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 };
+ efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID;
efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable;
- unsigned long size = sizeof(u8);
- efi_status_t status;
u8 val;
+ unsigned long size = sizeof(val);
+ efi_status_t status;
- status = f_getvar((efi_char16_t *)var_name, (efi_guid_t *)&var_guid,
+ status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid,
NULL, &size, &val);
+ if (status != EFI_SUCCESS)
+ goto out_efi_err;
+
+ if (val == 0)
+ return 0;
+
+ status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid,
+ NULL, &size, &val);
+
+ if (status != EFI_SUCCESS)
+ goto out_efi_err;
+
+ if (val == 1)
+ return 0;
+
+ return 1;
+
+out_efi_err:
switch (status) {
- case EFI_SUCCESS:
- return val;
case EFI_NOT_FOUND:
return 0;
+ case EFI_DEVICE_ERROR:
+ return -EIO;
+ case EFI_SECURITY_VIOLATION:
+ return -EACCES;
default:
- return 1;
+ return -EINVAL;
}
}
@@ -147,6 +169,25 @@ void efi_char16_printk(efi_system_table_t *sys_table_arg,
out->output_string(out, str);
}
+static struct screen_info *setup_graphics(efi_system_table_t *sys_table_arg)
+{
+ efi_guid_t gop_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+ efi_status_t status;
+ unsigned long size;
+ void **gop_handle = NULL;
+ struct screen_info *si = NULL;
+
+ size = 0;
+ status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL,
+ &gop_proto, NULL, &size, gop_handle);
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ si = alloc_screen_info(sys_table_arg);
+ if (!si)
+ return NULL;
+ efi_setup_gop(sys_table_arg, si, &gop_proto, size);
+ }
+ return si;
+}
/*
* This function handles the architcture specific differences between arm and
@@ -185,6 +226,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID;
unsigned long reserve_addr = 0;
unsigned long reserve_size = 0;
+ int secure_boot = 0;
+ struct screen_info *si;
/* Check if we were booted by the EFI firmware */
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
@@ -237,6 +280,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
__nokaslr = true;
}
+ si = setup_graphics(sys_table);
+
status = handle_kernel_image(sys_table, image_addr, &image_size,
&reserve_addr,
&reserve_size,
@@ -250,12 +295,21 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
if (status != EFI_SUCCESS)
pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
+ secure_boot = efi_get_secureboot(sys_table);
+ if (secure_boot > 0)
+ pr_efi(sys_table, "UEFI Secure Boot is enabled.\n");
+
+ if (secure_boot < 0) {
+ pr_efi_err(sys_table,
+ "could not determine UEFI Secure Boot status.\n");
+ }
+
/*
* Unauthenticated device tree data is a security hazard, so
* ignore 'dtb=' unless UEFI Secure Boot is disabled.
*/
- if (efi_secureboot_enabled(sys_table)) {
- pr_efi(sys_table, "UEFI Secure Boot is enabled.\n");
+ if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) {
+ pr_efi(sys_table, "Ignoring DTB from command line.\n");
} else {
status = handle_cmdline_files(sys_table, image, cmdline_ptr,
"dtb=",
@@ -309,6 +363,7 @@ fail_free_image:
efi_free(sys_table, image_size, *image_addr);
efi_free(sys_table, reserve_size, reserve_addr);
fail_free_cmdline:
+ free_screen_info(sys_table, si);
efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr);
fail:
return EFI_ERROR;
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index 6f42be4d0084..e1f0b28e1dcb 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -26,6 +26,43 @@ efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
return EFI_SUCCESS;
}
+static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID;
+
+struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg)
+{
+ struct screen_info *si;
+ efi_status_t status;
+
+ /*
+ * Unlike on arm64, where we can directly fill out the screen_info
+ * structure from the stub, we need to allocate a buffer to hold
+ * its contents while we hand over to the kernel proper from the
+ * decompressor.
+ */
+ status = efi_call_early(allocate_pool, EFI_RUNTIME_SERVICES_DATA,
+ sizeof(*si), (void **)&si);
+
+ if (status != EFI_SUCCESS)
+ return NULL;
+
+ status = efi_call_early(install_configuration_table,
+ &screen_info_guid, si);
+ if (status == EFI_SUCCESS)
+ return si;
+
+ efi_call_early(free_pool, si);
+ return NULL;
+}
+
+void free_screen_info(efi_system_table_t *sys_table_arg, struct screen_info *si)
+{
+ if (!si)
+ return;
+
+ efi_call_early(install_configuration_table, &screen_info_guid, NULL);
+ efi_call_early(free_pool, si);
+}
+
efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
unsigned long *image_addr,
unsigned long *image_size,
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 29ed2f9b218c..3bd127f95315 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -125,10 +125,12 @@ unsigned long get_dram_base(efi_system_table_t *sys_table_arg)
map.map_end = map.map + map_size;
- for_each_efi_memory_desc(&map, md)
- if (md->attribute & EFI_MEMORY_WB)
+ for_each_efi_memory_desc_in_map(&map, md) {
+ if (md->attribute & EFI_MEMORY_WB) {
if (membase > md->phys_addr)
membase = md->phys_addr;
+ }
+ }
efi_call_early(free_pool, map.map);
diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c
new file mode 100644
index 000000000000..932742e4cf23
--- /dev/null
+++ b/drivers/firmware/efi/libstub/gop.c
@@ -0,0 +1,354 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/efi.h>
+#include <linux/screen_info.h>
+#include <asm/efi.h>
+#include <asm/setup.h>
+
+static void find_bits(unsigned long mask, u8 *pos, u8 *size)
+{
+ u8 first, len;
+
+ first = 0;
+ len = 0;
+
+ if (mask) {
+ while (!(mask & 0x1)) {
+ mask = mask >> 1;
+ first++;
+ }
+
+ while (mask & 0x1) {
+ mask = mask >> 1;
+ len++;
+ }
+ }
+
+ *pos = first;
+ *size = len;
+}
+
+static void
+setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
+ struct efi_pixel_bitmask pixel_info, int pixel_format)
+{
+ if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+ si->lfb_depth = 32;
+ si->lfb_linelength = pixels_per_scan_line * 4;
+ si->red_size = 8;
+ si->red_pos = 0;
+ si->green_size = 8;
+ si->green_pos = 8;
+ si->blue_size = 8;
+ si->blue_pos = 16;
+ si->rsvd_size = 8;
+ si->rsvd_pos = 24;
+ } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
+ si->lfb_depth = 32;
+ si->lfb_linelength = pixels_per_scan_line * 4;
+ si->red_size = 8;
+ si->red_pos = 16;
+ si->green_size = 8;
+ si->green_pos = 8;
+ si->blue_size = 8;
+ si->blue_pos = 0;
+ si->rsvd_size = 8;
+ si->rsvd_pos = 24;
+ } else if (pixel_format == PIXEL_BIT_MASK) {
+ find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
+ find_bits(pixel_info.green_mask, &si->green_pos,
+ &si->green_size);
+ find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
+ find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
+ &si->rsvd_size);
+ si->lfb_depth = si->red_size + si->green_size +
+ si->blue_size + si->rsvd_size;
+ si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+ } else {
+ si->lfb_depth = 4;
+ si->lfb_linelength = si->lfb_width / 2;
+ si->red_size = 0;
+ si->red_pos = 0;
+ si->green_size = 0;
+ si->green_pos = 0;
+ si->blue_size = 0;
+ si->blue_pos = 0;
+ si->rsvd_size = 0;
+ si->rsvd_pos = 0;
+ }
+}
+
+static efi_status_t
+__gop_query32(efi_system_table_t *sys_table_arg,
+ struct efi_graphics_output_protocol_32 *gop32,
+ struct efi_graphics_output_mode_info **info,
+ unsigned long *size, u64 *fb_base)
+{
+ struct efi_graphics_output_protocol_mode_32 *mode;
+ efi_graphics_output_protocol_query_mode query_mode;
+ efi_status_t status;
+ unsigned long m;
+
+ m = gop32->mode;
+ mode = (struct efi_graphics_output_protocol_mode_32 *)m;
+ query_mode = (void *)(unsigned long)gop32->query_mode;
+
+ status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size,
+ info);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ *fb_base = mode->frame_buffer_base;
+ return status;
+}
+
+static efi_status_t
+setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si,
+ efi_guid_t *proto, unsigned long size, void **gop_handle)
+{
+ struct efi_graphics_output_protocol_32 *gop32, *first_gop;
+ unsigned long nr_gops;
+ u16 width, height;
+ u32 pixels_per_scan_line;
+ u32 ext_lfb_base;
+ u64 fb_base;
+ struct efi_pixel_bitmask pixel_info;
+ int pixel_format;
+ efi_status_t status = EFI_NOT_FOUND;
+ u32 *handles = (u32 *)(unsigned long)gop_handle;
+ int i;
+
+ first_gop = NULL;
+ gop32 = NULL;
+
+ nr_gops = size / sizeof(u32);
+ for (i = 0; i < nr_gops; i++) {
+ struct efi_graphics_output_mode_info *info = NULL;
+ efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
+ bool conout_found = false;
+ void *dummy = NULL;
+ efi_handle_t h = (efi_handle_t)(unsigned long)handles[i];
+ u64 current_fb_base;
+
+ status = efi_call_early(handle_protocol, h,
+ proto, (void **)&gop32);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ status = efi_call_early(handle_protocol, h,
+ &conout_proto, &dummy);
+ if (status == EFI_SUCCESS)
+ conout_found = true;
+
+ status = __gop_query32(sys_table_arg, gop32, &info, &size,
+ &current_fb_base);
+ if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+ /*
+ * Systems that use the UEFI Console Splitter may
+ * provide multiple GOP devices, not all of which are
+ * backed by real hardware. The workaround is to search
+ * for a GOP implementing the ConOut protocol, and if
+ * one isn't found, to just fall back to the first GOP.
+ */
+ width = info->horizontal_resolution;
+ height = info->vertical_resolution;
+ pixel_format = info->pixel_format;
+ pixel_info = info->pixel_information;
+ pixels_per_scan_line = info->pixels_per_scan_line;
+ fb_base = current_fb_base;
+
+ /*
+ * Once we've found a GOP supporting ConOut,
+ * don't bother looking any further.
+ */
+ first_gop = gop32;
+ if (conout_found)
+ break;
+ }
+ }
+
+ /* Did we find any GOPs? */
+ if (!first_gop)
+ goto out;
+
+ /* EFI framebuffer */
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_width = width;
+ si->lfb_height = height;
+ si->lfb_base = fb_base;
+
+ ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+ if (ext_lfb_base) {
+ si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ si->ext_lfb_base = ext_lfb_base;
+ }
+
+ si->pages = 1;
+
+ setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
+
+ si->lfb_size = si->lfb_linelength * si->lfb_height;
+
+ si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+out:
+ return status;
+}
+
+static efi_status_t
+__gop_query64(efi_system_table_t *sys_table_arg,
+ struct efi_graphics_output_protocol_64 *gop64,
+ struct efi_graphics_output_mode_info **info,
+ unsigned long *size, u64 *fb_base)
+{
+ struct efi_graphics_output_protocol_mode_64 *mode;
+ efi_graphics_output_protocol_query_mode query_mode;
+ efi_status_t status;
+ unsigned long m;
+
+ m = gop64->mode;
+ mode = (struct efi_graphics_output_protocol_mode_64 *)m;
+ query_mode = (void *)(unsigned long)gop64->query_mode;
+
+ status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size,
+ info);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ *fb_base = mode->frame_buffer_base;
+ return status;
+}
+
+static efi_status_t
+setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si,
+ efi_guid_t *proto, unsigned long size, void **gop_handle)
+{
+ struct efi_graphics_output_protocol_64 *gop64, *first_gop;
+ unsigned long nr_gops;
+ u16 width, height;
+ u32 pixels_per_scan_line;
+ u32 ext_lfb_base;
+ u64 fb_base;
+ struct efi_pixel_bitmask pixel_info;
+ int pixel_format;
+ efi_status_t status = EFI_NOT_FOUND;
+ u64 *handles = (u64 *)(unsigned long)gop_handle;
+ int i;
+
+ first_gop = NULL;
+ gop64 = NULL;
+
+ nr_gops = size / sizeof(u64);
+ for (i = 0; i < nr_gops; i++) {
+ struct efi_graphics_output_mode_info *info = NULL;
+ efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
+ bool conout_found = false;
+ void *dummy = NULL;
+ efi_handle_t h = (efi_handle_t)(unsigned long)handles[i];
+ u64 current_fb_base;
+
+ status = efi_call_early(handle_protocol, h,
+ proto, (void **)&gop64);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ status = efi_call_early(handle_protocol, h,
+ &conout_proto, &dummy);
+ if (status == EFI_SUCCESS)
+ conout_found = true;
+
+ status = __gop_query64(sys_table_arg, gop64, &info, &size,
+ &current_fb_base);
+ if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+ /*
+ * Systems that use the UEFI Console Splitter may
+ * provide multiple GOP devices, not all of which are
+ * backed by real hardware. The workaround is to search
+ * for a GOP implementing the ConOut protocol, and if
+ * one isn't found, to just fall back to the first GOP.
+ */
+ width = info->horizontal_resolution;
+ height = info->vertical_resolution;
+ pixel_format = info->pixel_format;
+ pixel_info = info->pixel_information;
+ pixels_per_scan_line = info->pixels_per_scan_line;
+ fb_base = current_fb_base;
+
+ /*
+ * Once we've found a GOP supporting ConOut,
+ * don't bother looking any further.
+ */
+ first_gop = gop64;
+ if (conout_found)
+ break;
+ }
+ }
+
+ /* Did we find any GOPs? */
+ if (!first_gop)
+ goto out;
+
+ /* EFI framebuffer */
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_width = width;
+ si->lfb_height = height;
+ si->lfb_base = fb_base;
+
+ ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+ if (ext_lfb_base) {
+ si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ si->ext_lfb_base = ext_lfb_base;
+ }
+
+ si->pages = 1;
+
+ setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
+
+ si->lfb_size = si->lfb_linelength * si->lfb_height;
+
+ si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+out:
+ return status;
+}
+
+/*
+ * See if we have Graphics Output Protocol
+ */
+efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
+ struct screen_info *si, efi_guid_t *proto,
+ unsigned long size)
+{
+ efi_status_t status;
+ void **gop_handle = NULL;
+
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size, (void **)&gop_handle);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ proto, NULL, &size, gop_handle);
+ if (status != EFI_SUCCESS)
+ goto free_handle;
+
+ if (efi_is_64bit()) {
+ status = setup_gop64(sys_table_arg, si, proto, size,
+ gop_handle);
+ } else {
+ status = setup_gop32(sys_table_arg, si, proto, size,
+ gop_handle);
+ }
+
+free_handle:
+ efi_call_early(free_pool, gop_handle);
+ return status;
+}
diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c
new file mode 100644
index 000000000000..236004b9a50d
--- /dev/null
+++ b/drivers/firmware/efi/memattr.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "efi: memattr: " fmt
+
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+
+#include <asm/early_ioremap.h>
+
+static int __initdata tbl_size;
+
+/*
+ * Reserve the memory associated with the Memory Attributes configuration
+ * table, if it exists.
+ */
+int __init efi_memattr_init(void)
+{
+ efi_memory_attributes_table_t *tbl;
+
+ if (efi.mem_attr_table == EFI_INVALID_TABLE_ADDR)
+ return 0;
+
+ tbl = early_memremap(efi.mem_attr_table, sizeof(*tbl));
+ if (!tbl) {
+ pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n",
+ efi.mem_attr_table);
+ return -ENOMEM;
+ }
+
+ if (tbl->version > 1) {
+ pr_warn("Unexpected EFI Memory Attributes table version %d\n",
+ tbl->version);
+ goto unmap;
+ }
+
+ tbl_size = sizeof(*tbl) + tbl->num_entries * tbl->desc_size;
+ memblock_reserve(efi.mem_attr_table, tbl_size);
+
+unmap:
+ early_memunmap(tbl, sizeof(*tbl));
+ return 0;
+}
+
+/*
+ * Returns a copy @out of the UEFI memory descriptor @in if it is covered
+ * entirely by a UEFI memory map entry with matching attributes. The virtual
+ * address of @out is set according to the matching entry that was found.
+ */
+static bool entry_is_valid(const efi_memory_desc_t *in, efi_memory_desc_t *out)
+{
+ u64 in_paddr = in->phys_addr;
+ u64 in_size = in->num_pages << EFI_PAGE_SHIFT;
+ efi_memory_desc_t *md;
+
+ *out = *in;
+
+ if (in->type != EFI_RUNTIME_SERVICES_CODE &&
+ in->type != EFI_RUNTIME_SERVICES_DATA) {
+ pr_warn("Entry type should be RuntimeServiceCode/Data\n");
+ return false;
+ }
+
+ if (!(in->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))) {
+ pr_warn("Entry attributes invalid: RO and XP bits both cleared\n");
+ return false;
+ }
+
+ if (PAGE_SIZE > EFI_PAGE_SIZE &&
+ (!PAGE_ALIGNED(in->phys_addr) ||
+ !PAGE_ALIGNED(in->num_pages << EFI_PAGE_SHIFT))) {
+ /*
+ * Since arm64 may execute with page sizes of up to 64 KB, the
+ * UEFI spec mandates that RuntimeServices memory regions must
+ * be 64 KB aligned. We need to validate this here since we will
+ * not be able to tighten permissions on such regions without
+ * affecting adjacent regions.
+ */
+ pr_warn("Entry address region misaligned\n");
+ return false;
+ }
+
+ for_each_efi_memory_desc(md) {
+ u64 md_paddr = md->phys_addr;
+ u64 md_size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (!(md->attribute & EFI_MEMORY_RUNTIME))
+ continue;
+ if (md->virt_addr == 0) {
+ /* no virtual mapping has been installed by the stub */
+ break;
+ }
+
+ if (md_paddr > in_paddr || (in_paddr - md_paddr) >= md_size)
+ continue;
+
+ /*
+ * This entry covers the start of @in, check whether
+ * it covers the end as well.
+ */
+ if (md_paddr + md_size < in_paddr + in_size) {
+ pr_warn("Entry covers multiple EFI memory map regions\n");
+ return false;
+ }
+
+ if (md->type != in->type) {
+ pr_warn("Entry type deviates from EFI memory map region type\n");
+ return false;
+ }
+
+ out->virt_addr = in_paddr + (md->virt_addr - md_paddr);
+
+ return true;
+ }
+
+ pr_warn("No matching entry found in the EFI memory map\n");
+ return false;
+}
+
+/*
+ * To be called after the EFI page tables have been populated. If a memory
+ * attributes table is available, its contents will be used to update the
+ * mappings with tightened permissions as described by the table.
+ * This requires the UEFI memory map to have already been populated with
+ * virtual addresses.
+ */
+int __init efi_memattr_apply_permissions(struct mm_struct *mm,
+ efi_memattr_perm_setter fn)
+{
+ efi_memory_attributes_table_t *tbl;
+ int i, ret;
+
+ if (tbl_size <= sizeof(*tbl))
+ return 0;
+
+ /*
+ * We need the EFI memory map to be setup so we can use it to
+ * lookup the virtual addresses of all entries in the of EFI
+ * Memory Attributes table. If it isn't available, this
+ * function should not be called.
+ */
+ if (WARN_ON(!efi_enabled(EFI_MEMMAP)))
+ return 0;
+
+ tbl = memremap(efi.mem_attr_table, tbl_size, MEMREMAP_WB);
+ if (!tbl) {
+ pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n",
+ efi.mem_attr_table);
+ return -ENOMEM;
+ }
+
+ if (efi_enabled(EFI_DBG))
+ pr_info("Processing EFI Memory Attributes table:\n");
+
+ for (i = ret = 0; ret == 0 && i < tbl->num_entries; i++) {
+ efi_memory_desc_t md;
+ unsigned long size;
+ bool valid;
+ char buf[64];
+
+ valid = entry_is_valid((void *)tbl->entry + i * tbl->desc_size,
+ &md);
+ size = md.num_pages << EFI_PAGE_SHIFT;
+ if (efi_enabled(EFI_DBG) || !valid)
+ pr_info("%s 0x%012llx-0x%012llx %s\n",
+ valid ? "" : "!", md.phys_addr,
+ md.phys_addr + size - 1,
+ efi_md_typeattr_format(buf, sizeof(buf), &md));
+
+ if (valid)
+ ret = fn(mm, &md);
+ }
+ memunmap(tbl);
+ return ret;
+}
diff --git a/drivers/firmware/efi/reboot.c b/drivers/firmware/efi/reboot.c
index 9c59d1c795d1..62ead9b9d871 100644
--- a/drivers/firmware/efi/reboot.c
+++ b/drivers/firmware/efi/reboot.c
@@ -9,7 +9,8 @@ int efi_reboot_quirk_mode = -1;
void efi_reboot(enum reboot_mode reboot_mode, const char *__unused)
{
- int efi_mode;
+ const char *str[] = { "cold", "warm", "shutdown", "platform" };
+ int efi_mode, cap_reset_mode;
if (!efi_enabled(EFI_RUNTIME_SERVICES))
return;
@@ -30,6 +31,15 @@ void efi_reboot(enum reboot_mode reboot_mode, const char *__unused)
if (efi_reboot_quirk_mode != -1)
efi_mode = efi_reboot_quirk_mode;
+ if (efi_capsule_pending(&cap_reset_mode)) {
+ if (efi_mode != cap_reset_mode)
+ printk(KERN_CRIT "efi: %s reset requested but pending "
+ "capsule update requires %s reset... Performing "
+ "%s reset.\n", str[efi_mode], str[cap_reset_mode],
+ str[cap_reset_mode]);
+ efi_mode = cap_reset_mode;
+ }
+
efi.reset_system(efi_mode, EFI_SUCCESS, 0, NULL);
}
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index de6953039af6..23bef6bb73ee 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -16,10 +16,70 @@
#include <linux/bug.h>
#include <linux/efi.h>
+#include <linux/irqflags.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
+#include <linux/stringify.h>
#include <asm/efi.h>
+static void efi_call_virt_check_flags(unsigned long flags, const char *call)
+{
+ unsigned long cur_flags, mismatch;
+
+ local_save_flags(cur_flags);
+
+ mismatch = flags ^ cur_flags;
+ if (!WARN_ON_ONCE(mismatch & ARCH_EFI_IRQ_FLAGS_MASK))
+ return;
+
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_NOW_UNRELIABLE);
+ pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI %s\n",
+ flags, cur_flags, call);
+ local_irq_restore(flags);
+}
+
+/*
+ * Arch code can implement the following three template macros, avoiding
+ * reptition for the void/non-void return cases of {__,}efi_call_virt:
+ *
+ * * arch_efi_call_virt_setup
+ *
+ * Sets up the environment for the call (e.g. switching page tables,
+ * allowing kernel-mode use of floating point, if required).
+ *
+ * * arch_efi_call_virt
+ *
+ * Performs the call. The last expression in the macro must be the call
+ * itself, allowing the logic to be shared by the void and non-void
+ * cases.
+ *
+ * * arch_efi_call_virt_teardown
+ *
+ * Restores the usual kernel environment once the call has returned.
+ */
+
+#define efi_call_virt(f, args...) \
+({ \
+ efi_status_t __s; \
+ unsigned long flags; \
+ arch_efi_call_virt_setup(); \
+ local_save_flags(flags); \
+ __s = arch_efi_call_virt(f, args); \
+ efi_call_virt_check_flags(flags, __stringify(f)); \
+ arch_efi_call_virt_teardown(); \
+ __s; \
+})
+
+#define __efi_call_virt(f, args...) \
+({ \
+ unsigned long flags; \
+ arch_efi_call_virt_setup(); \
+ local_save_flags(flags); \
+ arch_efi_call_virt(f, args); \
+ efi_call_virt_check_flags(flags, __stringify(f)); \
+ arch_efi_call_virt_teardown(); \
+})
+
/*
* According to section 7.1 of the UEFI spec, Runtime Services are not fully
* reentrant, and there are particular combinations of calls that need to be
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 34b741940494..d3b751383286 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -329,39 +329,6 @@ check_var_size_nonblocking(u32 attributes, unsigned long size)
return fops->query_variable_store(attributes, size, true);
}
-static int efi_status_to_err(efi_status_t status)
-{
- int err;
-
- switch (status) {
- case EFI_SUCCESS:
- err = 0;
- break;
- case EFI_INVALID_PARAMETER:
- err = -EINVAL;
- break;
- case EFI_OUT_OF_RESOURCES:
- err = -ENOSPC;
- break;
- case EFI_DEVICE_ERROR:
- err = -EIO;
- break;
- case EFI_WRITE_PROTECTED:
- err = -EROFS;
- break;
- case EFI_SECURITY_VIOLATION:
- err = -EACCES;
- break;
- case EFI_NOT_FOUND:
- err = -ENOENT;
- break;
- default:
- err = -EINVAL;
- }
-
- return err;
-}
-
static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
struct list_head *head)
{
@@ -452,8 +419,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
* Returns 0 on success, or a kernel error code on failure.
*/
int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
- void *data, bool atomic, bool duplicates,
- struct list_head *head)
+ void *data, bool duplicates, struct list_head *head)
{
const struct efivar_operations *ops = __efivars->ops;
unsigned long variable_name_size = 1024;
@@ -483,7 +449,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
&vendor_guid);
switch (status) {
case EFI_SUCCESS:
- if (!atomic)
+ if (duplicates)
spin_unlock_irq(&__efivars->lock);
variable_name_size = var_name_strnsize(variable_name,
@@ -498,21 +464,19 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
* and may end up looping here forever.
*/
if (duplicates &&
- variable_is_present(variable_name, &vendor_guid, head)) {
+ variable_is_present(variable_name, &vendor_guid,
+ head)) {
dup_variable_bug(variable_name, &vendor_guid,
variable_name_size);
- if (!atomic)
- spin_lock_irq(&__efivars->lock);
-
status = EFI_NOT_FOUND;
- break;
+ } else {
+ err = func(variable_name, vendor_guid,
+ variable_name_size, data);
+ if (err)
+ status = EFI_NOT_FOUND;
}
- err = func(variable_name, vendor_guid, variable_name_size, data);
- if (err)
- status = EFI_NOT_FOUND;
-
- if (!atomic)
+ if (duplicates)
spin_lock_irq(&__efivars->lock);
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
index bf731e9f643e..7f85c2c1d681 100644
--- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
+++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
@@ -276,8 +276,8 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector
}
}
} else {
- for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
- for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+ for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+ for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
if (max_pix_clock >= pix_clock) {
*dp_lanes = lane_num;
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a0f1bd711b53..e3f4c725a1c6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2872,20 +2872,6 @@ static void intel_dp_info(struct seq_file *m,
intel_panel_info(m, &intel_connector->panel);
}
-static void intel_dp_mst_info(struct seq_file *m,
- struct intel_connector *intel_connector)
-{
- struct intel_encoder *intel_encoder = intel_connector->encoder;
- struct intel_dp_mst_encoder *intel_mst =
- enc_to_mst(&intel_encoder->base);
- struct intel_digital_port *intel_dig_port = intel_mst->primary;
- struct intel_dp *intel_dp = &intel_dig_port->dp;
- bool has_audio = drm_dp_mst_port_has_audio(&intel_dp->mst_mgr,
- intel_connector->port);
-
- seq_printf(m, "\taudio support: %s\n", yesno(has_audio));
-}
-
static void intel_hdmi_info(struct seq_file *m,
struct intel_connector *intel_connector)
{
@@ -2929,8 +2915,6 @@ static void intel_connector_info(struct seq_file *m,
intel_hdmi_info(m, intel_connector);
else if (intel_encoder->type == INTEL_OUTPUT_LVDS)
intel_lvds_info(m, intel_connector);
- else if (intel_encoder->type == INTEL_OUTPUT_DP_MST)
- intel_dp_mst_info(m, intel_connector);
}
seq_printf(m, "\tmodes:\n");
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index fffdac801d3b..363bd79dea2e 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7444,6 +7444,8 @@ enum skl_disp_power_wells {
#define TRANS_CLK_SEL_DISABLED (0x0<<29)
#define TRANS_CLK_SEL_PORT(x) (((x)+1)<<29)
+#define CDCLK_FREQ _MMIO(0x46200)
+
#define _TRANSA_MSA_MISC 0x60410
#define _TRANSB_MSA_MISC 0x61410
#define _TRANSC_MSA_MISC 0x62410
diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c
index 30f921421b0c..7d281b40064a 100644
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -262,8 +262,7 @@ static void hsw_audio_codec_disable(struct intel_encoder *encoder)
tmp |= AUD_CONFIG_N_PROG_ENABLE;
tmp &= ~AUD_CONFIG_UPPER_N_MASK;
tmp &= ~AUD_CONFIG_LOWER_N_MASK;
- if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) ||
- intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST))
+ if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT))
tmp |= AUD_CONFIG_N_VALUE_INDEX;
I915_WRITE(HSW_AUD_CFG(pipe), tmp);
@@ -476,8 +475,7 @@ static void ilk_audio_codec_enable(struct drm_connector *connector,
tmp &= ~AUD_CONFIG_N_VALUE_INDEX;
tmp &= ~AUD_CONFIG_N_PROG_ENABLE;
tmp &= ~AUD_CONFIG_PIXEL_CLOCK_HDMI_MASK;
- if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) ||
- intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST))
+ if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT))
tmp |= AUD_CONFIG_N_VALUE_INDEX;
else
tmp |= audio_config_hdmi_pixel_clock(adjusted_mode);
@@ -515,8 +513,7 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder)
/* ELD Conn_Type */
connector->eld[5] &= ~(3 << 2);
- if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT) ||
- intel_pipe_has_type(crtc, INTEL_OUTPUT_DP_MST))
+ if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT))
connector->eld[5] |= (1 << 2);
connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2;
diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index 505fc5cf26f8..0364292367b1 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c
@@ -257,8 +257,14 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder,
pipe_config->has_pch_encoder = true;
/* LPT FDI RX only supports 8bpc. */
- if (HAS_PCH_LPT(dev))
+ if (HAS_PCH_LPT(dev)) {
+ if (pipe_config->bw_constrained && pipe_config->pipe_bpp < 24) {
+ DRM_DEBUG_KMS("LPT only supports 24bpp\n");
+ return false;
+ }
+
pipe_config->pipe_bpp = 24;
+ }
/* FDI must always be 2.7 GHz */
if (HAS_DDI(dev)) {
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index 3b57bf06abe8..96ffcc541e17 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -3106,23 +3106,6 @@ void intel_ddi_fdi_disable(struct drm_crtc *crtc)
I915_WRITE(FDI_RX_CTL(PIPE_A), val);
}
-bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv,
- struct intel_crtc *intel_crtc)
-{
- u32 temp;
-
- if (intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_AUDIO)) {
- temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD);
-
- intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO);
-
- if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe))
- return true;
- }
-
- return false;
-}
-
void intel_ddi_get_config(struct intel_encoder *encoder,
struct intel_crtc_state *pipe_config)
{
@@ -3183,8 +3166,11 @@ void intel_ddi_get_config(struct intel_encoder *encoder,
break;
}
- pipe_config->has_audio =
- intel_ddi_is_audio_enabled(dev_priv, intel_crtc);
+ if (intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_AUDIO)) {
+ temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD);
+ if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe))
+ pipe_config->has_audio = true;
+ }
if (encoder->type == INTEL_OUTPUT_EDP && dev_priv->vbt.edp_bpp &&
pipe_config->pipe_bpp > dev_priv->vbt.edp_bpp) {
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 182f84937345..0104a06d01fd 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -7988,9 +7988,6 @@ static void i9xx_get_pfit_config(struct intel_crtc *crtc,
pipe_config->gmch_pfit.control = tmp;
pipe_config->gmch_pfit.pgm_ratios = I915_READ(PFIT_PGM_RATIOS);
- if (INTEL_INFO(dev)->gen < 5)
- pipe_config->gmch_pfit.lvds_border_bits =
- I915_READ(LVDS) & LVDS_BORDER_ENABLE;
}
static void vlv_crtc_clock_get(struct intel_crtc *crtc,
@@ -9752,6 +9749,8 @@ static void broadwell_set_cdclk(struct drm_device *dev, int cdclk)
sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ, data);
mutex_unlock(&dev_priv->rps.hw_lock);
+ I915_WRITE(CDCLK_FREQ, DIV_ROUND_CLOSEST(cdclk, 1000) - 1);
+
intel_update_cdclk(dev);
WARN(cdclk != dev_priv->cdclk_freq,
diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c
index 937e77228466..2c999725b3d4 100644
--- a/drivers/gpu/drm/i915/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/intel_dp_mst.c
@@ -78,8 +78,6 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder,
return false;
}
- if (drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, found->port))
- pipe_config->has_audio = true;
mst_pbn = drm_dp_calc_pbn_mode(adjusted_mode->crtc_clock, bpp);
pipe_config->pbn = mst_pbn;
@@ -104,11 +102,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder)
struct intel_dp_mst_encoder *intel_mst = enc_to_mst(&encoder->base);
struct intel_digital_port *intel_dig_port = intel_mst->primary;
struct intel_dp *intel_dp = &intel_dig_port->dp;
- struct drm_device *dev = encoder->base.dev;
- struct drm_i915_private *dev_priv = dev->dev_private;
- struct drm_crtc *crtc = encoder->base.crtc;
- struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-
int ret;
DRM_DEBUG_KMS("%d\n", intel_dp->active_mst_links);
@@ -119,10 +112,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder)
if (ret) {
DRM_ERROR("failed to update payload %d\n", ret);
}
- if (intel_crtc->config->has_audio) {
- intel_audio_codec_disable(encoder);
- intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO);
- }
}
static void intel_mst_post_disable_dp(struct intel_encoder *encoder)
@@ -221,7 +210,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder)
struct intel_dp *intel_dp = &intel_dig_port->dp;
struct drm_device *dev = intel_dig_port->base.base.dev;
struct drm_i915_private *dev_priv = dev->dev_private;
- struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc);
enum port port = intel_dig_port->port;
int ret;
@@ -234,13 +222,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder)
ret = drm_dp_check_act_status(&intel_dp->mst_mgr);
ret = drm_dp_update_payload_part2(&intel_dp->mst_mgr);
-
- if (crtc->config->has_audio) {
- DRM_DEBUG_DRIVER("Enabling DP audio on pipe %c\n",
- pipe_name(crtc->pipe));
- intel_display_power_get(dev_priv, POWER_DOMAIN_AUDIO);
- intel_audio_codec_enable(encoder);
- }
}
static bool intel_dp_mst_enc_get_hw_state(struct intel_encoder *encoder,
@@ -266,9 +247,6 @@ static void intel_dp_mst_enc_get_config(struct intel_encoder *encoder,
pipe_config->has_dp_encoder = true;
- pipe_config->has_audio =
- intel_ddi_is_audio_enabled(dev_priv, crtc);
-
temp = I915_READ(TRANS_DDI_FUNC_CTL(cpu_transcoder));
if (temp & TRANS_DDI_PHSYNC)
flags |= DRM_MODE_FLAG_PHSYNC;
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 7d3af3a72abe..9d0770c23fde 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1019,8 +1019,6 @@ void intel_ddi_set_pipe_settings(struct drm_crtc *crtc);
void intel_ddi_prepare_link_retrain(struct intel_dp *intel_dp);
bool intel_ddi_connector_get_hw_state(struct intel_connector *intel_connector);
void intel_ddi_fdi_disable(struct drm_crtc *crtc);
-bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv,
- struct intel_crtc *intel_crtc);
void intel_ddi_get_config(struct intel_encoder *encoder,
struct intel_crtc_state *pipe_config);
struct intel_encoder *
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index cd9fe609aefb..10dc3517b63b 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -123,6 +123,10 @@ static void intel_lvds_get_config(struct intel_encoder *encoder,
pipe_config->base.adjusted_mode.flags |= flags;
+ if (INTEL_INFO(dev)->gen < 5)
+ pipe_config->gmch_pfit.lvds_border_bits =
+ tmp & LVDS_BORDER_ENABLE;
+
/* gen2/3 store dither state in pfit control, needs to match */
if (INTEL_INFO(dev)->gen < 4) {
tmp = I915_READ(PFIT_CONTROL);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 8ed3cf34f82d..3425d8e737b3 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -6646,6 +6646,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
misccpctl = I915_READ(GEN7_MISCCPCTL);
I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
I915_WRITE(GEN8_L3SQCREG1, BDW_WA_L3SQCREG1_DEFAULT);
+ /*
+ * Wait at least 100 clocks before re-enabling clock gating. See
+ * the definition of L3SQCREG1 in BSpec.
+ */
+ POSTING_READ(GEN8_L3SQCREG1);
+ udelay(1);
I915_WRITE(GEN7_MISCCPCTL, misccpctl);
/*
diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c
index b80b08f71cb4..532127c55de6 100644
--- a/drivers/gpu/drm/radeon/atombios_crtc.c
+++ b/drivers/gpu/drm/radeon/atombios_crtc.c
@@ -1742,6 +1742,7 @@ static u32 radeon_get_pll_use_mask(struct drm_crtc *crtc)
static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc)
{
struct drm_device *dev = crtc->dev;
+ struct radeon_device *rdev = dev->dev_private;
struct drm_crtc *test_crtc;
struct radeon_crtc *test_radeon_crtc;
@@ -1751,6 +1752,10 @@ static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc)
test_radeon_crtc = to_radeon_crtc(test_crtc);
if (test_radeon_crtc->encoder &&
ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) {
+ /* PPLL2 is exclusive to UNIPHYA on DCE61 */
+ if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) &&
+ test_radeon_crtc->pll_id == ATOM_PPLL2)
+ continue;
/* for DP use the same PLL for all */
if (test_radeon_crtc->pll_id != ATOM_PPLL_INVALID)
return test_radeon_crtc->pll_id;
@@ -1772,6 +1777,7 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc)
{
struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc);
struct drm_device *dev = crtc->dev;
+ struct radeon_device *rdev = dev->dev_private;
struct drm_crtc *test_crtc;
struct radeon_crtc *test_radeon_crtc;
u32 adjusted_clock, test_adjusted_clock;
@@ -1787,6 +1793,10 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc)
test_radeon_crtc = to_radeon_crtc(test_crtc);
if (test_radeon_crtc->encoder &&
!ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) {
+ /* PPLL2 is exclusive to UNIPHYA on DCE61 */
+ if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) &&
+ test_radeon_crtc->pll_id == ATOM_PPLL2)
+ continue;
/* check if we are already driving this connector with another crtc */
if (test_radeon_crtc->connector == radeon_crtc->connector) {
/* if we are, return that pll */
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c
index afa9db1dc0e3..cead089a9e7d 100644
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -326,8 +326,8 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector,
}
}
} else {
- for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
- for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+ for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+ for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
if (max_pix_clock >= pix_clock) {
*dp_lanes = lane_num;
diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
index 3b0c229d7dcd..db64e0062689 100644
--- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c
+++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
@@ -105,7 +105,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg
tmp &= AUX_HPD_SEL(0x7);
tmp |= AUX_HPD_SEL(chan->rec.hpd);
- tmp |= AUX_EN | AUX_LS_READ_EN;
+ tmp |= AUX_EN | AUX_LS_READ_EN | AUX_HPD_DISCON(0x1);
WREG32(AUX_CONTROL + aux_offset[instance], tmp);
diff --git a/drivers/input/misc/max8997_haptic.c b/drivers/input/misc/max8997_haptic.c
index a806ba3818f7..8d6326d7e7be 100644
--- a/drivers/input/misc/max8997_haptic.c
+++ b/drivers/input/misc/max8997_haptic.c
@@ -255,12 +255,14 @@ static int max8997_haptic_probe(struct platform_device *pdev)
struct max8997_dev *iodev = dev_get_drvdata(pdev->dev.parent);
const struct max8997_platform_data *pdata =
dev_get_platdata(iodev->dev);
- const struct max8997_haptic_platform_data *haptic_pdata =
- pdata->haptic_pdata;
+ const struct max8997_haptic_platform_data *haptic_pdata = NULL;
struct max8997_haptic *chip;
struct input_dev *input_dev;
int error;
+ if (pdata)
+ haptic_pdata = pdata->haptic_pdata;
+
if (!haptic_pdata) {
dev_err(&pdev->dev, "no haptic platform data\n");
return -EINVAL;
diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c
index df3581f60628..42de34b92996 100644
--- a/drivers/input/misc/twl6040-vibra.c
+++ b/drivers/input/misc/twl6040-vibra.c
@@ -257,6 +257,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev)
int vddvibr_uV = 0;
int error;
+ of_node_get(twl6040_core_dev->of_node);
twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node,
"vibra");
if (!twl6040_core_node) {
diff --git a/drivers/input/mouse/byd.c b/drivers/input/mouse/byd.c
index fdc243ca93ed..e583f8b50454 100644
--- a/drivers/input/mouse/byd.c
+++ b/drivers/input/mouse/byd.c
@@ -2,6 +2,10 @@
* BYD TouchPad PS/2 mouse driver
*
* Copyright (C) 2015 Chris Diamand <chris@diamand.org>
+ * Copyright (C) 2015 Richard Pospesel
+ * Copyright (C) 2015 Tai Chi Minh Ralph Eastwood
+ * Copyright (C) 2015 Martin Wimpress
+ * Copyright (C) 2015 Jay Kuri
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c
index 7f366f1b0377..0b1b8c7b6ce5 100644
--- a/drivers/media/v4l2-core/videobuf2-v4l2.c
+++ b/drivers/media/v4l2-core/videobuf2-v4l2.c
@@ -74,11 +74,6 @@ static int __verify_planes_array(struct vb2_buffer *vb, const struct v4l2_buffer
return 0;
}
-static int __verify_planes_array_core(struct vb2_buffer *vb, const void *pb)
-{
- return __verify_planes_array(vb, pb);
-}
-
/**
* __verify_length() - Verify that the bytesused value for each plane fits in
* the plane length and that the data offset doesn't exceed the bytesused value.
@@ -442,7 +437,6 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb,
}
static const struct vb2_buf_ops v4l2_buf_ops = {
- .verify_planes_array = __verify_planes_array_core,
.fill_user_buffer = __fill_v4l2_buffer,
.fill_vb2_buffer = __fill_vb2_buffer,
.copy_timestamp = __copy_timestamp,
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
index b212488606da..11be8044e0d7 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
@@ -43,6 +43,7 @@ static void xgene_cle_idt_to_hw(u32 dstqid, u32 fpsel,
static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata,
struct xgene_cle_dbptr *dbptr, u32 *buf)
{
+ buf[0] = SET_VAL(CLE_DROP, dbptr->drop);
buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) |
SET_VAL(CLE_DSTQIDL, dbptr->dstqid);
@@ -412,7 +413,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
.branch = {
{
/* IPV4 */
- .valid = 0,
+ .valid = 1,
.next_packet_pointer = 22,
.jump_bw = JMP_FW,
.jump_rel = JMP_ABS,
@@ -420,7 +421,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
.next_node = PKT_PROT_NODE,
.next_branch = 0,
.data = 0x8,
- .mask = 0xffff
+ .mask = 0x0
},
{
.valid = 0,
@@ -456,7 +457,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
.next_node = RSS_IPV4_TCP_NODE,
.next_branch = 0,
.data = 0x0600,
- .mask = 0xffff
+ .mask = 0x00ff
},
{
/* UDP */
@@ -468,7 +469,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
.next_node = RSS_IPV4_UDP_NODE,
.next_branch = 0,
.data = 0x1100,
- .mask = 0xffff
+ .mask = 0x00ff
},
{
.valid = 0,
@@ -642,7 +643,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
{
/* TCP DST Port */
.valid = 0,
- .next_packet_pointer = 256,
+ .next_packet_pointer = 258,
.jump_bw = JMP_FW,
.jump_rel = JMP_ABS,
.operation = EQT,
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
index 29a17abdd828..3bf90683240e 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
@@ -83,6 +83,8 @@
#define CLE_TYPE_POS 0
#define CLE_TYPE_LEN 2
+#define CLE_DROP_POS 28
+#define CLE_DROP_LEN 1
#define CLE_DSTQIDL_POS 25
#define CLE_DSTQIDL_LEN 7
#define CLE_DSTQIDH_POS 0
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 39e081a70f5b..513d2a62ee6d 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -219,27 +219,30 @@ void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring,
struct xgene_enet_pdata *pdata,
enum xgene_enet_err_code status)
{
- struct rtnl_link_stats64 *stats = &pdata->stats;
-
switch (status) {
case INGRESS_CRC:
- stats->rx_crc_errors++;
+ ring->rx_crc_errors++;
+ ring->rx_dropped++;
break;
case INGRESS_CHECKSUM:
case INGRESS_CHECKSUM_COMPUTE:
- stats->rx_errors++;
+ ring->rx_errors++;
+ ring->rx_dropped++;
break;
case INGRESS_TRUNC_FRAME:
- stats->rx_frame_errors++;
+ ring->rx_frame_errors++;
+ ring->rx_dropped++;
break;
case INGRESS_PKT_LEN:
- stats->rx_length_errors++;
+ ring->rx_length_errors++;
+ ring->rx_dropped++;
break;
case INGRESS_PKT_UNDER:
- stats->rx_frame_errors++;
+ ring->rx_frame_errors++;
+ ring->rx_dropped++;
break;
case INGRESS_FIFO_OVERRUN:
- stats->rx_fifo_errors++;
+ ring->rx_fifo_errors++;
break;
default:
break;
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
index ba7da98af2ef..45220be3122f 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
@@ -86,7 +86,7 @@ enum xgene_enet_rm {
#define RINGADDRL_POS 5
#define RINGADDRL_LEN 27
#define RINGADDRH_POS 0
-#define RINGADDRH_LEN 6
+#define RINGADDRH_LEN 7
#define RINGSIZE_POS 23
#define RINGSIZE_LEN 3
#define RINGTYPE_POS 19
@@ -94,9 +94,9 @@ enum xgene_enet_rm {
#define RINGMODE_POS 20
#define RINGMODE_LEN 3
#define RECOMTIMEOUTL_POS 28
-#define RECOMTIMEOUTL_LEN 3
+#define RECOMTIMEOUTL_LEN 4
#define RECOMTIMEOUTH_POS 0
-#define RECOMTIMEOUTH_LEN 2
+#define RECOMTIMEOUTH_LEN 3
#define NUMMSGSINQ_POS 1
#define NUMMSGSINQ_LEN 16
#define ACCEPTLERR BIT(19)
@@ -201,6 +201,8 @@ enum xgene_enet_rm {
#define USERINFO_LEN 32
#define FPQNUM_POS 32
#define FPQNUM_LEN 12
+#define ELERR_POS 46
+#define ELERR_LEN 2
#define NV_POS 50
#define NV_LEN 1
#define LL_POS 51
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index 99d7e580e166..fd200883d228 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -443,8 +443,8 @@ static netdev_tx_t xgene_enet_start_xmit(struct sk_buff *skb,
skb_tx_timestamp(skb);
- pdata->stats.tx_packets++;
- pdata->stats.tx_bytes += skb->len;
+ tx_ring->tx_packets++;
+ tx_ring->tx_bytes += skb->len;
pdata->ring_ops->wr_cmd(tx_ring, count);
return NETDEV_TX_OK;
@@ -483,12 +483,12 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring,
skb = buf_pool->rx_skb[skb_index];
/* checking for error */
- status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
+ status = (GET_VAL(ELERR, le64_to_cpu(raw_desc->m0)) << LERR_LEN) ||
+ GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
if (unlikely(status > 2)) {
dev_kfree_skb_any(skb);
xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev),
status);
- pdata->stats.rx_dropped++;
ret = -EIO;
goto out;
}
@@ -506,8 +506,8 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring,
xgene_enet_skip_csum(skb);
}
- pdata->stats.rx_packets++;
- pdata->stats.rx_bytes += datalen;
+ rx_ring->rx_packets++;
+ rx_ring->rx_bytes += datalen;
napi_gro_receive(&rx_ring->napi, skb);
out:
if (--rx_ring->nbufpool == 0) {
@@ -630,7 +630,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
ring = pdata->rx_ring[i];
irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
- IRQF_SHARED, ring->irq_name, ring);
+ 0, ring->irq_name, ring);
if (ret) {
netdev_err(ndev, "Failed to request irq %s\n",
ring->irq_name);
@@ -641,7 +641,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
ring = pdata->tx_ring[i]->cp_ring;
irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
- IRQF_SHARED, ring->irq_name, ring);
+ 0, ring->irq_name, ring);
if (ret) {
netdev_err(ndev, "Failed to request irq %s\n",
ring->irq_name);
@@ -1114,12 +1114,31 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64(
{
struct xgene_enet_pdata *pdata = netdev_priv(ndev);
struct rtnl_link_stats64 *stats = &pdata->stats;
+ struct xgene_enet_desc_ring *ring;
+ int i;
- stats->rx_errors += stats->rx_length_errors +
- stats->rx_crc_errors +
- stats->rx_frame_errors +
- stats->rx_fifo_errors;
- memcpy(storage, &pdata->stats, sizeof(struct rtnl_link_stats64));
+ memset(stats, 0, sizeof(struct rtnl_link_stats64));
+ for (i = 0; i < pdata->txq_cnt; i++) {
+ ring = pdata->tx_ring[i];
+ if (ring) {
+ stats->tx_packets += ring->tx_packets;
+ stats->tx_bytes += ring->tx_bytes;
+ }
+ }
+
+ for (i = 0; i < pdata->rxq_cnt; i++) {
+ ring = pdata->rx_ring[i];
+ if (ring) {
+ stats->rx_packets += ring->rx_packets;
+ stats->rx_bytes += ring->rx_bytes;
+ stats->rx_errors += ring->rx_length_errors +
+ ring->rx_crc_errors +
+ ring->rx_frame_errors +
+ ring->rx_fifo_errors;
+ stats->rx_dropped += ring->rx_dropped;
+ }
+ }
+ memcpy(storage, stats, sizeof(struct rtnl_link_stats64));
return storage;
}
@@ -1234,6 +1253,13 @@ static int xgene_enet_get_irqs(struct xgene_enet_pdata *pdata)
for (i = 0; i < max_irqs; i++) {
ret = platform_get_irq(pdev, i);
if (ret <= 0) {
+ if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
+ max_irqs = i;
+ pdata->rxq_cnt = max_irqs / 2;
+ pdata->txq_cnt = max_irqs / 2;
+ pdata->cq_cnt = max_irqs / 2;
+ break;
+ }
dev_err(dev, "Unable to get ENET IRQ\n");
ret = ret ? : -ENXIO;
return ret;
@@ -1437,19 +1463,28 @@ static void xgene_enet_setup_ops(struct xgene_enet_pdata *pdata)
pdata->port_ops = &xgene_xgport_ops;
pdata->cle_ops = &xgene_cle3in_ops;
pdata->rm = RM0;
- pdata->rxq_cnt = XGENE_NUM_RX_RING;
- pdata->txq_cnt = XGENE_NUM_TX_RING;
- pdata->cq_cnt = XGENE_NUM_TXC_RING;
+ if (!pdata->rxq_cnt) {
+ pdata->rxq_cnt = XGENE_NUM_RX_RING;
+ pdata->txq_cnt = XGENE_NUM_TX_RING;
+ pdata->cq_cnt = XGENE_NUM_TXC_RING;
+ }
break;
}
if (pdata->enet_id == XGENE_ENET1) {
switch (pdata->port_id) {
case 0:
- pdata->cpu_bufnum = START_CPU_BUFNUM_0;
- pdata->eth_bufnum = START_ETH_BUFNUM_0;
- pdata->bp_bufnum = START_BP_BUFNUM_0;
- pdata->ring_num = START_RING_NUM_0;
+ if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
+ pdata->cpu_bufnum = X2_START_CPU_BUFNUM_0;
+ pdata->eth_bufnum = X2_START_ETH_BUFNUM_0;
+ pdata->bp_bufnum = X2_START_BP_BUFNUM_0;
+ pdata->ring_num = START_RING_NUM_0;
+ } else {
+ pdata->cpu_bufnum = START_CPU_BUFNUM_0;
+ pdata->eth_bufnum = START_ETH_BUFNUM_0;
+ pdata->bp_bufnum = START_BP_BUFNUM_0;
+ pdata->ring_num = START_RING_NUM_0;
+ }
break;
case 1:
if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
index 175d18890c7a..9d9cf445148c 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
@@ -49,10 +49,10 @@
#define XGENE_ENET_MSS 1448
#define XGENE_MIN_ENET_FRAME_SIZE 60
-#define XGENE_MAX_ENET_IRQ 8
-#define XGENE_NUM_RX_RING 4
-#define XGENE_NUM_TX_RING 4
-#define XGENE_NUM_TXC_RING 4
+#define XGENE_MAX_ENET_IRQ 16
+#define XGENE_NUM_RX_RING 8
+#define XGENE_NUM_TX_RING 8
+#define XGENE_NUM_TXC_RING 8
#define START_CPU_BUFNUM_0 0
#define START_ETH_BUFNUM_0 2
@@ -121,6 +121,16 @@ struct xgene_enet_desc_ring {
struct xgene_enet_raw_desc16 *raw_desc16;
};
__le64 *exp_bufs;
+ u64 tx_packets;
+ u64 tx_bytes;
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 rx_dropped;
+ u64 rx_errors;
+ u64 rx_length_errors;
+ u64 rx_crc_errors;
+ u64 rx_frame_errors;
+ u64 rx_fifo_errors;
};
struct xgene_mac_ops {
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
index 29a71b4dcc44..002df5a6756e 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
@@ -33,7 +33,7 @@
#define LINK_STATUS BIT(2)
#define LINK_UP BIT(15)
#define MPA_IDLE_WITH_QMI_EMPTY BIT(12)
-#define SG_RX_DV_GATE_REG_0_ADDR 0x0dfc
+#define SG_RX_DV_GATE_REG_0_ADDR 0x05fc
extern const struct xgene_mac_ops xgene_sgmac_ops;
extern const struct xgene_port_ops xgene_sgport_ops;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9d4e8e113fe1..c39a7f5c6a01 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -813,6 +813,46 @@ static inline struct sk_buff *bnxt_copy_skb(struct bnxt_napi *bnapi, u8 *data,
return skb;
}
+static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_napi *bnapi,
+ u32 *raw_cons, void *cmp)
+{
+ struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+ struct rx_cmp *rxcmp = cmp;
+ u32 tmp_raw_cons = *raw_cons;
+ u8 cmp_type, agg_bufs = 0;
+
+ cmp_type = RX_CMP_TYPE(rxcmp);
+
+ if (cmp_type == CMP_TYPE_RX_L2_CMP) {
+ agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) &
+ RX_CMP_AGG_BUFS) >>
+ RX_CMP_AGG_BUFS_SHIFT;
+ } else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) {
+ struct rx_tpa_end_cmp *tpa_end = cmp;
+
+ agg_bufs = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) &
+ RX_TPA_END_CMP_AGG_BUFS) >>
+ RX_TPA_END_CMP_AGG_BUFS_SHIFT;
+ }
+
+ if (agg_bufs) {
+ if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, &tmp_raw_cons))
+ return -EBUSY;
+ }
+ *raw_cons = tmp_raw_cons;
+ return 0;
+}
+
+static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
+{
+ if (!rxr->bnapi->in_reset) {
+ rxr->bnapi->in_reset = true;
+ set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
+ schedule_work(&bp->sp_task);
+ }
+ rxr->rx_next_cons = 0xffff;
+}
+
static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
struct rx_tpa_start_cmp *tpa_start,
struct rx_tpa_start_cmp_ext *tpa_start1)
@@ -830,6 +870,11 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
prod_rx_buf = &rxr->rx_buf_ring[prod];
tpa_info = &rxr->rx_tpa[agg_id];
+ if (unlikely(cons != rxr->rx_next_cons)) {
+ bnxt_sched_reset(bp, rxr);
+ return;
+ }
+
prod_rx_buf->data = tpa_info->data;
mapping = tpa_info->mapping;
@@ -867,6 +912,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
rxr->rx_prod = NEXT_RX(prod);
cons = NEXT_RX(cons);
+ rxr->rx_next_cons = NEXT_RX(cons);
cons_rx_buf = &rxr->rx_buf_ring[cons];
bnxt_reuse_rx_data(rxr, cons, cons_rx_buf->data);
@@ -980,6 +1026,14 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp,
dma_addr_t mapping;
struct sk_buff *skb;
+ if (unlikely(bnapi->in_reset)) {
+ int rc = bnxt_discard_rx(bp, bnapi, raw_cons, tpa_end);
+
+ if (rc < 0)
+ return ERR_PTR(-EBUSY);
+ return NULL;
+ }
+
tpa_info = &rxr->rx_tpa[agg_id];
data = tpa_info->data;
prefetch(data);
@@ -1146,6 +1200,12 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
cons = rxcmp->rx_cmp_opaque;
rx_buf = &rxr->rx_buf_ring[cons];
data = rx_buf->data;
+ if (unlikely(cons != rxr->rx_next_cons)) {
+ int rc1 = bnxt_discard_rx(bp, bnapi, raw_cons, rxcmp);
+
+ bnxt_sched_reset(bp, rxr);
+ return rc1;
+ }
prefetch(data);
agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & RX_CMP_AGG_BUFS) >>
@@ -1245,6 +1305,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
next_rx:
rxr->rx_prod = NEXT_RX(prod);
+ rxr->rx_next_cons = NEXT_RX(cons);
next_rx_no_prod:
*raw_cons = tmp_raw_cons;
@@ -2486,6 +2547,7 @@ static void bnxt_clear_ring_indices(struct bnxt *bp)
rxr->rx_prod = 0;
rxr->rx_agg_prod = 0;
rxr->rx_sw_agg_prod = 0;
+ rxr->rx_next_cons = 0;
}
}
}
@@ -4462,6 +4524,7 @@ static void bnxt_enable_napi(struct bnxt *bp)
int i;
for (i = 0; i < bp->cp_nr_rings; i++) {
+ bp->bnapi[i]->in_reset = false;
bnxt_enable_poll(bp->bnapi[i]);
napi_enable(&bp->bnapi[i]->napi);
}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 8b823ff558ff..de9d53eee3dd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -584,6 +584,7 @@ struct bnxt_rx_ring_info {
u16 rx_prod;
u16 rx_agg_prod;
u16 rx_sw_agg_prod;
+ u16 rx_next_cons;
void __iomem *rx_doorbell;
void __iomem *rx_agg_doorbell;
@@ -636,6 +637,7 @@ struct bnxt_napi {
#ifdef CONFIG_NET_RX_BUSY_POLL
atomic_t poll_state;
#endif
+ bool in_reset;
};
#ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index fa05e347262f..06b819db51b1 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -533,6 +533,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
nicvf_config_vlan_stripping(nic, nic->netdev->features);
/* Enable Receive queue */
+ memset(&rq_cfg, 0, sizeof(struct rq_cfg));
rq_cfg.ena = 1;
rq_cfg.tcp_ena = 0;
nicvf_queue_reg_write(nic, NIC_QSET_RQ_0_7_CFG, qidx, *(u64 *)&rq_cfg);
@@ -565,6 +566,7 @@ void nicvf_cmp_queue_config(struct nicvf *nic, struct queue_set *qs,
qidx, (u64)(cq->dmem.phys_base));
/* Enable Completion queue */
+ memset(&cq_cfg, 0, sizeof(struct cq_cfg));
cq_cfg.ena = 1;
cq_cfg.reset = 0;
cq_cfg.caching = 0;
@@ -613,6 +615,7 @@ static void nicvf_snd_queue_config(struct nicvf *nic, struct queue_set *qs,
qidx, (u64)(sq->dmem.phys_base));
/* Enable send queue & set queue size */
+ memset(&sq_cfg, 0, sizeof(struct sq_cfg));
sq_cfg.ena = 1;
sq_cfg.reset = 0;
sq_cfg.ldwb = 0;
@@ -649,6 +652,7 @@ static void nicvf_rbdr_config(struct nicvf *nic, struct queue_set *qs,
/* Enable RBDR & set queue size */
/* Buffer size should be in multiples of 128 bytes */
+ memset(&rbdr_cfg, 0, sizeof(struct rbdr_cfg));
rbdr_cfg.ena = 1;
rbdr_cfg.reset = 0;
rbdr_cfg.ldwb = 0;
diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c
index 1f23845a0694..085f9125cf42 100644
--- a/drivers/net/ethernet/ezchip/nps_enet.c
+++ b/drivers/net/ethernet/ezchip/nps_enet.c
@@ -145,7 +145,7 @@ static void nps_enet_tx_handler(struct net_device *ndev)
u32 tx_ctrl_nt = (tx_ctrl_value & TX_CTL_NT_MASK) >> TX_CTL_NT_SHIFT;
/* Check if we got TX */
- if (!priv->tx_packet_sent || tx_ctrl_ct)
+ if (!priv->tx_skb || tx_ctrl_ct)
return;
/* Ack Tx ctrl register */
@@ -160,7 +160,7 @@ static void nps_enet_tx_handler(struct net_device *ndev)
}
dev_kfree_skb(priv->tx_skb);
- priv->tx_packet_sent = false;
+ priv->tx_skb = NULL;
if (netif_queue_stopped(ndev))
netif_wake_queue(ndev);
@@ -183,6 +183,9 @@ static int nps_enet_poll(struct napi_struct *napi, int budget)
work_done = nps_enet_rx_handler(ndev);
if (work_done < budget) {
u32 buf_int_enable_value = 0;
+ u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL);
+ u32 tx_ctrl_ct =
+ (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT;
napi_complete(napi);
@@ -192,6 +195,18 @@ static int nps_enet_poll(struct napi_struct *napi, int budget)
nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE,
buf_int_enable_value);
+
+ /* in case we will get a tx interrupt while interrupts
+ * are masked, we will lose it since the tx is edge interrupt.
+ * specifically, while executing the code section above,
+ * between nps_enet_tx_handler and the interrupts enable, all
+ * tx requests will be stuck until we will get an rx interrupt.
+ * the two code lines below will solve this situation by
+ * re-adding ourselves to the poll list.
+ */
+
+ if (priv->tx_skb && !tx_ctrl_ct)
+ napi_reschedule(napi);
}
return work_done;
@@ -217,7 +232,7 @@ static irqreturn_t nps_enet_irq_handler(s32 irq, void *dev_instance)
u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT;
u32 rx_ctrl_cr = (rx_ctrl_value & RX_CTL_CR_MASK) >> RX_CTL_CR_SHIFT;
- if ((!tx_ctrl_ct && priv->tx_packet_sent) || rx_ctrl_cr)
+ if ((!tx_ctrl_ct && priv->tx_skb) || rx_ctrl_cr)
if (likely(napi_schedule_prep(&priv->napi))) {
nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0);
__napi_schedule(&priv->napi);
@@ -387,8 +402,6 @@ static void nps_enet_send_frame(struct net_device *ndev,
/* Write the length of the Frame */
tx_ctrl_value |= length << TX_CTL_NT_SHIFT;
- /* Indicate SW is done */
- priv->tx_packet_sent = true;
tx_ctrl_value |= NPS_ENET_ENABLE << TX_CTL_CT_SHIFT;
/* Send Frame */
nps_enet_reg_set(priv, NPS_ENET_REG_TX_CTL, tx_ctrl_value);
@@ -465,7 +478,7 @@ static s32 nps_enet_open(struct net_device *ndev)
s32 err;
/* Reset private variables */
- priv->tx_packet_sent = false;
+ priv->tx_skb = NULL;
priv->ge_mac_cfg_2_value = 0;
priv->ge_mac_cfg_3_value = 0;
@@ -534,6 +547,11 @@ static netdev_tx_t nps_enet_start_xmit(struct sk_buff *skb,
priv->tx_skb = skb;
+ /* make sure tx_skb is actually written to the memory
+ * before the HW is informed and the IRQ is fired.
+ */
+ wmb();
+
nps_enet_send_frame(ndev, skb);
return NETDEV_TX_OK;
diff --git a/drivers/net/ethernet/ezchip/nps_enet.h b/drivers/net/ethernet/ezchip/nps_enet.h
index d0cab600bce8..3939ca20cc9f 100644
--- a/drivers/net/ethernet/ezchip/nps_enet.h
+++ b/drivers/net/ethernet/ezchip/nps_enet.h
@@ -165,14 +165,12 @@
* struct nps_enet_priv - Storage of ENET's private information.
* @regs_base: Base address of ENET memory-mapped control registers.
* @irq: For RX/TX IRQ number.
- * @tx_packet_sent: SW indication if frame is being sent.
* @tx_skb: socket buffer of sent frame.
* @napi: Structure for NAPI.
*/
struct nps_enet_priv {
void __iomem *regs_base;
s32 irq;
- bool tx_packet_sent;
struct sk_buff *tx_skb;
struct napi_struct napi;
u32 ge_mac_cfg_2_value;
diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig
index b5c6d42daa12..2664827ddecd 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -68,7 +68,7 @@ config MVNETA
config MVNETA_BM
tristate
- default y if MVNETA=y && MVNETA_BM_ENABLE
+ default y if MVNETA=y && MVNETA_BM_ENABLE!=n
default MVNETA_BM_ENABLE
select HWBM
help
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
index cda9e604a95f..0844b7c75767 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
@@ -1417,6 +1417,7 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter)
struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump;
struct pci_dev *pdev = adapter->pdev;
bool extended = false;
+ int ret;
prev_version = adapter->fw_version;
current_version = qlcnic_83xx_get_fw_version(adapter);
@@ -1427,8 +1428,11 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter)
if (qlcnic_83xx_md_check_extended_dump_capability(adapter))
extended = !qlcnic_83xx_extend_md_capab(adapter);
- if (!qlcnic_fw_cmd_get_minidump_temp(adapter))
- dev_info(&pdev->dev, "Supports FW dump capability\n");
+ ret = qlcnic_fw_cmd_get_minidump_temp(adapter);
+ if (ret)
+ return;
+
+ dev_info(&pdev->dev, "Supports FW dump capability\n");
/* Once we have minidump template with extended iSCSI dump
* capability, update the minidump capture mask to 0x1f as
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 9e2a0bd8f5a8..4277d0c12101 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1506,6 +1506,8 @@ static int ravb_close(struct net_device *ndev)
priv->phydev = NULL;
}
+ if (priv->chip_id == RCAR_GEN3)
+ free_irq(priv->emac_irq, ndev);
free_irq(ndev->irq, ndev);
napi_disable(&priv->napi[RAVB_NC]);
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 5590b9c182c9..445fc5aef308 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -790,9 +790,11 @@ void phy_start(struct phy_device *phydev)
break;
case PHY_HALTED:
/* make sure interrupts are re-enabled for the PHY */
- err = phy_enable_interrupts(phydev);
- if (err < 0)
- break;
+ if (phydev->irq != PHY_POLL) {
+ err = phy_enable_interrupts(phydev);
+ if (err < 0)
+ break;
+ }
phydev->state = PHY_RESUMING;
do_resume = true;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 75870e68a7c3..34731e29c589 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -105,6 +105,7 @@ void iwl_mvm_set_tx_cmd(struct iwl_mvm *mvm, struct sk_buff *skb,
struct iwl_tx_cmd *tx_cmd,
struct ieee80211_tx_info *info, u8 sta_id)
{
+ struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb);
struct ieee80211_hdr *hdr = (void *)skb->data;
__le16 fc = hdr->frame_control;
u32 tx_flags = le32_to_cpu(tx_cmd->tx_flags);
@@ -185,7 +186,7 @@ void iwl_mvm_set_tx_cmd(struct iwl_mvm *mvm, struct sk_buff *skb,
tx_cmd->tx_flags = cpu_to_le32(tx_flags);
/* Total # bytes to be transmitted */
tx_cmd->len = cpu_to_le16((u16)skb->len +
- (uintptr_t)info->driver_data[0]);
+ (uintptr_t)skb_info->driver_data[0]);
tx_cmd->next_frame_len = 0;
tx_cmd->life_time = cpu_to_le32(TX_CMD_LIFE_TIME_INFINITE);
tx_cmd->sta_id = sta_id;
@@ -327,10 +328,11 @@ static void iwl_mvm_set_tx_cmd_crypto(struct iwl_mvm *mvm,
*/
static struct iwl_device_cmd *
iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb,
- int hdrlen, struct ieee80211_sta *sta, u8 sta_id)
+ struct ieee80211_tx_info *info, int hdrlen,
+ struct ieee80211_sta *sta, u8 sta_id)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb);
struct iwl_device_cmd *dev_cmd;
struct iwl_tx_cmd *tx_cmd;
@@ -350,10 +352,10 @@ iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb,
iwl_mvm_set_tx_cmd_rate(mvm, tx_cmd, info, sta, hdr->frame_control);
- memset(&info->status, 0, sizeof(info->status));
- memset(info->driver_data, 0, sizeof(info->driver_data));
+ memset(&skb_info->status, 0, sizeof(skb_info->status));
+ memset(skb_info->driver_data, 0, sizeof(skb_info->driver_data));
- info->driver_data[1] = dev_cmd;
+ skb_info->driver_data[1] = dev_cmd;
return dev_cmd;
}
@@ -361,22 +363,25 @@ iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb,
int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_info info;
struct iwl_device_cmd *dev_cmd;
struct iwl_tx_cmd *tx_cmd;
u8 sta_id;
int hdrlen = ieee80211_hdrlen(hdr->frame_control);
- if (WARN_ON_ONCE(info->flags & IEEE80211_TX_CTL_AMPDU))
+ memcpy(&info, skb->cb, sizeof(info));
+
+ if (WARN_ON_ONCE(info.flags & IEEE80211_TX_CTL_AMPDU))
return -1;
- if (WARN_ON_ONCE(info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM &&
- (!info->control.vif ||
- info->hw_queue != info->control.vif->cab_queue)))
+ if (WARN_ON_ONCE(info.flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM &&
+ (!info.control.vif ||
+ info.hw_queue != info.control.vif->cab_queue)))
return -1;
/* This holds the amsdu headers length */
- info->driver_data[0] = (void *)(uintptr_t)0;
+ skb_info->driver_data[0] = (void *)(uintptr_t)0;
/*
* IWL_MVM_OFFCHANNEL_QUEUE is used for ROC packets that can be used
@@ -385,7 +390,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
* and hence needs to be sent on the aux queue
*/
if (IEEE80211_SKB_CB(skb)->hw_queue == IWL_MVM_OFFCHANNEL_QUEUE &&
- info->control.vif->type == NL80211_IFTYPE_STATION)
+ info.control.vif->type == NL80211_IFTYPE_STATION)
IEEE80211_SKB_CB(skb)->hw_queue = mvm->aux_queue;
/*
@@ -398,14 +403,14 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
* AUX station.
*/
sta_id = mvm->aux_sta.sta_id;
- if (info->control.vif) {
+ if (info.control.vif) {
struct iwl_mvm_vif *mvmvif =
- iwl_mvm_vif_from_mac80211(info->control.vif);
+ iwl_mvm_vif_from_mac80211(info.control.vif);
- if (info->control.vif->type == NL80211_IFTYPE_P2P_DEVICE ||
- info->control.vif->type == NL80211_IFTYPE_AP)
+ if (info.control.vif->type == NL80211_IFTYPE_P2P_DEVICE ||
+ info.control.vif->type == NL80211_IFTYPE_AP)
sta_id = mvmvif->bcast_sta.sta_id;
- else if (info->control.vif->type == NL80211_IFTYPE_STATION &&
+ else if (info.control.vif->type == NL80211_IFTYPE_STATION &&
is_multicast_ether_addr(hdr->addr1)) {
u8 ap_sta_id = ACCESS_ONCE(mvmvif->ap_sta_id);
@@ -414,19 +419,18 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
}
}
- IWL_DEBUG_TX(mvm, "station Id %d, queue=%d\n", sta_id, info->hw_queue);
+ IWL_DEBUG_TX(mvm, "station Id %d, queue=%d\n", sta_id, info.hw_queue);
- dev_cmd = iwl_mvm_set_tx_params(mvm, skb, hdrlen, NULL, sta_id);
+ dev_cmd = iwl_mvm_set_tx_params(mvm, skb, &info, hdrlen, NULL, sta_id);
if (!dev_cmd)
return -1;
- /* From now on, we cannot access info->control */
tx_cmd = (struct iwl_tx_cmd *)dev_cmd->payload;
/* Copy MAC header from skb into command buffer */
memcpy(tx_cmd->hdr, hdr, hdrlen);
- if (iwl_trans_tx(mvm->trans, skb, dev_cmd, info->hw_queue)) {
+ if (iwl_trans_tx(mvm->trans, skb, dev_cmd, info.hw_queue)) {
iwl_trans_free_tx_cmd(mvm->trans, dev_cmd);
return -1;
}
@@ -445,11 +449,11 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
#ifdef CONFIG_INET
static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
+ struct ieee80211_tx_info *info,
struct ieee80211_sta *sta,
struct sk_buff_head *mpdus_skb)
{
struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct ieee80211_hdr *hdr = (void *)skb->data;
unsigned int mss = skb_shinfo(skb)->gso_size;
struct sk_buff *tmp, *next;
@@ -544,6 +548,8 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
/* This skb fits in one single A-MSDU */
if (num_subframes * mss >= tcp_payload_len) {
+ struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb);
+
/*
* Compute the length of all the data added for the A-MSDU.
* This will be used to compute the length to write in the TX
@@ -552,11 +558,10 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
* already had one set of SNAP / IP / TCP headers.
*/
num_subframes = DIV_ROUND_UP(tcp_payload_len, mss);
- info = IEEE80211_SKB_CB(skb);
amsdu_add = num_subframes * sizeof(struct ethhdr) +
(num_subframes - 1) * (snap_ip_tcp + pad);
/* This holds the amsdu headers length */
- info->driver_data[0] = (void *)(uintptr_t)amsdu_add;
+ skb_info->driver_data[0] = (void *)(uintptr_t)amsdu_add;
__skb_queue_tail(mpdus_skb, skb);
return 0;
@@ -596,11 +601,14 @@ segment:
ip_hdr(tmp)->id = htons(ip_base_id + i * num_subframes);
if (tcp_payload_len > mss) {
+ struct ieee80211_tx_info *skb_info =
+ IEEE80211_SKB_CB(tmp);
+
num_subframes = DIV_ROUND_UP(tcp_payload_len, mss);
- info = IEEE80211_SKB_CB(tmp);
amsdu_add = num_subframes * sizeof(struct ethhdr) +
(num_subframes - 1) * (snap_ip_tcp + pad);
- info->driver_data[0] = (void *)(uintptr_t)amsdu_add;
+ skb_info->driver_data[0] =
+ (void *)(uintptr_t)amsdu_add;
skb_shinfo(tmp)->gso_size = mss;
} else {
qc = ieee80211_get_qos_ctl((void *)tmp->data);
@@ -622,6 +630,7 @@ segment:
}
#else /* CONFIG_INET */
static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
+ struct ieee80211_tx_info *info,
struct ieee80211_sta *sta,
struct sk_buff_head *mpdus_skb)
{
@@ -636,10 +645,10 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
* Sets the fields in the Tx cmd that are crypto related
*/
static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb,
+ struct ieee80211_tx_info *info,
struct ieee80211_sta *sta)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct iwl_mvm_sta *mvmsta;
struct iwl_device_cmd *dev_cmd;
struct iwl_tx_cmd *tx_cmd;
@@ -660,7 +669,8 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb,
if (WARN_ON_ONCE(mvmsta->sta_id == IWL_MVM_STATION_COUNT))
return -1;
- dev_cmd = iwl_mvm_set_tx_params(mvm, skb, hdrlen, sta, mvmsta->sta_id);
+ dev_cmd = iwl_mvm_set_tx_params(mvm, skb, info, hdrlen,
+ sta, mvmsta->sta_id);
if (!dev_cmd)
goto drop;
@@ -736,7 +746,8 @@ int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb,
struct ieee80211_sta *sta)
{
struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
- struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_tx_info info;
struct sk_buff_head mpdus_skbs;
unsigned int payload_len;
int ret;
@@ -747,21 +758,23 @@ int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb,
if (WARN_ON_ONCE(mvmsta->sta_id == IWL_MVM_STATION_COUNT))
return -1;
+ memcpy(&info, skb->cb, sizeof(info));
+
/* This holds the amsdu headers length */
- info->driver_data[0] = (void *)(uintptr_t)0;
+ skb_info->driver_data[0] = (void *)(uintptr_t)0;
if (!skb_is_gso(skb))
- return iwl_mvm_tx_mpdu(mvm, skb, sta);
+ return iwl_mvm_tx_mpdu(mvm, skb, &info, sta);
payload_len = skb_tail_pointer(skb) - skb_transport_header(skb) -
tcp_hdrlen(skb) + skb->data_len;
if (payload_len <= skb_shinfo(skb)->gso_size)
- return iwl_mvm_tx_mpdu(mvm, skb, sta);
+ return iwl_mvm_tx_mpdu(mvm, skb, &info, sta);
__skb_queue_head_init(&mpdus_skbs);
- ret = iwl_mvm_tx_tso(mvm, skb, sta, &mpdus_skbs);
+ ret = iwl_mvm_tx_tso(mvm, skb, &info, sta, &mpdus_skbs);
if (ret)
return ret;
@@ -771,7 +784,7 @@ int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb,
while (!skb_queue_empty(&mpdus_skbs)) {
skb = __skb_dequeue(&mpdus_skbs);
- ret = iwl_mvm_tx_mpdu(mvm, skb, sta);
+ ret = iwl_mvm_tx_mpdu(mvm, skb, &info, sta);
if (ret) {
__skb_queue_purge(&mpdus_skbs);
return ret;
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index b42f26029225..4412a57ec862 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -711,6 +711,7 @@ static void xenvif_tx_err(struct xenvif_queue *queue,
if (cons == end)
break;
RING_COPY_REQUEST(&queue->tx, cons++, txp);
+ extra_count = 0; /* only the first frag can have extras */
} while (1);
queue->tx.req_cons = cons;
}
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index f70090897fdf..f2d01d4d9364 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -847,6 +847,14 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
if (!platform_get_irq(cpu_pmu->plat_device, 0))
cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+ /*
+ * This is a CPU PMU potentially in a heterogeneous configuration (e.g.
+ * big.LITTLE). This is not an uncore PMU, and we have taken ctx
+ * sharing into account (e.g. with our pmu::filter_match callback and
+ * pmu::event_init group validation).
+ */
+ cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+
return 0;
out_unregister:
diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c
index 4429312e848d..2c447130b954 100644
--- a/drivers/pinctrl/pinctrl-at91-pio4.c
+++ b/drivers/pinctrl/pinctrl-at91-pio4.c
@@ -722,9 +722,11 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev,
break;
case PIN_CONFIG_BIAS_PULL_UP:
conf |= ATMEL_PIO_PUEN_MASK;
+ conf &= (~ATMEL_PIO_PDEN_MASK);
break;
case PIN_CONFIG_BIAS_PULL_DOWN:
conf |= ATMEL_PIO_PDEN_MASK;
+ conf &= (~ATMEL_PIO_PUEN_MASK);
break;
case PIN_CONFIG_DRIVE_OPEN_DRAIN:
if (arg == 0)
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 8fad0a7044d3..f2201d42a9cd 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -34,6 +34,9 @@
#include <asm/processor.h>
#include <asm/cpu_device_id.h>
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT 0x0000065C
+
/* bitmasks for RAPL MSRs, used by primitive access functions */
#define ENERGY_STATUS_MASK 0xffffffff
@@ -86,6 +89,7 @@ enum rapl_domain_type {
RAPL_DOMAIN_PP0, /* core power plane */
RAPL_DOMAIN_PP1, /* graphics uncore */
RAPL_DOMAIN_DRAM,/* DRAM control_type */
+ RAPL_DOMAIN_PLATFORM, /* PSys control_type */
RAPL_DOMAIN_MAX,
};
@@ -251,9 +255,11 @@ static const char * const rapl_domain_names[] = {
"core",
"uncore",
"dram",
+ "psys",
};
static struct powercap_control_type *control_type; /* PowerCap Controller */
+static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */
/* caller to ensure CPU hotplug lock is held */
static struct rapl_package *find_package_by_id(int id)
@@ -409,6 +415,14 @@ static const struct powercap_zone_ops zone_ops[] = {
.set_enable = set_domain_enable,
.get_enable = get_domain_enable,
},
+ /* RAPL_DOMAIN_PLATFORM */
+ {
+ .get_energy_uj = get_energy_counter,
+ .get_max_energy_range_uj = get_max_energy_counter,
+ .release = release_zone,
+ .set_enable = set_domain_enable,
+ .get_enable = get_domain_enable,
+ },
};
static int set_power_limit(struct powercap_zone *power_zone, int id,
@@ -1160,6 +1174,13 @@ static int rapl_unregister_powercap(void)
powercap_unregister_zone(control_type,
&rd_package->power_zone);
}
+
+ if (platform_rapl_domain) {
+ powercap_unregister_zone(control_type,
+ &platform_rapl_domain->power_zone);
+ kfree(platform_rapl_domain);
+ }
+
powercap_unregister_control_type(control_type);
return 0;
@@ -1239,6 +1260,47 @@ err_cleanup:
return ret;
}
+static int rapl_register_psys(void)
+{
+ struct rapl_domain *rd;
+ struct powercap_zone *power_zone;
+ u64 val;
+
+ if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val)
+ return -ENODEV;
+
+ if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val)
+ return -ENODEV;
+
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return -ENOMEM;
+
+ rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
+ rd->id = RAPL_DOMAIN_PLATFORM;
+ rd->msrs[0] = MSR_PLATFORM_POWER_LIMIT;
+ rd->msrs[1] = MSR_PLATFORM_ENERGY_STATUS;
+ rd->rpl[0].prim_id = PL1_ENABLE;
+ rd->rpl[0].name = pl1_name;
+ rd->rpl[1].prim_id = PL2_ENABLE;
+ rd->rpl[1].name = pl2_name;
+ rd->rp = find_package_by_id(0);
+
+ power_zone = powercap_register_zone(&rd->power_zone, control_type,
+ "psys", NULL,
+ &zone_ops[RAPL_DOMAIN_PLATFORM],
+ 2, &constraint_ops);
+
+ if (IS_ERR(power_zone)) {
+ kfree(rd);
+ return PTR_ERR(power_zone);
+ }
+
+ platform_rapl_domain = rd;
+
+ return 0;
+}
+
static int rapl_register_powercap(void)
{
struct rapl_domain *rd;
@@ -1255,6 +1317,10 @@ static int rapl_register_powercap(void)
list_for_each_entry(rp, &rapl_packages, plist)
if (rapl_package_register_powercap(rp))
goto err_cleanup_package;
+
+ /* Don't bail out if PSys is not supported */
+ rapl_register_psys();
+
return ret;
err_cleanup_package:
@@ -1289,6 +1355,9 @@ static int rapl_check_domain(int cpu, int domain)
case RAPL_DOMAIN_DRAM:
msr = MSR_DRAM_ENERGY_STATUS;
break;
+ case RAPL_DOMAIN_PLATFORM:
+ /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
+ return -EINVAL;
default:
pr_err("invalid domain id %d\n", domain);
return -EINVAL;
diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c
index 40cd894e4df5..514a5e8fdbab 100644
--- a/drivers/regulator/axp20x-regulator.c
+++ b/drivers/regulator/axp20x-regulator.c
@@ -157,7 +157,9 @@ static struct regulator_ops axp20x_ops_sw = {
static const struct regulator_linear_range axp20x_ldo4_ranges[] = {
REGULATOR_LINEAR_RANGE(1250000, 0x0, 0x0, 0),
REGULATOR_LINEAR_RANGE(1300000, 0x1, 0x8, 100000),
- REGULATOR_LINEAR_RANGE(2500000, 0x9, 0xf, 100000),
+ REGULATOR_LINEAR_RANGE(2500000, 0x9, 0x9, 0),
+ REGULATOR_LINEAR_RANGE(2700000, 0xa, 0xb, 100000),
+ REGULATOR_LINEAR_RANGE(3000000, 0xc, 0xf, 100000),
};
static const struct regulator_desc axp20x_regulators[] = {
@@ -215,10 +217,14 @@ static const struct regulator_desc axp22x_regulators[] = {
AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)),
AXP_DESC(AXP22X, ELDO3, "eldo3", "eldoin", 700, 3300, 100,
AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)),
- AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 1800, 3300, 100,
+ /* Note the datasheet only guarantees reliable operation up to
+ * 3.3V, this needs to be enforced via dts provided constraints */
+ AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 700, 3800, 100,
AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07,
AXP22X_IO_ENABLED, AXP22X_IO_DISABLED),
- AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 1800, 3300, 100,
+ /* Note the datasheet only guarantees reliable operation up to
+ * 3.3V, this needs to be enforced via dts provided constraints */
+ AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 700, 3800, 100,
AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07,
AXP22X_IO_ENABLED, AXP22X_IO_DISABLED),
AXP_DESC_FIXED(AXP22X, RTC_LDO, "rtc_ldo", "ips", 3000),
diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c
index ed9e7e96f877..c6af343f54ea 100644
--- a/drivers/regulator/da9063-regulator.c
+++ b/drivers/regulator/da9063-regulator.c
@@ -900,4 +900,4 @@ module_exit(da9063_regulator_cleanup);
MODULE_AUTHOR("Krystian Garbaciak <krystian.garbaciak@diasemi.com>");
MODULE_DESCRIPTION("DA9063 regulators driver");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("paltform:" DA9063_DRVNAME_REGULATORS);
+MODULE_ALIAS("platform:" DA9063_DRVNAME_REGULATORS);
diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c
index a8718e98674a..83e89e5d4752 100644
--- a/drivers/regulator/gpio-regulator.c
+++ b/drivers/regulator/gpio-regulator.c
@@ -162,6 +162,8 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np,
of_property_read_u32(np, "startup-delay-us", &config->startup_delay);
config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0);
+ if (config->enable_gpio == -EPROBE_DEFER)
+ return ERR_PTR(-EPROBE_DEFER);
/* Fetch GPIOs. - optional property*/
ret = of_gpio_count(np);
diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index d24e2c783dc5..6dfa3502e1f1 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -308,7 +308,7 @@ static struct regulator_ops s2mps11_buck_ops = {
.enable_mask = S2MPS11_ENABLE_MASK \
}
-#define regulator_desc_s2mps11_buck6_10(num, min, step) { \
+#define regulator_desc_s2mps11_buck67810(num, min, step) { \
.name = "BUCK"#num, \
.id = S2MPS11_BUCK##num, \
.ops = &s2mps11_buck_ops, \
@@ -324,6 +324,22 @@ static struct regulator_ops s2mps11_buck_ops = {
.enable_mask = S2MPS11_ENABLE_MASK \
}
+#define regulator_desc_s2mps11_buck9 { \
+ .name = "BUCK9", \
+ .id = S2MPS11_BUCK9, \
+ .ops = &s2mps11_buck_ops, \
+ .type = REGULATOR_VOLTAGE, \
+ .owner = THIS_MODULE, \
+ .min_uV = MIN_3000_MV, \
+ .uV_step = STEP_25_MV, \
+ .n_voltages = S2MPS11_BUCK9_N_VOLTAGES, \
+ .ramp_delay = S2MPS11_RAMP_DELAY, \
+ .vsel_reg = S2MPS11_REG_B9CTRL2, \
+ .vsel_mask = S2MPS11_BUCK9_VSEL_MASK, \
+ .enable_reg = S2MPS11_REG_B9CTRL1, \
+ .enable_mask = S2MPS11_ENABLE_MASK \
+}
+
static const struct regulator_desc s2mps11_regulators[] = {
regulator_desc_s2mps11_ldo(1, STEP_25_MV),
regulator_desc_s2mps11_ldo(2, STEP_50_MV),
@@ -368,11 +384,11 @@ static const struct regulator_desc s2mps11_regulators[] = {
regulator_desc_s2mps11_buck1_4(3),
regulator_desc_s2mps11_buck1_4(4),
regulator_desc_s2mps11_buck5,
- regulator_desc_s2mps11_buck6_10(6, MIN_600_MV, STEP_6_25_MV),
- regulator_desc_s2mps11_buck6_10(7, MIN_600_MV, STEP_6_25_MV),
- regulator_desc_s2mps11_buck6_10(8, MIN_600_MV, STEP_6_25_MV),
- regulator_desc_s2mps11_buck6_10(9, MIN_3000_MV, STEP_25_MV),
- regulator_desc_s2mps11_buck6_10(10, MIN_750_MV, STEP_12_5_MV),
+ regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV),
+ regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_6_25_MV),
+ regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_6_25_MV),
+ regulator_desc_s2mps11_buck9,
+ regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV),
};
static struct regulator_ops s2mps14_reg_ops;
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 8eaed0522aa3..a655cf29c16f 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -532,6 +532,7 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
return SCSI_DH_DEV_TEMP_BUSY;
retry:
+ err = 0;
retval = submit_rtpg(sdev, buff, bufflen, &sense_hdr, pg->flags);
if (retval) {
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 5d0ec42a9317..634254a52301 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4214,7 +4214,7 @@ static struct scsi_host_template qla1280_driver_template = {
.eh_bus_reset_handler = qla1280_eh_bus_reset,
.eh_host_reset_handler = qla1280_eh_adapter_reset,
.bios_param = qla1280_biosparam,
- .can_queue = 0xfffff,
+ .can_queue = MAX_OUTSTANDING_COMMANDS,
.this_id = -1,
.sg_tablesize = SG_ALL,
.use_clustering = ENABLE_CLUSTERING,
diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
index 39412c9097c6..c1a2d747b246 100644
--- a/drivers/spi/spi-fsl-dspi.c
+++ b/drivers/spi/spi-fsl-dspi.c
@@ -385,8 +385,8 @@ static int dspi_transfer_one_message(struct spi_master *master,
dspi->cur_chip = spi_get_ctldata(spi);
dspi->cs = spi->chip_select;
dspi->cs_change = 0;
- if (dspi->cur_transfer->transfer_list.next
- == &dspi->cur_msg->transfers)
+ if (list_is_last(&dspi->cur_transfer->transfer_list,
+ &dspi->cur_msg->transfers) || transfer->cs_change)
dspi->cs_change = 1;
dspi->void_write_data = dspi->cur_chip->void_write_data;
diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index 43a02e377b3b..0caa3c8bef46 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -423,12 +423,16 @@ static void omap2_mcspi_tx_dma(struct spi_device *spi,
if (mcspi_dma->dma_tx) {
struct dma_async_tx_descriptor *tx;
+ struct scatterlist sg;
dmaengine_slave_config(mcspi_dma->dma_tx, &cfg);
- tx = dmaengine_prep_slave_sg(mcspi_dma->dma_tx, xfer->tx_sg.sgl,
- xfer->tx_sg.nents, DMA_MEM_TO_DEV,
- DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+ sg_init_table(&sg, 1);
+ sg_dma_address(&sg) = xfer->tx_dma;
+ sg_dma_len(&sg) = xfer->len;
+
+ tx = dmaengine_prep_slave_sg(mcspi_dma->dma_tx, &sg, 1,
+ DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
if (tx) {
tx->callback = omap2_mcspi_tx_callback;
tx->callback_param = spi;
@@ -474,15 +478,20 @@ omap2_mcspi_rx_dma(struct spi_device *spi, struct spi_transfer *xfer,
if (mcspi_dma->dma_rx) {
struct dma_async_tx_descriptor *tx;
+ struct scatterlist sg;
dmaengine_slave_config(mcspi_dma->dma_rx, &cfg);
if ((l & OMAP2_MCSPI_CHCONF_TURBO) && mcspi->fifo_depth == 0)
dma_count -= es;
- tx = dmaengine_prep_slave_sg(mcspi_dma->dma_rx, xfer->rx_sg.sgl,
- xfer->rx_sg.nents, DMA_DEV_TO_MEM,
- DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+ sg_init_table(&sg, 1);
+ sg_dma_address(&sg) = xfer->rx_dma;
+ sg_dma_len(&sg) = dma_count;
+
+ tx = dmaengine_prep_slave_sg(mcspi_dma->dma_rx, &sg, 1,
+ DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT |
+ DMA_CTRL_ACK);
if (tx) {
tx->callback = omap2_mcspi_rx_callback;
tx->callback_param = spi;
@@ -496,6 +505,8 @@ omap2_mcspi_rx_dma(struct spi_device *spi, struct spi_transfer *xfer,
omap2_mcspi_set_dma_req(spi, 1, 1);
wait_for_completion(&mcspi_dma->dma_rx_completion);
+ dma_unmap_single(mcspi->dev, xfer->rx_dma, count,
+ DMA_FROM_DEVICE);
if (mcspi->fifo_depth > 0)
return count;
@@ -608,6 +619,8 @@ omap2_mcspi_txrx_dma(struct spi_device *spi, struct spi_transfer *xfer)
if (tx != NULL) {
wait_for_completion(&mcspi_dma->dma_tx_completion);
+ dma_unmap_single(mcspi->dev, xfer->tx_dma, xfer->len,
+ DMA_TO_DEVICE);
if (mcspi->fifo_depth > 0) {
irqstat_reg = mcspi->base + OMAP2_MCSPI_IRQSTATUS;
@@ -1074,16 +1087,6 @@ static void omap2_mcspi_cleanup(struct spi_device *spi)
gpio_free(spi->cs_gpio);
}
-static bool omap2_mcspi_can_dma(struct spi_master *master,
- struct spi_device *spi,
- struct spi_transfer *xfer)
-{
- if (xfer->len < DMA_MIN_BYTES)
- return false;
-
- return true;
-}
-
static int omap2_mcspi_work_one(struct omap2_mcspi *mcspi,
struct spi_device *spi, struct spi_transfer *t)
{
@@ -1265,6 +1268,32 @@ static int omap2_mcspi_transfer_one(struct spi_master *master,
return -EINVAL;
}
+ if (len < DMA_MIN_BYTES)
+ goto skip_dma_map;
+
+ if (mcspi_dma->dma_tx && tx_buf != NULL) {
+ t->tx_dma = dma_map_single(mcspi->dev, (void *) tx_buf,
+ len, DMA_TO_DEVICE);
+ if (dma_mapping_error(mcspi->dev, t->tx_dma)) {
+ dev_dbg(mcspi->dev, "dma %cX %d bytes error\n",
+ 'T', len);
+ return -EINVAL;
+ }
+ }
+ if (mcspi_dma->dma_rx && rx_buf != NULL) {
+ t->rx_dma = dma_map_single(mcspi->dev, rx_buf, t->len,
+ DMA_FROM_DEVICE);
+ if (dma_mapping_error(mcspi->dev, t->rx_dma)) {
+ dev_dbg(mcspi->dev, "dma %cX %d bytes error\n",
+ 'R', len);
+ if (tx_buf != NULL)
+ dma_unmap_single(mcspi->dev, t->tx_dma,
+ len, DMA_TO_DEVICE);
+ return -EINVAL;
+ }
+ }
+
+skip_dma_map:
return omap2_mcspi_work_one(mcspi, spi, t);
}
@@ -1348,7 +1377,6 @@ static int omap2_mcspi_probe(struct platform_device *pdev)
master->transfer_one = omap2_mcspi_transfer_one;
master->set_cs = omap2_mcspi_set_cs;
master->cleanup = omap2_mcspi_cleanup;
- master->can_dma = omap2_mcspi_can_dma;
master->dev.of_node = node;
master->max_speed_hz = OMAP2_MCSPI_MAX_FREQ;
master->min_speed_hz = OMAP2_MCSPI_MAX_FREQ >> 15;
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 85e59a406a4c..86138e4101b0 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -126,7 +126,7 @@ static const struct lpss_config lpss_platforms[] = {
.reg_general = -1,
.reg_ssp = 0x20,
.reg_cs_ctrl = 0x24,
- .reg_capabilities = 0xfc,
+ .reg_capabilities = -1,
.rx_threshold = 1,
.tx_threshold_lo = 32,
.tx_threshold_hi = 56,
diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c
index eac3c960b2de..443f664534e1 100644
--- a/drivers/spi/spi-ti-qspi.c
+++ b/drivers/spi/spi-ti-qspi.c
@@ -94,6 +94,7 @@ struct ti_qspi {
#define QSPI_FLEN(n) ((n - 1) << 0)
#define QSPI_WLEN_MAX_BITS 128
#define QSPI_WLEN_MAX_BYTES 16
+#define QSPI_WLEN_MASK QSPI_WLEN(QSPI_WLEN_MAX_BITS)
/* STATUS REGISTER */
#define BUSY 0x01
@@ -235,16 +236,16 @@ static inline int ti_qspi_poll_wc(struct ti_qspi *qspi)
return -ETIMEDOUT;
}
-static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t)
+static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t,
+ int count)
{
- int wlen, count, xfer_len;
+ int wlen, xfer_len;
unsigned int cmd;
const u8 *txbuf;
u32 data;
txbuf = t->tx_buf;
cmd = qspi->cmd | QSPI_WR_SNGL;
- count = t->len;
wlen = t->bits_per_word >> 3; /* in bytes */
xfer_len = wlen;
@@ -304,9 +305,10 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t)
return 0;
}
-static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t)
+static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t,
+ int count)
{
- int wlen, count;
+ int wlen;
unsigned int cmd;
u8 *rxbuf;
@@ -323,7 +325,6 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t)
cmd |= QSPI_RD_SNGL;
break;
}
- count = t->len;
wlen = t->bits_per_word >> 3; /* in bytes */
while (count) {
@@ -354,12 +355,13 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t)
return 0;
}
-static int qspi_transfer_msg(struct ti_qspi *qspi, struct spi_transfer *t)
+static int qspi_transfer_msg(struct ti_qspi *qspi, struct spi_transfer *t,
+ int count)
{
int ret;
if (t->tx_buf) {
- ret = qspi_write_msg(qspi, t);
+ ret = qspi_write_msg(qspi, t, count);
if (ret) {
dev_dbg(qspi->dev, "Error while writing\n");
return ret;
@@ -367,7 +369,7 @@ static int qspi_transfer_msg(struct ti_qspi *qspi, struct spi_transfer *t)
}
if (t->rx_buf) {
- ret = qspi_read_msg(qspi, t);
+ ret = qspi_read_msg(qspi, t, count);
if (ret) {
dev_dbg(qspi->dev, "Error while reading\n");
return ret;
@@ -450,7 +452,8 @@ static int ti_qspi_start_transfer_one(struct spi_master *master,
struct spi_device *spi = m->spi;
struct spi_transfer *t;
int status = 0, ret;
- int frame_length;
+ unsigned int frame_len_words, transfer_len_words;
+ int wlen;
/* setup device control reg */
qspi->dc = 0;
@@ -462,14 +465,15 @@ static int ti_qspi_start_transfer_one(struct spi_master *master,
if (spi->mode & SPI_CS_HIGH)
qspi->dc |= QSPI_CSPOL(spi->chip_select);
- frame_length = (m->frame_length << 3) / spi->bits_per_word;
-
- frame_length = clamp(frame_length, 0, QSPI_FRAME);
+ frame_len_words = 0;
+ list_for_each_entry(t, &m->transfers, transfer_list)
+ frame_len_words += t->len / (t->bits_per_word >> 3);
+ frame_len_words = min_t(unsigned int, frame_len_words, QSPI_FRAME);
/* setup command reg */
qspi->cmd = 0;
qspi->cmd |= QSPI_EN_CS(spi->chip_select);
- qspi->cmd |= QSPI_FLEN(frame_length);
+ qspi->cmd |= QSPI_FLEN(frame_len_words);
ti_qspi_write(qspi, qspi->dc, QSPI_SPI_DC_REG);
@@ -479,16 +483,23 @@ static int ti_qspi_start_transfer_one(struct spi_master *master,
ti_qspi_disable_memory_map(spi);
list_for_each_entry(t, &m->transfers, transfer_list) {
- qspi->cmd |= QSPI_WLEN(t->bits_per_word);
+ qspi->cmd = ((qspi->cmd & ~QSPI_WLEN_MASK) |
+ QSPI_WLEN(t->bits_per_word));
+
+ wlen = t->bits_per_word >> 3;
+ transfer_len_words = min(t->len / wlen, frame_len_words);
- ret = qspi_transfer_msg(qspi, t);
+ ret = qspi_transfer_msg(qspi, t, transfer_len_words * wlen);
if (ret) {
dev_dbg(qspi->dev, "transfer message failed\n");
mutex_unlock(&qspi->list_lock);
return -EINVAL;
}
- m->actual_length += t->len;
+ m->actual_length += transfer_len_words * wlen;
+ frame_len_words -= transfer_len_words;
+ if (frame_len_words == 0)
+ break;
}
mutex_unlock(&qspi->list_lock);
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 983280e8d93f..e5a391aecde1 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -761,7 +761,7 @@ config FB_VESA
config FB_EFI
bool "EFI-based Framebuffer Support"
- depends on (FB = y) && X86 && EFI
+ depends on (FB = y) && !IA64 && EFI
select FB_CFB_FILLRECT
select FB_CFB_COPYAREA
select FB_CFB_IMAGEBLIT
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c
index 95d293b7445a..f4c045c0051c 100644
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -6,16 +6,14 @@
*
*/
-#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/efi.h>
#include <linux/errno.h>
#include <linux/fb.h>
#include <linux/platform_device.h>
#include <linux/screen_info.h>
-#include <linux/dmi.h>
-#include <linux/pci.h>
#include <video/vga.h>
-#include <asm/sysfb.h>
+#include <asm/efi.h>
static bool request_mem_succeeded = false;
@@ -85,21 +83,13 @@ static struct fb_ops efifb_ops = {
static int efifb_setup(char *options)
{
char *this_opt;
- int i;
if (options && *options) {
while ((this_opt = strsep(&options, ",")) != NULL) {
if (!*this_opt) continue;
- for (i = 0; i < M_UNKNOWN; i++) {
- if (efifb_dmi_list[i].base != 0 &&
- !strcmp(this_opt, efifb_dmi_list[i].optname)) {
- screen_info.lfb_base = efifb_dmi_list[i].base;
- screen_info.lfb_linelength = efifb_dmi_list[i].stride;
- screen_info.lfb_width = efifb_dmi_list[i].width;
- screen_info.lfb_height = efifb_dmi_list[i].height;
- }
- }
+ efifb_setup_from_dmi(&screen_info, this_opt);
+
if (!strncmp(this_opt, "base:", 5))
screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0);
else if (!strncmp(this_opt, "stride:", 7))
@@ -338,5 +328,4 @@ static struct platform_driver efifb_driver = {
.remove = efifb_remove,
};
-module_platform_driver(efifb_driver);
-MODULE_LICENSE("GPL");
+builtin_platform_driver(efifb_driver);
diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c
index be7e56a338e8..e9d2135445c1 100644
--- a/drivers/xen/efi.c
+++ b/drivers/xen/efi.c
@@ -316,7 +316,6 @@ static const struct efi efi_xen __initconst = {
.get_next_high_mono_count = xen_efi_get_next_high_mono_count,
.reset_system = NULL, /* Functionality provided by Xen. */
.set_virtual_address_map = NULL, /* Not used under Xen. */
- .memmap = NULL, /* Not used under Xen. */
.flags = 0 /* Initialized later. */
};
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index feef8a9c4de7..f02404052b7b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
.sb = inode->i_sb,
};
lower_file = ecryptfs_file_to_lower(file);
- lower_file->f_pos = ctx->pos;
rc = iterate_dir(lower_file, &buf.ctx);
ctx->pos = buf.ctx.pos;
if (rc < 0)
@@ -223,14 +222,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
}
ecryptfs_set_file_lower(
file, ecryptfs_inode_to_private(inode)->lower_file);
- if (d_is_dir(ecryptfs_dentry)) {
- ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
- mutex_lock(&crypt_stat->cs_mutex);
- crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
- mutex_unlock(&crypt_stat->cs_mutex);
- rc = 0;
- goto out;
- }
rc = read_or_initialize_metadata(ecryptfs_dentry);
if (rc)
goto out_put;
@@ -247,6 +238,45 @@ out:
return rc;
}
+/**
+ * ecryptfs_dir_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_dir_open(struct inode *inode, struct file *file)
+{
+ struct dentry *ecryptfs_dentry = file->f_path.dentry;
+ /* Private value of ecryptfs_dentry allocated in
+ * ecryptfs_lookup() */
+ struct ecryptfs_file_info *file_info;
+ struct file *lower_file;
+
+ /* Released in ecryptfs_release or end of function if failure */
+ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
+ ecryptfs_set_file_private(file, file_info);
+ if (unlikely(!file_info)) {
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to allocate memory\n");
+ return -ENOMEM;
+ }
+ lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
+ file->f_flags, current_cred());
+ if (IS_ERR(lower_file)) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the lower file for the dentry with name "
+ "[%pd]; rc = [%ld]\n", __func__,
+ ecryptfs_dentry, PTR_ERR(lower_file));
+ kmem_cache_free(ecryptfs_file_info_cache, file_info);
+ return PTR_ERR(lower_file);
+ }
+ ecryptfs_set_file_lower(file, lower_file);
+ return 0;
+}
+
static int ecryptfs_flush(struct file *file, fl_owner_t td)
{
struct file *lower_file = ecryptfs_file_to_lower(file);
@@ -267,6 +297,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
return 0;
}
+static int ecryptfs_dir_release(struct inode *inode, struct file *file)
+{
+ fput(ecryptfs_file_to_lower(file));
+ kmem_cache_free(ecryptfs_file_info_cache,
+ ecryptfs_file_to_private(file));
+ return 0;
+}
+
+static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence);
+}
+
static int
ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
@@ -346,20 +389,16 @@ const struct file_operations ecryptfs_dir_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
#endif
- .open = ecryptfs_open,
- .flush = ecryptfs_flush,
- .release = ecryptfs_release,
+ .open = ecryptfs_dir_open,
+ .release = ecryptfs_dir_release,
.fsync = ecryptfs_fsync,
- .fasync = ecryptfs_fasync,
- .splice_read = generic_file_splice_read,
- .llseek = default_llseek,
+ .llseek = ecryptfs_dir_llseek,
};
const struct file_operations ecryptfs_main_fops = {
.llseek = generic_file_llseek,
.read_iter = ecryptfs_read_update_atime,
.write_iter = generic_file_write_iter,
- .iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index d48e0d261d78..5f22e74bbade 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -157,7 +157,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
return 0;
}
-long
+static long
efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
{
void __user *arg = (void __user *)p;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 553c5d2db4a4..9cb54a38832d 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -216,8 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&efivarfs_list);
- err = efivar_init(efivarfs_callback, (void *)sb, false,
- true, &efivarfs_list);
+ err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
if (err)
__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 5384ceb35b1c..98b3eb7d8eaf 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de,
int retnamlen = 0;
int truncate = 0;
int ret = 0;
+ char *p;
+ int len;
if (!ISOFS_SB(inode->i_sb)->s_rock)
return 0;
@@ -267,12 +269,17 @@ repeat:
rr->u.NM.flags);
break;
}
- if ((strlen(retname) + rr->len - 5) >= 254) {
+ len = rr->len - 5;
+ if (retnamlen + len >= 254) {
truncate = 1;
break;
}
- strncat(retname, rr->u.NM.name, rr->len - 5);
- retnamlen += rr->len - 5;
+ p = memchr(rr->u.NM.name, '\0', len);
+ if (unlikely(p))
+ len = p - rr->u.NM.name;
+ memcpy(retname + retnamlen, rr->u.NM.name, len);
+ retnamlen += len;
+ retname[retnamlen] = '\0';
break;
case SIG('R', 'E'):
kfree(rs.buffer);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 03b688d19f69..37f9678ae4df 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -153,9 +153,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
p = buf + len + nlen;
*p = '\0';
for (kn = kn_to; kn != common; kn = kn->parent) {
- nlen = strlen(kn->name);
- p -= nlen;
- memcpy(p, kn->name, nlen);
+ size_t tmp = strlen(kn->name);
+ p -= tmp;
+ memcpy(p, kn->name, tmp);
*(--p) = '/';
}
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f73541fbe7af..3d670a3678f2 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
+#include <linux/seq_file.h>
#include "kernfs-internal.h"
@@ -40,6 +41,19 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
return 0;
}
+static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
+{
+ struct kernfs_node *node = dentry->d_fsdata;
+ struct kernfs_root *root = kernfs_root(node);
+ struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+ if (scops && scops->show_path)
+ return scops->show_path(sf, node, root);
+
+ seq_dentry(sf, dentry, " \t\n\\");
+ return 0;
+}
+
const struct super_operations kernfs_sops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
@@ -47,6 +61,7 @@ const struct super_operations kernfs_sops = {
.remount_fs = kernfs_sop_remount_fs,
.show_options = kernfs_sop_show_options,
+ .show_path = kernfs_sop_show_path,
};
/**
diff --git a/fs/namei.c b/fs/namei.c
index 1d9ca2d5dff6..42f8ca038254 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1794,30 +1794,49 @@ static inline unsigned int fold_hash(unsigned long hash)
return hash_64(hash, 32);
}
+/*
+ * This is George Marsaglia's XORSHIFT generator.
+ * It implements a maximum-period LFSR in only a few
+ * instructions. It also has the property (required
+ * by hash_name()) that mix_hash(0) = 0.
+ */
+static inline unsigned long mix_hash(unsigned long hash)
+{
+ hash ^= hash << 13;
+ hash ^= hash >> 7;
+ hash ^= hash << 17;
+ return hash;
+}
+
#else /* 32-bit case */
#define fold_hash(x) (x)
+static inline unsigned long mix_hash(unsigned long hash)
+{
+ hash ^= hash << 13;
+ hash ^= hash >> 17;
+ hash ^= hash << 5;
+ return hash;
+}
+
#endif
unsigned int full_name_hash(const unsigned char *name, unsigned int len)
{
- unsigned long a, mask;
- unsigned long hash = 0;
+ unsigned long a, hash = 0;
for (;;) {
a = load_unaligned_zeropad(name);
if (len < sizeof(unsigned long))
break;
- hash += a;
- hash *= 9;
+ hash = mix_hash(hash + a);
name += sizeof(unsigned long);
len -= sizeof(unsigned long);
if (!len)
goto done;
}
- mask = bytemask_from_count(len);
- hash += mask & a;
+ hash += a & bytemask_from_count(len);
done:
return fold_hash(hash);
}
@@ -1835,7 +1854,7 @@ static inline u64 hash_name(const char *name)
hash = a = 0;
len = -sizeof(unsigned long);
do {
- hash = (hash + a) * 9;
+ hash = mix_hash(hash + a);
len += sizeof(unsigned long);
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
@@ -2267,6 +2286,33 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
EXPORT_SYMBOL(vfs_path_lookup);
/**
+ * lookup_hash - lookup single pathname component on already hashed name
+ * @name: name and hash to lookup
+ * @base: base directory to lookup from
+ *
+ * The name must have been verified and hashed (see lookup_one_len()). Using
+ * this after just full_name_hash() is unsafe.
+ *
+ * This function also doesn't check for search permission on base directory.
+ *
+ * Use lookup_one_len_unlocked() instead, unless you really know what you are
+ * doing.
+ *
+ * Do not hold i_mutex; this helper takes i_mutex if necessary.
+ */
+struct dentry *lookup_hash(const struct qstr *name, struct dentry *base)
+{
+ struct dentry *ret;
+
+ ret = lookup_dcache(name, base, 0);
+ if (!ret)
+ ret = lookup_slow(name, base, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(lookup_hash);
+
+/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
@@ -2337,7 +2383,6 @@ struct dentry *lookup_one_len_unlocked(const char *name,
struct qstr this;
unsigned int c;
int err;
- struct dentry *ret;
this.name = name;
this.len = len;
@@ -2369,10 +2414,7 @@ struct dentry *lookup_one_len_unlocked(const char *name,
if (err)
return ERR_PTR(err);
- ret = lookup_dcache(&this, base, 0);
- if (!ret)
- ret = lookup_slow(&this, base, 0);
- return ret;
+ return lookup_hash(&this, base);
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2942,22 +2984,10 @@ no_open:
dentry = lookup_real(dir, dentry, nd->flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
-
- if (create_error) {
- int open_flag = op->open_flag;
-
- error = create_error;
- if ((open_flag & O_EXCL)) {
- if (!dentry->d_inode)
- goto out;
- } else if (!dentry->d_inode) {
- goto out;
- } else if ((open_flag & O_TRUNC) &&
- d_is_reg(dentry)) {
- goto out;
- }
- /* will fail later, go on to get the right error */
- }
+ }
+ if (create_error && !dentry->d_inode) {
+ error = create_error;
+ goto out;
}
looked_up:
path->dentry = dentry;
@@ -4213,7 +4243,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
- if (source == target)
+ /*
+ * Check source == target.
+ * On overlayfs need to look at underlying inodes.
+ */
+ if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
return 0;
error = may_delete(old_dir, old_dentry, is_dir);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0cdf497c91ef..2162434728c0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
brelse(di_bh);
return acl;
}
+
+int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (ret)
+ return ret;
+ ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+ acl, NULL, NULL);
+ posix_acl_release(acl);
+ return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl = NULL;
+ int ret = 0, ret2;
+ umode_t mode;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+ dir_bh);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl) {
+ mode = inode->i_mode & ~current_umask();
+ ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ }
+ if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+ if (S_ISDIR(inode->i_mode)) {
+ ret = ocfs2_set_acl(handle, inode, di_bh,
+ ACL_TYPE_DEFAULT, acl,
+ meta_ac, data_ac);
+ if (ret)
+ goto cleanup;
+ }
+ mode = inode->i_mode;
+ ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
+ if (ret < 0)
+ return ret;
+
+ ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+ if (ret2) {
+ mlog_errno(ret2);
+ ret = ret2;
+ goto cleanup;
+ }
+ if (ret > 0) {
+ ret = ocfs2_set_acl(handle, inode,
+ di_bh, ACL_TYPE_ACCESS,
+ acl, meta_ac, data_ac);
+ }
+ }
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 3fce68d08625..2783a75b3999 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle,
struct posix_acl *acl,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac);
+extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+ struct buffer_head *, struct buffer_head *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5308841756be..59cce53c91d8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1268,20 +1268,20 @@ bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- brelse(bh);
/* Release quota pointers in case we acquired them */
for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
dqput(transfer_to[qtype]);
if (!status && attr->ia_valid & ATTR_MODE) {
- status = posix_acl_chmod(inode, inode->i_mode);
+ status = ocfs2_acl_chmod(inode, bh);
if (status < 0)
mlog_errno(status);
}
if (inode_locked)
ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
return status;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6b3e87189a64..a8f1225e6d9b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
- struct posix_acl *default_acl = NULL, *acl = NULL;
struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
@@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (status) {
- mlog_errno(status);
- goto leave;
- }
-
handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
S_ISDIR(mode),
xattr_credits));
@@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir,
inc_nlink(dir);
}
- if (default_acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_DEFAULT, default_acl,
- meta_ac, data_ac);
- }
- if (!status && acl) {
- status = ocfs2_set_acl(handle, inode, new_fe_bh,
- ACL_TYPE_ACCESS, acl,
- meta_ac, data_ac);
- }
+ status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+ meta_ac, data_ac);
if (status < 0) {
mlog_errno(status);
@@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir,
d_instantiate(dentry, inode);
status = 0;
leave:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 744d5d90c363..92bbe93bfe10 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
- struct posix_acl *default_acl, *acl;
- umode_t mode;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
- mode = inode->i_mode;
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
- if (error) {
- mlog_errno(error);
- return error;
- }
- error = ocfs2_create_inode_in_orphan(dir, mode,
+ error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
- &new_dentry->d_name,
- default_acl, acl);
+ &new_dentry->d_name);
if (error)
mlog_errno(error);
}
out:
- if (default_acl)
- posix_acl_release(default_acl);
- if (acl)
- posix_acl_release(acl);
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7d3d979f57d9..f19b7381a998 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7216,12 +7216,10 @@ out:
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl)
+ const struct qstr *qstr)
{
- struct buffer_head *dir_bh = NULL;
int ret = 0;
+ struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
mlog_errno(ret);
goto leave;
}
-
- if (!ret && default_acl)
- ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
- if (!ret && acl)
- ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+ if (ret)
+ mlog_errno(ret);
ocfs2_inode_unlock(dir, 0);
brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index f10d5b93c366..1633cc15ea1f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr,
- struct posix_acl *default_acl,
- struct posix_acl *acl);
+ const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
diff --git a/fs/open.c b/fs/open.c
index 17cb6b1dab75..081d3d6df74b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path);
int vfs_open(const struct path *path, struct file *file,
const struct cred *cred)
{
- struct dentry *dentry = path->dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = vfs_select_inode(path->dentry, file->f_flags);
- file->f_path = *path;
- if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
- inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- }
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ file->f_path = *path;
return do_dentry_open(file, inode, NULL, cred);
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5d972e6cd3fe..791235e03d17 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -411,9 +411,7 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
{
struct dentry *dentry;
- inode_lock(dir->d_inode);
- dentry = lookup_one_len(name->name, dir, name->len);
- inode_unlock(dir->d_inode);
+ dentry = lookup_hash(name, dir);
if (IS_ERR(dentry)) {
if (PTR_ERR(dentry) == -ENOENT)
diff --git a/fs/splice.c b/fs/splice.c
index b018eb485019..dd9bf7e410d2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1143,6 +1143,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(ret < 0))
return ret;
+ if (unlikely(len > MAX_RW_COUNT))
+ len = MAX_RW_COUNT;
+
if (in->f_op->splice_read)
splice_read = in->f_op->splice_read;
else
diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index d6d5dc98d7da..3fc94a046bf5 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -53,7 +53,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
/*
* lock for writing
*/
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void __down_write(struct rw_semaphore *sem)
{
long tmp;
@@ -63,9 +63,16 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_down_write_failed(sem);
}
-static inline void __down_write(struct rw_semaphore *sem)
+static inline int __down_write_killable(struct rw_semaphore *sem)
{
- __down_write_nested(sem, 0);
+ long tmp;
+
+ tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+ (atomic_long_t *)&sem->count);
+ if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+ if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+ return -EINTR;
+ return 0;
}
static inline int __down_write_trylock(struct rw_semaphore *sem)
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 506c3531832e..e451534fe54d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -560,11 +560,11 @@ static inline int atomic_dec_if_positive(atomic_t *v)
/**
* atomic_fetch_or - perform *p |= mask and return old value of *p
- * @p: pointer to atomic_t
* @mask: mask to OR on the atomic_t
+ * @p: pointer to atomic_t
*/
#ifndef atomic_fetch_or
-static inline int atomic_fetch_or(atomic_t *p, int mask)
+static inline int atomic_fetch_or(int mask, atomic_t *p)
{
int old, val = atomic_read(p);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4bb4de8d95ea..7e9422cb5989 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -565,4 +565,16 @@ static inline struct dentry *d_real(struct dentry *dentry)
return dentry;
}
+static inline struct inode *vfs_select_inode(struct dentry *dentry,
+ unsigned open_flags)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (inode && unlikely(dentry->d_flags & DCACHE_OP_SELECT_INODE))
+ inode = dentry->d_op->d_select_inode(dentry, open_flags);
+
+ return inode;
+}
+
+
#endif /* __LINUX_DCACHE_H */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 1626474567ac..df7acb51f3cc 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -21,6 +21,7 @@
#include <linux/pfn.h>
#include <linux/pstore.h>
#include <linux/reboot.h>
+#include <linux/screen_info.h>
#include <asm/page.h>
@@ -124,6 +125,13 @@ typedef struct {
} efi_capsule_header_t;
/*
+ * EFI capsule flags
+ */
+#define EFI_CAPSULE_PERSIST_ACROSS_RESET 0x00010000
+#define EFI_CAPSULE_POPULATE_SYSTEM_TABLE 0x00020000
+#define EFI_CAPSULE_INITIATE_RESET 0x00040000
+
+/*
* Allocation types for calls to boottime->allocate_pages.
*/
#define EFI_ALLOCATE_ANY_PAGES 0
@@ -282,9 +290,10 @@ typedef struct {
efi_status_t (*handle_protocol)(efi_handle_t, efi_guid_t *, void **);
void *__reserved;
void *register_protocol_notify;
- void *locate_handle;
+ efi_status_t (*locate_handle)(int, efi_guid_t *, void *,
+ unsigned long *, efi_handle_t *);
void *locate_device_path;
- void *install_configuration_table;
+ efi_status_t (*install_configuration_table)(efi_guid_t *, void *);
void *load_image;
void *start_image;
void *exit;
@@ -623,6 +632,27 @@ void efi_native_runtime_setup(void);
EFI_GUID(0x3152bca5, 0xeade, 0x433d, \
0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44)
+#define EFI_MEMORY_ATTRIBUTES_TABLE_GUID \
+ EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, \
+ 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20)
+
+#define EFI_CONSOLE_OUT_DEVICE_GUID \
+ EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, \
+ 0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
+
+/*
+ * This GUID is used to pass to the kernel proper the struct screen_info
+ * structure that was populated by the stub based on the GOP protocol instance
+ * associated with ConOut
+ */
+#define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID \
+ EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, \
+ 0xb9, 0xe, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95)
+
+#define LINUX_EFI_LOADER_ENTRY_GUID \
+ EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, \
+ 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f)
+
typedef struct {
efi_guid_t guid;
u64 table;
@@ -847,6 +877,14 @@ typedef struct {
#define EFI_INVALID_TABLE_ADDR (~0UL)
+typedef struct {
+ u32 version;
+ u32 num_entries;
+ u32 desc_size;
+ u32 reserved;
+ efi_memory_desc_t entry[0];
+} efi_memory_attributes_table_t;
+
/*
* All runtime access to EFI goes through this structure:
*/
@@ -868,6 +906,7 @@ extern struct efi {
unsigned long config_table; /* config tables */
unsigned long esrt; /* ESRT table */
unsigned long properties_table; /* properties table */
+ unsigned long mem_attr_table; /* memory attributes table */
efi_get_time_t *get_time;
efi_set_time_t *set_time;
efi_get_wakeup_time_t *get_wakeup_time;
@@ -883,7 +922,7 @@ extern struct efi {
efi_get_next_high_mono_count_t *get_next_high_mono_count;
efi_reset_system_t *reset_system;
efi_set_virtual_address_map_t *set_virtual_address_map;
- struct efi_memory_map *memmap;
+ struct efi_memory_map memmap;
unsigned long flags;
} efi;
@@ -945,7 +984,6 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource,
extern void efi_get_time(struct timespec *now);
extern void efi_reserve_boot_services(void);
extern int efi_get_fdt_params(struct efi_fdt_params *params);
-extern struct efi_memory_map memmap;
extern struct kobject *efi_kobj;
extern int efi_reboot_quirk_mode;
@@ -957,12 +995,34 @@ extern void __init efi_fake_memmap(void);
static inline void efi_fake_memmap(void) { }
#endif
+/*
+ * efi_memattr_perm_setter - arch specific callback function passed into
+ * efi_memattr_apply_permissions() that updates the
+ * mapping permissions described by the second
+ * argument in the page tables referred to by the
+ * first argument.
+ */
+typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *);
+
+extern int efi_memattr_init(void);
+extern int efi_memattr_apply_permissions(struct mm_struct *mm,
+ efi_memattr_perm_setter fn);
+
/* Iterate through an efi_memory_map */
-#define for_each_efi_memory_desc(m, md) \
+#define for_each_efi_memory_desc_in_map(m, md) \
for ((md) = (m)->map; \
(md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \
(md) = (void *)(md) + (m)->desc_size)
+/**
+ * for_each_efi_memory_desc - iterate over descriptors in efi.memmap
+ * @md: the efi_memory_desc_t * iterator
+ *
+ * Once the loop finishes @md must not be accessed.
+ */
+#define for_each_efi_memory_desc(md) \
+ for_each_efi_memory_desc_in_map(&efi.memmap, md)
+
/*
* Format an EFI memory descriptor's type and attributes to a user-provided
* character buffer, as per snprintf(), and return the buffer.
@@ -1000,7 +1060,6 @@ extern int __init efi_setup_pcdp_console(char *);
* possible, remove EFI-related code altogether.
*/
#define EFI_BOOT 0 /* Were we booted from EFI? */
-#define EFI_SYSTEM_TABLES 1 /* Can we use EFI system tables? */
#define EFI_CONFIG_TABLES 2 /* Can we use EFI config tables? */
#define EFI_RUNTIME_SERVICES 3 /* Can we use runtime services? */
#define EFI_MEMMAP 4 /* Can we use EFI memory map? */
@@ -1026,8 +1085,16 @@ static inline bool efi_enabled(int feature)
}
static inline void
efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {}
+
+static inline bool
+efi_capsule_pending(int *reset_type)
+{
+ return false;
+}
#endif
+extern int efi_status_to_err(efi_status_t status);
+
/*
* Variable Attributes
*/
@@ -1180,6 +1247,80 @@ struct efi_simple_text_output_protocol {
void *test_string;
};
+#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0
+#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1
+#define PIXEL_BIT_MASK 2
+#define PIXEL_BLT_ONLY 3
+#define PIXEL_FORMAT_MAX 4
+
+struct efi_pixel_bitmask {
+ u32 red_mask;
+ u32 green_mask;
+ u32 blue_mask;
+ u32 reserved_mask;
+};
+
+struct efi_graphics_output_mode_info {
+ u32 version;
+ u32 horizontal_resolution;
+ u32 vertical_resolution;
+ int pixel_format;
+ struct efi_pixel_bitmask pixel_information;
+ u32 pixels_per_scan_line;
+} __packed;
+
+struct efi_graphics_output_protocol_mode_32 {
+ u32 max_mode;
+ u32 mode;
+ u32 info;
+ u32 size_of_info;
+ u64 frame_buffer_base;
+ u32 frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_mode_64 {
+ u32 max_mode;
+ u32 mode;
+ u64 info;
+ u64 size_of_info;
+ u64 frame_buffer_base;
+ u64 frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_mode {
+ u32 max_mode;
+ u32 mode;
+ unsigned long info;
+ unsigned long size_of_info;
+ u64 frame_buffer_base;
+ unsigned long frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_32 {
+ u32 query_mode;
+ u32 set_mode;
+ u32 blt;
+ u32 mode;
+};
+
+struct efi_graphics_output_protocol_64 {
+ u64 query_mode;
+ u64 set_mode;
+ u64 blt;
+ u64 mode;
+};
+
+struct efi_graphics_output_protocol {
+ unsigned long query_mode;
+ unsigned long set_mode;
+ unsigned long blt;
+ struct efi_graphics_output_protocol_mode *mode;
+};
+
+typedef efi_status_t (*efi_graphics_output_protocol_query_mode)(
+ struct efi_graphics_output_protocol *, u32, unsigned long *,
+ struct efi_graphics_output_mode_info **);
+
extern struct list_head efivar_sysfs_list;
static inline void
@@ -1195,8 +1336,7 @@ int efivars_unregister(struct efivars *efivars);
struct kobject *efivars_kobject(void);
int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
- void *data, bool atomic, bool duplicates,
- struct list_head *head);
+ void *data, bool duplicates, struct list_head *head);
void efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
void efivar_entry_remove(struct efivar_entry *entry);
@@ -1242,6 +1382,13 @@ int efivars_sysfs_init(void);
#define EFIVARS_DATA_SIZE_MAX 1024
#endif /* CONFIG_EFI_VARS */
+extern bool efi_capsule_pending(int *reset_type);
+
+extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
+ size_t size, int *reset);
+
+extern int efi_capsule_update(efi_capsule_header_t *capsule,
+ struct page **pages);
#ifdef CONFIG_EFI_RUNTIME_MAP
int efi_runtime_map_init(struct kobject *);
@@ -1319,5 +1466,9 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
efi_status_t efi_parse_options(char *cmdline);
+efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
+ struct screen_info *si, efi_guid_t *proto,
+ unsigned long size);
+
bool efi_runtime_disabled(void);
#endif /* _LINUX_EFI_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c44242f39..30f089ebe0a4 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -152,6 +152,8 @@ struct kernfs_syscall_ops {
int (*rmdir)(struct kernfs_node *kn);
int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name);
+ int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
+ struct kernfs_root *root);
};
struct kernfs_root {
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fb7d87e45fbe..eabe0138eb06 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -457,6 +457,18 @@ do { \
lock_acquired(&(_lock)->dep_map, _RET_IP_); \
} while (0)
+#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
+({ \
+ int ____err = 0; \
+ if (!try(_lock)) { \
+ lock_contended(&(_lock)->dep_map, _RET_IP_); \
+ ____err = lock(_lock); \
+ } \
+ if (!____err) \
+ lock_acquired(&(_lock)->dep_map, _RET_IP_); \
+ ____err; \
+})
+
#else /* CONFIG_LOCK_STAT */
#define lock_contended(lockdep_map, ip) do {} while (0)
@@ -465,6 +477,9 @@ do { \
#define LOCK_CONTENDED(_lock, try, lock) \
lock(_lock)
+#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
+ lock(_lock)
+
#endif /* CONFIG_LOCK_STAT */
#ifdef CONFIG_LOCKDEP
diff --git a/include/linux/mfd/samsung/s2mps11.h b/include/linux/mfd/samsung/s2mps11.h
index b288965e8101..2c14eeca46f0 100644
--- a/include/linux/mfd/samsung/s2mps11.h
+++ b/include/linux/mfd/samsung/s2mps11.h
@@ -173,10 +173,12 @@ enum s2mps11_regulators {
#define S2MPS11_LDO_VSEL_MASK 0x3F
#define S2MPS11_BUCK_VSEL_MASK 0xFF
+#define S2MPS11_BUCK9_VSEL_MASK 0x1F
#define S2MPS11_ENABLE_MASK (0x03 << S2MPS11_ENABLE_SHIFT)
#define S2MPS11_ENABLE_SHIFT 0x06
#define S2MPS11_LDO_N_VOLTAGES (S2MPS11_LDO_VSEL_MASK + 1)
#define S2MPS11_BUCK_N_VOLTAGES (S2MPS11_BUCK_VSEL_MASK + 1)
+#define S2MPS11_BUCK9_N_VOLTAGES (S2MPS11_BUCK9_VSEL_MASK + 1)
#define S2MPS11_RAMP_DELAY 25000 /* uV/us */
#define S2MPS11_CTRL1_PWRHOLD_MASK BIT(4)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 864d7221de84..8f468e0d2534 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
#else
static inline int total_mapcount(struct page *page)
{
return page_mapcount(page);
}
+static inline int page_trans_huge_mapcount(struct page *page,
+ int *total_mapcount)
+{
+ int mapcount = page_mapcount(page);
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ return mapcount;
+}
#endif
static inline struct page *virt_to_head_page(const void *x)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 77d01700daf7..ec5ec2818a28 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,6 +79,8 @@ extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
+struct qstr;
+extern struct dentry *lookup_hash(const struct qstr *, struct dentry *);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd71..9e1c3ada91c4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
struct perf_callchain_entry {
__u64 nr;
- __u64 ip[PERF_MAX_STACK_DEPTH];
+ __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
};
struct perf_raw_record {
@@ -151,6 +151,15 @@ struct hw_perf_event {
*/
struct task_struct *target;
+ /*
+ * PMU would store hardware filter configuration
+ * here.
+ */
+ void *addr_filters;
+
+ /* Last sync'ed generation of filters */
+ unsigned long addr_filters_gen;
+
/*
* hw_perf_event::state flags; used to track the PERF_EF_* state.
*/
@@ -216,6 +225,7 @@ struct perf_event;
#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08
#define PERF_PMU_CAP_EXCLUSIVE 0x10
#define PERF_PMU_CAP_ITRACE 0x20
+#define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40
/**
* struct pmu - generic performance monitoring unit
@@ -240,6 +250,9 @@ struct pmu {
int task_ctx_nr;
int hrtimer_interval_ms;
+ /* number of address filters this PMU can do */
+ unsigned int nr_addr_filters;
+
/*
* Fully disable/enable this PMU, can be used to protect from the PMI
* as well as for lazy/batch writing of the MSRs.
@@ -393,12 +406,71 @@ struct pmu {
void (*free_aux) (void *aux); /* optional */
/*
+ * Validate address range filters: make sure the HW supports the
+ * requested configuration and number of filters; return 0 if the
+ * supplied filters are valid, -errno otherwise.
+ *
+ * Runs in the context of the ioctl()ing process and is not serialized
+ * with the rest of the PMU callbacks.
+ */
+ int (*addr_filters_validate) (struct list_head *filters);
+ /* optional */
+
+ /*
+ * Synchronize address range filter configuration:
+ * translate hw-agnostic filters into hardware configuration in
+ * event::hw::addr_filters.
+ *
+ * Runs as a part of filter sync sequence that is done in ->start()
+ * callback by calling perf_event_addr_filters_sync().
+ *
+ * May (and should) traverse event::addr_filters::list, for which its
+ * caller provides necessary serialization.
+ */
+ void (*addr_filters_sync) (struct perf_event *event);
+ /* optional */
+
+ /*
* Filter events for PMU-specific reasons.
*/
int (*filter_match) (struct perf_event *event); /* optional */
};
/**
+ * struct perf_addr_filter - address range filter definition
+ * @entry: event's filter list linkage
+ * @inode: object file's inode for file-based filters
+ * @offset: filter range offset
+ * @size: filter range size
+ * @range: 1: range, 0: address
+ * @filter: 1: filter/start, 0: stop
+ *
+ * This is a hardware-agnostic filter configuration as specified by the user.
+ */
+struct perf_addr_filter {
+ struct list_head entry;
+ struct inode *inode;
+ unsigned long offset;
+ unsigned long size;
+ unsigned int range : 1,
+ filter : 1;
+};
+
+/**
+ * struct perf_addr_filters_head - container for address range filters
+ * @list: list of filters for this event
+ * @lock: spinlock that serializes accesses to the @list and event's
+ * (and its children's) filter generations.
+ *
+ * A child event will use parent's @list (and therefore @lock), so they are
+ * bundled together; see perf_event_addr_filters().
+ */
+struct perf_addr_filters_head {
+ struct list_head list;
+ raw_spinlock_t lock;
+};
+
+/**
* enum perf_event_active_state - the states of a event
*/
enum perf_event_active_state {
@@ -566,6 +638,12 @@ struct perf_event {
atomic_t event_limit;
+ /* address range filters */
+ struct perf_addr_filters_head addr_filters;
+ /* vma address array for file-based filders */
+ unsigned long *addr_filters_offs;
+ unsigned long addr_filters_gen;
+
void (*destroy)(struct perf_event *);
struct rcu_head rcu_head;
@@ -834,9 +912,25 @@ extern int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs);
+extern void perf_event_output_forward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
+extern void perf_event_output_backward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
extern void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs);
+ struct perf_sample_data *data,
+ struct pt_regs *regs);
+
+static inline bool
+is_default_overflow_handler(struct perf_event *event)
+{
+ if (likely(event->overflow_handler == perf_event_output_forward))
+ return true;
+ if (unlikely(event->overflow_handler == perf_event_output_backward))
+ return true;
+ return false;
+}
extern void
perf_event_header__init_id(struct perf_event_header *header,
@@ -977,9 +1071,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
+extern int sysctl_perf_event_max_stack;
+
static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
{
- if (entry->nr < PERF_MAX_STACK_DEPTH) {
+ if (entry->nr < sysctl_perf_event_max_stack) {
entry->ip[entry->nr++] = ip;
return 0;
} else {
@@ -1001,6 +1097,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
static inline bool perf_paranoid_tracepoint_raw(void)
{
@@ -1045,8 +1143,41 @@ static inline bool has_aux(struct perf_event *event)
return event->pmu->setup_aux;
}
+static inline bool is_write_backward(struct perf_event *event)
+{
+ return !!event->attr.write_backward;
+}
+
+static inline bool has_addr_filter(struct perf_event *event)
+{
+ return event->pmu->nr_addr_filters;
+}
+
+/*
+ * An inherited event uses parent's filters
+ */
+static inline struct perf_addr_filters_head *
+perf_event_addr_filters(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = &event->addr_filters;
+
+ if (event->parent)
+ ifh = &event->parent->addr_filters;
+
+ return ifh;
+}
+
+extern void perf_event_addr_filters_sync(struct perf_event *event);
+
extern int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size);
+extern int perf_output_begin_forward(struct perf_output_handle *handle,
+ struct perf_event *event,
+ unsigned int size);
+extern int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_event *event,
+ unsigned int size);
+
extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len);
diff --git a/include/linux/proportions.h b/include/linux/proportions.h
deleted file mode 100644
index 21221338ad18..000000000000
--- a/include/linux/proportions.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * FLoating proportions
- *
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
- *
- * This file contains the public data structure and API definitions.
- */
-
-#ifndef _LINUX_PROPORTIONS_H
-#define _LINUX_PROPORTIONS_H
-
-#include <linux/percpu_counter.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-
-struct prop_global {
- /*
- * The period over which we differentiate
- *
- * period = 2^shift
- */
- int shift;
- /*
- * The total event counter aka 'time'.
- *
- * Treated as an unsigned long; the lower 'shift - 1' bits are the
- * counter bits, the remaining upper bits the period counter.
- */
- struct percpu_counter events;
-};
-
-/*
- * global proportion descriptor
- *
- * this is needed to consistently flip prop_global structures.
- */
-struct prop_descriptor {
- int index;
- struct prop_global pg[2];
- struct mutex mutex; /* serialize the prop_global switch */
-};
-
-int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp);
-void prop_change_shift(struct prop_descriptor *pd, int new_shift);
-
-/*
- * ----- PERCPU ------
- */
-
-struct prop_local_percpu {
- /*
- * the local events counter
- */
- struct percpu_counter events;
-
- /*
- * snapshot of the last seen global state
- */
- int shift;
- unsigned long period;
- raw_spinlock_t lock; /* protect the snapshot state */
-};
-
-int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp);
-void prop_local_destroy_percpu(struct prop_local_percpu *pl);
-void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl);
-void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl,
- long *numerator, long *denominator);
-
-static inline
-void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __prop_inc_percpu(pd, pl);
- local_irq_restore(flags);
-}
-
-/*
- * Limit the time part in order to ensure there are some bits left for the
- * cycle counter and fraction multiply.
- */
-#if BITS_PER_LONG == 32
-#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
-#else
-#define PROP_MAX_SHIFT (BITS_PER_LONG/2)
-#endif
-
-#define PROP_FRAC_SHIFT (BITS_PER_LONG - PROP_MAX_SHIFT - 1)
-#define PROP_FRAC_BASE (1UL << PROP_FRAC_SHIFT)
-
-void __prop_inc_percpu_max(struct prop_descriptor *pd,
- struct prop_local_percpu *pl, long frac);
-
-
-/*
- * ----- SINGLE ------
- */
-
-struct prop_local_single {
- /*
- * the local events counter
- */
- unsigned long events;
-
- /*
- * snapshot of the last seen global state
- * and a lock protecting this state
- */
- unsigned long period;
- int shift;
- raw_spinlock_t lock; /* protect the snapshot state */
-};
-
-#define INIT_PROP_LOCAL_SINGLE(name) \
-{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
-}
-
-int prop_local_init_single(struct prop_local_single *pl);
-void prop_local_destroy_single(struct prop_local_single *pl);
-void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl);
-void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl,
- long *numerator, long *denominator);
-
-static inline
-void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __prop_inc_single(pd, pl);
- local_irq_restore(flags);
-}
-
-#endif /* _LINUX_PROPORTIONS_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2657aff2725b..5f1533e3d032 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -508,14 +508,7 @@ int rcu_read_lock_bh_held(void);
* CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
* critical section unless it can prove otherwise.
*/
-#ifdef CONFIG_PREEMPT_COUNT
int rcu_read_lock_sched_held(void);
-#else /* #ifdef CONFIG_PREEMPT_COUNT */
-static inline int rcu_read_lock_sched_held(void)
-{
- return 1;
-}
-#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -532,18 +525,10 @@ static inline int rcu_read_lock_bh_held(void)
return 1;
}
-#ifdef CONFIG_PREEMPT_COUNT
static inline int rcu_read_lock_sched_held(void)
{
- return preempt_count() != 0 || irqs_disabled();
-}
-#else /* #ifdef CONFIG_PREEMPT_COUNT */
-static inline int rcu_read_lock_sched_held(void)
-{
- return 1;
+ return !preemptible();
}
-#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
-
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#ifdef CONFIG_PROVE_RCU
@@ -1144,4 +1129,17 @@ static inline void rcu_sysidle_force_exit(void)
#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+ \
+ if (!atomic_read(&___rfd_beenhere) && \
+ !atomic_xchg(&___rfd_beenhere, 1)) \
+ ftrace_dump(oops_dump_mode); \
+} while (0)
+
+
#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 64809aea661c..93aea75029fb 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -149,6 +149,22 @@ static inline unsigned long rcu_batches_completed_sched(void)
return 0;
}
+/*
+ * Return the number of expedited grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed(void)
+{
+ return 0;
+}
+
+/*
+ * Return the number of expedited sched grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed_sched(void)
+{
+ return 0;
+}
+
static inline void rcu_force_quiescent_state(void)
{
}
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index ad1eda9fa4da..5043cb823fb2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -87,6 +87,8 @@ unsigned long rcu_batches_started_sched(void);
unsigned long rcu_batches_completed(void);
unsigned long rcu_batches_completed_bh(void);
unsigned long rcu_batches_completed_sched(void);
+unsigned long rcu_exp_batches_completed(void);
+unsigned long rcu_exp_batches_completed_sched(void);
void show_rcu_gp_kthreads(void);
void rcu_force_quiescent_state(void);
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index 561e8615528d..ae0528b834cd 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -34,7 +34,7 @@ struct rw_semaphore {
extern void __down_read(struct rw_semaphore *sem);
extern int __down_read_trylock(struct rw_semaphore *sem);
extern void __down_write(struct rw_semaphore *sem);
-extern void __down_write_nested(struct rw_semaphore *sem, int subclass);
+extern int __must_check __down_write_killable(struct rw_semaphore *sem);
extern int __down_write_trylock(struct rw_semaphore *sem);
extern void __up_read(struct rw_semaphore *sem);
extern void __up_write(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8f498cdde280..d1c12d160ace 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -14,6 +14,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
+#include <linux/err.h>
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif
@@ -43,6 +44,7 @@ struct rw_semaphore {
extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
@@ -116,6 +118,7 @@ extern int down_read_trylock(struct rw_semaphore *sem);
* lock for writing
*/
extern void down_write(struct rw_semaphore *sem);
+extern int __must_check down_write_killable(struct rw_semaphore *sem);
/*
* trylock for writing -- returns 1 if successful, 0 if contention
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 38526b67e787..6cc0df970f1a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -40,7 +40,6 @@ struct sched_param {
#include <linux/pid.h>
#include <linux/percpu.h>
#include <linux/topology.h>
-#include <linux/proportions.h>
#include <linux/seccomp.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
@@ -1655,6 +1654,7 @@ struct task_struct {
unsigned long sas_ss_sp;
size_t sas_ss_size;
+ unsigned sas_ss_flags;
struct callback_head *task_works;
@@ -2667,6 +2667,18 @@ static inline int kill_cad_pid(int sig, int priv)
*/
static inline int on_sig_stack(unsigned long sp)
{
+ /*
+ * If the signal stack is SS_AUTODISARM then, by construction, we
+ * can't be on the signal stack unless user code deliberately set
+ * SS_AUTODISARM when we were already on it.
+ *
+ * This improves reliability: if user state gets corrupted such that
+ * the stack pointer points very close to the end of the signal stack,
+ * then this check will enable the signal to be handled anyway.
+ */
+ if (current->sas_ss_flags & SS_AUTODISARM)
+ return 0;
+
#ifdef CONFIG_STACK_GROWSUP
return sp >= current->sas_ss_sp &&
sp - current->sas_ss_sp < current->sas_ss_size;
@@ -2684,6 +2696,13 @@ static inline int sas_ss_flags(unsigned long sp)
return on_sig_stack(sp) ? SS_ONSTACK : 0;
}
+static inline void sas_ss_reset(struct task_struct *p)
+{
+ p->sas_ss_sp = 0;
+ p->sas_ss_size = 0;
+ p->sas_ss_flags = SS_DISABLE;
+}
+
static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 92557bbce7e7..3fbe81444d31 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -432,8 +432,10 @@ int __save_altstack(stack_t __user *, unsigned long);
stack_t __user *__uss = uss; \
struct task_struct *t = current; \
put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \
- put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
+ put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+ if (t->sas_ss_flags & SS_AUTODISARM) \
+ sas_ss_reset(t); \
} while (0);
#ifdef CONFIG_PROC_FS
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0a4cd4703f40..ad220359f1b0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
-extern int reuse_swap_page(struct page *);
+extern bool reuse_swap_page(struct page *, int *);
extern int try_to_free_swap(struct page *);
struct backing_dev_info;
@@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-#define reuse_swap_page(page) \
- (!PageTransCompound(page) && page_mapcount(page) == 1)
+#define reuse_swap_page(page, total_mapcount) \
+ (page_trans_huge_mapcount(page, total_mapcount) == 1)
static inline int try_to_free_swap(struct page *page)
{
diff --git a/include/linux/uio.h b/include/linux/uio.h
index fd9bcfedad42..1b5d1cd796e2 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -87,6 +87,7 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
unsigned long iov_iter_alignment(const struct iov_iter *i);
+unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index ef72c4aada56..d3e756539d44 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -172,6 +172,77 @@ TRACE_EVENT(rcu_grace_period_init,
);
/*
+ * Tracepoint for expedited grace-period events. Takes a string identifying
+ * the RCU flavor, the expedited grace-period sequence number, and a string
+ * identifying the grace-period-related event as follows:
+ *
+ * "snap": Captured snapshot of expedited grace period sequence number.
+ * "start": Started a real expedited grace period.
+ * "end": Ended a real expedited grace period.
+ * "endwake": Woke piggybackers up.
+ * "done": Someone else did the expedited grace period for us.
+ */
+TRACE_EVENT(rcu_exp_grace_period,
+
+ TP_PROTO(const char *rcuname, unsigned long gpseq, const char *gpevent),
+
+ TP_ARGS(rcuname, gpseq, gpevent),
+
+ TP_STRUCT__entry(
+ __field(const char *, rcuname)
+ __field(unsigned long, gpseq)
+ __field(const char *, gpevent)
+ ),
+
+ TP_fast_assign(
+ __entry->rcuname = rcuname;
+ __entry->gpseq = gpseq;
+ __entry->gpevent = gpevent;
+ ),
+
+ TP_printk("%s %lu %s",
+ __entry->rcuname, __entry->gpseq, __entry->gpevent)
+);
+
+/*
+ * Tracepoint for expedited grace-period funnel-locking events. Takes a
+ * string identifying the RCU flavor, an integer identifying the rcu_node
+ * combining-tree level, another pair of integers identifying the lowest-
+ * and highest-numbered CPU associated with the current rcu_node structure,
+ * and a string. identifying the grace-period-related event as follows:
+ *
+ * "nxtlvl": Advance to next level of rcu_node funnel
+ * "wait": Wait for someone else to do expedited GP
+ */
+TRACE_EVENT(rcu_exp_funnel_lock,
+
+ TP_PROTO(const char *rcuname, u8 level, int grplo, int grphi,
+ const char *gpevent),
+
+ TP_ARGS(rcuname, level, grplo, grphi, gpevent),
+
+ TP_STRUCT__entry(
+ __field(const char *, rcuname)
+ __field(u8, level)
+ __field(int, grplo)
+ __field(int, grphi)
+ __field(const char *, gpevent)
+ ),
+
+ TP_fast_assign(
+ __entry->rcuname = rcuname;
+ __entry->level = level;
+ __entry->grplo = grplo;
+ __entry->grphi = grphi;
+ __entry->gpevent = gpevent;
+ ),
+
+ TP_printk("%s %d %d %d %s",
+ __entry->rcuname, __entry->level, __entry->grplo,
+ __entry->grphi, __entry->gpevent)
+);
+
+/*
* Tracepoint for RCU no-CBs CPU callback handoffs. This event is intended
* to assist debugging of these handoffs.
*
@@ -704,11 +775,15 @@ TRACE_EVENT(rcu_barrier,
#else /* #ifdef CONFIG_RCU_TRACE */
#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
-#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
- qsmask) do { } while (0)
#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
level, grplo, grphi, event) \
do { } while (0)
+#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
+ qsmask) do { } while (0)
+#define trace_rcu_exp_grace_period(rcuname, gqseq, gpevent) \
+ do { } while (0)
+#define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \
+ do { } while (0)
#define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0)
#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index f80277569f24..e601c8c3bdc7 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -19,14 +19,20 @@
#ifndef _LINUX_IF_H
#define _LINUX_IF_H
+#include <linux/libc-compat.h> /* for compatibility with glibc */
#include <linux/types.h> /* for "__kernel_caddr_t" et al */
#include <linux/socket.h> /* for "struct sockaddr" et al */
#include <linux/compiler.h> /* for "__user" et al */
+#if __UAPI_DEF_IF_IFNAMSIZ
#define IFNAMSIZ 16
+#endif /* __UAPI_DEF_IF_IFNAMSIZ */
#define IFALIASZ 256
#include <linux/hdlc/ioctl.h>
+/* For glibc compatibility. An empty enum does not compile. */
+#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO != 0 && \
+ __UAPI_DEF_IF_NET_DEVICE_FLAGS != 0
/**
* enum net_device_flags - &struct net_device flags
*
@@ -68,6 +74,8 @@
* @IFF_ECHO: echo sent packets. Volatile.
*/
enum net_device_flags {
+/* for compatibility with glibc net/if.h */
+#if __UAPI_DEF_IF_NET_DEVICE_FLAGS
IFF_UP = 1<<0, /* sysfs */
IFF_BROADCAST = 1<<1, /* volatile */
IFF_DEBUG = 1<<2, /* sysfs */
@@ -84,11 +92,17 @@ enum net_device_flags {
IFF_PORTSEL = 1<<13, /* sysfs */
IFF_AUTOMEDIA = 1<<14, /* sysfs */
IFF_DYNAMIC = 1<<15, /* sysfs */
+#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS */
+#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO
IFF_LOWER_UP = 1<<16, /* volatile */
IFF_DORMANT = 1<<17, /* volatile */
IFF_ECHO = 1<<18, /* volatile */
+#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */
};
+#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO != 0 && __UAPI_DEF_IF_NET_DEVICE_FLAGS != 0 */
+/* for compatibility with glibc net/if.h */
+#if __UAPI_DEF_IF_NET_DEVICE_FLAGS
#define IFF_UP IFF_UP
#define IFF_BROADCAST IFF_BROADCAST
#define IFF_DEBUG IFF_DEBUG
@@ -105,9 +119,13 @@ enum net_device_flags {
#define IFF_PORTSEL IFF_PORTSEL
#define IFF_AUTOMEDIA IFF_AUTOMEDIA
#define IFF_DYNAMIC IFF_DYNAMIC
+#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS */
+
+#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO
#define IFF_LOWER_UP IFF_LOWER_UP
#define IFF_DORMANT IFF_DORMANT
#define IFF_ECHO IFF_ECHO
+#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */
#define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
@@ -166,6 +184,8 @@ enum {
* being very small might be worth keeping for clean configuration.
*/
+/* for compatibility with glibc net/if.h */
+#if __UAPI_DEF_IF_IFMAP
struct ifmap {
unsigned long mem_start;
unsigned long mem_end;
@@ -175,6 +195,7 @@ struct ifmap {
unsigned char port;
/* 3 bytes spare */
};
+#endif /* __UAPI_DEF_IF_IFMAP */
struct if_settings {
unsigned int type; /* Type of physical device or protocol */
@@ -200,6 +221,8 @@ struct if_settings {
* remainder may be interface specific.
*/
+/* for compatibility with glibc net/if.h */
+#if __UAPI_DEF_IF_IFREQ
struct ifreq {
#define IFHWADDRLEN 6
union
@@ -223,6 +246,7 @@ struct ifreq {
struct if_settings ifru_settings;
} ifr_ifru;
};
+#endif /* __UAPI_DEF_IF_IFREQ */
#define ifr_name ifr_ifrn.ifrn_name /* interface name */
#define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */
@@ -249,6 +273,8 @@ struct ifreq {
* must know all networks accessible).
*/
+/* for compatibility with glibc net/if.h */
+#if __UAPI_DEF_IF_IFCONF
struct ifconf {
int ifc_len; /* size of buffer */
union {
@@ -256,6 +282,8 @@ struct ifconf {
struct ifreq __user *ifcu_req;
} ifc_ifcu;
};
+#endif /* __UAPI_DEF_IF_IFCONF */
+
#define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */
#define ifc_req ifc_ifcu.ifcu_req /* array of structures */
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
index 7d024ceb075d..d5e38c73377c 100644
--- a/include/uapi/linux/libc-compat.h
+++ b/include/uapi/linux/libc-compat.h
@@ -51,6 +51,40 @@
/* We have included glibc headers... */
#if defined(__GLIBC__)
+/* Coordinate with glibc net/if.h header. */
+#if defined(_NET_IF_H)
+
+/* GLIBC headers included first so don't define anything
+ * that would already be defined. */
+
+#define __UAPI_DEF_IF_IFCONF 0
+#define __UAPI_DEF_IF_IFMAP 0
+#define __UAPI_DEF_IF_IFNAMSIZ 0
+#define __UAPI_DEF_IF_IFREQ 0
+/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */
+#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 0
+/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */
+#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO
+#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1
+#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */
+
+#else /* _NET_IF_H */
+
+/* Linux headers included first, and we must define everything
+ * we need. The expectation is that glibc will check the
+ * __UAPI_DEF_* defines and adjust appropriately. */
+
+#define __UAPI_DEF_IF_IFCONF 1
+#define __UAPI_DEF_IF_IFMAP 1
+#define __UAPI_DEF_IF_IFNAMSIZ 1
+#define __UAPI_DEF_IF_IFREQ 1
+/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */
+#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1
+/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */
+#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1
+
+#endif /* _NET_IF_H */
+
/* Coordinate with glibc netinet/in.h header. */
#if defined(_NETINET_IN_H)
@@ -117,6 +151,16 @@
* that we need. */
#else /* !defined(__GLIBC__) */
+/* Definitions for if.h */
+#define __UAPI_DEF_IF_IFCONF 1
+#define __UAPI_DEF_IF_IFMAP 1
+#define __UAPI_DEF_IF_IFNAMSIZ 1
+#define __UAPI_DEF_IF_IFREQ 1
+/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */
+#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1
+/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */
+#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1
+
/* Definitions for in.h */
#define __UAPI_DEF_IN_ADDR 1
#define __UAPI_DEF_IN_IPPROTO 1
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe9623c1a7..43fc8d213472 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -340,7 +340,8 @@ struct perf_event_attr {
comm_exec : 1, /* flag comm events that are due to an exec */
use_clockid : 1, /* use @clockid for time fields */
context_switch : 1, /* context switch data */
- __reserved_1 : 37;
+ write_backward : 1, /* Write ring buffer from end to beginning */
+ __reserved_1 : 36;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -401,6 +402,7 @@ struct perf_event_attr {
#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *)
#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *)
#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
diff --git a/include/uapi/linux/signal.h b/include/uapi/linux/signal.h
index e1bd50c29ded..cd0804b6bfa2 100644
--- a/include/uapi/linux/signal.h
+++ b/include/uapi/linux/signal.h
@@ -7,4 +7,9 @@
#define SS_ONSTACK 1
#define SS_DISABLE 2
+/* bit-flags */
+#define SS_AUTODISARM (1U << 31) /* disable sas during sighandling */
+/* mask for all SS_xxx flags */
+#define SS_FLAG_BITS SS_AUTODISARM
+
#endif /* _UAPI_LINUX_SIGNAL_H */
diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild
index 242cf0c6e33d..e3969bd939e4 100644
--- a/include/uapi/linux/tc_act/Kbuild
+++ b/include/uapi/linux/tc_act/Kbuild
@@ -10,3 +10,4 @@ header-y += tc_skbedit.h
header-y += tc_vlan.h
header-y += tc_bpf.h
header-y += tc_connmark.h
+header-y += tc_ife.h
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
- value_size / 8 > PERF_MAX_STACK_DEPTH)
+ value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
- /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
- u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a7d31ffd3..86cb5c6e8932 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_free_root(root);
}
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ rcu_read_lock();
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ if (cset == &init_css_set) {
+ res = &root->cgrp;
+ } else {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == root) {
+ res = c;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ BUG_ON(!res);
+ return res;
+}
+
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
@@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
return 0;
}
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root)
+{
+ int len = 0;
+ char *buf = NULL;
+ struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+ struct cgroup *ns_cgroup;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ spin_lock_bh(&css_set_lock);
+ ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+ len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+ spin_unlock_bh(&css_set_lock);
+
+ if (len >= PATH_MAX)
+ len = -ERANGE;
+ else if (len > 0) {
+ seq_escape(sf, buf, " \t\n\\");
+ len = 0;
+ }
+ kfree(buf);
+ return len;
+}
+
static int cgroup_show_options(struct seq_file *seq,
struct kernfs_root *kf_root)
{
@@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
+ .show_path = cgroup_show_path,
};
static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}
static void
@@ -215,3 +224,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c0ded2416615..050a290c72c7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -44,6 +44,8 @@
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
#include "internal.h"
@@ -1927,8 +1929,13 @@ event_sched_in(struct perf_event *event,
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
- event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = smp_processor_id();
+ WRITE_ONCE(event->oncpu, smp_processor_id());
+ /*
+ * Order event::oncpu write to happen before the ACTIVE state
+ * is visible.
+ */
+ smp_wmb();
+ WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
@@ -2360,6 +2367,112 @@ void perf_event_enable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_enable);
+struct stop_event_data {
+ struct perf_event *event;
+ unsigned int restart;
+};
+
+static int __perf_event_stop(void *info)
+{
+ struct stop_event_data *sd = info;
+ struct perf_event *event = sd->event;
+
+ /* if it's already INACTIVE, do nothing */
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * There is a window with interrupts enabled before we get here,
+ * so we need to check again lest we try to stop another CPU's event.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return -EAGAIN;
+
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ /*
+ * May race with the actual stop (through perf_pmu_output_stop()),
+ * but it is only used for events with AUX ring buffer, and such
+ * events will refuse to restart because of rb::aux_mmap_count==0,
+ * see comments in perf_aux_output_begin().
+ *
+ * Since this is happening on a event-local CPU, no trace is lost
+ * while restarting.
+ */
+ if (sd->restart)
+ event->pmu->start(event, PERF_EF_START);
+
+ return 0;
+}
+
+static int perf_event_restart(struct perf_event *event)
+{
+ struct stop_event_data sd = {
+ .event = event,
+ .restart = 1,
+ };
+ int ret = 0;
+
+ do {
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * We only want to restart ACTIVE events, so if the event goes
+ * inactive here (event->oncpu==-1), there's nothing more to do;
+ * fall through with ret==-ENXIO.
+ */
+ ret = cpu_function_call(READ_ONCE(event->oncpu),
+ __perf_event_stop, &sd);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * In order to contain the amount of racy and tricky in the address filter
+ * configuration management, it is a two part process:
+ *
+ * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
+ * we update the addresses of corresponding vmas in
+ * event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * (p2) when an event is scheduled in (pmu::add), it calls
+ * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
+ * if the generation has changed since the previous call.
+ *
+ * If (p1) happens while the event is active, we restart it to force (p2).
+ *
+ * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
+ * pre-existing mappings, called once when new filters arrive via SET_FILTER
+ * ioctl;
+ * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
+ * registered mapping, called for every new mmap(), with mm::mmap_sem down
+ * for reading;
+ * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
+ * of exec.
+ */
+void perf_event_addr_filters_sync(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock(&ifh->lock);
+ if (event->addr_filters_gen != event->hw.addr_filters_gen) {
+ event->pmu->addr_filters_sync(event);
+ event->hw.addr_filters_gen = event->addr_filters_gen;
+ }
+ raw_spin_unlock(&ifh->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
+
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
@@ -3209,16 +3322,6 @@ out:
put_ctx(clone_ctx);
}
-void perf_event_exec(void)
-{
- int ctxn;
-
- rcu_read_lock();
- for_each_task_context_nr(ctxn)
- perf_event_enable_on_exec(ctxn);
- rcu_read_unlock();
-}
-
struct perf_read_data {
struct perf_event *event;
bool group;
@@ -3720,6 +3823,9 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
+static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head);
+
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3747,6 +3853,8 @@ static void _free_event(struct perf_event *event)
}
perf_event_free_bpf_prog(event);
+ perf_addr_filters_splice(event, NULL);
+ kfree(event->addr_filters_offs);
if (event->destroy)
event->destroy(event);
@@ -4343,6 +4451,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
+ case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb || !rb->nr_pages) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ rb_toggle_paused(rb, !!arg);
+ rcu_read_unlock();
+ return 0;
+ }
default:
return -ENOTTY;
}
@@ -4659,6 +4780,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
event->pmu->event_mapped(event);
}
+static void perf_pmu_output_stop(struct perf_event *event);
+
/*
* A buffer can be mmap()ed multiple times; either directly through the same
* event, or through other events by use of perf_event_set_output().
@@ -4686,10 +4809,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ /*
+ * Stop all AUX events that are writing to this buffer,
+ * so that we can free its AUX pages and corresponding PMU
+ * data. Note that after rb::aux_mmap_count dropped to zero,
+ * they won't start any more (see perf_aux_output_begin()).
+ */
+ perf_pmu_output_stop(event);
+
+ /* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ /* this has to be the last one */
rb_free_aux(rb);
+ WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
mutex_unlock(&event->mmap_mutex);
}
@@ -5630,9 +5765,13 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void __always_inline
+__perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs,
+ int (*output_begin)(struct perf_output_handle *,
+ struct perf_event *,
+ unsigned int))
{
struct perf_output_handle handle;
struct perf_event_header header;
@@ -5642,7 +5781,7 @@ void perf_event_output(struct perf_event *event,
perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ if (output_begin(&handle, event, header.size))
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -5653,6 +5792,30 @@ exit:
rcu_read_unlock();
}
+void
+perf_event_output_forward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_forward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+void
+perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin);
+}
+
/*
* read event_id
*/
@@ -5698,15 +5861,18 @@ typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
static void
perf_event_aux_ctx(struct perf_event_context *ctx,
perf_event_aux_output_cb output,
- void *data)
+ void *data, bool all)
{
struct perf_event *event;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- continue;
- if (!event_filter_match(event))
- continue;
+ if (!all) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ }
+
output(event, data);
}
}
@@ -5717,7 +5883,7 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
{
rcu_read_lock();
preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data);
+ perf_event_aux_ctx(task_ctx, output, data, false);
preempt_enable();
rcu_read_unlock();
}
@@ -5747,13 +5913,13 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
if (cpuctx->unique_pmu != pmu)
goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data);
+ perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
ctxn = pmu->task_ctx_nr;
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, output, data);
+ perf_event_aux_ctx(ctx, output, data, false);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
@@ -5761,6 +5927,134 @@ next:
}
/*
+ * Clear all file-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+ unsigned long flags;
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (filter->inode) {
+ event->addr_filters_offs[count] = 0;
+ restart++;
+ }
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_restart(event);
+}
+
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctxn);
+
+ perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+ true);
+ }
+ rcu_read_unlock();
+}
+
+struct remote_output {
+ struct ring_buffer *rb;
+ int err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+ struct perf_event *parent = event->parent;
+ struct remote_output *ro = data;
+ struct ring_buffer *rb = ro->rb;
+ struct stop_event_data sd = {
+ .event = event,
+ };
+
+ if (!has_aux(event))
+ return;
+
+ if (!parent)
+ parent = event;
+
+ /*
+ * In case of inheritance, it will be the parent that links to the
+ * ring-buffer, but it will be the child that's actually using it:
+ */
+ if (rcu_dereference(parent->rb) == rb)
+ ro->err = __perf_event_stop(&sd);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+ struct perf_event *event = info;
+ struct pmu *pmu = event->pmu;
+ struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ struct remote_output ro = {
+ .rb = event->rb,
+ };
+
+ rcu_read_lock();
+ perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ if (cpuctx->task_ctx)
+ perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ &ro, false);
+ rcu_read_unlock();
+
+ return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+ struct perf_event *iter;
+ int err, cpu;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+ /*
+ * For per-CPU events, we need to make sure that neither they
+ * nor their children are running; for cpu==-1 events it's
+ * sufficient to stop the event itself if it's active, since
+ * it can't have children.
+ */
+ cpu = iter->cpu;
+ if (cpu == -1)
+ cpu = READ_ONCE(iter->oncpu);
+
+ if (cpu == -1)
+ continue;
+
+ err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+ if (err == -EAGAIN) {
+ rcu_read_unlock();
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
* task tracking -- fork/exit
*
* enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
@@ -6169,6 +6463,87 @@ got_name:
kfree(buf);
}
+/*
+ * Whether this @filter depends on a dynamic object which is not loaded
+ * yet or its load addresses are not known.
+ */
+static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
+{
+ return filter->filter && filter->inode;
+}
+
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_addr_filter_match(struct perf_addr_filter *filter,
+ struct file *file, unsigned long offset,
+ unsigned long size)
+{
+ if (filter->inode != file->f_inode)
+ return false;
+
+ if (filter->offset > offset + size)
+ return false;
+
+ if (filter->offset + filter->size < offset)
+ return false;
+
+ return true;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct vm_area_struct *vma = data;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
+ struct file *file = vma->vm_file;
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+
+ if (!has_addr_filter(event))
+ return;
+
+ if (!file)
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (perf_addr_filter_match(filter, file, off,
+ vma->vm_end - vma->vm_start)) {
+ event->addr_filters_offs[count] = vma->vm_start;
+ restart++;
+ }
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_restart(event);
+}
+
+/*
+ * Adjust all task's events' filters to the new vma
+ */
+static void perf_addr_filters_adjust(struct vm_area_struct *vma)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (!ctx)
+ continue;
+
+ perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ }
+ rcu_read_unlock();
+}
+
void perf_event_mmap(struct vm_area_struct *vma)
{
struct perf_mmap_event mmap_event;
@@ -6200,6 +6575,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .flags (attr_mmap2 only) */
};
+ perf_addr_filters_adjust(vma);
perf_event_mmap_event(&mmap_event);
}
@@ -6491,10 +6867,7 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(&event->pending);
}
- if (event->overflow_handler)
- event->overflow_handler(event, data, regs);
- else
- perf_event_output(event, data, regs);
+ event->overflow_handler(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
@@ -7081,24 +7454,6 @@ static inline void perf_tp_register(void)
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- char *filter_str;
- int ret;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
-
- filter_str = strndup_user(arg, PAGE_SIZE);
- if (IS_ERR(filter_str))
- return PTR_ERR(filter_str);
-
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
- kfree(filter_str);
- return ret;
-}
-
static void perf_event_free_filter(struct perf_event *event)
{
ftrace_profile_free_filter(event);
@@ -7153,11 +7508,6 @@ static inline void perf_tp_register(void)
{
}
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- return -ENOENT;
-}
-
static void perf_event_free_filter(struct perf_event *event)
{
}
@@ -7186,6 +7536,387 @@ void perf_bp_event(struct perf_event *bp, void *data)
#endif
/*
+ * Allocate a new address filter
+ */
+static struct perf_addr_filter *
+perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
+{
+ int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+ struct perf_addr_filter *filter;
+
+ filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+ if (!filter)
+ return NULL;
+
+ INIT_LIST_HEAD(&filter->entry);
+ list_add_tail(&filter->entry, filters);
+
+ return filter;
+}
+
+static void free_filters_list(struct list_head *filters)
+{
+ struct perf_addr_filter *filter, *iter;
+
+ list_for_each_entry_safe(filter, iter, filters, entry) {
+ if (filter->inode)
+ iput(filter->inode);
+ list_del(&filter->entry);
+ kfree(filter);
+ }
+}
+
+/*
+ * Free existing address filters and optionally install new ones
+ */
+static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head)
+{
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ if (!has_addr_filter(event))
+ return;
+
+ /* don't bother with children, they don't have their own filters */
+ if (event->parent)
+ return;
+
+ raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
+
+ list_splice_init(&event->addr_filters.list, &list);
+ if (head)
+ list_splice(head, &event->addr_filters.list);
+
+ raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
+
+ free_filters_list(&list);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_sem down for reading.
+ */
+static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct file *file = vma->vm_file;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+ if (!file)
+ continue;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ continue;
+
+ return vma->vm_start;
+ }
+
+ return 0;
+}
+
+/*
+ * Update event's address range filters based on the
+ * task's existing mappings, if any.
+ */
+static void perf_event_addr_filters_apply(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct task_struct *task = READ_ONCE(event->ctx->task);
+ struct perf_addr_filter *filter;
+ struct mm_struct *mm = NULL;
+ unsigned int count = 0;
+ unsigned long flags;
+
+ /*
+ * We may observe TASK_TOMBSTONE, which means that the event tear-down
+ * will stop on the parent's child_mutex that our caller is also holding
+ */
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ goto restart;
+
+ down_read(&mm->mmap_sem);
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ event->addr_filters_offs[count] = 0;
+
+ if (perf_addr_filter_needs_mmap(filter))
+ event->addr_filters_offs[count] =
+ perf_addr_filter_apply(filter, mm);
+
+ count++;
+ }
+
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ up_read(&mm->mmap_sem);
+
+ mmput(mm);
+
+restart:
+ perf_event_restart(event);
+}
+
+/*
+ * Address range filtering: limiting the data to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION RANGE_SPEC
+ * where ACTION is one of the
+ * * "filter": limit the trace to this region
+ * * "start": start tracing from this address
+ * * "stop": stop tracing at this address/region;
+ * RANGE_SPEC is
+ * * for kernel addresses: <start address>[/<size>]
+ * * for object files: <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified, the range is treated as a single address.
+ */
+enum {
+ IF_ACT_FILTER,
+ IF_ACT_START,
+ IF_ACT_STOP,
+ IF_SRC_FILE,
+ IF_SRC_KERNEL,
+ IF_SRC_FILEADDR,
+ IF_SRC_KERNELADDR,
+};
+
+enum {
+ IF_STATE_ACTION = 0,
+ IF_STATE_SOURCE,
+ IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+ { IF_ACT_FILTER, "filter" },
+ { IF_ACT_START, "start" },
+ { IF_ACT_STOP, "stop" },
+ { IF_SRC_FILE, "%u/%u@%s" },
+ { IF_SRC_KERNEL, "%u/%u" },
+ { IF_SRC_FILEADDR, "%u@%s" },
+ { IF_SRC_KERNELADDR, "%u" },
+};
+
+/*
+ * Address filter string parser
+ */
+static int
+perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
+ struct list_head *filters)
+{
+ struct perf_addr_filter *filter = NULL;
+ char *start, *orig, *filename = NULL;
+ struct path path;
+ substring_t args[MAX_OPT_ARGS];
+ int state = IF_STATE_ACTION, token;
+ unsigned int kernel = 0;
+ int ret = -EINVAL;
+
+ orig = fstr = kstrdup(fstr, GFP_KERNEL);
+ if (!fstr)
+ return -ENOMEM;
+
+ while ((start = strsep(&fstr, " ,\n")) != NULL) {
+ ret = -EINVAL;
+
+ if (!*start)
+ continue;
+
+ /* filter definition begins */
+ if (state == IF_STATE_ACTION) {
+ filter = perf_addr_filter_new(event, filters);
+ if (!filter)
+ goto fail;
+ }
+
+ token = match_token(start, if_tokens, args);
+ switch (token) {
+ case IF_ACT_FILTER:
+ case IF_ACT_START:
+ filter->filter = 1;
+
+ case IF_ACT_STOP:
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ state = IF_STATE_SOURCE;
+ break;
+
+ case IF_SRC_KERNELADDR:
+ case IF_SRC_KERNEL:
+ kernel = 1;
+
+ case IF_SRC_FILEADDR:
+ case IF_SRC_FILE:
+ if (state != IF_STATE_SOURCE)
+ goto fail;
+
+ if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
+ filter->range = 1;
+
+ *args[0].to = 0;
+ ret = kstrtoul(args[0].from, 0, &filter->offset);
+ if (ret)
+ goto fail;
+
+ if (filter->range) {
+ *args[1].to = 0;
+ ret = kstrtoul(args[1].from, 0, &filter->size);
+ if (ret)
+ goto fail;
+ }
+
+ if (token == IF_SRC_FILE) {
+ filename = match_strdup(&args[2]);
+ if (!filename) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ state = IF_STATE_END;
+ break;
+
+ default:
+ goto fail;
+ }
+
+ /*
+ * Filter definition is fully parsed, validate and install it.
+ * Make sure that it doesn't contradict itself or the event's
+ * attribute.
+ */
+ if (state == IF_STATE_END) {
+ if (kernel && event->attr.exclude_kernel)
+ goto fail;
+
+ if (!kernel) {
+ if (!filename)
+ goto fail;
+
+ /* look up the path and grab its inode */
+ ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto fail_free_name;
+
+ filter->inode = igrab(d_inode(path.dentry));
+ path_put(&path);
+ kfree(filename);
+ filename = NULL;
+
+ ret = -EINVAL;
+ if (!filter->inode ||
+ !S_ISREG(filter->inode->i_mode))
+ /* free_filters_list() will iput() */
+ goto fail;
+ }
+
+ /* ready to consume more filters */
+ state = IF_STATE_ACTION;
+ filter = NULL;
+ }
+ }
+
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ kfree(orig);
+
+ return 0;
+
+fail_free_name:
+ kfree(filename);
+fail:
+ free_filters_list(filters);
+ kfree(orig);
+
+ return ret;
+}
+
+static int
+perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
+{
+ LIST_HEAD(filters);
+ int ret;
+
+ /*
+ * Since this is called in perf_ioctl() path, we're already holding
+ * ctx::mutex.
+ */
+ lockdep_assert_held(&event->ctx->mutex);
+
+ if (WARN_ON_ONCE(event->parent))
+ return -EINVAL;
+
+ /*
+ * For now, we only support filtering in per-task events; doing so
+ * for CPU-wide events requires additional context switching trickery,
+ * since same object code will be mapped at different virtual
+ * addresses in different processes.
+ */
+ if (!event->ctx->task)
+ return -EOPNOTSUPP;
+
+ ret = perf_event_parse_addr_filter(event, filter_str, &filters);
+ if (ret)
+ return ret;
+
+ ret = event->pmu->addr_filters_validate(&filters);
+ if (ret) {
+ free_filters_list(&filters);
+ return ret;
+ }
+
+ /* remove existing filters, if any */
+ perf_addr_filters_splice(event, &filters);
+
+ /* install new filters */
+ perf_event_for_each_child(event, perf_event_addr_filters_apply);
+
+ return ret;
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret = -EINVAL;
+
+ if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
+ !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+ !has_addr_filter(event))
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
+ event->attr.type == PERF_TYPE_TRACEPOINT)
+ ret = ftrace_profile_set_filter(event, event->attr.config,
+ filter_str);
+ else if (has_addr_filter(event))
+ ret = perf_event_set_addr_filter(event, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+/*
* hrtimer based swevent callback
*/
@@ -7542,6 +8273,20 @@ static void free_pmu_context(struct pmu *pmu)
out:
mutex_unlock(&pmus_lock);
}
+
+/*
+ * Let userspace know that this PMU supports address range filtering:
+ */
+static ssize_t nr_addr_filters_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+}
+DEVICE_ATTR_RO(nr_addr_filters);
+
static struct idr pmu_idr;
static ssize_t
@@ -7643,9 +8388,19 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (ret)
goto free_dev;
+ /* For PMUs with address filters, throw in an extra attribute: */
+ if (pmu->nr_addr_filters)
+ ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
+
+ if (ret)
+ goto del_dev;
+
out:
return ret;
+del_dev:
+ device_del(pmu->dev);
+
free_dev:
put_device(pmu->dev);
goto out;
@@ -7685,6 +8440,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
skip_type:
+ if (pmu->task_ctx_nr == perf_hw_context) {
+ static int hw_context_taken = 0;
+
+ /*
+ * Other than systems with heterogeneous CPUs, it never makes
+ * sense for two PMUs to share perf_hw_context. PMUs which are
+ * uncore must use perf_invalid_context.
+ */
+ if (WARN_ON_ONCE(hw_context_taken &&
+ !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
+ pmu->task_ctx_nr = perf_invalid_context;
+
+ hw_context_taken = 1;
+ }
+
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
if (pmu->pmu_cpu_context)
goto got_cpu_context;
@@ -7772,6 +8542,8 @@ void perf_pmu_unregister(struct pmu *pmu)
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
put_device(pmu->dev);
free_pmu_context(pmu);
@@ -7965,6 +8737,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
+ INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
@@ -7972,6 +8745,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
+ raw_spin_lock_init(&event->addr_filters.lock);
atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
@@ -8006,8 +8780,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
context = parent_event->overflow_handler_context;
}
- event->overflow_handler = overflow_handler;
- event->overflow_handler_context = context;
+ if (overflow_handler) {
+ event->overflow_handler = overflow_handler;
+ event->overflow_handler_context = context;
+ } else if (is_write_backward(event)){
+ event->overflow_handler = perf_event_output_backward;
+ event->overflow_handler_context = NULL;
+ } else {
+ event->overflow_handler = perf_event_output_forward;
+ event->overflow_handler_context = NULL;
+ }
perf_event__state_init(event);
@@ -8048,11 +8830,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (err)
goto err_pmu;
+ if (has_addr_filter(event)) {
+ event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!event->addr_filters_offs)
+ goto err_per_task;
+
+ /* force hw sync on the address filters */
+ event->addr_filters_gen = 1;
+ }
+
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
if (err)
- goto err_per_task;
+ goto err_addr_filters;
}
}
@@ -8061,6 +8854,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
return event;
+err_addr_filters:
+ kfree(event->addr_filters_offs);
+
err_per_task:
exclusive_event_destroy(event);
@@ -8240,6 +9036,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
goto out;
/*
+ * Either writing ring buffer from beginning or from end.
+ * Mixing is not allowed.
+ */
+ if (is_write_backward(output_event) != is_write_backward(event))
+ goto out;
+
+ /*
* If both events generate aux data, they must be on the same PMU
*/
if (has_aux(event) && has_aux(output_event) &&
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 4199b6d193f5..05f9f6d626df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,13 +11,13 @@
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
- struct irq_work irq_work;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
int overwrite; /* can overwrite itself */
+ int paused; /* can write into ring buffer */
atomic_t poll; /* POLL_ for wakeups */
@@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
rb_free(rb);
}
+static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
+{
+ if (!pause && rb->nr_pages)
+ rb->paused = 0;
+ else
+ rb->paused = 1;
+}
+
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index c61f0cbd308b..ae9b90dc9a5a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -102,8 +102,21 @@ out:
preempt_enable();
}
-int perf_output_begin(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size)
+static bool __always_inline
+ring_buffer_has_space(unsigned long head, unsigned long tail,
+ unsigned long data_size, unsigned int size,
+ bool backward)
+{
+ if (!backward)
+ return CIRC_SPACE(head, tail, data_size) >= size;
+ else
+ return CIRC_SPACE(tail, head, data_size) >= size;
+}
+
+static int __always_inline
+__perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size,
+ bool backward)
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
@@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle,
if (unlikely(!rb))
goto out;
- if (unlikely(!rb->nr_pages))
+ if (unlikely(rb->paused)) {
+ if (rb->nr_pages)
+ local_inc(&rb->lost);
goto out;
+ }
handle->rb = rb;
handle->event = event;
@@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle,
do {
tail = READ_ONCE(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
- if (!rb->overwrite &&
- unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
- goto fail;
+ if (!rb->overwrite) {
+ if (unlikely(!ring_buffer_has_space(head, tail,
+ perf_data_size(rb),
+ size, backward)))
+ goto fail;
+ }
/*
* The above forms a control dependency barrier separating the
@@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle,
* See perf_output_put_handle().
*/
- head += size;
+ if (!backward)
+ head += size;
+ else
+ head -= size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
+ if (backward) {
+ offset = head;
+ head = (u64)(-head);
+ }
+
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
@@ -203,6 +230,26 @@ out:
return -ENOSPC;
}
+int perf_output_begin_forward(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, event, size, false);
+}
+
+int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, event, size, true);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+
+ return __perf_output_begin(handle, event, size,
+ unlikely(is_write_backward(event)));
+}
+
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
@@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}
-static void rb_irq_work(struct irq_work *work);
-
static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
@@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
- init_irq_work(&rb->irq_work, rb_irq_work);
-}
-static void ring_buffer_put_async(struct ring_buffer *rb)
-{
- if (!atomic_dec_and_test(&rb->refcount))
- return;
-
- rb->rcu_head.next = (void *)rb;
- irq_work_queue(&rb->irq_work);
+ /*
+ * perf_output_begin() only checks rb->paused, therefore
+ * rb->paused must be true if we have no pages for output.
+ */
+ if (!rb->nr_pages)
+ rb->paused = 1;
}
/*
@@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb)
* The ordering is similar to that of perf_output_{begin,end}, with
* the exception of (B), which should be taken care of by the pmu
* driver, since ordering rules will differ depending on hardware.
+ *
+ * Call this from pmu::start(); see the comment in perf_aux_output_end()
+ * about its use in pmu callbacks. Both can also be called from the PMI
+ * handler if needed.
*/
void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event)
@@ -288,6 +334,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
goto err;
/*
+ * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+ * the aux buffer is in perf_mmap_close(), about to get freed.
+ */
+ if (!atomic_read(&rb->aux_mmap_count))
+ goto err_put;
+
+ /*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
*/
@@ -328,10 +381,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
return handle->rb->aux_priv;
err_put:
+ /* can't be last */
rb_free_aux(rb);
err:
- ring_buffer_put_async(rb);
+ ring_buffer_put(rb);
handle->event = NULL;
return NULL;
@@ -342,11 +396,16 @@ err:
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
* pmu driver's responsibility to observe ordering rules of the hardware,
* so that all the data is externally visible before this is called.
+ *
+ * Note: this has to be called from pmu::stop() callback, as the assumption
+ * of the AUX buffer management code is that after pmu::stop(), the AUX
+ * transaction must be stopped and therefore drop the AUX reference count.
*/
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
bool truncated)
{
struct ring_buffer *rb = handle->rb;
+ bool wakeup = truncated;
unsigned long aux_head;
u64 flags = 0;
@@ -375,14 +434,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
- perf_output_wakeup(handle);
+ wakeup = true;
local_add(rb->aux_watermark, &rb->aux_wakeup);
}
+
+ if (wakeup) {
+ if (truncated)
+ handle->event->pending_disable = 1;
+ perf_output_wakeup(handle);
+ }
+
handle->event = NULL;
local_set(&rb->aux_nest, 0);
+ /* can't be last */
rb_free_aux(rb);
- ring_buffer_put_async(rb);
+ ring_buffer_put(rb);
}
/*
@@ -463,6 +530,14 @@ static void __rb_free_aux(struct ring_buffer *rb)
{
int pg;
+ /*
+ * Should never happen, the last reference should be dropped from
+ * perf_mmap_close() path, which first stops aux transactions (which
+ * in turn are the atomic holders of aux_refcount) and then does the
+ * last rb_free_aux().
+ */
+ WARN_ON_ONCE(in_atomic());
+
if (rb->aux_priv) {
rb->free_aux(rb->aux_priv);
rb->free_aux = NULL;
@@ -574,18 +649,7 @@ out:
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
- irq_work_queue(&rb->irq_work);
-}
-
-static void rb_irq_work(struct irq_work *work)
-{
- struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
-
- if (!atomic_read(&rb->aux_refcount))
__rb_free_aux(rb);
-
- if (rb->rcu_head.next == (void *)rb)
- call_rcu(&rb->rcu_head, rb_free_rcu);
}
#ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/fork.c b/kernel/fork.c
index d277e83ed3e0..3e8451527cbe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1494,7 +1494,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
- p->sas_ss_sp = p->sas_ss_size = 0;
+ sas_ss_reset(p);
/*
* Syscall tracing and stepping should be turned off in the
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 68bc6a654ca3..81f1a7107c0e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -709,7 +709,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
* yet. Otherwise we look it up. We cache the result in the lock object
* itself, so actual lookup of the hash should be once per lock object.
*/
-static inline struct lock_class *
+static struct lock_class *
register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ef1919d63b2..f8c5af52a131 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -75,12 +75,7 @@ struct lock_stress_stats {
long n_lock_acquired;
};
-#if defined(MODULE)
-#define LOCKTORTURE_RUNNABLE_INIT 1
-#else
-#define LOCKTORTURE_RUNNABLE_INIT 0
-#endif
-int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+int torture_runnable = IS_ENABLED(MODULE);
module_param(torture_runnable, int, 0444);
MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
@@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
if (!rt_task(current)) {
/*
- * (1) Boost priority once every ~50k operations. When the
+ * Boost priority once every ~50k operations. When the
* task tries to take the lock, the rtmutex it will account
* for the new priority, and do any corresponding pi-dance.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
policy = SCHED_FIFO;
param.sched_priority = MAX_RT_PRIO - 1;
} else /* common case, do nothing */
@@ -748,6 +743,15 @@ static void lock_torture_cleanup(void)
if (torture_cleanup_begin())
return;
+ /*
+ * Indicates early cleanup, meaning that the test has not run,
+ * such as when passing bogus args when loading the module. As
+ * such, only perform the underlying torture-specific cleanups,
+ * and avoid anything related to locktorture.
+ */
+ if (!cxt.lwsa)
+ goto end;
+
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
torture_stop_kthread(lock_torture_writer,
@@ -776,6 +780,7 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+end:
torture_cleanup_end();
}
@@ -870,6 +875,7 @@ static int __init lock_torture_init(void)
VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
firsterr = -ENOMEM;
kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
goto unwind;
}
@@ -878,6 +884,7 @@ static int __init lock_torture_init(void)
cxt.lrsa[i].n_lock_acquired = 0;
}
}
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d734b7502001..22e025309845 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -191,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
for (i = 0 ; i < qstat_num; i++)
WRITE_ONCE(ptr[i], 0);
- for (i = 0 ; i < qstat_num; i++)
- WRITE_ONCE(ptr[i], 0);
}
return count;
}
@@ -214,10 +212,8 @@ static int __init init_qspinlock_stat(void)
struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
int i;
- if (!d_qstat) {
- pr_warn("Could not create 'qlockstat' debugfs directory\n");
- return 0;
- }
+ if (!d_qstat)
+ goto out;
/*
* Create the debugfs files
@@ -227,12 +223,20 @@ static int __init init_qspinlock_stat(void)
* performance.
*/
for (i = 0; i < qstat_num; i++)
- debugfs_create_file(qstat_names[i], 0400, d_qstat,
- (void *)(long)i, &fops_qstat);
+ if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat))
+ goto fail_undo;
+
+ if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat))
+ goto fail_undo;
- debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
- (void *)(long)qstat_reset_cnts, &fops_qstat);
return 0;
+fail_undo:
+ debugfs_remove_recursive(d_qstat);
+out:
+ pr_warn("Could not create 'qlockstat' debugfs entries\n");
+ return -ENOMEM;
}
fs_initcall(init_qspinlock_stat);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 3a5048572065..1591f6b3539f 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem)
/*
* get a write lock on the semaphore
*/
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+int __sched __down_write_common(struct rw_semaphore *sem, int state)
{
struct rwsem_waiter waiter;
struct task_struct *tsk;
unsigned long flags;
+ int ret = 0;
raw_spin_lock_irqsave(&sem->wait_lock, flags);
@@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
*/
if (sem->count == 0)
break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ set_task_state(tsk, state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
/* got the lock */
sem->count = -1;
+out:
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
}
void __sched __down_write(struct rw_semaphore *sem)
{
- __down_write_nested(sem, 0);
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
}
/*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a4d4de05b2d1..09e30c6225e5 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
/*
* Wait until we successfully acquire the write lock
*/
-__visible
-struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore *
+__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
{
long count;
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
+ struct rw_semaphore *ret = sem;
/* undo write bias from down_write operation, stop active locking */
count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
@@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
/* wait until we successfully acquire the lock */
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
while (true) {
if (rwsem_try_write_lock(count, sem))
break;
@@ -486,21 +487,48 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
/* Block until there are no active lockers. */
do {
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
+ list_del(&waiter.list);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ return ret;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+ else
+ __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
raw_spin_unlock_irq(&sem->wait_lock);
- return sem;
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(rwsem_down_write_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_write_failed_killable);
+
/*
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 205be0ce34de..c817216c1615 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -55,6 +55,25 @@ void __sched down_write(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_write);
/*
+ * lock for writing
+ */
+int __sched down_write_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_owner(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable);
+
+/*
* trylock for writing -- returns 1 if successful, 0 if contention
*/
int down_write_trylock(struct rw_semaphore *sem)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 032b2c015beb..18dfc485225c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n
obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
new file mode 100644
index 000000000000..3cee0d8393ed
--- /dev/null
+++ b/kernel/rcu/rcuperf.c
@@ -0,0 +1,655 @@
+/*
+ * Read-Copy Update module-based performance-test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2015
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
+
+#define PERF_FLAG "-perf:"
+#define PERFOUT_STRING(s) \
+ pr_alert("%s" PERF_FLAG s "\n", perf_type)
+#define VERBOSE_PERFOUT_STRING(s) \
+ do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
+#define VERBOSE_PERFOUT_ERRSTRING(s) \
+ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+
+torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nwriters, -1, "Number of RCU updater threads");
+torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+
+static char *perf_type = "rcu";
+module_param(perf_type, charp, 0444);
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+
+static int nrealreaders;
+static int nrealwriters;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *shutdown_task;
+
+static u64 **writer_durations;
+static int *writer_n_durations;
+static atomic_t n_rcu_perf_reader_started;
+static atomic_t n_rcu_perf_writer_started;
+static atomic_t n_rcu_perf_writer_finished;
+static wait_queue_head_t shutdown_wq;
+static u64 t_rcu_perf_writer_started;
+static u64 t_rcu_perf_writer_finished;
+static unsigned long b_rcu_perf_writer_started;
+static unsigned long b_rcu_perf_writer_finished;
+
+static int rcu_perf_writer_state;
+#define RTWS_INIT 0
+#define RTWS_EXP_SYNC 1
+#define RTWS_SYNC 2
+#define RTWS_IDLE 2
+#define RTWS_STOPPING 3
+
+#define MAX_MEAS 10000
+#define MIN_MEAS 100
+
+#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
+#define RCUPERF_RUNNABLE_INIT 1
+#else
+#define RCUPERF_RUNNABLE_INIT 0
+#endif
+static int perf_runnable = RCUPERF_RUNNABLE_INIT;
+module_param(perf_runnable, int, 0444);
+MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_perf_ops {
+ int ptype;
+ void (*init)(void);
+ void (*cleanup)(void);
+ int (*readlock)(void);
+ void (*readunlock)(int idx);
+ unsigned long (*started)(void);
+ unsigned long (*completed)(void);
+ unsigned long (*exp_completed)(void);
+ void (*sync)(void);
+ void (*exp_sync)(void);
+ const char *name;
+};
+
+static struct rcu_perf_ops *cur_ops;
+
+/*
+ * Definitions for rcu perf testing.
+ */
+
+static int rcu_perf_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_perf_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned long __maybe_unused rcu_no_completed(void)
+{
+ return 0;
+}
+
+static void rcu_sync_perf_init(void)
+{
+}
+
+static struct rcu_perf_ops rcu_ops = {
+ .ptype = RCU_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = rcu_perf_read_lock,
+ .readunlock = rcu_perf_read_unlock,
+ .started = rcu_batches_started,
+ .completed = rcu_batches_completed,
+ .exp_completed = rcu_exp_batches_completed,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh perf testing.
+ */
+
+static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH)
+{
+ rcu_read_unlock_bh();
+}
+
+static struct rcu_perf_ops rcu_bh_ops = {
+ .ptype = RCU_BH_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = rcu_bh_perf_read_lock,
+ .readunlock = rcu_bh_perf_read_unlock,
+ .started = rcu_batches_started_bh,
+ .completed = rcu_batches_completed_bh,
+ .exp_completed = rcu_exp_batches_completed_sched,
+ .sync = synchronize_rcu_bh,
+ .exp_sync = synchronize_rcu_bh_expedited,
+ .name = "rcu_bh"
+};
+
+/*
+ * Definitions for srcu perf testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl_perf);
+static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf;
+
+static int srcu_perf_read_lock(void) __acquires(srcu_ctlp)
+{
+ return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp)
+{
+ srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static unsigned long srcu_perf_completed(void)
+{
+ return srcu_batches_completed(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize(void)
+{
+ synchronize_srcu(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcu_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = srcu_perf_read_lock,
+ .readunlock = srcu_perf_read_unlock,
+ .started = NULL,
+ .completed = srcu_perf_completed,
+ .exp_completed = srcu_perf_completed,
+ .sync = srcu_perf_synchronize,
+ .exp_sync = srcu_perf_synchronize_expedited,
+ .name = "srcu"
+};
+
+/*
+ * Definitions for sched perf testing.
+ */
+
+static int sched_perf_read_lock(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void sched_perf_read_unlock(int idx)
+{
+ preempt_enable();
+}
+
+static struct rcu_perf_ops sched_ops = {
+ .ptype = RCU_SCHED_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = sched_perf_read_lock,
+ .readunlock = sched_perf_read_unlock,
+ .started = rcu_batches_started_sched,
+ .completed = rcu_batches_completed_sched,
+ .exp_completed = rcu_exp_batches_completed_sched,
+ .sync = synchronize_sched,
+ .exp_sync = synchronize_sched_expedited,
+ .name = "sched"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks perf testing.
+ */
+
+static int tasks_perf_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_perf_read_unlock(int idx)
+{
+}
+
+static struct rcu_perf_ops tasks_ops = {
+ .ptype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = tasks_perf_read_lock,
+ .readunlock = tasks_perf_read_unlock,
+ .started = rcu_no_completed,
+ .completed = rcu_no_completed,
+ .sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_tasks,
+ .name = "tasks"
+};
+
+#define RCUPERF_TASKS_OPS &tasks_ops,
+
+static bool __maybe_unused torturing_tasks(void)
+{
+ return cur_ops == &tasks_ops;
+}
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+
+#define RCUPERF_TASKS_OPS
+
+static bool __maybe_unused torturing_tasks(void)
+{
+ return false;
+}
+
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
+/*
+ * If performance tests complete, wait for shutdown to commence.
+ */
+static void rcu_perf_wait_shutdown(void)
+{
+ cond_resched_rcu_qs();
+ if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
+ return;
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * RCU perf reader kthread. Repeatedly does empty RCU read-side
+ * critical section, minimizing update-side interference.
+ */
+static int
+rcu_perf_reader(void *arg)
+{
+ unsigned long flags;
+ int idx;
+ long me = (long)arg;
+
+ VERBOSE_PERFOUT_STRING("rcu_perf_reader task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_rcu_perf_reader_started);
+
+ do {
+ local_irq_save(flags);
+ idx = cur_ops->readlock();
+ cur_ops->readunlock(idx);
+ local_irq_restore(flags);
+ rcu_perf_wait_shutdown();
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_perf_reader");
+ return 0;
+}
+
+/*
+ * RCU perf writer kthread. Repeatedly does a grace period.
+ */
+static int
+rcu_perf_writer(void *arg)
+{
+ int i = 0;
+ int i_max;
+ long me = (long)arg;
+ struct sched_param sp;
+ bool started = false, done = false, alldone = false;
+ u64 t;
+ u64 *wdp;
+ u64 *wdpp = writer_durations[me];
+
+ VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
+ WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
+ WARN_ON(rcu_gp_is_normal() && gp_exp);
+ WARN_ON(!wdpp);
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ sp.sched_priority = 1;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+
+ if (holdoff)
+ schedule_timeout_uninterruptible(holdoff * HZ);
+
+ t = ktime_get_mono_fast_ns();
+ if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
+ t_rcu_perf_writer_started = t;
+ if (gp_exp) {
+ b_rcu_perf_writer_started =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_perf_writer_started =
+ cur_ops->completed();
+ }
+ }
+
+ do {
+ wdp = &wdpp[i];
+ *wdp = ktime_get_mono_fast_ns();
+ if (gp_exp) {
+ rcu_perf_writer_state = RTWS_EXP_SYNC;
+ cur_ops->exp_sync();
+ } else {
+ rcu_perf_writer_state = RTWS_SYNC;
+ cur_ops->sync();
+ }
+ rcu_perf_writer_state = RTWS_IDLE;
+ t = ktime_get_mono_fast_ns();
+ *wdp = t - *wdp;
+ i_max = i;
+ if (!started &&
+ atomic_read(&n_rcu_perf_writer_started) >= nrealwriters)
+ started = true;
+ if (!done && i >= MIN_MEAS) {
+ done = true;
+ sp.sched_priority = 0;
+ sched_setscheduler_nocheck(current,
+ SCHED_NORMAL, &sp);
+ pr_alert("%s" PERF_FLAG
+ "rcu_perf_writer %ld has %d measurements\n",
+ perf_type, me, MIN_MEAS);
+ if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
+ nrealwriters) {
+ schedule_timeout_interruptible(10);
+ rcu_ftrace_dump(DUMP_ALL);
+ PERFOUT_STRING("Test complete");
+ t_rcu_perf_writer_finished = t;
+ if (gp_exp) {
+ b_rcu_perf_writer_finished =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_perf_writer_finished =
+ cur_ops->completed();
+ }
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+ }
+ if (done && !alldone &&
+ atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters)
+ alldone = true;
+ if (started && !alldone && i < MAX_MEAS - 1)
+ i++;
+ rcu_perf_wait_shutdown();
+ } while (!torture_must_stop());
+ rcu_perf_writer_state = RTWS_STOPPING;
+ writer_n_durations[me] = i_max;
+ torture_kthread_stopping("rcu_perf_writer");
+ return 0;
+}
+
+static inline void
+rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" PERF_FLAG
+ "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
+ perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
+}
+
+static void
+rcu_perf_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_perf_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_perf_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ perf_type, PERF_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ perf_type, PERF_FLAG,
+ t_rcu_perf_writer_started, t_rcu_perf_writer_finished,
+ t_rcu_perf_writer_finished -
+ t_rcu_perf_writer_started,
+ ngps,
+ b_rcu_perf_writer_finished -
+ b_rcu_perf_writer_started);
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j <= writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ perf_type, PERF_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ }
+ kfree(writer_tasks);
+ kfree(writer_durations);
+ kfree(writer_n_durations);
+ }
+
+ /* Do flavor-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * Return the number if non-negative. If -1, the number of CPUs.
+ * If less than -1, that much less than the number of CPUs, but
+ * at least one.
+ */
+static int compute_real(int n)
+{
+ int nr;
+
+ if (n >= 0) {
+ nr = n;
+ } else {
+ nr = num_online_cpus() + 1 + n;
+ if (nr <= 0)
+ nr = 1;
+ }
+ return nr;
+}
+
+/*
+ * RCU perf shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_perf_shutdown(void *arg)
+{
+ do {
+ wait_event(shutdown_wq,
+ atomic_read(&n_rcu_perf_writer_finished) >=
+ nrealwriters);
+ } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_perf_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+static int __init
+rcu_perf_init(void)
+{
+ long i;
+ int firsterr = 0;
+ static struct rcu_perf_ops *perf_ops[] = {
+ &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ RCUPERF_TASKS_OPS
+ };
+
+ if (!torture_init_begin(perf_type, verbose, &perf_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the perf'er is on the job. */
+ for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
+ cur_ops = perf_ops[i];
+ if (strcmp(perf_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(perf_ops)) {
+ pr_alert("rcu-perf: invalid perf type: \"%s\"\n",
+ perf_type);
+ pr_alert("rcu-perf types:");
+ for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
+ pr_alert(" %s", perf_ops[i]->name);
+ pr_alert("\n");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ cur_ops->init();
+
+ nrealwriters = compute_real(nwriters);
+ nrealreaders = compute_real(nreaders);
+ atomic_set(&n_rcu_perf_reader_started, 0);
+ atomic_set(&n_rcu_perf_writer_started, 0);
+ atomic_set(&n_rcu_perf_writer_finished, 0);
+ rcu_perf_print_module_parms(cur_ops, "Start of test");
+
+ /* Start up the kthreads. */
+
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(rcu_perf_shutdown, NULL,
+ shutdown_task);
+ if (firsterr)
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+ reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_PERFOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ firsterr = torture_create_kthread(rcu_perf_reader, (void *)i,
+ reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders)
+ schedule_timeout_uninterruptible(1);
+ writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
+ GFP_KERNEL);
+ writer_n_durations =
+ kcalloc(nrealwriters, sizeof(*writer_n_durations),
+ GFP_KERNEL);
+ if (!writer_tasks || !writer_durations || !writer_n_durations) {
+ VERBOSE_PERFOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters; i++) {
+ writer_durations[i] =
+ kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
+ GFP_KERNEL);
+ if (!writer_durations[i])
+ goto unwind;
+ firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
+ writer_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ rcu_perf_cleanup();
+ return firsterr;
+}
+
+module_init(rcu_perf_init);
+module_exit(rcu_perf_cleanup);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 250ea67c1615..084a28a732eb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
@@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
- bool can_expedite = !rcu_gp_is_expedited();
+ bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
int expediting = 0;
unsigned long gp_snap;
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
@@ -932,7 +932,7 @@ rcu_torture_writer(void *arg)
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite) {
pr_alert("%s" TORTURE_FLAG
- " Grace periods expedited from boot/sysfs for %s,\n",
+ " GP expediting controlled from boot/sysfs for %s,\n",
torture_type, cur_ops->name);
pr_alert("%s" TORTURE_FLAG
" Disabled dynamic grace-period expediting.\n",
@@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
-static void rcutorture_trace_dump(void)
-{
- static atomic_t beenhere = ATOMIC_INIT(0);
-
- if (atomic_read(&beenhere))
- return;
- if (atomic_xchg(&beenhere, 1) != 0)
- return;
- ftrace_dump(DUMP_ALL);
-}
-
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
@@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused)
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
started, completed);
- rcutorture_trace_dump();
+ rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
completed = completed - started;
@@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg)
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
ts, started, completed);
- rcutorture_trace_dump();
+ rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
completed = completed - started;
@@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void)
rcu_torture_writer_state,
gpnum, completed, flags);
show_rcu_gp_kthreads();
- rcutorture_trace_dump();
+ rcu_ftrace_dump(DUMP_ALL);
}
rtcv_snap = rcu_torture_current_version;
}
@@ -1489,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg)
* The above smp_load_acquire() ensures barrier_phase load
* is ordered before the folloiwng ->call().
*/
+ local_irq_disable(); /* Just to test no-irq call_rcu(). */
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ local_irq_enable();
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -1596,7 +1587,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self,
{
long cpu = (long)hcpu;
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
(void)rcutorture_booster_init(cpu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9a535a86e732..c7f1bc4f817c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,6 +102,8 @@ struct rcu_state sname##_state = { \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
+ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
+ .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
}
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -370,6 +372,21 @@ void rcu_all_qs(void)
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
+ if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
+ /*
+ * Yes, we just checked a per-CPU variable with preemption
+ * enabled, so we might be migrated to some other CPU at
+ * this point. That is OK because in that case, the
+ * migration will supply the needed quiescent state.
+ * We might end up needlessly disabling preemption and
+ * invoking rcu_sched_qs() on the destination CPU, but
+ * the probability and cost are both quite low, so this
+ * should not be a problem in practice.
+ */
+ preempt_disable();
+ rcu_sched_qs();
+ preempt_enable();
+ }
this_cpu_inc(rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -385,9 +402,11 @@ module_param(qlowmark, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
+static bool rcu_kick_kthreads;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+module_param(rcu_kick_kthreads, bool, 0644);
/*
* How long the grace period must be before we start recruiting
@@ -460,6 +479,28 @@ unsigned long rcu_batches_completed_bh(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
/*
+ * Return the number of RCU expedited batches completed thus far for
+ * debug & stats. Odd numbers mean that a batch is in progress, even
+ * numbers mean idle. The value returned will thus be roughly double
+ * the cumulative batches since boot.
+ */
+unsigned long rcu_exp_batches_completed(void)
+{
+ return rcu_state_p->expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
+
+/*
+ * Return the number of RCU-sched expedited batches completed thus far
+ * for debug & stats. Similar to rcu_exp_batches_completed().
+ */
+unsigned long rcu_exp_batches_completed_sched(void)
+{
+ return rcu_sched_state.expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
+
+/*
* Force a quiescent state.
*/
void rcu_force_quiescent_state(void)
@@ -637,7 +678,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
idle_task(smp_processor_id());
trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
- ftrace_dump(DUMP_ORIG);
+ rcu_ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -799,7 +840,7 @@ static void rcu_eqs_exit_common(long long oldval, int user)
trace_rcu_dyntick(TPS("Error on exit: not idle task"),
oldval, rdtp->dynticks_nesting);
- ftrace_dump(DUMP_ORIG);
+ rcu_ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -1224,8 +1265,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
rsp->gp_flags,
gp_state_getname(rsp->gp_state), rsp->gp_state,
rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
- if (rsp->gp_kthread)
+ if (rsp->gp_kthread) {
sched_show_task(rsp->gp_kthread);
+ wake_up_process(rsp->gp_kthread);
+ }
}
}
@@ -1249,6 +1292,25 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
}
}
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
+{
+ unsigned long j;
+
+ if (!rcu_kick_kthreads)
+ return;
+ j = READ_ONCE(rsp->jiffies_kick_kthreads);
+ if (time_after(jiffies, j) && rsp->gp_kthread) {
+ WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
+ rcu_ftrace_dump(DUMP_ALL);
+ wake_up_process(rsp->gp_kthread);
+ WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ);
+ }
+}
+
static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
@@ -1260,6 +1322,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads(rsp);
+ if (rcu_cpu_stall_suppress)
+ return;
+
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1333,6 +1400,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads(rsp);
+ if (rcu_cpu_stall_suppress)
+ return;
+
/*
* OK, time to rat on ourselves...
* See Documentation/RCU/stallwarn.txt for info on how to debug
@@ -1377,8 +1449,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
- if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+ if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+ !rcu_gp_in_progress(rsp))
return;
+ rcu_stall_kick_kthreads(rsp);
j = jiffies;
/*
@@ -2117,8 +2191,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
ret = 0;
for (;;) {
- if (!ret)
+ if (!ret) {
rsp->jiffies_force_qs = jiffies + j;
+ WRITE_ONCE(rsp->jiffies_kick_kthreads,
+ jiffies + 3 * j);
+ }
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
@@ -2144,6 +2221,15 @@ static int __noreturn rcu_gp_kthread(void *arg)
TPS("fqsend"));
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
+ ret = 0; /* Force full wait till next FQS. */
+ j = jiffies_till_next_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_next_fqs = HZ;
+ } else if (j < 1) {
+ j = 1;
+ jiffies_till_next_fqs = 1;
+ }
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
@@ -2152,14 +2238,12 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqswaitsig"));
- }
- j = jiffies_till_next_fqs;
- if (j > HZ) {
- j = HZ;
- jiffies_till_next_fqs = HZ;
- } else if (j < 1) {
- j = 1;
- jiffies_till_next_fqs = 1;
+ ret = 1; /* Keep old FQS timing. */
+ j = jiffies;
+ if (time_after(jiffies, rsp->jiffies_force_qs))
+ j = 1;
+ else
+ j = rsp->jiffies_force_qs - j;
}
}
@@ -3376,8 +3460,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
}
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
+ unsigned long s;
+
smp_mb(); /* Caller's modifications seen first by other CPUs. */
- return rcu_seq_snap(&rsp->expedited_sequence);
+ s = rcu_seq_snap(&rsp->expedited_sequence);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+ return s;
}
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
{
@@ -3469,7 +3557,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
*/
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
@@ -3485,8 +3573,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Caller must hold the root rcu_node's exp_funnel_mutex and the
- * specified rcu_node structure's ->lock.
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake, unsigned long flags)
@@ -3523,7 +3611,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
* Report expedited quiescent state for specified node. This is a
* lock-acquisition wrapper function for __rcu_report_exp_rnp().
*
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
*/
static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
struct rcu_node *rnp, bool wake)
@@ -3536,8 +3624,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure. Caller must hold the root
- * rcu_node's exp_funnel_mutex.
+ * specified leaf rcu_node structure. Caller must hold the rcu_state's
+ * exp_mutex.
*/
static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
unsigned long mask, bool wake)
@@ -3555,7 +3643,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Report expedited quiescent state for specified rcu_data (CPU).
- * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
bool wake)
@@ -3564,15 +3651,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
}
/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp,
- atomic_long_t *stat, unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+ unsigned long s)
{
if (rcu_exp_gp_seq_done(rsp, s)) {
- if (rnp)
- mutex_unlock(&rnp->exp_funnel_mutex);
- else if (rdp)
- mutex_unlock(&rdp->exp_funnel_mutex);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
/* Ensure test happens before caller kfree(). */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(stat);
@@ -3582,59 +3665,65 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
}
/*
- * Funnel-lock acquisition for expedited grace periods. Returns a
- * pointer to the root rcu_node structure, or NULL if some other
- * task did the expedited grace period for us.
+ * Funnel-lock acquisition for expedited grace periods. Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held. Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
*/
-static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
{
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
- struct rcu_node *rnp0;
- struct rcu_node *rnp1 = NULL;
+ struct rcu_node *rnp = rdp->mynode;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ /* Low-contention fastpath. */
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+ (rnp == rnp_root ||
+ ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+ !mutex_is_locked(&rsp->exp_mutex) &&
+ mutex_trylock(&rsp->exp_mutex))
+ goto fastpath;
/*
- * First try directly acquiring the root lock in order to reduce
- * latency in the common case where expedited grace periods are
- * rare. We check mutex_is_locked() to avoid pathological levels of
- * memory contention on ->exp_funnel_mutex in the heavy-load case.
+ * Each pass through the following loop works its way up
+ * the rcu_node tree, returning if others have done the work or
+ * otherwise falls through to acquire rsp->exp_mutex. The mapping
+ * from CPU to rcu_node structure can be inexact, as it is just
+ * promoting locality and is not strictly needed for correctness.
*/
- rnp0 = rcu_get_root(rsp);
- if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
- if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
- if (sync_exp_work_done(rsp, rnp0, NULL,
- &rdp->expedited_workdone0, s))
- return NULL;
- return rnp0;
+ for (; rnp != NULL; rnp = rnp->parent) {
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+ return true;
+
+ /* Work not done, either wait here or go up. */
+ spin_lock(&rnp->exp_lock);
+ if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+ /* Someone else doing GP, so wait for them. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+ rnp->grplo, rnp->grphi,
+ TPS("wait"));
+ wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ sync_exp_work_done(rsp,
+ &rdp->exp_workdone2, s));
+ return true;
}
+ rnp->exp_seq_rq = s; /* Followers can wait on us. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+ rnp->grphi, TPS("nxtlvl"));
}
-
- /*
- * Each pass through the following loop works its way
- * up the rcu_node tree, returning if others have done the
- * work or otherwise falls through holding the root rnp's
- * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
- * can be inexact, as it is just promoting locality and is not
- * strictly needed for correctness.
- */
- if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
- return NULL;
- mutex_lock(&rdp->exp_funnel_mutex);
- rnp0 = rdp->mynode;
- for (; rnp0 != NULL; rnp0 = rnp0->parent) {
- if (sync_exp_work_done(rsp, rnp1, rdp,
- &rdp->expedited_workdone2, s))
- return NULL;
- mutex_lock(&rnp0->exp_funnel_mutex);
- if (rnp1)
- mutex_unlock(&rnp1->exp_funnel_mutex);
- else
- mutex_unlock(&rdp->exp_funnel_mutex);
- rnp1 = rnp0;
+ mutex_lock(&rsp->exp_mutex);
+fastpath:
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+ mutex_unlock(&rsp->exp_mutex);
+ return true;
}
- if (sync_exp_work_done(rsp, rnp1, rdp,
- &rdp->expedited_workdone3, s))
- return NULL;
- return rnp1;
+ rcu_exp_gp_seq_start(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+ return false;
}
/* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -3649,6 +3738,11 @@ static void sync_sched_exp_handler(void *data)
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
return;
+ if (rcu_is_cpu_rrupt_from_idle()) {
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
+ return;
+ }
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
resched_cpu(smp_processor_id());
}
@@ -3773,7 +3867,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
rsp->name);
ndetected = 0;
rcu_for_each_leaf_node(rsp, rnp) {
- ndetected = rcu_print_task_exp_stall(rnp);
+ ndetected += rcu_print_task_exp_stall(rnp);
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
struct rcu_data *rdp;
@@ -3783,7 +3877,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
ndetected++;
rdp = per_cpu_ptr(rsp->rda, cpu);
pr_cont(" %d-%c%c%c", cpu,
- "O."[cpu_online(cpu)],
+ "O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
}
@@ -3792,7 +3886,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
jiffies - jiffies_start, rsp->expedited_sequence,
rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
- if (!ndetected) {
+ if (ndetected) {
pr_err("blocking rcu_node structures:");
rcu_for_each_node_breadth_first(rsp, rnp) {
if (rnp == rnp_root)
@@ -3818,6 +3912,41 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
}
}
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period. Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_node *rnp;
+
+ synchronize_sched_expedited_wait(rsp);
+ rcu_exp_gp_seq_end(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+ /*
+ * Switch over to wakeup mode, allowing the next GP, but -only- the
+ * next GP, to proceed.
+ */
+ mutex_lock(&rsp->exp_wake_mutex);
+ mutex_unlock(&rsp->exp_mutex);
+
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+ spin_lock(&rnp->exp_lock);
+ /* Recheck, avoid hang in case someone just arrived. */
+ if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+ rnp->exp_seq_rq = s;
+ spin_unlock(&rnp->exp_lock);
+ }
+ wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+ }
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+ mutex_unlock(&rsp->exp_wake_mutex);
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3837,7 +3966,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
void synchronize_sched_expedited(void)
{
unsigned long s;
- struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
/* If only one CPU, this is automatically a grace period. */
@@ -3852,17 +3980,14 @@ void synchronize_sched_expedited(void)
/* Take a snapshot of the sequence number. */
s = rcu_exp_gp_seq_snap(rsp);
-
- rnp = exp_funnel_lock(rsp, s);
- if (rnp == NULL)
+ if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
- rcu_exp_gp_seq_start(rsp);
+ /* Initialize the rcu_node tree in preparation for the wait. */
sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
- synchronize_sched_expedited_wait(rsp);
- rcu_exp_gp_seq_end(rsp);
- mutex_unlock(&rnp->exp_funnel_mutex);
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rsp, s);
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
@@ -4162,7 +4287,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
rdp->cpu = cpu;
rdp->rsp = rsp;
- mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -4420,10 +4544,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
{
static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT;
- static const char * const exp[] = RCU_EXP_NAME_INIT;
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
- static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4482,9 +4604,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
- mutex_init(&rnp->exp_funnel_mutex);
- lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
- &rcu_exp_class[i], exp[i]);
+ init_waitqueue_head(&rnp->exp_wq[0]);
+ init_waitqueue_head(&rnp->exp_wq[1]);
+ init_waitqueue_head(&rnp->exp_wq[2]);
+ init_waitqueue_head(&rnp->exp_wq[3]);
+ spin_lock_init(&rnp->exp_lock);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df668c0f9e64..e3959f5e6ddf 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,7 +70,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
@@ -79,7 +78,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
@@ -89,7 +87,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
@@ -100,7 +97,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -252,7 +248,9 @@ struct rcu_node {
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
- struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
+ spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
+ unsigned long exp_seq_rq;
+ wait_queue_head_t exp_wq[4];
} ____cacheline_internodealigned_in_smp;
/*
@@ -387,11 +385,9 @@ struct rcu_data {
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
- struct mutex exp_funnel_mutex;
- atomic_long_t expedited_workdone0; /* # done by others #0. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_workdone3; /* # done by others #3. */
+ atomic_long_t exp_workdone1; /* # done by others #1. */
+ atomic_long_t exp_workdone2; /* # done by others #2. */
+ atomic_long_t exp_workdone3; /* # done by others #3. */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -505,6 +501,8 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex exp_mutex; /* Serialize expedited GP. */
+ struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
@@ -513,6 +511,8 @@ struct rcu_state {
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
+ unsigned long jiffies_kick_kthreads; /* Time at which to kick */
+ /* kthreads, if configured. */
unsigned long n_force_qs; /* Number of calls to */
/* force_quiescent_state(). */
unsigned long n_force_qs_lh; /* ~Number of calls leaving */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index efdf7b61ce12..ff1cd4e1188d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -722,18 +722,22 @@ static void sync_rcu_exp_handler(void *info)
* synchronize_rcu_expedited - Brute-force RCU grace period
*
* Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blkd_tasks lists and wait for this list to drain. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.
- * In fact, if you are using synchronize_rcu_expedited() in a loop,
- * please restructure your code to batch your updates, and then Use a
- * single synchronize_rcu() instead.
+ * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state. On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
*/
void synchronize_rcu_expedited(void)
{
- struct rcu_node *rnp;
- struct rcu_node *rnp_unlock;
struct rcu_state *rsp = rcu_state_p;
unsigned long s;
@@ -744,23 +748,14 @@ void synchronize_rcu_expedited(void)
}
s = rcu_exp_gp_seq_snap(rsp);
-
- rnp_unlock = exp_funnel_lock(rsp, s);
- if (rnp_unlock == NULL)
+ if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
- rcu_exp_gp_seq_start(rsp);
-
/* Initialize the rcu_node tree in preparation for the wait. */
sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
- /* Wait for snapshotted ->blkd_tasks lists to drain. */
- rnp = rcu_get_root(rsp);
- synchronize_sched_expedited_wait(rsp);
-
- /* Clean up and exit. */
- rcu_exp_gp_seq_end(rsp);
- mutex_unlock(&rnp_unlock->exp_funnel_mutex);
+ /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+ rcu_exp_wait_wake(rsp, s);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 1088e64f01ad..86782f9a4604 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v)
int cpu;
struct rcu_state *rsp = (struct rcu_state *)m->private;
struct rcu_data *rdp;
- unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+ unsigned long s1 = 0, s2 = 0, s3 = 0;
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(rsp->rda, cpu);
- s0 += atomic_long_read(&rdp->expedited_workdone0);
- s1 += atomic_long_read(&rdp->expedited_workdone1);
- s2 += atomic_long_read(&rdp->expedited_workdone2);
- s3 += atomic_long_read(&rdp->expedited_workdone3);
+ s1 += atomic_long_read(&rdp->exp_workdone1);
+ s2 += atomic_long_read(&rdp->exp_workdone2);
+ s3 += atomic_long_read(&rdp->exp_workdone3);
}
- seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence, s0, s1, s2, s3,
+ seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence, s1, s2, s3,
atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index ca828b41c938..3ccdc8eebc5a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -67,7 +67,7 @@ static int rcu_normal_after_boot;
module_param(rcu_normal_after_boot, int, 0);
#endif /* #ifndef CONFIG_TINY_RCU */
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
* rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
*
@@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void)
return 0;
if (debug_locks)
lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
- return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+ return lockdep_opinion || !preemptible();
}
EXPORT_SYMBOL(rcu_read_lock_sched_held);
#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index aa9bf00749c1..ab122a2cee41 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3099,12 +3099,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
oss.ss_sp = (void __user *) current->sas_ss_sp;
oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp);
+ oss.ss_flags = sas_ss_flags(sp) |
+ (current->sas_ss_flags & SS_FLAG_BITS);
if (uss) {
void __user *ss_sp;
size_t ss_size;
- int ss_flags;
+ unsigned ss_flags;
+ int ss_mode;
error = -EFAULT;
if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
@@ -3119,18 +3121,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
if (on_sig_stack(sp))
goto out;
+ ss_mode = ss_flags & ~SS_FLAG_BITS;
error = -EINVAL;
- /*
- * Note - this code used to test ss_flags incorrectly:
- * old code may have been written using ss_flags==0
- * to mean ss_flags==SS_ONSTACK (as this was the only
- * way that worked) - this fix preserves that older
- * mechanism.
- */
- if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+ if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+ ss_mode != 0)
goto out;
- if (ss_flags == SS_DISABLE) {
+ if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
@@ -3141,6 +3138,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
current->sas_ss_sp = (unsigned long) ss_sp;
current->sas_ss_size = ss_size;
+ current->sas_ss_flags = ss_flags;
}
error = 0;
@@ -3171,9 +3169,14 @@ int restore_altstack(const stack_t __user *uss)
int __save_altstack(stack_t __user *uss, unsigned long sp)
{
struct task_struct *t = current;
- return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
- __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+ __put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
+ if (err)
+ return err;
+ if (t->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(t);
+ return 0;
}
#ifdef CONFIG_COMPAT
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..c8b318663525 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ .extra2 = &six_hundred_forty_kb,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31872bc53bc4..536ada80f6dd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -262,7 +262,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
{
int prev;
- prev = atomic_fetch_or(dep, BIT(bit));
+ prev = atomic_fetch_or(BIT(bit), dep);
if (!prev)
tick_nohz_full_kick_all();
}
@@ -292,7 +292,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
ts = per_cpu_ptr(&tick_cpu_sched, cpu);
- prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit));
+ prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
if (!prev) {
preempt_disable();
/* Perf needs local kick that is NMI safe */
diff --git a/kernel/torture.c b/kernel/torture.c
index 44aa462d033f..fa0bdeee17ac 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -451,6 +451,7 @@ static int torture_shutdown(void *arg)
torture_shutdown_hook();
else
VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+ ftrace_dump(DUMP_ALL);
kernel_power_off(); /* Shut down the system. */
return 0;
}
@@ -602,8 +603,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable)
{
mutex_lock(&fullstop_mutex);
if (torture_type != NULL) {
- pr_alert("torture_init_begin: refusing %s init: %s running",
+ pr_alert("torture_init_begin: Refusing %s init: %s running.\n",
ttype, torture_type);
+ pr_alert("torture_init_begin: One torture test at a time!\n");
mutex_unlock(&fullstop_mutex);
return false;
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..e11108f1d197 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!is_sampling_event(p_event))
+ return 0;
+
/*
* We don't allow user space callchains for function trace
* event, due to issues with page faults while tracing page
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3bfdff06eea7..5f5068e94003 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4554,6 +4554,17 @@ static void rebind_workers(struct worker_pool *pool)
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+
+ /*
+ * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+ * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
+ * being reworked and this can go away in time.
+ */
+ if (!(pool->flags & POOL_DISASSOCIATED)) {
+ spin_unlock_irq(&pool->lock);
+ return;
+ }
+
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e9a607534ca..f4b797a690ba 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1289,6 +1289,39 @@ config TORTURE_TEST
tristate
default n
+config RCU_PERF_TEST
+ tristate "performance tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ select SRCU
+ select TASKS_RCU
+ default n
+ help
+ This option provides a kernel module that runs performance
+ tests on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU performance tests to be built into
+ the kernel.
+ Say M if you want the RCU performance tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_PERF_TEST_RUNNABLE
+ bool "performance tests for RCU runnable by default"
+ depends on RCU_PERF_TEST = y
+ default n
+ help
+ This option provides a way to build the RCU performance tests
+ directly into the kernel without them starting up at boot time.
+ You can use /sys/module to manually override this setting.
+ This /proc file is available only when the RCU performance
+ tests have been built into the kernel.
+
+ Say Y here if you want the RCU performance tests to start during
+ boot (you probably don't).
+ Say N here if you want the RCU performance tests to start only
+ after being manually enabled via /sys/module.
+
config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436c97..a65e9a861535 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -23,7 +23,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o \
sha1.o md5.o irq_regs.o argv_split.o \
- proportions.o flex_proportions.o ratelimit.o show_mem.o \
+ flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o nmi_backtrace.o
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 2b3f46c049d4..554522934c44 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -74,7 +74,7 @@ next_tag:
/* Extract a tag from the data */
tag = data[dp++];
- if (tag == 0) {
+ if (tag == ASN1_EOC) {
/* It appears to be an EOC. */
if (data[dp++] != 0)
goto invalid_eoc;
@@ -96,10 +96,8 @@ next_tag:
/* Extract the length */
len = data[dp++];
- if (len <= 0x7f) {
- dp += len;
- goto next_tag;
- }
+ if (len <= 0x7f)
+ goto check_length;
if (unlikely(len == ASN1_INDEFINITE_LENGTH)) {
/* Indefinite length */
@@ -110,14 +108,18 @@ next_tag:
}
n = len - 0x80;
- if (unlikely(n > sizeof(size_t) - 1))
+ if (unlikely(n > sizeof(len) - 1))
goto length_too_long;
if (unlikely(n > datalen - dp))
goto data_overrun_error;
- for (len = 0; n > 0; n--) {
+ len = 0;
+ for (; n > 0; n--) {
len <<= 8;
len |= data[dp++];
}
+check_length:
+ if (len > datalen - dp)
+ goto data_overrun_error;
dp += len;
goto next_tag;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5fecddc32b1b..ca5316e0087b 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -569,6 +569,25 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
}
EXPORT_SYMBOL(iov_iter_alignment);
+unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
+{
+ unsigned long res = 0;
+ size_t size = i->count;
+ if (!size)
+ return 0;
+
+ iterate_all_kinds(i, size, v,
+ (res |= (!res ? 0 : (unsigned long)v.iov_base) |
+ (size != v.iov_len ? size : 0), 0),
+ (res |= (!res ? 0 : (unsigned long)v.bv_offset) |
+ (size != v.bv_len ? size : 0)),
+ (res |= (!res ? 0 : (unsigned long)v.iov_base) |
+ (size != v.iov_len ? size : 0))
+ );
+ return res;
+}
+EXPORT_SYMBOL(iov_iter_gap_alignment);
+
ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
diff --git a/lib/proportions.c b/lib/proportions.c
deleted file mode 100644
index efa54f259ea9..000000000000
--- a/lib/proportions.c
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Floating proportions
- *
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
- *
- * Description:
- *
- * The floating proportion is a time derivative with an exponentially decaying
- * history:
- *
- * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
- *
- * Where j is an element from {prop_local}, x_{j} is j's number of events,
- * and i the time period over which the differential is taken. So d/dt_{-i} is
- * the differential over the i-th last period.
- *
- * The decaying history gives smooth transitions. The time differential carries
- * the notion of speed.
- *
- * The denominator is 2^(1+i) because we want the series to be normalised, ie.
- *
- * \Sum_{i=0} 1/2^(1+i) = 1
- *
- * Further more, if we measure time (t) in the same events as x; so that:
- *
- * t = \Sum_{j} x_{j}
- *
- * we get that:
- *
- * \Sum_{j} p_{j} = 1
- *
- * Writing this in an iterative fashion we get (dropping the 'd's):
- *
- * if (++x_{j}, ++t > period)
- * t /= 2;
- * for_each (j)
- * x_{j} /= 2;
- *
- * so that:
- *
- * p_{j} = x_{j} / t;
- *
- * We optimize away the '/= 2' for the global time delta by noting that:
- *
- * if (++t > period) t /= 2:
- *
- * Can be approximated by:
- *
- * period/2 + (++t % period/2)
- *
- * [ Furthermore, when we choose period to be 2^n it can be written in terms of
- * binary operations and wraparound artefacts disappear. ]
- *
- * Also note that this yields a natural counter of the elapsed periods:
- *
- * c = t / (period/2)
- *
- * [ Its monotonic increasing property can be applied to mitigate the wrap-
- * around issue. ]
- *
- * This allows us to do away with the loop over all prop_locals on each period
- * expiration. By remembering the period count under which it was last accessed
- * as c_{j}, we can obtain the number of 'missed' cycles from:
- *
- * c - c_{j}
- *
- * We can then lazily catch up to the global period count every time we are
- * going to use x_{j}, by doing:
- *
- * x_{j} /= 2^(c - c_{j}), c_{j} = c
- */
-
-#include <linux/proportions.h>
-#include <linux/rcupdate.h>
-
-int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp)
-{
- int err;
-
- if (shift > PROP_MAX_SHIFT)
- shift = PROP_MAX_SHIFT;
-
- pd->index = 0;
- pd->pg[0].shift = shift;
- mutex_init(&pd->mutex);
- err = percpu_counter_init(&pd->pg[0].events, 0, gfp);
- if (err)
- goto out;
-
- err = percpu_counter_init(&pd->pg[1].events, 0, gfp);
- if (err)
- percpu_counter_destroy(&pd->pg[0].events);
-
-out:
- return err;
-}
-
-/*
- * We have two copies, and flip between them to make it seem like an atomic
- * update. The update is not really atomic wrt the events counter, but
- * it is internally consistent with the bit layout depending on shift.
- *
- * We copy the events count, move the bits around and flip the index.
- */
-void prop_change_shift(struct prop_descriptor *pd, int shift)
-{
- int index;
- int offset;
- u64 events;
- unsigned long flags;
-
- if (shift > PROP_MAX_SHIFT)
- shift = PROP_MAX_SHIFT;
-
- mutex_lock(&pd->mutex);
-
- index = pd->index ^ 1;
- offset = pd->pg[pd->index].shift - shift;
- if (!offset)
- goto out;
-
- pd->pg[index].shift = shift;
-
- local_irq_save(flags);
- events = percpu_counter_sum(&pd->pg[pd->index].events);
- if (offset < 0)
- events <<= -offset;
- else
- events >>= offset;
- percpu_counter_set(&pd->pg[index].events, events);
-
- /*
- * ensure the new pg is fully written before the switch
- */
- smp_wmb();
- pd->index = index;
- local_irq_restore(flags);
-
- synchronize_rcu();
-
-out:
- mutex_unlock(&pd->mutex);
-}
-
-/*
- * wrap the access to the data in an rcu_read_lock() section;
- * this is used to track the active references.
- */
-static struct prop_global *prop_get_global(struct prop_descriptor *pd)
-__acquires(RCU)
-{
- int index;
-
- rcu_read_lock();
- index = pd->index;
- /*
- * match the wmb from vcd_flip()
- */
- smp_rmb();
- return &pd->pg[index];
-}
-
-static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
-__releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static void
-prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
-{
- int offset = *pl_shift - new_shift;
-
- if (!offset)
- return;
-
- if (offset < 0)
- *pl_period <<= -offset;
- else
- *pl_period >>= offset;
-
- *pl_shift = new_shift;
-}
-
-/*
- * PERCPU
- */
-
-#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp)
-{
- raw_spin_lock_init(&pl->lock);
- pl->shift = 0;
- pl->period = 0;
- return percpu_counter_init(&pl->events, 0, gfp);
-}
-
-void prop_local_destroy_percpu(struct prop_local_percpu *pl)
-{
- percpu_counter_destroy(&pl->events);
-}
-
-/*
- * Catch up with missed period expirations.
- *
- * until (c_{j} == c)
- * x_{j} -= x_{j}/2;
- * c_{j}++;
- */
-static
-void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
-{
- unsigned long period = 1UL << (pg->shift - 1);
- unsigned long period_mask = ~(period - 1);
- unsigned long global_period;
- unsigned long flags;
-
- global_period = percpu_counter_read(&pg->events);
- global_period &= period_mask;
-
- /*
- * Fast path - check if the local and global period count still match
- * outside of the lock.
- */
- if (pl->period == global_period)
- return;
-
- raw_spin_lock_irqsave(&pl->lock, flags);
- prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
-
- /*
- * For each missed period, we half the local counter.
- * basically:
- * pl->events >> (global_period - pl->period);
- */
- period = (global_period - pl->period) >> (pg->shift - 1);
- if (period < BITS_PER_LONG) {
- s64 val = percpu_counter_read(&pl->events);
-
- if (val < (nr_cpu_ids * PROP_BATCH))
- val = percpu_counter_sum(&pl->events);
-
- __percpu_counter_add(&pl->events, -val + (val >> period),
- PROP_BATCH);
- } else
- percpu_counter_set(&pl->events, 0);
-
- pl->period = global_period;
- raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- * ++x_{j}, ++t
- */
-void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
-{
- struct prop_global *pg = prop_get_global(pd);
-
- prop_norm_percpu(pg, pl);
- __percpu_counter_add(&pl->events, 1, PROP_BATCH);
- percpu_counter_add(&pg->events, 1);
- prop_put_global(pd, pg);
-}
-
-/*
- * identical to __prop_inc_percpu, except that it limits this pl's fraction to
- * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded.
- */
-void __prop_inc_percpu_max(struct prop_descriptor *pd,
- struct prop_local_percpu *pl, long frac)
-{
- struct prop_global *pg = prop_get_global(pd);
-
- prop_norm_percpu(pg, pl);
-
- if (unlikely(frac != PROP_FRAC_BASE)) {
- unsigned long period_2 = 1UL << (pg->shift - 1);
- unsigned long counter_mask = period_2 - 1;
- unsigned long global_count;
- long numerator, denominator;
-
- numerator = percpu_counter_read_positive(&pl->events);
- global_count = percpu_counter_read(&pg->events);
- denominator = period_2 + (global_count & counter_mask);
-
- if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT))
- goto out_put;
- }
-
- percpu_counter_add(&pl->events, 1);
- percpu_counter_add(&pg->events, 1);
-
-out_put:
- prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- * p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_percpu(struct prop_descriptor *pd,
- struct prop_local_percpu *pl,
- long *numerator, long *denominator)
-{
- struct prop_global *pg = prop_get_global(pd);
- unsigned long period_2 = 1UL << (pg->shift - 1);
- unsigned long counter_mask = period_2 - 1;
- unsigned long global_count;
-
- prop_norm_percpu(pg, pl);
- *numerator = percpu_counter_read_positive(&pl->events);
-
- global_count = percpu_counter_read(&pg->events);
- *denominator = period_2 + (global_count & counter_mask);
-
- prop_put_global(pd, pg);
-}
-
-/*
- * SINGLE
- */
-
-int prop_local_init_single(struct prop_local_single *pl)
-{
- raw_spin_lock_init(&pl->lock);
- pl->shift = 0;
- pl->period = 0;
- pl->events = 0;
- return 0;
-}
-
-void prop_local_destroy_single(struct prop_local_single *pl)
-{
-}
-
-/*
- * Catch up with missed period expirations.
- */
-static
-void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
-{
- unsigned long period = 1UL << (pg->shift - 1);
- unsigned long period_mask = ~(period - 1);
- unsigned long global_period;
- unsigned long flags;
-
- global_period = percpu_counter_read(&pg->events);
- global_period &= period_mask;
-
- /*
- * Fast path - check if the local and global period count still match
- * outside of the lock.
- */
- if (pl->period == global_period)
- return;
-
- raw_spin_lock_irqsave(&pl->lock, flags);
- prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
- /*
- * For each missed period, we half the local counter.
- */
- period = (global_period - pl->period) >> (pg->shift - 1);
- if (likely(period < BITS_PER_LONG))
- pl->events >>= period;
- else
- pl->events = 0;
- pl->period = global_period;
- raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- * ++x_{j}, ++t
- */
-void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
-{
- struct prop_global *pg = prop_get_global(pd);
-
- prop_norm_single(pg, pl);
- pl->events++;
- percpu_counter_add(&pg->events, 1);
- prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- * p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_single(struct prop_descriptor *pd,
- struct prop_local_single *pl,
- long *numerator, long *denominator)
-{
- struct prop_global *pg = prop_get_global(pd);
- unsigned long period_2 = 1UL << (pg->shift - 1);
- unsigned long counter_mask = period_2 - 1;
- unsigned long global_count;
-
- prop_norm_single(pg, pl);
- *numerator = pl->events;
-
- global_count = percpu_counter_read(&pg->events);
- *denominator = period_2 + (global_count & counter_mask);
-
- prop_put_global(pd, pg);
-}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7daa7de8f48..b49ee126d4d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
/*
* We can only reuse the page if nobody else maps the huge page or it's
- * part. We can do it by checking page_mapcount() on each sub-page, but
- * it's expensive.
- * The cheaper way is to check page_count() to be equal 1: every
- * mapcount takes page reference reference, so this way we can
- * guarantee, that the PMD is the only mapping.
- * This can give false negative if somebody pinned the page, but that's
- * fine.
+ * part.
*/
- if (page_mapcount(page) == 1 && page_count(page) == 1) {
+ if (page_trans_huge_mapcount(page, NULL) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (pte_write(pteval)) {
writable = true;
} else {
- if (PageSwapCache(page) && !reuse_swap_page(page)) {
+ if (PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
unlock_page(page);
result = SCAN_SWAP_CACHE_PAGE;
goto out;
@@ -3223,6 +3218,64 @@ int total_mapcount(struct page *page)
}
/*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+{
+ int i, ret, _total_mapcount, mapcount;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ return mapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = ret = 0;
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ ret = max(ret, mapcount);
+ _total_mapcount += mapcount;
+ }
+ if (PageDoubleMap(page)) {
+ ret -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ ret += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ return ret;
+}
+
+/*
* This function splits huge page into normal pages. @page can point to any
* subpage of huge page to split. Split doesn't change the position of @page.
*
diff --git a/mm/ksm.c b/mm/ksm.c
index b99e828172f6..4786b4150f62 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void)
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ up_read(&mm->mmap_sem);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void)
free_mm_slot(mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
mmdrop(mm);
- } else {
+ } else
spin_unlock(&ksm_mmlist_lock);
- up_read(&mm->mmap_sem);
- }
}
/* Clean up stable nodes, but don't worry if some are still busy */
@@ -1663,8 +1661,15 @@ next_mm:
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
- spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
+ /*
+ * up_read(&mm->mmap_sem) first because after
+ * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
+ * already have been freed under us by __ksm_exit()
+ * because the "mm_slot" is still hashed and
+ * ksm_scan.mm_slot doesn't point to it anymore.
+ */
+ spin_unlock(&ksm_mmlist_lock);
}
/* Repeat until we've completed scanning the whole list */
diff --git a/mm/memory.c b/mm/memory.c
index 52c218e2b724..07493e34ab7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* not dirty accountable.
*/
if (PageAnon(old_page) && !PageKsm(old_page)) {
+ int total_mapcount;
if (!trylock_page(old_page)) {
get_page(old_page);
pte_unmap_unlock(page_table, ptl);
@@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
put_page(old_page);
}
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
+ if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (total_mapcount == 1) {
+ /*
+ * The page is all ours. Move it to
+ * our anon_vma so the rmap code will
+ * not search our parent or siblings.
+ * Protected against the rmap code by
+ * the page lock.
+ */
+ page_move_anon_rmap(compound_head(old_page),
+ vma, address);
+ }
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);
@@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
inc_mm_counter_fast(mm, MM_ANONPAGES);
dec_mm_counter_fast(mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+ if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 83874eced5bf..031713ab40ce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -922,18 +922,19 @@ out:
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
+ *
+ * NOTE: total_mapcount should not be relied upon by the caller if
+ * reuse_swap_page() returns false, but it may be always overwritten
+ * (see the other implementation for CONFIG_SWAP=n).
*/
-int reuse_swap_page(struct page *page)
+bool reuse_swap_page(struct page *page, int *total_mapcount)
{
int count;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
- return 0;
- /* The page is part of THP and cannot be reused */
- if (PageTransCompound(page))
- return 0;
- count = page_mapcount(page);
+ return false;
+ count = page_trans_huge_mapcount(page, total_mapcount);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
if (count == 1 && !PageWriteback(page)) {
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d97268e8ff10..2b68418c7198 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -975,6 +975,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
val = 65535 - 15;
+ if (type == RTAX_HOPLIMIT && val > 255)
+ val = 255;
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
return -EINVAL;
fi->fib_metrics[type - 1] = val;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 205a2b8a5a84..4cc84212cce1 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -398,7 +398,10 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
- skb_pop_mac_header(skb);
+ if (tunnel->dev->type != ARPHRD_NONE)
+ skb_pop_mac_header(skb);
+ else
+ skb_reset_mac_header(skb);
if (tunnel->collect_md) {
__be16 flags;
__be64 tun_id;
@@ -1031,6 +1034,8 @@ static void ipgre_netlink_parms(struct net_device *dev,
struct ip_tunnel *t = netdev_priv(dev);
t->collect_md = true;
+ if (dev->type == ARPHRD_IPGRE)
+ dev->type = ARPHRD_NONE;
}
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 441ae9da3a23..79a03b87a771 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2640,8 +2640,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
skb_headroom(skb) >= 0xFFFF)) {
- struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
- GFP_ATOMIC);
+ struct sk_buff *nskb;
+
+ skb_mstamp_get(&skb->skb_mstamp);
+ nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
} else {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d916d6ab9ad2..6f32944e0223 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1750,6 +1750,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
} else {
val = nla_get_u32(nla);
}
+ if (type == RTAX_HOPLIMIT && val > 255)
+ val = 255;
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
goto err;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index afde5f5e728a..e27fd17c6743 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -66,7 +66,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
-static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
@@ -1778,6 +1778,7 @@ void nf_conntrack_init_end(void)
int nf_conntrack_init_net(struct net *net)
{
+ static atomic64_t unique_id;
int ret = -ENOMEM;
int cpu;
@@ -1800,7 +1801,8 @@ int nf_conntrack_init_net(struct net *net)
if (!net->ct.stat)
goto err_pcpu_lists;
- net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
+ net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu",
+ (u64)atomic64_inc_return(&unique_id));
if (!net->ct.slabname)
goto err_slabname;
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 4c2b4c0c4d5f..dbd0803b1827 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -96,6 +96,8 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
return -EINVAL;
if (flags & NFACCT_F_OVERQUOTA)
return -EINVAL;
+ if ((flags & NFACCT_F_QUOTA) && !tb[NFACCT_QUOTA])
+ return -EINVAL;
size += sizeof(u64);
}
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 29d2c31f406c..daf45da448fa 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -236,6 +236,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
list_del(&info->timer->entry);
del_timer_sync(&info->timer->timer);
+ cancel_work_sync(&info->timer->work);
sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
kfree(info->timer->attr.attr.name);
kfree(info->timer);
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index b5fea1101faa..10c84d882881 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -776,6 +776,19 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
return -EINVAL;
}
+ /* Userspace may decide to perform a ct lookup without a helper
+ * specified followed by a (recirculate and) commit with one.
+ * Therefore, for unconfirmed connections which we will commit,
+ * we need to attach the helper here.
+ */
+ if (!nf_ct_is_confirmed(ct) && info->commit &&
+ info->helper && !nfct_help(ct)) {
+ int err = __nf_ct_try_assign_helper(ct, info->ct,
+ GFP_ATOMIC);
+ if (err)
+ return err;
+ }
+
/* Call the helper only if:
* - nf_conntrack_in() was executed above ("!cached") for a
* confirmed connection, or
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index c589a9ba506a..343d011aa818 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -423,7 +423,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
u16 ife_type = 0;
u8 *daddr = NULL;
u8 *saddr = NULL;
- int ret = 0;
+ int ret = 0, exists = 0;
int err;
err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
@@ -435,25 +435,29 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_IFE_PARMS]);
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
if (parm->flags & IFE_ENCODE) {
/* Until we get issued the ethertype, we cant have
* a default..
**/
if (!tb[TCA_IFE_TYPE]) {
+ if (exists)
+ tcf_hash_release(a, bind);
pr_info("You MUST pass etherype for encoding\n");
return -EINVAL;
}
}
- if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ if (!exists) {
ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
- if (bind) /* dont override defaults */
- return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
@@ -495,6 +499,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
NULL);
if (err) {
metadata_parse_err:
+ if (exists)
+ tcf_hash_release(a, bind);
if (ret == ACT_P_CREATED)
_tcf_ife_cleanup(a, bind);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 350e134cffb3..8b5270008a6e 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -96,7 +96,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
struct tcf_ipt *ipt;
struct xt_entry_target *td, *t;
char *tname;
- int ret = 0, err;
+ int ret = 0, err, exists = 0;
u32 hook = 0;
u32 index = 0;
@@ -107,18 +107,23 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
if (err < 0)
return err;
- if (tb[TCA_IPT_HOOK] == NULL)
- return -EINVAL;
- if (tb[TCA_IPT_TARG] == NULL)
+ if (tb[TCA_IPT_INDEX] != NULL)
+ index = nla_get_u32(tb[TCA_IPT_INDEX]);
+
+ exists = tcf_hash_check(tn, index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
+ if (exists)
+ tcf_hash_release(a, bind);
return -EINVAL;
+ }
td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
return -EINVAL;
- if (tb[TCA_IPT_INDEX] != NULL)
- index = nla_get_u32(tb[TCA_IPT_INDEX]);
-
if (!tcf_hash_check(tn, index, a, bind)) {
ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
false);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e8a760cf7775..8f3948dd38b8 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -61,7 +61,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct tc_mirred *parm;
struct tcf_mirred *m;
struct net_device *dev;
- int ret, ok_push = 0;
+ int ret, ok_push = 0, exists = 0;
if (nla == NULL)
return -EINVAL;
@@ -71,17 +71,27 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (tb[TCA_MIRRED_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_MIRRED_PARMS]);
+
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
switch (parm->eaction) {
case TCA_EGRESS_MIRROR:
case TCA_EGRESS_REDIR:
break;
default:
+ if (exists)
+ tcf_hash_release(a, bind);
return -EINVAL;
}
if (parm->ifindex) {
dev = __dev_get_by_index(net, parm->ifindex);
- if (dev == NULL)
+ if (dev == NULL) {
+ if (exists)
+ tcf_hash_release(a, bind);
return -ENODEV;
+ }
switch (dev->type) {
case ARPHRD_TUNNEL:
case ARPHRD_TUNNEL6:
@@ -99,7 +109,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
dev = NULL;
}
- if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ if (!exists) {
if (dev == NULL)
return -EINVAL;
ret = tcf_hash_create(tn, parm->index, est, a,
@@ -108,9 +118,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
return ret;
ret = ACT_P_CREATED;
} else {
- if (bind)
- return 0;
-
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 75b2be13fbcc..3a33fb648a6d 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -87,7 +87,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct tc_defact *parm;
struct tcf_defact *d;
char *defdata;
- int ret = 0, err;
+ int ret = 0, err, exists = 0;
if (nla == NULL)
return -EINVAL;
@@ -99,13 +99,21 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
if (tb[TCA_DEF_PARMS] == NULL)
return -EINVAL;
- if (tb[TCA_DEF_DATA] == NULL)
- return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ if (tb[TCA_DEF_DATA] == NULL) {
+ if (exists)
+ tcf_hash_release(a, bind);
+ return -EINVAL;
+ }
+
defdata = nla_data(tb[TCA_DEF_DATA]);
- if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ if (!exists) {
ret = tcf_hash_create(tn, parm->index, est, a,
sizeof(*d), bind, false);
if (ret)
@@ -122,8 +130,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
} else {
d = to_defact(a);
- if (bind)
- return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index cfcdbdc00c9b..69da5a8f0034 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -69,7 +69,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct tcf_skbedit *d;
u32 flags = 0, *priority = NULL, *mark = NULL;
u16 *queue_mapping = NULL;
- int ret = 0, err;
+ int ret = 0, err, exists = 0;
if (nla == NULL)
return -EINVAL;
@@ -96,12 +96,18 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
mark = nla_data(tb[TCA_SKBEDIT_MARK]);
}
- if (!flags)
- return -EINVAL;
-
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
- if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
+ if (!flags) {
+ tcf_hash_release(a, bind);
+ return -EINVAL;
+ }
+
+ if (!exists) {
ret = tcf_hash_create(tn, parm->index, est, a,
sizeof(*d), bind, false);
if (ret)
@@ -111,8 +117,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
ret = ACT_P_CREATED;
} else {
d = to_skbedit(a);
- if (bind)
- return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index bab8ae0cefc0..c45f926dafb9 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -77,7 +77,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
int action;
__be16 push_vid = 0;
__be16 push_proto = 0;
- int ret = 0;
+ int ret = 0, exists = 0;
int err;
if (!nla)
@@ -90,15 +90,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!tb[TCA_VLAN_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_VLAN_PARMS]);
+ exists = tcf_hash_check(tn, parm->index, a, bind);
+ if (exists && bind)
+ return 0;
+
switch (parm->v_action) {
case TCA_VLAN_ACT_POP:
break;
case TCA_VLAN_ACT_PUSH:
- if (!tb[TCA_VLAN_PUSH_VLAN_ID])
+ if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
+ if (exists)
+ tcf_hash_release(a, bind);
return -EINVAL;
+ }
push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
- if (push_vid >= VLAN_VID_MASK)
+ if (push_vid >= VLAN_VID_MASK) {
+ if (exists)
+ tcf_hash_release(a, bind);
return -ERANGE;
+ }
if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) {
push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]);
@@ -114,11 +124,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
}
break;
default:
+ if (exists)
+ tcf_hash_release(a, bind);
return -EINVAL;
}
action = parm->v_action;
- if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ if (!exists) {
ret = tcf_hash_create(tn, parm->index, est, a,
sizeof(*v), bind, false);
if (ret)
@@ -126,8 +138,6 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
ret = ACT_P_CREATED;
} else {
- if (bind)
- return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index 7ecd04c21360..997ff7b2509b 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -277,6 +277,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
memset(&theirs, 0, sizeof(theirs));
memcpy(new, ours, sizeof(*new));
+ memset(dte, 0, sizeof(*dte));
len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask);
if (len < 0)
diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c
index 64e0d1d81ca5..9739fce9e032 100644
--- a/sound/pci/hda/hda_sysfs.c
+++ b/sound/pci/hda/hda_sysfs.c
@@ -141,14 +141,6 @@ static int reconfig_codec(struct hda_codec *codec)
err = snd_hda_codec_configure(codec);
if (err < 0)
goto error;
- /* rebuild PCMs */
- err = snd_hda_codec_build_pcms(codec);
- if (err < 0)
- goto error;
- /* rebuild mixers */
- err = snd_hda_codec_build_controls(codec);
- if (err < 0)
- goto error;
err = snd_card_register(codec->card);
error:
snd_hda_power_down(codec);
diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 1483f85999ec..a010d704e0e2 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -3401,6 +3401,9 @@ static int patch_atihdmi(struct hda_codec *codec)
spec->ops.pin_hbr_setup = atihdmi_pin_hbr_setup;
spec->ops.setup_stream = atihdmi_setup_stream;
+ spec->chmap.ops.pin_get_slot_channel = atihdmi_pin_get_slot_channel;
+ spec->chmap.ops.pin_set_slot_channel = atihdmi_pin_set_slot_channel;
+
if (!has_amd_full_remap_support(codec)) {
/* override to ATI/AMD-specific versions with pairwise mapping */
spec->chmap.ops.chmap_cea_alloc_validate_get_type =
@@ -3408,10 +3411,6 @@ static int patch_atihdmi(struct hda_codec *codec)
spec->chmap.ops.cea_alloc_to_tlv_chmap =
atihdmi_paired_cea_alloc_to_tlv_chmap;
spec->chmap.ops.chmap_validate = atihdmi_paired_chmap_validate;
- spec->chmap.ops.pin_get_slot_channel =
- atihdmi_pin_get_slot_channel;
- spec->chmap.ops.pin_set_slot_channel =
- atihdmi_pin_set_slot_channel;
}
/* ATI/AMD converters do not advertise all of their capabilities */
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index ac4490a96863..4918ffa5ba68 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6426,6 +6426,7 @@ enum {
ALC668_FIXUP_DELL_DISABLE_AAMIX,
ALC668_FIXUP_DELL_XPS13,
ALC662_FIXUP_ASUS_Nx50,
+ ALC668_FIXUP_ASUS_Nx51,
};
static const struct hda_fixup alc662_fixups[] = {
@@ -6672,6 +6673,15 @@ static const struct hda_fixup alc662_fixups[] = {
.chained = true,
.chain_id = ALC662_FIXUP_BASS_1A
},
+ [ALC668_FIXUP_ASUS_Nx51] = {
+ .type = HDA_FIXUP_PINS,
+ .v.pins = (const struct hda_pintbl[]) {
+ {0x1a, 0x90170151}, /* bass speaker */
+ {}
+ },
+ .chained = true,
+ .chain_id = ALC662_FIXUP_BASS_CHMAP,
+ },
};
static const struct snd_pci_quirk alc662_fixup_tbl[] = {
@@ -6694,11 +6704,14 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = {
SND_PCI_QUIRK(0x1028, 0x0698, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800),
+ SND_PCI_QUIRK(0x1043, 0x1080, "Asus UX501VW", ALC668_FIXUP_HEADSET_MODE),
SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_ASUS_Nx50),
SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A),
SND_PCI_QUIRK(0x1043, 0x129d, "Asus N750", ALC662_FIXUP_ASUS_Nx50),
SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP),
SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16),
+ SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51),
+ SND_PCI_QUIRK(0x1043, 0x17bd, "ASUS N751", ALC668_FIXUP_ASUS_Nx51),
SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16),
SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_BASS_MODE4_CHMAP),
SND_PCI_QUIRK(0x1043, 0x8469, "ASUS mobo", ALC662_FIXUP_NO_JACK_DETECT),
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 0adfd9537cf7..6adde457b602 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1137,8 +1137,11 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */
case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */
case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
+ case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */
case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */
+ case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */
case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */
+ case USB_ID(0x1de7, 0x0114): /* Phoenix Audio MT202pcs */
case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */
return true;
}
diff --git a/tools/Makefile b/tools/Makefile
index 60c7e6c8ff17..6bf68fe7dd29 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -137,7 +137,8 @@ libsubcmd_clean:
$(call descend,lib/subcmd,clean)
perf_clean:
- $(call descend,$(@:_clean=),clean)
+ $(Q)mkdir -p $(PERF_O) .
+ $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
selftests_clean:
$(call descend,testing/$(@:_clean=),clean)
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 6b7707270aa3..57c8f98874e8 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -30,6 +30,7 @@ endef
FEATURE_TESTS_BASIC := \
backtrace \
dwarf \
+ dwarf_getlocations \
fortify-source \
sync-compare-and-swap \
glibc \
@@ -48,6 +49,10 @@ FEATURE_TESTS_BASIC := \
libslang \
libcrypto \
libunwind \
+ libunwind-x86 \
+ libunwind-x86_64 \
+ libunwind-arm \
+ libunwind-aarch64 \
pthread-attr-setaffinity-np \
stackprotector-all \
timerfd \
@@ -68,7 +73,9 @@ FEATURE_TESTS_EXTRA := \
libbabeltrace \
liberty \
liberty-z \
- libunwind-debug-frame
+ libunwind-debug-frame \
+ libunwind-debug-frame-arm \
+ libunwind-debug-frame-aarch64
FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC)
@@ -78,6 +85,7 @@ endif
FEATURE_DISPLAY ?= \
dwarf \
+ dwarf_getlocations \
glibc \
gtk2 \
libaudit \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index c5f4c417428d..3d88f09e188b 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -3,6 +3,7 @@ FILES= \
test-backtrace.bin \
test-bionic.bin \
test-dwarf.bin \
+ test-dwarf_getlocations.bin \
test-fortify-source.bin \
test-sync-compare-and-swap.bin \
test-glibc.bin \
@@ -26,6 +27,12 @@ FILES= \
test-libcrypto.bin \
test-libunwind.bin \
test-libunwind-debug-frame.bin \
+ test-libunwind-x86.bin \
+ test-libunwind-x86_64.bin \
+ test-libunwind-arm.bin \
+ test-libunwind-aarch64.bin \
+ test-libunwind-debug-frame-arm.bin \
+ test-libunwind-debug-frame-aarch64.bin \
test-pthread-attr-setaffinity-np.bin \
test-stackprotector-all.bin \
test-timerfd.bin \
@@ -82,6 +89,9 @@ endif
$(OUTPUT)test-dwarf.bin:
$(BUILD) $(DWARFLIBS)
+$(OUTPUT)test-dwarf_getlocations.bin:
+ $(BUILD) $(DWARFLIBS)
+
$(OUTPUT)test-libelf-mmap.bin:
$(BUILD) -lelf
@@ -99,6 +109,23 @@ $(OUTPUT)test-libunwind.bin:
$(OUTPUT)test-libunwind-debug-frame.bin:
$(BUILD) -lelf
+$(OUTPUT)test-libunwind-x86.bin:
+ $(BUILD) -lelf -lunwind-x86
+
+$(OUTPUT)test-libunwind-x86_64.bin:
+ $(BUILD) -lelf -lunwind-x86_64
+
+$(OUTPUT)test-libunwind-arm.bin:
+ $(BUILD) -lelf -lunwind-arm
+
+$(OUTPUT)test-libunwind-aarch64.bin:
+ $(BUILD) -lelf -lunwind-aarch64
+
+$(OUTPUT)test-libunwind-debug-frame-arm.bin:
+ $(BUILD) -lelf -lunwind-arm
+
+$(OUTPUT)test-libunwind-debug-frame-aarch64.bin:
+ $(BUILD) -lelf -lunwind-aarch64
$(OUTPUT)test-libaudit.bin:
$(BUILD) -laudit
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index e499a36c1e4a..a282e8cb84f3 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -41,6 +41,10 @@
# include "test-dwarf.c"
#undef main
+#define main main_test_dwarf_getlocations
+# include "test-dwarf_getlocations.c"
+#undef main
+
#define main main_test_libelf_getphdrnum
# include "test-libelf-getphdrnum.c"
#undef main
@@ -143,6 +147,7 @@ int main(int argc, char *argv[])
main_test_libelf_mmap();
main_test_glibc();
main_test_dwarf();
+ main_test_dwarf_getlocations();
main_test_libelf_getphdrnum();
main_test_libunwind();
main_test_libaudit();
diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c
index b389026839b9..e04ab89a1013 100644
--- a/tools/build/feature/test-bpf.c
+++ b/tools/build/feature/test-bpf.c
@@ -27,10 +27,9 @@ int main(void)
attr.log_level = 0;
attr.kern_version = 0;
- attr = attr;
/*
* Test existence of __NR_bpf and BPF_PROG_LOAD.
* This call should fail if we run the testcase.
*/
- return syscall(__NR_bpf, BPF_PROG_LOAD, attr, sizeof(attr));
+ return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}
diff --git a/tools/build/feature/test-dwarf_getlocations.c b/tools/build/feature/test-dwarf_getlocations.c
new file mode 100644
index 000000000000..70162699dd43
--- /dev/null
+++ b/tools/build/feature/test-dwarf_getlocations.c
@@ -0,0 +1,12 @@
+#include <stdlib.h>
+#include <elfutils/libdw.h>
+
+int main(void)
+{
+ Dwarf_Addr base, start, end;
+ Dwarf_Attribute attr;
+ Dwarf_Op *op;
+ size_t nops;
+ ptrdiff_t offset = 0;
+ return (int)dwarf_getlocations(&attr, offset, &base, &start, &end, &op, &nops);
+}
diff --git a/tools/build/feature/test-libunwind-aarch64.c b/tools/build/feature/test-libunwind-aarch64.c
new file mode 100644
index 000000000000..fc03fb64e8c1
--- /dev/null
+++ b/tools/build/feature/test-libunwind-aarch64.c
@@ -0,0 +1,26 @@
+#include <libunwind-aarch64.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+ unw_word_t ip,
+ unw_dyn_info_t *di,
+ unw_proc_info_t *pi,
+ int need_unwind_info, void *arg);
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+ unw_addr_space_t addr_space;
+
+ addr_space = unw_create_addr_space(&accessors, 0);
+ if (addr_space)
+ return 0;
+
+ unw_init_remote(NULL, addr_space, NULL);
+ dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+ return 0;
+}
diff --git a/tools/build/feature/test-libunwind-arm.c b/tools/build/feature/test-libunwind-arm.c
new file mode 100644
index 000000000000..632d95ec641f
--- /dev/null
+++ b/tools/build/feature/test-libunwind-arm.c
@@ -0,0 +1,27 @@
+#include <libunwind-arm.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+ unw_word_t ip,
+ unw_dyn_info_t *di,
+ unw_proc_info_t *pi,
+ int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+ unw_addr_space_t addr_space;
+
+ addr_space = unw_create_addr_space(&accessors, 0);
+ if (addr_space)
+ return 0;
+
+ unw_init_remote(NULL, addr_space, NULL);
+ dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+ return 0;
+}
diff --git a/tools/build/feature/test-libunwind-debug-frame-aarch64.c b/tools/build/feature/test-libunwind-debug-frame-aarch64.c
new file mode 100644
index 000000000000..22844673fc26
--- /dev/null
+++ b/tools/build/feature/test-libunwind-debug-frame-aarch64.c
@@ -0,0 +1,16 @@
+#include <libunwind-aarch64.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+ unw_word_t ip, unw_word_t segbase,
+ const char *obj_name, unw_word_t start,
+ unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+ dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+ return 0;
+}
diff --git a/tools/build/feature/test-libunwind-debug-frame-arm.c b/tools/build/feature/test-libunwind-debug-frame-arm.c
new file mode 100644
index 000000000000..f98859684fee
--- /dev/null
+++ b/tools/build/feature/test-libunwind-debug-frame-arm.c
@@ -0,0 +1,16 @@
+#include <libunwind-arm.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+ unw_word_t ip, unw_word_t segbase,
+ const char *obj_name, unw_word_t start,
+ unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+ dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+ return 0;
+}
diff --git a/tools/build/feature/test-libunwind-x86.c b/tools/build/feature/test-libunwind-x86.c
new file mode 100644
index 000000000000..3561edce305e
--- /dev/null
+++ b/tools/build/feature/test-libunwind-x86.c
@@ -0,0 +1,27 @@
+#include <libunwind-x86.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+ unw_word_t ip,
+ unw_dyn_info_t *di,
+ unw_proc_info_t *pi,
+ int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+ unw_addr_space_t addr_space;
+
+ addr_space = unw_create_addr_space(&accessors, 0);
+ if (addr_space)
+ return 0;
+
+ unw_init_remote(NULL, addr_space, NULL);
+ dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+ return 0;
+}
diff --git a/tools/build/feature/test-libunwind-x86_64.c b/tools/build/feature/test-libunwind-x86_64.c
new file mode 100644
index 000000000000..5add2517b2a1
--- /dev/null
+++ b/tools/build/feature/test-libunwind-x86_64.c
@@ -0,0 +1,27 @@
+#include <libunwind-x86_64.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+ unw_word_t ip,
+ unw_dyn_info_t *di,
+ unw_proc_info_t *pi,
+ int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+ unw_addr_space_t addr_space;
+
+ addr_space = unw_create_addr_space(&accessors, 0);
+ if (addr_space)
+ return 0;
+
+ unw_init_remote(NULL, addr_space, NULL);
+ dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+ return 0;
+}
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index ef78c22ff44d..08556cf2c70d 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -351,6 +351,19 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep)
return err;
}
+int procfs__read_str(const char *entry, char **buf, size_t *sizep)
+{
+ char path[PATH_MAX];
+ const char *procfs = procfs__mountpoint();
+
+ if (!procfs)
+ return -1;
+
+ snprintf(path, sizeof(path), "%s/%s", procfs, entry);
+
+ return filename__read_str(path, buf, sizep);
+}
+
int sysfs__read_ull(const char *entry, unsigned long long *value)
{
char path[PATH_MAX];
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index 9f6598098dc5..16c9c2ed7c5b 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -29,6 +29,8 @@ int filename__read_int(const char *filename, int *value);
int filename__read_ull(const char *filename, unsigned long long *value);
int filename__read_str(const char *filename, char **buf, size_t *sizep);
+int procfs__read_str(const char *entry, char **buf, size_t *sizep);
+
int sysctl__read_int(const char *sysctl, int *value);
int sysfs__read_int(const char *entry, int *value);
int sysfs__read_ull(const char *entry, unsigned long long *value);
diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
index 0144b3d1bb77..88cccea3ca99 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -1164,11 +1164,11 @@ process_filter(struct event_format *event, struct filter_arg **parg,
current_op = current_exp;
ret = collapse_tree(current_op, parg, error_str);
+ /* collapse_tree() may free current_op, and updates parg accordingly */
+ current_op = NULL;
if (ret < 0)
goto fail;
- *parg = current_op;
-
free(token);
return 0;
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index be764f9ec769..c6c8318e38a2 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -672,6 +672,7 @@ The letters are:
d create a debug log
g synthesize a call chain (use with i or x)
l synthesize last branch entries (use with i or x)
+ s skip initial number of events
"Instructions" events look like they were recorded by "perf record -e
instructions".
@@ -730,6 +731,12 @@ from one sample to the next.
To disable trace decoding entirely, use the option --no-itrace.
+It is also possible to skip events generated (instructions, branches, transactions)
+at the beginning. This is useful to ignore initialization code.
+
+ --itrace=i0nss1000000
+
+skips the first million instructions.
dump option
-----------
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
index 65453f4c7006..e2a4c5e0dbe5 100644
--- a/tools/perf/Documentation/itrace.txt
+++ b/tools/perf/Documentation/itrace.txt
@@ -7,6 +7,7 @@
d create a debug log
g synthesize a call chain (use with i or x)
l synthesize last branch entries (use with i or x)
+ s skip initial number of events
The default is all events i.e. the same as --itrace=ibxe
@@ -24,3 +25,10 @@
Also the number of last branch entries (default 64, max. 1024) for
instructions or transactions events can be specified.
+
+ It is also possible to skip events generated (instructions, branches, transactions)
+ at the beginning. This is useful to ignore initialization code.
+
+ --itrace=i0nss1000000
+
+ skips the first million instructions.
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index e9cd39a92dc2..778f54d4d0bd 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -33,7 +33,7 @@ OPTIONS
-f::
--force::
- Don't complain, do it.
+ Don't do ownership validation.
-v::
--verbose::
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index d1deb573877f..3e9490b9c533 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -75,7 +75,7 @@ OPTIONS
-f::
--force::
- Don't complain, do it.
+ Don't do ownership validation.
--symfs=<directory>::
Look for files with symbols relative to this directory.
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index ec723d0a5bb3..a126e97a8114 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -93,6 +93,67 @@ raw encoding of 0x1A8 can be used:
You should refer to the processor specific documentation for getting these
details. Some of them are referenced in the SEE ALSO section below.
+ARBITRARY PMUS
+--------------
+
+perf also supports an extended syntax for specifying raw parameters
+to PMUs. Using this typically requires looking up the specific event
+in the CPU vendor specific documentation.
+
+The available PMUs and their raw parameters can be listed with
+
+ ls /sys/devices/*/format
+
+For example the raw event "LSD.UOPS" core pmu event above could
+be specified as
+
+ perf stat -e cpu/event=0xa8,umask=0x1,name=LSD.UOPS_CYCLES,cmask=1/ ...
+
+PER SOCKET PMUS
+---------------
+
+Some PMUs are not associated with a core, but with a whole CPU socket.
+Events on these PMUs generally cannot be sampled, but only counted globally
+with perf stat -a. They can be bound to one logical CPU, but will measure
+all the CPUs in the same socket.
+
+This example measures memory bandwidth every second
+on the first memory controller on socket 0 of a Intel Xeon system
+
+ perf stat -C 0 -a uncore_imc_0/cas_count_read/,uncore_imc_0/cas_count_write/ -I 1000 ...
+
+Each memory controller has its own PMU. Measuring the complete system
+bandwidth would require specifying all imc PMUs (see perf list output),
+and adding the values together.
+
+This example measures the combined core power every second
+
+ perf stat -I 1000 -e power/energy-cores/ -a
+
+ACCESS RESTRICTIONS
+-------------------
+
+For non root users generally only context switched PMU events are available.
+This is normally only the events in the cpu PMU, the predefined events
+like cycles and instructions and some software events.
+
+Other PMUs and global measurements are normally root only.
+Some event qualifiers, such as "any", are also root only.
+
+This can be overriden by setting the kernel.perf_event_paranoid
+sysctl to -1, which allows non root to use these events.
+
+For accessing trace point events perf needs to have read access to
+/sys/kernel/debug/tracing, even when perf_event_paranoid is in a relaxed
+setting.
+
+TRACING
+-------
+
+Some PMUs control advanced hardware tracing capabilities, such as Intel PT,
+that allows low overhead execution tracing. These are described in a separate
+intel-pt.txt document.
+
PARAMETERIZED EVENTS
--------------------
@@ -106,6 +167,50 @@ also be supplied. For example:
perf stat -C 0 -e 'hv_gpci/dtbp_ptitc,phys_processor_idx=0x2/' ...
+EVENT GROUPS
+------------
+
+Perf supports time based multiplexing of events, when the number of events
+active exceeds the number of hardware performance counters. Multiplexing
+can cause measurement errors when the workload changes its execution
+profile.
+
+When metrics are computed using formulas from event counts, it is useful to
+ensure some events are always measured together as a group to minimize multiplexing
+errors. Event groups can be specified using { }.
+
+ perf stat -e '{instructions,cycles}' ...
+
+The number of available performance counters depend on the CPU. A group
+cannot contain more events than available counters.
+For example Intel Core CPUs typically have four generic performance counters
+for the core, plus three fixed counters for instructions, cycles and
+ref-cycles. Some special events have restrictions on which counter they
+can schedule, and may not support multiple instances in a single group.
+When too many events are specified in the group none of them will not
+be measured.
+
+Globally pinned events can limit the number of counters available for
+other groups. On x86 systems, the NMI watchdog pins a counter by default.
+The nmi watchdog can be disabled as root with
+
+ echo 0 > /proc/sys/kernel/nmi_watchdog
+
+Events from multiple different PMUs cannot be mixed in a group, with
+some exceptions for software events.
+
+LEADER SAMPLING
+---------------
+
+perf also supports group leader sampling using the :S specifier.
+
+ perf record -e '{cycles,instructions}:S' ...
+ perf report --group
+
+Normally all events in a event group sample, but with :S only
+the first event (the leader) samples, and it only reads the values of the
+other events in the group.
+
OPTIONS
-------
@@ -143,5 +248,5 @@ SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-top[1],
linkperf:perf-record[1],
-http://www.intel.com/Assets/PDF/manual/253669.pdf[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
+http://www.intel.com/sdm/[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming]
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 43310d8661fe..1d6092c460dd 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -48,6 +48,14 @@ OPTIONS
option can be passed in record mode. It will be interpreted the same way as perf
record.
+-K::
+--all-kernel::
+ Configure all used events to run in kernel space.
+
+-U::
+--all-user::
+ Configure all used events to run in user space.
+
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 19aa17532a16..8dbee832abd9 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -347,6 +347,19 @@ Configure all used events to run in kernel space.
--all-user::
Configure all used events to run in user space.
+--timestamp-filename
+Append timestamp to output file name.
+
+--switch-output::
+Generate multiple perf.data files, timestamp prefixed, switching to a new one
+when receiving a SIGUSR2.
+
+A possible use case is to, given an external event, slice the perf.data file
+that gets then processed, possibly via a perf script, to decide if that
+particular perf.data snapshot should be kept or not.
+
+Implies --timestamp-filename, --no-buildid and --no-buildid-cache.
+
SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 12113992ac9d..ebaf849e30ef 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -248,7 +248,7 @@ OPTIONS
Note that when using the --itrace option the synthesized callchain size
will override this value if the synthesized callchain size is bigger.
- Default: 127
+ Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
-G::
--inverted::
@@ -285,7 +285,7 @@ OPTIONS
-f::
--force::
- Don't complain, do it.
+ Don't do ownership validation.
--symfs=<directory>::
Look for files with symbols relative to this directory.
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 8ff4df956951..1cc08cc47ac5 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -50,6 +50,22 @@ OPTIONS
--dump-raw-trace=::
Display verbose dump of the sched data.
+OPTIONS for 'perf sched map'
+----------------------------
+
+--compact::
+ Show only CPUs with activity. Helps visualizing on high core
+ count systems.
+
+--cpus::
+ Show just entries with activities for the given CPUs.
+
+--color-cpus::
+ Highlight the given cpus.
+
+--color-pids::
+ Highlight the given pids.
+
SEE ALSO
--------
linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 382ddfb45d1d..a856a1095893 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -259,9 +259,23 @@ include::itrace.txt[]
--full-source-path::
Show the full path for source files for srcline output.
+--max-stack::
+ Set the stack depth limit when parsing the callchain, anything
+ beyond the specified depth will be ignored. This is a trade-off
+ between information loss and faster processing especially for
+ workloads that can have a very long callchain stack.
+ Note that when using the --itrace option the synthesized callchain size
+ will override this value if the synthesized callchain size is bigger.
+
+ Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+
--ns::
Use 9 decimal places when displaying time (i.e. show the nanoseconds)
+-f::
+--force::
+ Don't do ownership validation.
+
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 19f046f027cd..91d638df3a6b 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -177,7 +177,7 @@ Default is to monitor all CPUS.
between information loss and faster processing especially for
workloads that can have a very long callchain stack.
- Default: 127
+ Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
--ignore-callees=<regex>::
Ignore callees of the function(s) matching the given regex.
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 13293de8869f..6afe20121bc0 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -117,9 +117,41 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
--syscalls::
Trace system calls. This options is enabled by default.
+--call-graph [mode,type,min[,limit],order[,key][,branch]]::
+ Setup and enable call-graph (stack chain/backtrace) recording.
+ See `--call-graph` section in perf-record and perf-report
+ man pages for details. The ones that are most useful in 'perf trace'
+ are 'dwarf' and 'lbr', where available, try: 'perf trace --call-graph dwarf'.
+
+ Using this will, for the root user, bump the value of --mmap-pages to 4
+ times the maximum for non-root users, based on the kernel.perf_event_mlock_kb
+ sysctl. This is done only if the user doesn't specify a --mmap-pages value.
+
+--kernel-syscall-graph::
+ Show the kernel callchains on the syscall exit path.
+
--event::
Trace other events, see 'perf list' for a complete list.
+--max-stack::
+ Set the stack depth limit when parsing the callchain, anything
+ beyond the specified depth will be ignored. Note that at this point
+ this is just about the presentation part, i.e. the kernel is still
+ not limiting, the overhead of callchains needs to be set via the
+ knobs in --call-graph dwarf.
+
+ Implies '--call-graph dwarf' when --call-graph not present on the
+ command line, on systems where DWARF unwinding was built in.
+
+ Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+
+--min-stack::
+ Set the stack depth limit when parsing the callchain, anything
+ below the specified depth will be ignored. Disabled by default.
+
+ Implies '--call-graph dwarf' when --call-graph not present on the
+ command line, on systems where DWARF unwinding was built in.
+
--proc-map-timeout::
When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
because the file may be huge. A time out is needed in such cases.
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 000ea210389d..bde8cbae7dd9 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -183,6 +183,11 @@ endif
include config/Makefile
endif
+ifeq ($(config),0)
+include $(srctree)/tools/scripts/Makefile.arch
+-include arch/$(ARCH)/Makefile
+endif
+
# The FEATURE_DUMP_EXPORT holds location of the actual
# FEATURE_DUMP file to be used to bypass feature detection
# (for bpf or any other subproject)
@@ -297,8 +302,6 @@ endif
# because maintaining the nesting to match is a pain. If
# we had "elif" things would have been much nicer...
--include arch/$(ARCH)/Makefile
-
ifneq ($(OUTPUT),)
CFLAGS += -I$(OUTPUT)
endif
@@ -390,7 +393,7 @@ endif
__build-dir = $(subst $(OUTPUT),,$(dir $@))
build-dir = $(if $(__build-dir),$(__build-dir),.)
-prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep
+prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep archheaders
$(OUTPUT)%.o: %.c prepare FORCE
$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
@@ -430,7 +433,7 @@ $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h)
LIBPERF_IN := $(OUTPUT)libperf-in.o
-$(LIBPERF_IN): fixdep FORCE
+$(LIBPERF_IN): prepare fixdep FORCE
$(Q)$(MAKE) $(build)=libperf
$(LIB_FILE): $(LIBPERF_IN)
@@ -625,7 +628,7 @@ config-clean:
$(call QUIET_CLEAN, config)
$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
-clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
+clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
$(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
$(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
$(Q)$(RM) $(OUTPUT).config-detected
@@ -662,5 +665,5 @@ FORCE:
.PHONY: all install clean config-clean strip install-gtk
.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
.PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope FORCE prepare
-.PHONY: libtraceevent_plugins
+.PHONY: libtraceevent_plugins archheaders
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 56e05f126ad8..cc3930904d68 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -3,4 +3,5 @@ PERF_HAVE_DWARF_REGS := 1
endif
HAVE_KVM_STAT_SUPPORT := 1
+PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c
index 733151cdf46e..41bdf9530d82 100644
--- a/tools/perf/arch/powerpc/util/dwarf-regs.c
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -10,19 +10,26 @@
*/
#include <stddef.h>
+#include <errno.h>
+#include <string.h>
#include <dwarf-regs.h>
-
+#include <linux/ptrace.h>
+#include <linux/kernel.h>
+#include "util.h"
struct pt_regs_dwarfnum {
const char *name;
unsigned int dwarfnum;
+ unsigned int ptregs_offset;
};
-#define STR(s) #s
-#define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
-#define GPR_DWARFNUM_NAME(num) \
- {.name = STR(%gpr##num), .dwarfnum = num}
-#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
+#define REG_DWARFNUM_NAME(r, num) \
+ {.name = STR(%)STR(r), .dwarfnum = num, \
+ .ptregs_offset = offsetof(struct pt_regs, r)}
+#define GPR_DWARFNUM_NAME(num) \
+ {.name = STR(%gpr##num), .dwarfnum = num, \
+ .ptregs_offset = offsetof(struct pt_regs, gpr[num])}
+#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0, .ptregs_offset = 0}
/*
* Reference:
@@ -61,12 +68,12 @@ static const struct pt_regs_dwarfnum regdwarfnum_table[] = {
GPR_DWARFNUM_NAME(29),
GPR_DWARFNUM_NAME(30),
GPR_DWARFNUM_NAME(31),
- REG_DWARFNUM_NAME("%msr", 66),
- REG_DWARFNUM_NAME("%ctr", 109),
- REG_DWARFNUM_NAME("%link", 108),
- REG_DWARFNUM_NAME("%xer", 101),
- REG_DWARFNUM_NAME("%dar", 119),
- REG_DWARFNUM_NAME("%dsisr", 118),
+ REG_DWARFNUM_NAME(msr, 66),
+ REG_DWARFNUM_NAME(ctr, 109),
+ REG_DWARFNUM_NAME(link, 108),
+ REG_DWARFNUM_NAME(xer, 101),
+ REG_DWARFNUM_NAME(dar, 119),
+ REG_DWARFNUM_NAME(dsisr, 118),
REG_DWARFNUM_END,
};
@@ -86,3 +93,12 @@ const char *get_arch_regstr(unsigned int n)
return roff->name;
return NULL;
}
+
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_dwarfnum *roff;
+ for (roff = regdwarfnum_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->ptregs_offset;
+ return -EINVAL;
+}
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index bbc1a50768dd..c6d0f91731a1 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -19,12 +19,6 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
ehdr.e_type == ET_DYN;
}
-#if defined(_CALL_ELF) && _CALL_ELF == 2
-void arch__elf_sym_adjust(GElf_Sym *sym)
-{
- sym->st_value += PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
-}
-#endif
#endif
#if !defined(_CALL_ELF) || _CALL_ELF != 2
@@ -65,18 +59,45 @@ bool arch__prefers_symtab(void)
return true;
}
+#ifdef HAVE_LIBELF_SUPPORT
+void arch__sym_update(struct symbol *s, GElf_Sym *sym)
+{
+ s->arch_sym = sym->st_other;
+}
+#endif
+
#define PPC64LE_LEP_OFFSET 8
void arch__fix_tev_from_maps(struct perf_probe_event *pev,
- struct probe_trace_event *tev, struct map *map)
+ struct probe_trace_event *tev, struct map *map,
+ struct symbol *sym)
{
+ int lep_offset;
+
/*
- * ppc64 ABIv2 local entry point is currently always 2 instructions
- * (8 bytes) after the global entry point.
+ * When probing at a function entry point, we normally always want the
+ * LEP since that catches calls to the function through both the GEP and
+ * the LEP. Hence, we would like to probe at an offset of 8 bytes if
+ * the user only specified the function entry.
+ *
+ * However, if the user specifies an offset, we fall back to using the
+ * GEP since all userspace applications (objdump/readelf) show function
+ * disassembly with offsets from the GEP.
+ *
+ * In addition, we shouldn't specify an offset for kretprobes.
*/
- if (!pev->uprobes && map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS) {
- tev->point.address += PPC64LE_LEP_OFFSET;
+ if (pev->point.offset || pev->point.retprobe || !map || !sym)
+ return;
+
+ lep_offset = PPC64_LOCAL_ENTRY_OFFSET(sym->arch_sym);
+
+ if (map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS)
tev->point.offset += PPC64LE_LEP_OFFSET;
+ else if (lep_offset) {
+ if (pev->uprobes)
+ tev->point.address += lep_offset;
+ else
+ tev->point.offset += lep_offset;
}
}
#endif
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 269af2143735..6c9211b18ec0 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -4,3 +4,26 @@ endif
HAVE_KVM_STAT_SUPPORT := 1
PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
PERF_HAVE_JITDUMP := 1
+
+###
+# Syscall table generation
+#
+
+out := $(OUTPUT)arch/x86/include/generated/asm
+header := $(out)/syscalls_64.c
+sys := $(srctree)/tools/perf/arch/x86/entry/syscalls
+systbl := $(sys)/syscalltbl.sh
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+$(header): $(sys)/syscall_64.tbl $(systbl)
+ @(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \
+ (diff -B arch/x86/entry/syscalls/syscall_64.tbl ../../arch/x86/entry/syscalls/syscall_64.tbl >/dev/null) \
+ || echo "Warning: x86_64's syscall_64.tbl differs from kernel" >&2 )) || true
+ $(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@
+
+clean::
+ $(call QUIET_CLEAN, x86) $(RM) $(header)
+
+archheaders: $(header)
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..cac6d17ce5db
--- /dev/null
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -0,0 +1,376 @@
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The abi is "common", "64" or "x32" for this file.
+#
+0 common read sys_read
+1 common write sys_write
+2 common open sys_open
+3 common close sys_close
+4 common stat sys_newstat
+5 common fstat sys_newfstat
+6 common lstat sys_newlstat
+7 common poll sys_poll
+8 common lseek sys_lseek
+9 common mmap sys_mmap
+10 common mprotect sys_mprotect
+11 common munmap sys_munmap
+12 common brk sys_brk
+13 64 rt_sigaction sys_rt_sigaction
+14 common rt_sigprocmask sys_rt_sigprocmask
+15 64 rt_sigreturn sys_rt_sigreturn/ptregs
+16 64 ioctl sys_ioctl
+17 common pread64 sys_pread64
+18 common pwrite64 sys_pwrite64
+19 64 readv sys_readv
+20 64 writev sys_writev
+21 common access sys_access
+22 common pipe sys_pipe
+23 common select sys_select
+24 common sched_yield sys_sched_yield
+25 common mremap sys_mremap
+26 common msync sys_msync
+27 common mincore sys_mincore
+28 common madvise sys_madvise
+29 common shmget sys_shmget
+30 common shmat sys_shmat
+31 common shmctl sys_shmctl
+32 common dup sys_dup
+33 common dup2 sys_dup2
+34 common pause sys_pause
+35 common nanosleep sys_nanosleep
+36 common getitimer sys_getitimer
+37 common alarm sys_alarm
+38 common setitimer sys_setitimer
+39 common getpid sys_getpid
+40 common sendfile sys_sendfile64
+41 common socket sys_socket
+42 common connect sys_connect
+43 common accept sys_accept
+44 common sendto sys_sendto
+45 64 recvfrom sys_recvfrom
+46 64 sendmsg sys_sendmsg
+47 64 recvmsg sys_recvmsg
+48 common shutdown sys_shutdown
+49 common bind sys_bind
+50 common listen sys_listen
+51 common getsockname sys_getsockname
+52 common getpeername sys_getpeername
+53 common socketpair sys_socketpair
+54 64 setsockopt sys_setsockopt
+55 64 getsockopt sys_getsockopt
+56 common clone sys_clone/ptregs
+57 common fork sys_fork/ptregs
+58 common vfork sys_vfork/ptregs
+59 64 execve sys_execve/ptregs
+60 common exit sys_exit
+61 common wait4 sys_wait4
+62 common kill sys_kill
+63 common uname sys_newuname
+64 common semget sys_semget
+65 common semop sys_semop
+66 common semctl sys_semctl
+67 common shmdt sys_shmdt
+68 common msgget sys_msgget
+69 common msgsnd sys_msgsnd
+70 common msgrcv sys_msgrcv
+71 common msgctl sys_msgctl
+72 common fcntl sys_fcntl
+73 common flock sys_flock
+74 common fsync sys_fsync
+75 common fdatasync sys_fdatasync
+76 common truncate sys_truncate
+77 common ftruncate sys_ftruncate
+78 common getdents sys_getdents
+79 common getcwd sys_getcwd
+80 common chdir sys_chdir
+81 common fchdir sys_fchdir
+82 common rename sys_rename
+83 common mkdir sys_mkdir
+84 common rmdir sys_rmdir
+85 common creat sys_creat
+86 common link sys_link
+87 common unlink sys_unlink
+88 common symlink sys_symlink
+89 common readlink sys_readlink
+90 common chmod sys_chmod
+91 common fchmod sys_fchmod
+92 common chown sys_chown
+93 common fchown sys_fchown
+94 common lchown sys_lchown
+95 common umask sys_umask
+96 common gettimeofday sys_gettimeofday
+97 common getrlimit sys_getrlimit
+98 common getrusage sys_getrusage
+99 common sysinfo sys_sysinfo
+100 common times sys_times
+101 64 ptrace sys_ptrace
+102 common getuid sys_getuid
+103 common syslog sys_syslog
+104 common getgid sys_getgid
+105 common setuid sys_setuid
+106 common setgid sys_setgid
+107 common geteuid sys_geteuid
+108 common getegid sys_getegid
+109 common setpgid sys_setpgid
+110 common getppid sys_getppid
+111 common getpgrp sys_getpgrp
+112 common setsid sys_setsid
+113 common setreuid sys_setreuid
+114 common setregid sys_setregid
+115 common getgroups sys_getgroups
+116 common setgroups sys_setgroups
+117 common setresuid sys_setresuid
+118 common getresuid sys_getresuid
+119 common setresgid sys_setresgid
+120 common getresgid sys_getresgid
+121 common getpgid sys_getpgid
+122 common setfsuid sys_setfsuid
+123 common setfsgid sys_setfsgid
+124 common getsid sys_getsid
+125 common capget sys_capget
+126 common capset sys_capset
+127 64 rt_sigpending sys_rt_sigpending
+128 64 rt_sigtimedwait sys_rt_sigtimedwait
+129 64 rt_sigqueueinfo sys_rt_sigqueueinfo
+130 common rt_sigsuspend sys_rt_sigsuspend
+131 64 sigaltstack sys_sigaltstack
+132 common utime sys_utime
+133 common mknod sys_mknod
+134 64 uselib
+135 common personality sys_personality
+136 common ustat sys_ustat
+137 common statfs sys_statfs
+138 common fstatfs sys_fstatfs
+139 common sysfs sys_sysfs
+140 common getpriority sys_getpriority
+141 common setpriority sys_setpriority
+142 common sched_setparam sys_sched_setparam
+143 common sched_getparam sys_sched_getparam
+144 common sched_setscheduler sys_sched_setscheduler
+145 common sched_getscheduler sys_sched_getscheduler
+146 common sched_get_priority_max sys_sched_get_priority_max
+147 common sched_get_priority_min sys_sched_get_priority_min
+148 common sched_rr_get_interval sys_sched_rr_get_interval
+149 common mlock sys_mlock
+150 common munlock sys_munlock
+151 common mlockall sys_mlockall
+152 common munlockall sys_munlockall
+153 common vhangup sys_vhangup
+154 common modify_ldt sys_modify_ldt
+155 common pivot_root sys_pivot_root
+156 64 _sysctl sys_sysctl
+157 common prctl sys_prctl
+158 common arch_prctl sys_arch_prctl
+159 common adjtimex sys_adjtimex
+160 common setrlimit sys_setrlimit
+161 common chroot sys_chroot
+162 common sync sys_sync
+163 common acct sys_acct
+164 common settimeofday sys_settimeofday
+165 common mount sys_mount
+166 common umount2 sys_umount
+167 common swapon sys_swapon
+168 common swapoff sys_swapoff
+169 common reboot sys_reboot
+170 common sethostname sys_sethostname
+171 common setdomainname sys_setdomainname
+172 common iopl sys_iopl/ptregs
+173 common ioperm sys_ioperm
+174 64 create_module
+175 common init_module sys_init_module
+176 common delete_module sys_delete_module
+177 64 get_kernel_syms
+178 64 query_module
+179 common quotactl sys_quotactl
+180 64 nfsservctl
+181 common getpmsg
+182 common putpmsg
+183 common afs_syscall
+184 common tuxcall
+185 common security
+186 common gettid sys_gettid
+187 common readahead sys_readahead
+188 common setxattr sys_setxattr
+189 common lsetxattr sys_lsetxattr
+190 common fsetxattr sys_fsetxattr
+191 common getxattr sys_getxattr
+192 common lgetxattr sys_lgetxattr
+193 common fgetxattr sys_fgetxattr
+194 common listxattr sys_listxattr
+195 common llistxattr sys_llistxattr
+196 common flistxattr sys_flistxattr
+197 common removexattr sys_removexattr
+198 common lremovexattr sys_lremovexattr
+199 common fremovexattr sys_fremovexattr
+200 common tkill sys_tkill
+201 common time sys_time
+202 common futex sys_futex
+203 common sched_setaffinity sys_sched_setaffinity
+204 common sched_getaffinity sys_sched_getaffinity
+205 64 set_thread_area
+206 64 io_setup sys_io_setup
+207 common io_destroy sys_io_destroy
+208 common io_getevents sys_io_getevents
+209 64 io_submit sys_io_submit
+210 common io_cancel sys_io_cancel
+211 64 get_thread_area
+212 common lookup_dcookie sys_lookup_dcookie
+213 common epoll_create sys_epoll_create
+214 64 epoll_ctl_old
+215 64 epoll_wait_old
+216 common remap_file_pages sys_remap_file_pages
+217 common getdents64 sys_getdents64
+218 common set_tid_address sys_set_tid_address
+219 common restart_syscall sys_restart_syscall
+220 common semtimedop sys_semtimedop
+221 common fadvise64 sys_fadvise64
+222 64 timer_create sys_timer_create
+223 common timer_settime sys_timer_settime
+224 common timer_gettime sys_timer_gettime
+225 common timer_getoverrun sys_timer_getoverrun
+226 common timer_delete sys_timer_delete
+227 common clock_settime sys_clock_settime
+228 common clock_gettime sys_clock_gettime
+229 common clock_getres sys_clock_getres
+230 common clock_nanosleep sys_clock_nanosleep
+231 common exit_group sys_exit_group
+232 common epoll_wait sys_epoll_wait
+233 common epoll_ctl sys_epoll_ctl
+234 common tgkill sys_tgkill
+235 common utimes sys_utimes
+236 64 vserver
+237 common mbind sys_mbind
+238 common set_mempolicy sys_set_mempolicy
+239 common get_mempolicy sys_get_mempolicy
+240 common mq_open sys_mq_open
+241 common mq_unlink sys_mq_unlink
+242 common mq_timedsend sys_mq_timedsend
+243 common mq_timedreceive sys_mq_timedreceive
+244 64 mq_notify sys_mq_notify
+245 common mq_getsetattr sys_mq_getsetattr
+246 64 kexec_load sys_kexec_load
+247 64 waitid sys_waitid
+248 common add_key sys_add_key
+249 common request_key sys_request_key
+250 common keyctl sys_keyctl
+251 common ioprio_set sys_ioprio_set
+252 common ioprio_get sys_ioprio_get
+253 common inotify_init sys_inotify_init
+254 common inotify_add_watch sys_inotify_add_watch
+255 common inotify_rm_watch sys_inotify_rm_watch
+256 common migrate_pages sys_migrate_pages
+257 common openat sys_openat
+258 common mkdirat sys_mkdirat
+259 common mknodat sys_mknodat
+260 common fchownat sys_fchownat
+261 common futimesat sys_futimesat
+262 common newfstatat sys_newfstatat
+263 common unlinkat sys_unlinkat
+264 common renameat sys_renameat
+265 common linkat sys_linkat
+266 common symlinkat sys_symlinkat
+267 common readlinkat sys_readlinkat
+268 common fchmodat sys_fchmodat
+269 common faccessat sys_faccessat
+270 common pselect6 sys_pselect6
+271 common ppoll sys_ppoll
+272 common unshare sys_unshare
+273 64 set_robust_list sys_set_robust_list
+274 64 get_robust_list sys_get_robust_list
+275 common splice sys_splice
+276 common tee sys_tee
+277 common sync_file_range sys_sync_file_range
+278 64 vmsplice sys_vmsplice
+279 64 move_pages sys_move_pages
+280 common utimensat sys_utimensat
+281 common epoll_pwait sys_epoll_pwait
+282 common signalfd sys_signalfd
+283 common timerfd_create sys_timerfd_create
+284 common eventfd sys_eventfd
+285 common fallocate sys_fallocate
+286 common timerfd_settime sys_timerfd_settime
+287 common timerfd_gettime sys_timerfd_gettime
+288 common accept4 sys_accept4
+289 common signalfd4 sys_signalfd4
+290 common eventfd2 sys_eventfd2
+291 common epoll_create1 sys_epoll_create1
+292 common dup3 sys_dup3
+293 common pipe2 sys_pipe2
+294 common inotify_init1 sys_inotify_init1
+295 64 preadv sys_preadv
+296 64 pwritev sys_pwritev
+297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo
+298 common perf_event_open sys_perf_event_open
+299 64 recvmmsg sys_recvmmsg
+300 common fanotify_init sys_fanotify_init
+301 common fanotify_mark sys_fanotify_mark
+302 common prlimit64 sys_prlimit64
+303 common name_to_handle_at sys_name_to_handle_at
+304 common open_by_handle_at sys_open_by_handle_at
+305 common clock_adjtime sys_clock_adjtime
+306 common syncfs sys_syncfs
+307 64 sendmmsg sys_sendmmsg
+308 common setns sys_setns
+309 common getcpu sys_getcpu
+310 64 process_vm_readv sys_process_vm_readv
+311 64 process_vm_writev sys_process_vm_writev
+312 common kcmp sys_kcmp
+313 common finit_module sys_finit_module
+314 common sched_setattr sys_sched_setattr
+315 common sched_getattr sys_sched_getattr
+316 common renameat2 sys_renameat2
+317 common seccomp sys_seccomp
+318 common getrandom sys_getrandom
+319 common memfd_create sys_memfd_create
+320 common kexec_file_load sys_kexec_file_load
+321 common bpf sys_bpf
+322 64 execveat sys_execveat/ptregs
+323 common userfaultfd sys_userfaultfd
+324 common membarrier sys_membarrier
+325 common mlock2 sys_mlock2
+326 common copy_file_range sys_copy_file_range
+327 64 preadv2 sys_preadv2
+328 64 pwritev2 sys_pwritev2
+
+#
+# x32-specific system call numbers start at 512 to avoid cache impact
+# for native 64-bit operation.
+#
+512 x32 rt_sigaction compat_sys_rt_sigaction
+513 x32 rt_sigreturn sys32_x32_rt_sigreturn
+514 x32 ioctl compat_sys_ioctl
+515 x32 readv compat_sys_readv
+516 x32 writev compat_sys_writev
+517 x32 recvfrom compat_sys_recvfrom
+518 x32 sendmsg compat_sys_sendmsg
+519 x32 recvmsg compat_sys_recvmsg
+520 x32 execve compat_sys_execve/ptregs
+521 x32 ptrace compat_sys_ptrace
+522 x32 rt_sigpending compat_sys_rt_sigpending
+523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
+524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo
+525 x32 sigaltstack compat_sys_sigaltstack
+526 x32 timer_create compat_sys_timer_create
+527 x32 mq_notify compat_sys_mq_notify
+528 x32 kexec_load compat_sys_kexec_load
+529 x32 waitid compat_sys_waitid
+530 x32 set_robust_list compat_sys_set_robust_list
+531 x32 get_robust_list compat_sys_get_robust_list
+532 x32 vmsplice compat_sys_vmsplice
+533 x32 move_pages compat_sys_move_pages
+534 x32 preadv compat_sys_preadv64
+535 x32 pwritev compat_sys_pwritev64
+536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
+537 x32 recvmmsg compat_sys_recvmmsg
+538 x32 sendmmsg compat_sys_sendmmsg
+539 x32 process_vm_readv compat_sys_process_vm_readv
+540 x32 process_vm_writev compat_sys_process_vm_writev
+541 x32 setsockopt compat_sys_setsockopt
+542 x32 getsockopt compat_sys_getsockopt
+543 x32 io_setup compat_sys_io_setup
+544 x32 io_submit compat_sys_io_submit
+545 x32 execveat compat_sys_execveat/ptregs
diff --git a/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh
new file mode 100755
index 000000000000..49a18b9ad9cf
--- /dev/null
+++ b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+in="$1"
+arch="$2"
+
+syscall_macro() {
+ nr="$1"
+ name="$2"
+
+ echo " [$nr] = \"$name\","
+}
+
+emit() {
+ nr="$1"
+ entry="$2"
+
+ syscall_macro "$nr" "$entry"
+}
+
+echo "static const char *syscalltbl_${arch}[] = {"
+
+sorted_table=$(mktemp /tmp/syscalltbl.XXXXXX)
+grep '^[0-9]' "$in" | sort -n > $sorted_table
+
+max_nr=0
+while read nr abi name entry compat; do
+ if [ $nr -ge 512 ] ; then # discard compat sycalls
+ break
+ fi
+
+ emit "$nr" "$name"
+ max_nr=$nr
+done < $sorted_table
+
+rm -f $sorted_table
+
+echo "};"
+
+echo "#define SYSCALLTBL_${arch}_MAX_ID ${max_nr}"
diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
index 9d29ee283ac5..d4aa567a29c4 100644
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
@@ -71,7 +71,7 @@ int test__perf_time_to_tsc(int subtest __maybe_unused)
CHECK__(parse_events(evlist, "cycles:u", NULL));
- perf_evlist__config(evlist, &opts);
+ perf_evlist__config(evlist, &opts, NULL);
evsel = perf_evlist__first(evlist);
diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c
index 9223c164e545..1f86ee8fb831 100644
--- a/tools/perf/arch/x86/util/dwarf-regs.c
+++ b/tools/perf/arch/x86/util/dwarf-regs.c
@@ -63,6 +63,8 @@ struct pt_regs_offset {
# define REG_OFFSET_NAME_32(n, r) {.name = n, .offset = offsetof(struct pt_regs, r)}
#endif
+/* TODO: switching by dwarf address size */
+#ifndef __x86_64__
static const struct pt_regs_offset x86_32_regoffset_table[] = {
REG_OFFSET_NAME_32("%ax", eax),
REG_OFFSET_NAME_32("%cx", ecx),
@@ -75,6 +77,8 @@ static const struct pt_regs_offset x86_32_regoffset_table[] = {
REG_OFFSET_END,
};
+#define regoffset_table x86_32_regoffset_table
+#else
static const struct pt_regs_offset x86_64_regoffset_table[] = {
REG_OFFSET_NAME_64("%ax", rax),
REG_OFFSET_NAME_64("%dx", rdx),
@@ -95,11 +99,7 @@ static const struct pt_regs_offset x86_64_regoffset_table[] = {
REG_OFFSET_END,
};
-/* TODO: switching by dwarf address size */
-#ifdef __x86_64__
#define regoffset_table x86_64_regoffset_table
-#else
-#define regoffset_table x86_32_regoffset_table
#endif
/* Minus 1 for the ending REG_OFFSET_END */
diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c
index d66f9ad4df2e..7dc30637cf66 100644
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -438,6 +438,11 @@ struct auxtrace_record *intel_bts_recording_init(int *err)
if (!intel_bts_pmu)
return NULL;
+ if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) {
+ *err = -errno;
+ return NULL;
+ }
+
btsr = zalloc(sizeof(struct intel_bts_recording));
if (!btsr) {
*err = -ENOMEM;
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index a3395179c9ee..a07b9605e93b 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -1027,6 +1027,11 @@ struct auxtrace_record *intel_pt_recording_init(int *err)
if (!intel_pt_pmu)
return NULL;
+ if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) {
+ *err = -errno;
+ return NULL;
+ }
+
ptr = zalloc(sizeof(struct intel_pt_recording));
if (!ptr) {
*err = -ENOMEM;
diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
index fd2868490d00..357f1b13b5ae 100644
--- a/tools/perf/arch/x86/util/tsc.c
+++ b/tools/perf/arch/x86/util/tsc.c
@@ -7,7 +7,6 @@
#include <linux/types.h>
#include "../../util/debug.h"
#include "../../util/tsc.h"
-#include "tsc.h"
int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
struct perf_tsc_conversion *tc)
@@ -46,3 +45,34 @@ u64 rdtsc(void)
return low | ((u64)high) << 32;
}
+
+int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc,
+ struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine)
+{
+ union perf_event event = {
+ .time_conv = {
+ .header = {
+ .type = PERF_RECORD_TIME_CONV,
+ .size = sizeof(struct time_conv_event),
+ },
+ },
+ };
+ struct perf_tsc_conversion tc;
+ int err;
+
+ err = perf_read_tsc_conversion(pc, &tc);
+ if (err == -EOPNOTSUPP)
+ return 0;
+ if (err)
+ return err;
+
+ pr_debug2("Synthesizing TSC conversion information\n");
+
+ event.time_conv.time_mult = tc.time_mult;
+ event.time_conv.time_shift = tc.time_shift;
+ event.time_conv.time_zero = tc.time_zero;
+
+ return process(tool, &event, NULL, machine);
+}
diff --git a/tools/perf/arch/x86/util/tsc.h b/tools/perf/arch/x86/util/tsc.h
deleted file mode 100644
index 2edc4d31065c..000000000000
--- a/tools/perf/arch/x86/util/tsc.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef TOOLS_PERF_ARCH_X86_UTIL_TSC_H__
-#define TOOLS_PERF_ARCH_X86_UTIL_TSC_H__
-
-#include <linux/types.h>
-
-struct perf_tsc_conversion {
- u16 time_shift;
- u32 time_mult;
- u64 time_zero;
-};
-
-struct perf_event_mmap_page;
-
-int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
- struct perf_tsc_conversion *tc);
-
-#endif /* TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ */
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 6a18ce21f865..6952db65508a 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -83,7 +83,7 @@ static void *workerfn(void *arg)
do {
int ret;
again:
- ret = futex_lock_pi(w->futex, NULL, 0, futex_flag);
+ ret = futex_lock_pi(w->futex, NULL, futex_flag);
if (ret) { /* handle lock acquisition */
if (!silent)
diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h
index d44de9f44281..b2e06d1190d0 100644
--- a/tools/perf/bench/futex.h
+++ b/tools/perf/bench/futex.h
@@ -57,13 +57,11 @@ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags)
/**
* futex_lock_pi() - block on uaddr as a PI mutex
- * @detect: whether (1) or not (0) to perform deadlock detection
*/
static inline int
-futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int detect,
- int opflags)
+futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags)
{
- return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags);
+ return futex(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags);
}
/**
diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
index a91aa85d80ff..2b54d0f2672a 100644
--- a/tools/perf/bench/mem-functions.c
+++ b/tools/perf/bench/mem-functions.c
@@ -6,6 +6,7 @@
* Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
*/
+#include "debug.h"
#include "../perf.h"
#include "../util/util.h"
#include <subcmd/parse-options.h>
@@ -63,14 +64,16 @@ static struct perf_event_attr cycle_attr = {
.config = PERF_COUNT_HW_CPU_CYCLES
};
-static void init_cycles(void)
+static int init_cycles(void)
{
cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
- if (cycles_fd < 0 && errno == ENOSYS)
- die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
- else
- BUG_ON(cycles_fd < 0);
+ if (cycles_fd < 0 && errno == ENOSYS) {
+ pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+ return -1;
+ }
+
+ return cycles_fd;
}
static u64 get_cycles(void)
@@ -155,8 +158,13 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *
argc = parse_options(argc, argv, options, info->usage, 0);
- if (use_cycles)
- init_cycles();
+ if (use_cycles) {
+ i = init_cycles();
+ if (i < 0) {
+ fprintf(stderr, "Failed to open cycles counter\n");
+ return i;
+ }
+ }
size = (size_t)perf_atoll((char *)size_str);
size_total = (double)size * nr_loops;
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index c42448ed5dfe..fe1b77fa21f9 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -12,6 +12,7 @@
#include <subcmd/parse-options.h>
#include "util/util.h"
#include "util/debug.h"
+#include "util/config.h"
static bool use_system_config, use_user_config;
@@ -32,13 +33,28 @@ static struct option config_options[] = {
OPT_END()
};
-static int show_config(const char *key, const char *value,
- void *cb __maybe_unused)
+static int show_config(struct perf_config_set *set)
{
- if (value)
- printf("%s=%s\n", key, value);
- else
- printf("%s\n", key);
+ struct perf_config_section *section;
+ struct perf_config_item *item;
+ struct list_head *sections;
+
+ if (set == NULL)
+ return -1;
+
+ sections = &set->sections;
+ if (list_empty(sections))
+ return -1;
+
+ list_for_each_entry(section, sections, node) {
+ list_for_each_entry(item, &section->items, node) {
+ char *value = item->value;
+
+ if (value)
+ printf("%s.%s=%s\n", section->name,
+ item->name, value);
+ }
+ }
return 0;
}
@@ -46,6 +62,7 @@ static int show_config(const char *key, const char *value,
int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
{
int ret = 0;
+ struct perf_config_set *set;
char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
argc = parse_options(argc, argv, config_options, config_usage,
@@ -63,13 +80,19 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
else if (use_user_config)
config_exclusive_filename = user_config;
+ set = perf_config_set__new();
+ if (!set) {
+ ret = -1;
+ goto out_err;
+ }
+
switch (actions) {
case ACTION_LIST:
if (argc) {
pr_err("Error: takes no arguments\n");
parse_options_usage(config_usage, config_options, "l", 1);
} else {
- ret = perf_config(show_config, NULL);
+ ret = show_config(set);
if (ret < 0) {
const char * config_filename = config_exclusive_filename;
if (!config_exclusive_filename)
@@ -83,5 +106,7 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
usage_with_options(config_usage, config_options);
}
+ perf_config_set__delete(set);
+out_err:
return ret;
}
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 8053a8ceefda..9ce354f469dc 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -428,7 +428,7 @@ static void hists__baseline_only(struct hists *hists)
struct rb_root *root;
struct rb_node *next;
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root = &hists->entries_collapsed;
else
root = hists->entries_in;
@@ -450,7 +450,7 @@ static void hists__precompute(struct hists *hists)
struct rb_root *root;
struct rb_node *next;
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root = &hists->entries_collapsed;
else
root = hists->entries_in;
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index bc1de9b8fd67..f9830c902b78 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -61,6 +61,7 @@ static int check_emacsclient_version(void)
struct child_process ec_process;
const char *argv_ec[] = { "emacsclient", "--version", NULL };
int version;
+ int ret = -1;
/* emacsclient prints its version number on stderr */
memset(&ec_process, 0, sizeof(ec_process));
@@ -71,7 +72,10 @@ static int check_emacsclient_version(void)
fprintf(stderr, "Failed to start emacsclient.\n");
return -1;
}
- strbuf_read(&buffer, ec_process.err, 20);
+ if (strbuf_read(&buffer, ec_process.err, 20) < 0) {
+ fprintf(stderr, "Failed to read emacsclient version\n");
+ goto out;
+ }
close(ec_process.err);
/*
@@ -82,8 +86,7 @@ static int check_emacsclient_version(void)
if (prefixcmp(buffer.buf, "emacsclient")) {
fprintf(stderr, "Failed to parse emacsclient version.\n");
- strbuf_release(&buffer);
- return -1;
+ goto out;
}
version = atoi(buffer.buf + strlen("emacsclient"));
@@ -92,12 +95,11 @@ static int check_emacsclient_version(void)
fprintf(stderr,
"emacsclient version '%d' too old (< 22).\n",
version);
- strbuf_release(&buffer);
- return -1;
- }
-
+ } else
+ ret = 0;
+out:
strbuf_release(&buffer);
- return 0;
+ return ret;
}
static void exec_woman_emacs(const char *path, const char *page)
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index d1a2d104f2bc..e5afa8fe1bf1 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -748,6 +748,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
.auxtrace_info = perf_event__repipe_op2_synth,
.auxtrace = perf_event__repipe_auxtrace,
.auxtrace_error = perf_event__repipe_op2_synth,
+ .time_conv = perf_event__repipe_op2_synth,
.finished_round = perf_event__repipe_oe_synth,
.build_id = perf_event__repipe_op2_synth,
.id_index = perf_event__repipe_op2_synth,
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index c9cb3be47cff..58adfee230de 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -375,7 +375,7 @@ static u64 find_callsite(struct perf_evsel *evsel, struct perf_sample *sample)
}
al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
- sample__resolve_callchain(sample, NULL, evsel, &al, 16);
+ sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16);
callchain_cursor_commit(&callchain_cursor);
while (true) {
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index bff666458b28..6487c06d2708 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -982,7 +982,7 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm)
struct perf_evlist *evlist = kvm->evlist;
char sbuf[STRERR_BUFSIZE];
- perf_evlist__config(evlist, &kvm->opts);
+ perf_evlist__config(evlist, &kvm->opts, NULL);
/*
* Note: exclude_{guest,host} do not apply here.
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 85db3be4b3cb..1dc140c5481d 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -62,19 +62,22 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
int rec_argc, i = 0, j;
const char **rec_argv;
int ret;
+ bool all_user = false, all_kernel = false;
struct option options[] = {
OPT_CALLBACK('e', "event", &mem, "event",
"event selector. use 'perf mem record -e list' to list available events",
parse_record_events),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show counter open errors, etc)"),
+ OPT_BOOLEAN('U', "--all-user", &all_user, "collect only user level data"),
+ OPT_BOOLEAN('K', "--all-kernel", &all_kernel, "collect only kernel level data"),
OPT_END()
};
argc = parse_options(argc, argv, options, record_mem_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
- rec_argc = argc + 7; /* max number of arguments */
+ rec_argc = argc + 9; /* max number of arguments */
rec_argv = calloc(rec_argc + 1, sizeof(char *));
if (!rec_argv)
return -1;
@@ -103,6 +106,12 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
rec_argv[i++] = perf_mem_events__name(j);
};
+ if (all_user)
+ rec_argv[i++] = "--all-user";
+
+ if (all_kernel)
+ rec_argv[i++] = "--all-kernel";
+
for (j = 0; j < argc; j++, i++)
rec_argv[i] = argv[j];
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 515510ecc76a..f3679c44d3f3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -29,10 +29,12 @@
#include "util/data.h"
#include "util/perf_regs.h"
#include "util/auxtrace.h"
+#include "util/tsc.h"
#include "util/parse-branch-options.h"
#include "util/parse-regs-options.h"
#include "util/llvm-utils.h"
#include "util/bpf-loader.h"
+#include "util/trigger.h"
#include "asm/bug.h"
#include <unistd.h>
@@ -55,6 +57,8 @@ struct record {
bool no_buildid_cache;
bool no_buildid_cache_set;
bool buildid_all;
+ bool timestamp_filename;
+ bool switch_output;
unsigned long long samples;
};
@@ -124,9 +128,10 @@ out:
static volatile int done;
static volatile int signr = -1;
static volatile int child_finished;
-static volatile int auxtrace_snapshot_enabled;
-static volatile int auxtrace_snapshot_err;
+
static volatile int auxtrace_record__snapshot_started;
+static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
+static DEFINE_TRIGGER(switch_output_trigger);
static void sig_handler(int sig)
{
@@ -244,11 +249,12 @@ static void record__read_auxtrace_snapshot(struct record *rec)
{
pr_debug("Recording AUX area tracing snapshot\n");
if (record__auxtrace_read_snapshot_all(rec) < 0) {
- auxtrace_snapshot_err = -1;
+ trigger_error(&auxtrace_snapshot_trigger);
} else {
- auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
- if (!auxtrace_snapshot_err)
- auxtrace_snapshot_enabled = 1;
+ if (auxtrace_record__snapshot_finish(rec->itr))
+ trigger_error(&auxtrace_snapshot_trigger);
+ else
+ trigger_ready(&auxtrace_snapshot_trigger);
}
}
@@ -283,7 +289,7 @@ static int record__open(struct record *rec)
struct record_opts *opts = &rec->opts;
int rc = 0;
- perf_evlist__config(evlist, opts);
+ perf_evlist__config(evlist, opts, &callchain_param);
evlist__for_each(evlist, pos) {
try_again:
@@ -494,6 +500,73 @@ record__finish_output(struct record *rec)
return;
}
+static int record__synthesize_workload(struct record *rec)
+{
+ struct {
+ struct thread_map map;
+ struct thread_map_data map_data;
+ } thread_map;
+
+ thread_map.map.nr = 1;
+ thread_map.map.map[0].pid = rec->evlist->workload.pid;
+ thread_map.map.map[0].comm = NULL;
+ return perf_event__synthesize_thread_map(&rec->tool, &thread_map.map,
+ process_synthesized_event,
+ &rec->session->machines.host,
+ rec->opts.sample_address,
+ rec->opts.proc_map_timeout);
+}
+
+static int record__synthesize(struct record *rec);
+
+static int
+record__switch_output(struct record *rec, bool at_exit)
+{
+ struct perf_data_file *file = &rec->file;
+ int fd, err;
+
+ /* Same Size: "2015122520103046"*/
+ char timestamp[] = "InvalidTimestamp";
+
+ rec->samples = 0;
+ record__finish_output(rec);
+ err = fetch_current_timestamp(timestamp, sizeof(timestamp));
+ if (err) {
+ pr_err("Failed to get current timestamp\n");
+ return -EINVAL;
+ }
+
+ fd = perf_data_file__switch(file, timestamp,
+ rec->session->header.data_offset,
+ at_exit);
+ if (fd >= 0 && !at_exit) {
+ rec->bytes_written = 0;
+ rec->session->header.data_size = 0;
+ }
+
+ if (!quiet)
+ fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
+ file->path, timestamp);
+
+ /* Output tracking events */
+ if (!at_exit) {
+ record__synthesize(rec);
+
+ /*
+ * In 'perf record --switch-output' without -a,
+ * record__synthesize() in record__switch_output() won't
+ * generate tracking events because there's no thread_map
+ * in evlist. Which causes newly created perf.data doesn't
+ * contain map and comm information.
+ * Create a fake thread_map and directly call
+ * perf_event__synthesize_thread_map() for those events.
+ */
+ if (target__none(&rec->opts.target))
+ record__synthesize_workload(rec);
+ }
+ return fd;
+}
+
static volatile int workload_exec_errno;
/*
@@ -512,6 +585,15 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
static void snapshot_sig_handler(int sig);
+int __weak
+perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
+ struct perf_tool *tool __maybe_unused,
+ perf_event__handler_t process __maybe_unused,
+ struct machine *machine __maybe_unused)
+{
+ return 0;
+}
+
static int record__synthesize(struct record *rec)
{
struct perf_session *session = rec->session;
@@ -549,6 +631,11 @@ static int record__synthesize(struct record *rec)
}
}
+ err = perf_event__synth_time_conv(rec->evlist->mmap[0].base, tool,
+ process_synthesized_event, machine);
+ if (err)
+ goto out;
+
if (rec->opts.full_auxtrace) {
err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
session, process_synthesized_event);
@@ -600,10 +687,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
signal(SIGCHLD, sig_handler);
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
- if (rec->opts.auxtrace_snapshot_mode)
+
+ if (rec->opts.auxtrace_snapshot_mode || rec->switch_output) {
signal(SIGUSR2, snapshot_sig_handler);
- else
+ if (rec->opts.auxtrace_snapshot_mode)
+ trigger_on(&auxtrace_snapshot_trigger);
+ if (rec->switch_output)
+ trigger_on(&switch_output_trigger);
+ } else {
signal(SIGUSR2, SIG_IGN);
+ }
session = perf_session__new(file, false, tool);
if (session == NULL) {
@@ -729,27 +822,45 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
perf_evlist__enable(rec->evlist);
}
- auxtrace_snapshot_enabled = 1;
+ trigger_ready(&auxtrace_snapshot_trigger);
+ trigger_ready(&switch_output_trigger);
for (;;) {
unsigned long long hits = rec->samples;
if (record__mmap_read_all(rec) < 0) {
- auxtrace_snapshot_enabled = 0;
+ trigger_error(&auxtrace_snapshot_trigger);
+ trigger_error(&switch_output_trigger);
err = -1;
goto out_child;
}
if (auxtrace_record__snapshot_started) {
auxtrace_record__snapshot_started = 0;
- if (!auxtrace_snapshot_err)
+ if (!trigger_is_error(&auxtrace_snapshot_trigger))
record__read_auxtrace_snapshot(rec);
- if (auxtrace_snapshot_err) {
+ if (trigger_is_error(&auxtrace_snapshot_trigger)) {
pr_err("AUX area tracing snapshot failed\n");
err = -1;
goto out_child;
}
}
+ if (trigger_is_hit(&switch_output_trigger)) {
+ trigger_ready(&switch_output_trigger);
+
+ if (!quiet)
+ fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
+ waking);
+ waking = 0;
+ fd = record__switch_output(rec, false);
+ if (fd < 0) {
+ pr_err("Failed to switch to new file\n");
+ trigger_error(&switch_output_trigger);
+ err = fd;
+ goto out_child;
+ }
+ }
+
if (hits == rec->samples) {
if (done || draining)
break;
@@ -772,12 +883,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
* disable events in this case.
*/
if (done && !disabled && !target__none(&opts->target)) {
- auxtrace_snapshot_enabled = 0;
+ trigger_off(&auxtrace_snapshot_trigger);
perf_evlist__disable(rec->evlist);
disabled = true;
}
}
- auxtrace_snapshot_enabled = 0;
+ trigger_off(&auxtrace_snapshot_trigger);
+ trigger_off(&switch_output_trigger);
if (forks && workload_exec_errno) {
char msg[STRERR_BUFSIZE];
@@ -811,11 +923,22 @@ out_child:
/* this will be recalculated during process_buildids() */
rec->samples = 0;
- if (!err)
- record__finish_output(rec);
+ if (!err) {
+ if (!rec->timestamp_filename) {
+ record__finish_output(rec);
+ } else {
+ fd = record__switch_output(rec, true);
+ if (fd < 0) {
+ status = fd;
+ goto out_delete_session;
+ }
+ }
+ }
if (!err && !quiet) {
char samples[128];
+ const char *postfix = rec->timestamp_filename ?
+ ".<timestamp>" : "";
if (rec->samples && !rec->opts.full_auxtrace)
scnprintf(samples, sizeof(samples),
@@ -823,9 +946,9 @@ out_child:
else
samples[0] = '\0';
- fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s ]\n",
+ fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
perf_data_file__size(file) / 1024.0 / 1024.0,
- file->path, samples);
+ file->path, postfix, samples);
}
out_delete_session:
@@ -833,58 +956,61 @@ out_delete_session:
return status;
}
-static void callchain_debug(void)
+static void callchain_debug(struct callchain_param *callchain)
{
static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
- pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
+ pr_debug("callchain: type %s\n", str[callchain->record_mode]);
- if (callchain_param.record_mode == CALLCHAIN_DWARF)
+ if (callchain->record_mode == CALLCHAIN_DWARF)
pr_debug("callchain: stack dump size %d\n",
- callchain_param.dump_size);
+ callchain->dump_size);
}
-int record_parse_callchain_opt(const struct option *opt,
- const char *arg,
- int unset)
+int record_opts__parse_callchain(struct record_opts *record,
+ struct callchain_param *callchain,
+ const char *arg, bool unset)
{
int ret;
- struct record_opts *record = (struct record_opts *)opt->value;
-
- record->callgraph_set = true;
- callchain_param.enabled = !unset;
+ callchain->enabled = !unset;
/* --no-call-graph */
if (unset) {
- callchain_param.record_mode = CALLCHAIN_NONE;
+ callchain->record_mode = CALLCHAIN_NONE;
pr_debug("callchain: disabled\n");
return 0;
}
- ret = parse_callchain_record_opt(arg, &callchain_param);
+ ret = parse_callchain_record_opt(arg, callchain);
if (!ret) {
/* Enable data address sampling for DWARF unwind. */
- if (callchain_param.record_mode == CALLCHAIN_DWARF)
+ if (callchain->record_mode == CALLCHAIN_DWARF)
record->sample_address = true;
- callchain_debug();
+ callchain_debug(callchain);
}
return ret;
}
+int record_parse_callchain_opt(const struct option *opt,
+ const char *arg,
+ int unset)
+{
+ return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
+}
+
int record_callchain_opt(const struct option *opt,
const char *arg __maybe_unused,
int unset __maybe_unused)
{
- struct record_opts *record = (struct record_opts *)opt->value;
+ struct callchain_param *callchain = opt->value;
- record->callgraph_set = true;
- callchain_param.enabled = true;
+ callchain->enabled = true;
- if (callchain_param.record_mode == CALLCHAIN_NONE)
- callchain_param.record_mode = CALLCHAIN_FP;
+ if (callchain->record_mode == CALLCHAIN_NONE)
+ callchain->record_mode = CALLCHAIN_FP;
- callchain_debug();
+ callchain_debug(callchain);
return 0;
}
@@ -1122,7 +1248,7 @@ struct option __record_options[] = {
record__parse_mmap_pages),
OPT_BOOLEAN(0, "group", &record.opts.group,
"put the counters into a counter group"),
- OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
+ OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
NULL, "enables call-graph recording" ,
&record_callchain_opt),
OPT_CALLBACK(0, "call-graph", &record.opts,
@@ -1195,6 +1321,10 @@ struct option __record_options[] = {
"file", "vmlinux pathname"),
OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
"Record build-id of all DSOs regardless of hits"),
+ OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
+ "append timestamp to output filename"),
+ OPT_BOOLEAN(0, "switch-output", &record.switch_output,
+ "Switch output when receive SIGUSR2"),
OPT_END()
};
@@ -1250,6 +1380,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
return -EINVAL;
}
+ if (rec->switch_output)
+ rec->timestamp_filename = true;
+
if (!rec->itr) {
rec->itr = auxtrace_record__init(rec->evlist, &err);
if (err)
@@ -1261,6 +1394,14 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
if (err)
return err;
+ err = bpf__setup_stdout(rec->evlist);
+ if (err) {
+ bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
+ pr_err("ERROR: Setup BPF stdout failed: %s\n",
+ errbuf);
+ return err;
+ }
+
err = -ENOMEM;
symbol__init(NULL);
@@ -1275,8 +1416,36 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
"even with a suitable vmlinux or kallsyms file.\n\n");
- if (rec->no_buildid_cache || rec->no_buildid)
+ if (rec->no_buildid_cache || rec->no_buildid) {
disable_buildid_cache();
+ } else if (rec->switch_output) {
+ /*
+ * In 'perf record --switch-output', disable buildid
+ * generation by default to reduce data file switching
+ * overhead. Still generate buildid if they are required
+ * explicitly using
+ *
+ * perf record --signal-trigger --no-no-buildid \
+ * --no-no-buildid-cache
+ *
+ * Following code equals to:
+ *
+ * if ((rec->no_buildid || !rec->no_buildid_set) &&
+ * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
+ * disable_buildid_cache();
+ */
+ bool disable = true;
+
+ if (rec->no_buildid_set && !rec->no_buildid)
+ disable = false;
+ if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
+ disable = false;
+ if (disable) {
+ rec->no_buildid = true;
+ rec->no_buildid_cache = true;
+ disable_buildid_cache();
+ }
+ }
if (rec->evlist->nr_entries == 0 &&
perf_evlist__add_default(rec->evlist) < 0) {
@@ -1335,9 +1504,13 @@ out_symbol_exit:
static void snapshot_sig_handler(int sig __maybe_unused)
{
- if (!auxtrace_snapshot_enabled)
- return;
- auxtrace_snapshot_enabled = 0;
- auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
- auxtrace_record__snapshot_started = 1;
+ if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
+ trigger_hit(&auxtrace_snapshot_trigger);
+ auxtrace_record__snapshot_started = 1;
+ if (auxtrace_record__snapshot_start(record.itr))
+ trigger_error(&auxtrace_snapshot_trigger);
+ }
+
+ if (trigger_is_ready(&switch_output_trigger))
+ trigger_hit(&switch_output_trigger);
}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 160ea23b45aa..87d40e3c4078 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -47,7 +47,6 @@ struct report {
struct perf_tool tool;
struct perf_session *session;
bool use_tui, use_gtk, use_stdio;
- bool dont_use_callchains;
bool show_full_info;
bool show_threads;
bool inverted_callchain;
@@ -235,7 +234,7 @@ static int report__setup_sample_type(struct report *rep)
sample_type |= PERF_SAMPLE_BRANCH_STACK;
if (!is_pipe && !(sample_type & PERF_SAMPLE_CALLCHAIN)) {
- if (sort__has_parent) {
+ if (perf_hpp_list.parent) {
ui__error("Selected --sort parent, but no "
"callchain data. Did you call "
"'perf record' without -g?\n");
@@ -247,7 +246,7 @@ static int report__setup_sample_type(struct report *rep)
"you call 'perf record' without -g?\n");
return -1;
}
- } else if (!rep->dont_use_callchains &&
+ } else if (!callchain_param.enabled &&
callchain_param.mode != CHAIN_NONE &&
!symbol_conf.use_callchain) {
symbol_conf.use_callchain = true;
@@ -599,13 +598,15 @@ static int __cmd_report(struct report *rep)
static int
report_parse_callchain_opt(const struct option *opt, const char *arg, int unset)
{
- struct report *rep = (struct report *)opt->value;
+ struct callchain_param *callchain = opt->value;
+ callchain->enabled = !unset;
/*
* --no-call-graph
*/
if (unset) {
- rep->dont_use_callchains = true;
+ symbol_conf.use_callchain = false;
+ callchain->mode = CHAIN_NONE;
return 0;
}
@@ -690,7 +691,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
.ordered_events = true,
.ordering_requires_timestamps = true,
},
- .max_stack = PERF_MAX_STACK_DEPTH,
+ .max_stack = sysctl_perf_event_max_stack,
.pretty_printing_style = "normal",
.socket_filter = -1,
};
@@ -734,7 +735,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
"regex filter to identify parent, see: '--sort parent'"),
OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
"Only display entries with parent-match"),
- OPT_CALLBACK_DEFAULT('g', "call-graph", &report,
+ OPT_CALLBACK_DEFAULT('g', "call-graph", &callchain_param,
"print_type,threshold[,print_limit],order,sort_key[,branch],value",
report_callchain_help, &report_parse_callchain_opt,
callchain_default_opt),
@@ -743,7 +744,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_INTEGER(0, "max-stack", &report.max_stack,
"Set the maximum stack depth when parsing the callchain, "
"anything beyond the specified depth will be ignored. "
- "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+ "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
"alias for inverted call graph"),
OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
@@ -935,7 +936,7 @@ repeat:
goto error;
}
- sort__need_collapse = true;
+ perf_hpp_list.need_collapse = true;
}
/* Force tty output for header output and per-thread stat. */
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 871b55ae22a4..afa057666c2a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -11,6 +11,8 @@
#include "util/session.h"
#include "util/tool.h"
#include "util/cloexec.h"
+#include "util/thread_map.h"
+#include "util/color.h"
#include <subcmd/parse-options.h>
#include "util/trace-event.h"
@@ -122,6 +124,21 @@ struct trace_sched_handler {
struct machine *machine);
};
+#define COLOR_PIDS PERF_COLOR_BLUE
+#define COLOR_CPUS PERF_COLOR_BG_RED
+
+struct perf_sched_map {
+ DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
+ int *comp_cpus;
+ bool comp;
+ struct thread_map *color_pids;
+ const char *color_pids_str;
+ struct cpu_map *color_cpus;
+ const char *color_cpus_str;
+ struct cpu_map *cpus;
+ const char *cpus_str;
+};
+
struct perf_sched {
struct perf_tool tool;
const char *sort_order;
@@ -173,6 +190,7 @@ struct perf_sched {
struct list_head sort_list, cmp_pid;
bool force;
bool skip_merge;
+ struct perf_sched_map map;
};
static u64 get_nsecs(void)
@@ -1339,6 +1357,38 @@ static int process_sched_wakeup_event(struct perf_tool *tool,
return 0;
}
+union map_priv {
+ void *ptr;
+ bool color;
+};
+
+static bool thread__has_color(struct thread *thread)
+{
+ union map_priv priv = {
+ .ptr = thread__priv(thread),
+ };
+
+ return priv.color;
+}
+
+static struct thread*
+map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
+{
+ struct thread *thread = machine__findnew_thread(machine, pid, tid);
+ union map_priv priv = {
+ .color = false,
+ };
+
+ if (!sched->map.color_pids || !thread || thread__priv(thread))
+ return thread;
+
+ if (thread_map__has(sched->map.color_pids, tid))
+ priv.color = true;
+
+ thread__set_priv(thread, priv.ptr);
+ return thread;
+}
+
static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
struct perf_sample *sample, struct machine *machine)
{
@@ -1347,13 +1397,25 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
int new_shortname;
u64 timestamp0, timestamp = sample->time;
s64 delta;
- int cpu, this_cpu = sample->cpu;
+ int i, this_cpu = sample->cpu;
+ int cpus_nr;
+ bool new_cpu = false;
+ const char *color = PERF_COLOR_NORMAL;
BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
if (this_cpu > sched->max_cpu)
sched->max_cpu = this_cpu;
+ if (sched->map.comp) {
+ cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
+ if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
+ sched->map.comp_cpus[cpus_nr++] = this_cpu;
+ new_cpu = true;
+ }
+ } else
+ cpus_nr = sched->max_cpu;
+
timestamp0 = sched->cpu_last_switched[this_cpu];
sched->cpu_last_switched[this_cpu] = timestamp;
if (timestamp0)
@@ -1366,7 +1428,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
return -1;
}
- sched_in = machine__findnew_thread(machine, -1, next_pid);
+ sched_in = map__findnew_thread(sched, machine, -1, next_pid);
if (sched_in == NULL)
return -1;
@@ -1400,26 +1462,52 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
new_shortname = 1;
}
- for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
+ for (i = 0; i < cpus_nr; i++) {
+ int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
+ struct thread *curr_thread = sched->curr_thread[cpu];
+ const char *pid_color = color;
+ const char *cpu_color = color;
+
+ if (curr_thread && thread__has_color(curr_thread))
+ pid_color = COLOR_PIDS;
+
+ if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
+ continue;
+
+ if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
+ cpu_color = COLOR_CPUS;
+
if (cpu != this_cpu)
- printf(" ");
+ color_fprintf(stdout, cpu_color, " ");
else
- printf("*");
+ color_fprintf(stdout, cpu_color, "*");
if (sched->curr_thread[cpu])
- printf("%2s ", sched->curr_thread[cpu]->shortname);
+ color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
else
- printf(" ");
+ color_fprintf(stdout, color, " ");
}
- printf(" %12.6f secs ", (double)timestamp/1e9);
+ if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
+ goto out;
+
+ color_fprintf(stdout, color, " %12.6f secs ", (double)timestamp/1e9);
if (new_shortname) {
- printf("%s => %s:%d\n",
+ const char *pid_color = color;
+
+ if (thread__has_color(sched_in))
+ pid_color = COLOR_PIDS;
+
+ color_fprintf(stdout, pid_color, "%s => %s:%d",
sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
- } else {
- printf("\n");
}
+ if (sched->map.comp && new_cpu)
+ color_fprintf(stdout, color, " (CPU %d)", this_cpu);
+
+out:
+ color_fprintf(stdout, color, "\n");
+
thread__put(sched_in);
return 0;
@@ -1675,9 +1763,75 @@ static int perf_sched__lat(struct perf_sched *sched)
return 0;
}
+static int setup_map_cpus(struct perf_sched *sched)
+{
+ struct cpu_map *map;
+
+ sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
+
+ if (sched->map.comp) {
+ sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
+ if (!sched->map.comp_cpus)
+ return -1;
+ }
+
+ if (!sched->map.cpus_str)
+ return 0;
+
+ map = cpu_map__new(sched->map.cpus_str);
+ if (!map) {
+ pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
+ return -1;
+ }
+
+ sched->map.cpus = map;
+ return 0;
+}
+
+static int setup_color_pids(struct perf_sched *sched)
+{
+ struct thread_map *map;
+
+ if (!sched->map.color_pids_str)
+ return 0;
+
+ map = thread_map__new_by_tid_str(sched->map.color_pids_str);
+ if (!map) {
+ pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
+ return -1;
+ }
+
+ sched->map.color_pids = map;
+ return 0;
+}
+
+static int setup_color_cpus(struct perf_sched *sched)
+{
+ struct cpu_map *map;
+
+ if (!sched->map.color_cpus_str)
+ return 0;
+
+ map = cpu_map__new(sched->map.color_cpus_str);
+ if (!map) {
+ pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
+ return -1;
+ }
+
+ sched->map.color_cpus = map;
+ return 0;
+}
+
static int perf_sched__map(struct perf_sched *sched)
{
- sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
+ if (setup_map_cpus(sched))
+ return -1;
+
+ if (setup_color_pids(sched))
+ return -1;
+
+ if (setup_color_cpus(sched))
+ return -1;
setup_pager();
if (perf_sched__read_events(sched))
@@ -1831,6 +1985,17 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
"dump raw trace in ASCII"),
OPT_END()
};
+ const struct option map_options[] = {
+ OPT_BOOLEAN(0, "compact", &sched.map.comp,
+ "map output in compact mode"),
+ OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
+ "highlight given pids in map"),
+ OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
+ "highlight given CPUs in map"),
+ OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
+ "display given CPUs in map"),
+ OPT_END()
+ };
const char * const latency_usage[] = {
"perf sched latency [<options>]",
NULL
@@ -1839,6 +2004,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
"perf sched replay [<options>]",
NULL
};
+ const char * const map_usage[] = {
+ "perf sched map [<options>]",
+ NULL
+ };
const char *const sched_subcommands[] = { "record", "latency", "map",
"replay", "script", NULL };
const char *sched_usage[] = {
@@ -1887,6 +2056,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
setup_sorting(&sched, latency_options, latency_usage);
return perf_sched__lat(&sched);
} else if (!strcmp(argv[0], "map")) {
+ if (argc) {
+ argc = parse_options(argc, argv, map_options, map_usage, 0);
+ if (argc)
+ usage_with_options(map_usage, map_options);
+ }
sched.tp_handler = &map_ops;
setup_sorting(&sched, latency_options, latency_usage);
return perf_sched__map(&sched);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 3770c3dffe5e..efca81679bb3 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -22,6 +22,7 @@
#include "util/thread_map.h"
#include "util/stat.h"
#include <linux/bitmap.h>
+#include <linux/stringify.h>
#include "asm/bug.h"
#include "util/mem-events.h"
@@ -317,19 +318,19 @@ static void set_print_ip_opts(struct perf_event_attr *attr)
output[type].print_ip_opts = 0;
if (PRINT_FIELD(IP))
- output[type].print_ip_opts |= PRINT_IP_OPT_IP;
+ output[type].print_ip_opts |= EVSEL__PRINT_IP;
if (PRINT_FIELD(SYM))
- output[type].print_ip_opts |= PRINT_IP_OPT_SYM;
+ output[type].print_ip_opts |= EVSEL__PRINT_SYM;
if (PRINT_FIELD(DSO))
- output[type].print_ip_opts |= PRINT_IP_OPT_DSO;
+ output[type].print_ip_opts |= EVSEL__PRINT_DSO;
if (PRINT_FIELD(SYMOFFSET))
- output[type].print_ip_opts |= PRINT_IP_OPT_SYMOFFSET;
+ output[type].print_ip_opts |= EVSEL__PRINT_SYMOFFSET;
if (PRINT_FIELD(SRCLINE))
- output[type].print_ip_opts |= PRINT_IP_OPT_SRCLINE;
+ output[type].print_ip_opts |= EVSEL__PRINT_SRCLINE;
}
/*
@@ -569,18 +570,23 @@ static void print_sample_bts(struct perf_sample *sample,
/* print branch_from information */
if (PRINT_FIELD(IP)) {
unsigned int print_opts = output[attr->type].print_ip_opts;
+ struct callchain_cursor *cursor = NULL;
- if (symbol_conf.use_callchain && sample->callchain) {
- printf("\n");
- } else {
- printf(" ");
- if (print_opts & PRINT_IP_OPT_SRCLINE) {
+ if (symbol_conf.use_callchain && sample->callchain &&
+ thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+ sample, NULL, NULL, scripting_max_stack) == 0)
+ cursor = &callchain_cursor;
+
+ if (cursor == NULL) {
+ putchar(' ');
+ if (print_opts & EVSEL__PRINT_SRCLINE) {
print_srcline_last = true;
- print_opts &= ~PRINT_IP_OPT_SRCLINE;
+ print_opts &= ~EVSEL__PRINT_SRCLINE;
}
- }
- perf_evsel__print_ip(evsel, sample, al, print_opts,
- scripting_max_stack);
+ } else
+ putchar('\n');
+
+ sample__fprintf_sym(sample, al, 0, print_opts, cursor, stdout);
}
/* print branch_to information */
@@ -783,14 +789,15 @@ static void process_event(struct perf_script *script,
printf("%16" PRIu64, sample->weight);
if (PRINT_FIELD(IP)) {
- if (!symbol_conf.use_callchain)
- printf(" ");
- else
- printf("\n");
+ struct callchain_cursor *cursor = NULL;
+
+ if (symbol_conf.use_callchain && sample->callchain &&
+ thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+ sample, NULL, NULL, scripting_max_stack) == 0)
+ cursor = &callchain_cursor;
- perf_evsel__print_ip(evsel, sample, al,
- output[attr->type].print_ip_opts,
- scripting_max_stack);
+ putchar(cursor ? '\n' : ' ');
+ sample__fprintf_sym(sample, al, 0, output[attr->type].print_ip_opts, cursor, stdout);
}
if (PRINT_FIELD(IREGS))
@@ -1415,21 +1422,19 @@ static int is_directory(const char *base_path, const struct dirent *dent)
return S_ISDIR(st.st_mode);
}
-#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\
- while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \
- lang_next) \
- if ((lang_dirent.d_type == DT_DIR || \
- (lang_dirent.d_type == DT_UNKNOWN && \
- is_directory(scripts_path, &lang_dirent))) && \
- (strcmp(lang_dirent.d_name, ".")) && \
- (strcmp(lang_dirent.d_name, "..")))
+#define for_each_lang(scripts_path, scripts_dir, lang_dirent) \
+ while ((lang_dirent = readdir(scripts_dir)) != NULL) \
+ if ((lang_dirent->d_type == DT_DIR || \
+ (lang_dirent->d_type == DT_UNKNOWN && \
+ is_directory(scripts_path, lang_dirent))) && \
+ (strcmp(lang_dirent->d_name, ".")) && \
+ (strcmp(lang_dirent->d_name, "..")))
-#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\
- while (!readdir_r(lang_dir, &script_dirent, &script_next) && \
- script_next) \
- if (script_dirent.d_type != DT_DIR && \
- (script_dirent.d_type != DT_UNKNOWN || \
- !is_directory(lang_path, &script_dirent)))
+#define for_each_script(lang_path, lang_dir, script_dirent) \
+ while ((script_dirent = readdir(lang_dir)) != NULL) \
+ if (script_dirent->d_type != DT_DIR && \
+ (script_dirent->d_type != DT_UNKNOWN || \
+ !is_directory(lang_path, script_dirent)))
#define RECORD_SUFFIX "-record"
@@ -1575,7 +1580,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
const char *s __maybe_unused,
int unset __maybe_unused)
{
- struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+ struct dirent *script_dirent, *lang_dirent;
char scripts_path[MAXPATHLEN];
DIR *scripts_dir, *lang_dir;
char script_path[MAXPATHLEN];
@@ -1590,19 +1595,19 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
if (!scripts_dir)
return -1;
- for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+ for_each_lang(scripts_path, scripts_dir, lang_dirent) {
snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
- lang_dirent.d_name);
+ lang_dirent->d_name);
lang_dir = opendir(lang_path);
if (!lang_dir)
continue;
- for_each_script(lang_path, lang_dir, script_dirent, script_next) {
- script_root = get_script_root(&script_dirent, REPORT_SUFFIX);
+ for_each_script(lang_path, lang_dir, script_dirent) {
+ script_root = get_script_root(script_dirent, REPORT_SUFFIX);
if (script_root) {
desc = script_desc__findnew(script_root);
snprintf(script_path, MAXPATHLEN, "%s/%s",
- lang_path, script_dirent.d_name);
+ lang_path, script_dirent->d_name);
read_script_info(desc, script_path);
free(script_root);
}
@@ -1690,7 +1695,7 @@ static int check_ev_match(char *dir_name, char *scriptname,
*/
int find_scripts(char **scripts_array, char **scripts_path_array)
{
- struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+ struct dirent *script_dirent, *lang_dirent;
char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN];
DIR *scripts_dir, *lang_dir;
struct perf_session *session;
@@ -1713,9 +1718,9 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
return -1;
}
- for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+ for_each_lang(scripts_path, scripts_dir, lang_dirent) {
snprintf(lang_path, MAXPATHLEN, "%s/%s", scripts_path,
- lang_dirent.d_name);
+ lang_dirent->d_name);
#ifdef NO_LIBPERL
if (strstr(lang_path, "perl"))
continue;
@@ -1729,16 +1734,16 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
if (!lang_dir)
continue;
- for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+ for_each_script(lang_path, lang_dir, script_dirent) {
/* Skip those real time scripts: xxxtop.p[yl] */
- if (strstr(script_dirent.d_name, "top."))
+ if (strstr(script_dirent->d_name, "top."))
continue;
sprintf(scripts_path_array[i], "%s/%s", lang_path,
- script_dirent.d_name);
- temp = strchr(script_dirent.d_name, '.');
+ script_dirent->d_name);
+ temp = strchr(script_dirent->d_name, '.');
snprintf(scripts_array[i],
- (temp - script_dirent.d_name) + 1,
- "%s", script_dirent.d_name);
+ (temp - script_dirent->d_name) + 1,
+ "%s", script_dirent->d_name);
if (check_ev_match(lang_path,
scripts_array[i], session))
@@ -1756,7 +1761,7 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
static char *get_script_path(const char *script_root, const char *suffix)
{
- struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+ struct dirent *script_dirent, *lang_dirent;
char scripts_path[MAXPATHLEN];
char script_path[MAXPATHLEN];
DIR *scripts_dir, *lang_dir;
@@ -1769,21 +1774,21 @@ static char *get_script_path(const char *script_root, const char *suffix)
if (!scripts_dir)
return NULL;
- for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+ for_each_lang(scripts_path, scripts_dir, lang_dirent) {
snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
- lang_dirent.d_name);
+ lang_dirent->d_name);
lang_dir = opendir(lang_path);
if (!lang_dir)
continue;
- for_each_script(lang_path, lang_dir, script_dirent, script_next) {
- __script_root = get_script_root(&script_dirent, suffix);
+ for_each_script(lang_path, lang_dir, script_dirent) {
+ __script_root = get_script_root(script_dirent, suffix);
if (__script_root && !strcmp(script_root, __script_root)) {
free(__script_root);
closedir(lang_dir);
closedir(scripts_dir);
snprintf(script_path, MAXPATHLEN, "%s/%s",
- lang_path, script_dirent.d_name);
+ lang_path, script_dirent->d_name);
return strdup(script_path);
}
free(__script_root);
@@ -1961,6 +1966,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.attr = process_attr,
+ .event_update = perf_event__process_event_update,
.tracing_data = perf_event__process_tracing_data,
.build_id = perf_event__process_build_id,
.id_index = perf_event__process_id_index,
@@ -2022,6 +2028,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"only consider symbols in these pids"),
OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
"only consider symbols in these tids"),
+ OPT_UINTEGER(0, "max-stack", &scripting_max_stack,
+ "Set the maximum stack depth when parsing the callchain, "
+ "anything beyond the specified depth will be ignored. "
+ "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_BOOLEAN('I', "show-info", &show_full_info,
"display extended information from perf.data file"),
OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path,
@@ -2057,6 +2067,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
NULL
};
+ scripting_max_stack = sysctl_perf_event_max_stack;
+
setup_scripting();
argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 1f19f2f999c8..e459b685a4e9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -298,6 +298,14 @@ static int read_counter(struct perf_evsel *counter)
return -1;
}
}
+
+ if (verbose > 1) {
+ fprintf(stat_config.output,
+ "%s: %d: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
+ perf_evsel__name(counter),
+ cpu,
+ count->val, count->ena, count->run);
+ }
}
}
@@ -528,6 +536,7 @@ static int __run_perf_stat(int argc, const char **argv)
perf_evlist__set_leader(evsel_list);
evlist__for_each(evsel_list, counter) {
+try_again:
if (create_perf_stat_counter(counter) < 0) {
/*
* PPC returns ENXIO for HW counters until 2.6.37
@@ -544,7 +553,11 @@ static int __run_perf_stat(int argc, const char **argv)
if ((counter->leader != counter) ||
!(counter->leader->nr_members > 1))
continue;
- }
+ } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
+ if (verbose)
+ ui__warning("%s\n", msg);
+ goto try_again;
+ }
perf_evsel__open_strerror(counter, &target,
errno, msg, sizeof(msg));
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 833214979c4f..1793da585676 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -688,7 +688,7 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter,
struct hist_entry *he = iter->he;
struct perf_evsel *evsel = iter->evsel;
- if (sort__has_sym && single)
+ if (perf_hpp_list.sym && single)
perf_top__record_precise_ip(top, he, evsel->idx, al->addr);
hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
@@ -886,7 +886,7 @@ static int perf_top__start_counters(struct perf_top *top)
struct perf_evlist *evlist = top->evlist;
struct record_opts *opts = &top->record_opts;
- perf_evlist__config(evlist, opts);
+ perf_evlist__config(evlist, opts, &callchain_param);
evlist__for_each(evlist, counter) {
try_again:
@@ -917,15 +917,15 @@ out_err:
return -1;
}
-static int perf_top__setup_sample_type(struct perf_top *top __maybe_unused)
+static int callchain_param__setup_sample_type(struct callchain_param *callchain)
{
- if (!sort__has_sym) {
- if (symbol_conf.use_callchain) {
+ if (!perf_hpp_list.sym) {
+ if (callchain->enabled) {
ui__error("Selected -g but \"sym\" not present in --sort/-s.");
return -EINVAL;
}
- } else if (callchain_param.mode != CHAIN_NONE) {
- if (callchain_register_param(&callchain_param) < 0) {
+ } else if (callchain->mode != CHAIN_NONE) {
+ if (callchain_register_param(callchain) < 0) {
ui__error("Can't register callchain params.\n");
return -EINVAL;
}
@@ -952,7 +952,7 @@ static int __cmd_top(struct perf_top *top)
goto out_delete;
}
- ret = perf_top__setup_sample_type(top);
+ ret = callchain_param__setup_sample_type(&callchain_param);
if (ret)
goto out_delete;
@@ -962,7 +962,7 @@ static int __cmd_top(struct perf_top *top)
machine__synthesize_threads(&top->session->machines.host, &opts->target,
top->evlist->threads, false, opts->proc_map_timeout);
- if (sort__has_socket) {
+ if (perf_hpp_list.socket) {
ret = perf_env__read_cpu_topology_map(&perf_env);
if (ret < 0)
goto out_err_cpu_topo;
@@ -1045,18 +1045,17 @@ callchain_opt(const struct option *opt, const char *arg, int unset)
static int
parse_callchain_opt(const struct option *opt, const char *arg, int unset)
{
- struct record_opts *record = (struct record_opts *)opt->value;
+ struct callchain_param *callchain = opt->value;
- record->callgraph_set = true;
- callchain_param.enabled = !unset;
- callchain_param.record_mode = CALLCHAIN_FP;
+ callchain->enabled = !unset;
+ callchain->record_mode = CALLCHAIN_FP;
/*
* --no-call-graph
*/
if (unset) {
symbol_conf.use_callchain = false;
- callchain_param.record_mode = CALLCHAIN_NONE;
+ callchain->record_mode = CALLCHAIN_NONE;
return 0;
}
@@ -1104,7 +1103,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
},
.proc_map_timeout = 500,
},
- .max_stack = PERF_MAX_STACK_DEPTH,
+ .max_stack = sysctl_perf_event_max_stack,
.sym_pcnt_filter = 5,
};
struct record_opts *opts = &top.record_opts;
@@ -1162,17 +1161,17 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
"output field(s): overhead, period, sample plus all of sort keys"),
OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
"Show a column with the number of samples"),
- OPT_CALLBACK_NOOPT('g', NULL, &top.record_opts,
+ OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
NULL, "enables call-graph recording and display",
&callchain_opt),
- OPT_CALLBACK(0, "call-graph", &top.record_opts,
+ OPT_CALLBACK(0, "call-graph", &callchain_param,
"record_mode[,record_size],print_type,threshold[,print_limit],order,sort_key[,branch]",
top_callchain_help, &parse_callchain_opt),
OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
"Accumulate callchains of children and show total overhead as well"),
OPT_INTEGER(0, "max-stack", &top.max_stack,
"Set the maximum stack depth when parsing the callchain. "
- "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+ "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
"ignore callees of these functions in call graphs",
report_parse_ignore_callees_opt),
@@ -1256,7 +1255,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
sort__mode = SORT_MODE__TOP;
/* display thread wants entries to be collapsed in a different tree */
- sort__need_collapse = 1;
+ perf_hpp_list.need_collapse = 1;
if (top.use_stdio)
use_browser = 0;
@@ -1312,7 +1311,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
top.sym_evsel = perf_evlist__first(top.evlist);
- if (!symbol_conf.use_callchain) {
+ if (!callchain_param.enabled) {
symbol_conf.cumulate_callchain = false;
perf_hpp__cancel_cumulate();
}
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 93ac724fb635..6e5c325148e4 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -34,79 +34,76 @@
#include "trace-event.h"
#include "util/parse-events.h"
#include "util/bpf-loader.h"
+#include "callchain.h"
+#include "syscalltbl.h"
+#include "rb_resort.h"
-#include <libaudit.h>
+#include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
#include <stdlib.h>
-#include <sys/mman.h>
-#include <linux/futex.h>
#include <linux/err.h>
-
-/* For older distros: */
-#ifndef MAP_STACK
-# define MAP_STACK 0x20000
-#endif
-
-#ifndef MADV_HWPOISON
-# define MADV_HWPOISON 100
-
-#endif
-
-#ifndef MADV_MERGEABLE
-# define MADV_MERGEABLE 12
-#endif
-
-#ifndef MADV_UNMERGEABLE
-# define MADV_UNMERGEABLE 13
-#endif
-
-#ifndef EFD_SEMAPHORE
-# define EFD_SEMAPHORE 1
-#endif
-
-#ifndef EFD_NONBLOCK
-# define EFD_NONBLOCK 00004000
-#endif
-
-#ifndef EFD_CLOEXEC
-# define EFD_CLOEXEC 02000000
-#endif
+#include <linux/filter.h>
+#include <linux/audit.h>
+#include <sys/ptrace.h>
+#include <linux/random.h>
+#include <linux/stringify.h>
#ifndef O_CLOEXEC
# define O_CLOEXEC 02000000
#endif
-#ifndef SOCK_DCCP
-# define SOCK_DCCP 6
-#endif
-
-#ifndef SOCK_CLOEXEC
-# define SOCK_CLOEXEC 02000000
-#endif
-
-#ifndef SOCK_NONBLOCK
-# define SOCK_NONBLOCK 00004000
-#endif
-
-#ifndef MSG_CMSG_CLOEXEC
-# define MSG_CMSG_CLOEXEC 0x40000000
-#endif
-
-#ifndef PERF_FLAG_FD_NO_GROUP
-# define PERF_FLAG_FD_NO_GROUP (1UL << 0)
-#endif
-
-#ifndef PERF_FLAG_FD_OUTPUT
-# define PERF_FLAG_FD_OUTPUT (1UL << 1)
-#endif
-
-#ifndef PERF_FLAG_PID_CGROUP
-# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
-#endif
-
-#ifndef PERF_FLAG_FD_CLOEXEC
-# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
-#endif
-
+struct trace {
+ struct perf_tool tool;
+ struct syscalltbl *sctbl;
+ struct {
+ int max;
+ struct syscall *table;
+ struct {
+ struct perf_evsel *sys_enter,
+ *sys_exit;
+ } events;
+ } syscalls;
+ struct record_opts opts;
+ struct perf_evlist *evlist;
+ struct machine *host;
+ struct thread *current;
+ u64 base_time;
+ FILE *output;
+ unsigned long nr_events;
+ struct strlist *ev_qualifier;
+ struct {
+ size_t nr;
+ int *entries;
+ } ev_qualifier_ids;
+ struct intlist *tid_list;
+ struct intlist *pid_list;
+ struct {
+ size_t nr;
+ pid_t *entries;
+ } filter_pids;
+ double duration_filter;
+ double runtime_ms;
+ struct {
+ u64 vfs_getname,
+ proc_getname;
+ } stats;
+ unsigned int max_stack;
+ unsigned int min_stack;
+ bool not_ev_qualifier;
+ bool live;
+ bool full_time;
+ bool sched;
+ bool multiple_threads;
+ bool summary;
+ bool summary_only;
+ bool show_comm;
+ bool show_tool_stats;
+ bool trace_syscalls;
+ bool kernel_syscallchains;
+ bool force;
+ bool vfs_getname;
+ int trace_pgfaults;
+ int open_id;
+};
struct tp_field {
int offset;
@@ -371,221 +368,6 @@ static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
#define SCA_INT syscall_arg__scnprintf_int
-static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, prot = arg->val;
-
- if (prot == PROT_NONE)
- return scnprintf(bf, size, "NONE");
-#define P_MMAP_PROT(n) \
- if (prot & PROT_##n) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
- prot &= ~PROT_##n; \
- }
-
- P_MMAP_PROT(EXEC);
- P_MMAP_PROT(READ);
- P_MMAP_PROT(WRITE);
-#ifdef PROT_SEM
- P_MMAP_PROT(SEM);
-#endif
- P_MMAP_PROT(GROWSDOWN);
- P_MMAP_PROT(GROWSUP);
-#undef P_MMAP_PROT
-
- if (prot)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
-
- return printed;
-}
-
-#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
-
-static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, flags = arg->val;
-
-#define P_MMAP_FLAG(n) \
- if (flags & MAP_##n) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
- flags &= ~MAP_##n; \
- }
-
- P_MMAP_FLAG(SHARED);
- P_MMAP_FLAG(PRIVATE);
-#ifdef MAP_32BIT
- P_MMAP_FLAG(32BIT);
-#endif
- P_MMAP_FLAG(ANONYMOUS);
- P_MMAP_FLAG(DENYWRITE);
- P_MMAP_FLAG(EXECUTABLE);
- P_MMAP_FLAG(FILE);
- P_MMAP_FLAG(FIXED);
- P_MMAP_FLAG(GROWSDOWN);
-#ifdef MAP_HUGETLB
- P_MMAP_FLAG(HUGETLB);
-#endif
- P_MMAP_FLAG(LOCKED);
- P_MMAP_FLAG(NONBLOCK);
- P_MMAP_FLAG(NORESERVE);
- P_MMAP_FLAG(POPULATE);
- P_MMAP_FLAG(STACK);
-#ifdef MAP_UNINITIALIZED
- P_MMAP_FLAG(UNINITIALIZED);
-#endif
-#undef P_MMAP_FLAG
-
- if (flags)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
- return printed;
-}
-
-#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
-
-static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, flags = arg->val;
-
-#define P_MREMAP_FLAG(n) \
- if (flags & MREMAP_##n) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
- flags &= ~MREMAP_##n; \
- }
-
- P_MREMAP_FLAG(MAYMOVE);
-#ifdef MREMAP_FIXED
- P_MREMAP_FLAG(FIXED);
-#endif
-#undef P_MREMAP_FLAG
-
- if (flags)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
- return printed;
-}
-
-#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
-
-static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int behavior = arg->val;
-
- switch (behavior) {
-#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
- P_MADV_BHV(NORMAL);
- P_MADV_BHV(RANDOM);
- P_MADV_BHV(SEQUENTIAL);
- P_MADV_BHV(WILLNEED);
- P_MADV_BHV(DONTNEED);
- P_MADV_BHV(REMOVE);
- P_MADV_BHV(DONTFORK);
- P_MADV_BHV(DOFORK);
- P_MADV_BHV(HWPOISON);
-#ifdef MADV_SOFT_OFFLINE
- P_MADV_BHV(SOFT_OFFLINE);
-#endif
- P_MADV_BHV(MERGEABLE);
- P_MADV_BHV(UNMERGEABLE);
-#ifdef MADV_HUGEPAGE
- P_MADV_BHV(HUGEPAGE);
-#endif
-#ifdef MADV_NOHUGEPAGE
- P_MADV_BHV(NOHUGEPAGE);
-#endif
-#ifdef MADV_DONTDUMP
- P_MADV_BHV(DONTDUMP);
-#endif
-#ifdef MADV_DODUMP
- P_MADV_BHV(DODUMP);
-#endif
-#undef P_MADV_PHV
- default: break;
- }
-
- return scnprintf(bf, size, "%#x", behavior);
-}
-
-#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
-
-static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, op = arg->val;
-
- if (op == 0)
- return scnprintf(bf, size, "NONE");
-#define P_CMD(cmd) \
- if ((op & LOCK_##cmd) == LOCK_##cmd) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
- op &= ~LOCK_##cmd; \
- }
-
- P_CMD(SH);
- P_CMD(EX);
- P_CMD(NB);
- P_CMD(UN);
- P_CMD(MAND);
- P_CMD(RW);
- P_CMD(READ);
- P_CMD(WRITE);
-#undef P_OP
-
- if (op)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
-
- return printed;
-}
-
-#define SCA_FLOCK syscall_arg__scnprintf_flock
-
-static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
-{
- enum syscall_futex_args {
- SCF_UADDR = (1 << 0),
- SCF_OP = (1 << 1),
- SCF_VAL = (1 << 2),
- SCF_TIMEOUT = (1 << 3),
- SCF_UADDR2 = (1 << 4),
- SCF_VAL3 = (1 << 5),
- };
- int op = arg->val;
- int cmd = op & FUTEX_CMD_MASK;
- size_t printed = 0;
-
- switch (cmd) {
-#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
- P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
- P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
- P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
- P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break;
- P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break;
- P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break;
- P_FUTEX_OP(WAKE_OP); break;
- P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
- P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
- P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
- P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break;
- P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break;
- P_FUTEX_OP(WAIT_REQUEUE_PI); break;
- default: printed = scnprintf(bf, size, "%#x", cmd); break;
- }
-
- if (op & FUTEX_PRIVATE_FLAG)
- printed += scnprintf(bf + printed, size - printed, "|PRIV");
-
- if (op & FUTEX_CLOCK_REALTIME)
- printed += scnprintf(bf + printed, size - printed, "|CLKRT");
-
- return printed;
-}
-
-#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op
-
static const char *bpf_cmd[] = {
"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
"MAP_GET_NEXT_KEY", "PROG_LOAD",
@@ -652,110 +434,6 @@ static const char *socket_families[] = {
};
static DEFINE_STRARRAY(socket_families);
-#ifndef SOCK_TYPE_MASK
-#define SOCK_TYPE_MASK 0xf
-#endif
-
-static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- size_t printed;
- int type = arg->val,
- flags = type & ~SOCK_TYPE_MASK;
-
- type &= SOCK_TYPE_MASK;
- /*
- * Can't use a strarray, MIPS may override for ABI reasons.
- */
- switch (type) {
-#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
- P_SK_TYPE(STREAM);
- P_SK_TYPE(DGRAM);
- P_SK_TYPE(RAW);
- P_SK_TYPE(RDM);
- P_SK_TYPE(SEQPACKET);
- P_SK_TYPE(DCCP);
- P_SK_TYPE(PACKET);
-#undef P_SK_TYPE
- default:
- printed = scnprintf(bf, size, "%#x", type);
- }
-
-#define P_SK_FLAG(n) \
- if (flags & SOCK_##n) { \
- printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
- flags &= ~SOCK_##n; \
- }
-
- P_SK_FLAG(CLOEXEC);
- P_SK_FLAG(NONBLOCK);
-#undef P_SK_FLAG
-
- if (flags)
- printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
-
- return printed;
-}
-
-#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
-
-#ifndef MSG_PROBE
-#define MSG_PROBE 0x10
-#endif
-#ifndef MSG_WAITFORONE
-#define MSG_WAITFORONE 0x10000
-#endif
-#ifndef MSG_SENDPAGE_NOTLAST
-#define MSG_SENDPAGE_NOTLAST 0x20000
-#endif
-#ifndef MSG_FASTOPEN
-#define MSG_FASTOPEN 0x20000000
-#endif
-
-static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, flags = arg->val;
-
- if (flags == 0)
- return scnprintf(bf, size, "NONE");
-#define P_MSG_FLAG(n) \
- if (flags & MSG_##n) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
- flags &= ~MSG_##n; \
- }
-
- P_MSG_FLAG(OOB);
- P_MSG_FLAG(PEEK);
- P_MSG_FLAG(DONTROUTE);
- P_MSG_FLAG(TRYHARD);
- P_MSG_FLAG(CTRUNC);
- P_MSG_FLAG(PROBE);
- P_MSG_FLAG(TRUNC);
- P_MSG_FLAG(DONTWAIT);
- P_MSG_FLAG(EOR);
- P_MSG_FLAG(WAITALL);
- P_MSG_FLAG(FIN);
- P_MSG_FLAG(SYN);
- P_MSG_FLAG(CONFIRM);
- P_MSG_FLAG(RST);
- P_MSG_FLAG(ERRQUEUE);
- P_MSG_FLAG(NOSIGNAL);
- P_MSG_FLAG(MORE);
- P_MSG_FLAG(WAITFORONE);
- P_MSG_FLAG(SENDPAGE_NOTLAST);
- P_MSG_FLAG(FASTOPEN);
- P_MSG_FLAG(CMSG_CLOEXEC);
-#undef P_MSG_FLAG
-
- if (flags)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
- return printed;
-}
-
-#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
-
static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
struct syscall_arg *arg)
{
@@ -788,116 +466,6 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
#define SCA_FILENAME syscall_arg__scnprintf_filename
-static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, flags = arg->val;
-
- if (!(flags & O_CREAT))
- arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
-
- if (flags == 0)
- return scnprintf(bf, size, "RDONLY");
-#define P_FLAG(n) \
- if (flags & O_##n) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
- flags &= ~O_##n; \
- }
-
- P_FLAG(APPEND);
- P_FLAG(ASYNC);
- P_FLAG(CLOEXEC);
- P_FLAG(CREAT);
- P_FLAG(DIRECT);
- P_FLAG(DIRECTORY);
- P_FLAG(EXCL);
- P_FLAG(LARGEFILE);
- P_FLAG(NOATIME);
- P_FLAG(NOCTTY);
-#ifdef O_NONBLOCK
- P_FLAG(NONBLOCK);
-#elif O_NDELAY
- P_FLAG(NDELAY);
-#endif
-#ifdef O_PATH
- P_FLAG(PATH);
-#endif
- P_FLAG(RDWR);
-#ifdef O_DSYNC
- if ((flags & O_SYNC) == O_SYNC)
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
- else {
- P_FLAG(DSYNC);
- }
-#else
- P_FLAG(SYNC);
-#endif
- P_FLAG(TRUNC);
- P_FLAG(WRONLY);
-#undef P_FLAG
-
- if (flags)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
- return printed;
-}
-
-#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
-
-static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, flags = arg->val;
-
- if (flags == 0)
- return 0;
-
-#define P_FLAG(n) \
- if (flags & PERF_FLAG_##n) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
- flags &= ~PERF_FLAG_##n; \
- }
-
- P_FLAG(FD_NO_GROUP);
- P_FLAG(FD_OUTPUT);
- P_FLAG(PID_CGROUP);
- P_FLAG(FD_CLOEXEC);
-#undef P_FLAG
-
- if (flags)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
- return printed;
-}
-
-#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
-
-static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
- struct syscall_arg *arg)
-{
- int printed = 0, flags = arg->val;
-
- if (flags == 0)
- return scnprintf(bf, size, "NONE");
-#define P_FLAG(n) \
- if (flags & EFD_##n) { \
- printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
- flags &= ~EFD_##n; \
- }
-
- P_FLAG(SEMAPHORE);
- P_FLAG(CLOEXEC);
- P_FLAG(NONBLOCK);
-#undef P_FLAG
-
- if (flags)
- printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
- return printed;
-}
-
-#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
-
static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
struct syscall_arg *arg)
{
@@ -921,59 +489,6 @@ static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
-static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
-{
- int sig = arg->val;
-
- switch (sig) {
-#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
- P_SIGNUM(HUP);
- P_SIGNUM(INT);
- P_SIGNUM(QUIT);
- P_SIGNUM(ILL);
- P_SIGNUM(TRAP);
- P_SIGNUM(ABRT);
- P_SIGNUM(BUS);
- P_SIGNUM(FPE);
- P_SIGNUM(KILL);
- P_SIGNUM(USR1);
- P_SIGNUM(SEGV);
- P_SIGNUM(USR2);
- P_SIGNUM(PIPE);
- P_SIGNUM(ALRM);
- P_SIGNUM(TERM);
- P_SIGNUM(CHLD);
- P_SIGNUM(CONT);
- P_SIGNUM(STOP);
- P_SIGNUM(TSTP);
- P_SIGNUM(TTIN);
- P_SIGNUM(TTOU);
- P_SIGNUM(URG);
- P_SIGNUM(XCPU);
- P_SIGNUM(XFSZ);
- P_SIGNUM(VTALRM);
- P_SIGNUM(PROF);
- P_SIGNUM(WINCH);
- P_SIGNUM(IO);
- P_SIGNUM(PWR);
- P_SIGNUM(SYS);
-#ifdef SIGEMT
- P_SIGNUM(EMT);
-#endif
-#ifdef SIGSTKFLT
- P_SIGNUM(STKFLT);
-#endif
-#ifdef SIGSWI
- P_SIGNUM(SWI);
-#endif
- default: break;
- }
-
- return scnprintf(bf, size, "%#x", sig);
-}
-
-#define SCA_SIGNUM syscall_arg__scnprintf_signum
-
#if defined(__i386__) || defined(__x86_64__)
/*
* FIXME: Make this available to all arches.
@@ -1001,16 +516,62 @@ static const char *tioctls[] = {
static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
#endif /* defined(__i386__) || defined(__x86_64__) */
+#ifndef GRND_NONBLOCK
+#define GRND_NONBLOCK 0x0001
+#endif
+#ifndef GRND_RANDOM
+#define GRND_RANDOM 0x0002
+#endif
+
+static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+#define P_FLAG(n) \
+ if (flags & GRND_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~GRND_##n; \
+ }
+
+ P_FLAG(RANDOM);
+ P_FLAG(NONBLOCK);
+#undef P_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
+
#define STRARRAY(arg, name, array) \
.arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
.arg_parm = { [arg] = &strarray__##array, }
+#include "trace/beauty/eventfd.c"
+#include "trace/beauty/flock.c"
+#include "trace/beauty/futex_op.c"
+#include "trace/beauty/mmap.c"
+#include "trace/beauty/mode_t.c"
+#include "trace/beauty/msg_flags.c"
+#include "trace/beauty/open_flags.c"
+#include "trace/beauty/perf_event_open.c"
+#include "trace/beauty/pid.c"
+#include "trace/beauty/sched_policy.c"
+#include "trace/beauty/seccomp.c"
+#include "trace/beauty/signum.c"
+#include "trace/beauty/socket_type.c"
+#include "trace/beauty/waitid_options.c"
+
static struct syscall_fmt {
const char *name;
const char *alias;
size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
void *arg_parm[6];
bool errmsg;
+ bool errpid;
bool timeout;
bool hexret;
} syscall_fmts[] = {
@@ -1028,6 +589,7 @@ static struct syscall_fmt {
{ .name = "chroot", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
{ .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
+ { .name = "clone", .errpid = true, },
{ .name = "close", .errmsg = true,
.arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
{ .name = "connect", .errmsg = true, },
@@ -1093,6 +655,11 @@ static struct syscall_fmt {
{ .name = "getdents64", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
{ .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
+ { .name = "getpid", .errpid = true, },
+ { .name = "getpgid", .errpid = true, },
+ { .name = "getppid", .errpid = true, },
+ { .name = "getrandom", .errmsg = true,
+ .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
{ .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
{ .name = "getxattr", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
@@ -1186,8 +753,7 @@ static struct syscall_fmt {
[1] = SCA_FILENAME, /* filename */
[2] = SCA_OPEN_FLAGS, /* flags */ }, },
{ .name = "perf_event_open", .errmsg = true,
- .arg_scnprintf = { [1] = SCA_INT, /* pid */
- [2] = SCA_INT, /* cpu */
+ .arg_scnprintf = { [2] = SCA_INT, /* cpu */
[3] = SCA_FD, /* group_fd */
[4] = SCA_PERF_FLAGS, /* flags */ }, },
{ .name = "pipe2", .errmsg = true,
@@ -1234,6 +800,11 @@ static struct syscall_fmt {
.arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
{ .name = "rt_tgsigqueueinfo", .errmsg = true,
.arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
+ { .name = "sched_setscheduler", .errmsg = true,
+ .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
+ { .name = "seccomp", .errmsg = true,
+ .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
+ [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
{ .name = "select", .errmsg = true, .timeout = true, },
{ .name = "sendmmsg", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FD, /* fd */
@@ -1244,7 +815,9 @@ static struct syscall_fmt {
{ .name = "sendto", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FD, /* fd */
[3] = SCA_MSG_FLAGS, /* flags */ }, },
+ { .name = "set_tid_address", .errpid = true, },
{ .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
+ { .name = "setpgid", .errmsg = true, },
{ .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
{ .name = "setxattr", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
@@ -1287,6 +860,10 @@ static struct syscall_fmt {
.arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
{ .name = "vmsplice", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+ { .name = "wait4", .errpid = true,
+ .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
+ { .name = "waitid", .errpid = true,
+ .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
{ .name = "write", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
{ .name = "writev", .errmsg = true,
@@ -1398,59 +975,6 @@ fail:
static const size_t trace__entry_str_size = 2048;
-struct trace {
- struct perf_tool tool;
- struct {
- int machine;
- int open_id;
- } audit;
- struct {
- int max;
- struct syscall *table;
- struct {
- struct perf_evsel *sys_enter,
- *sys_exit;
- } events;
- } syscalls;
- struct record_opts opts;
- struct perf_evlist *evlist;
- struct machine *host;
- struct thread *current;
- u64 base_time;
- FILE *output;
- unsigned long nr_events;
- struct strlist *ev_qualifier;
- struct {
- size_t nr;
- int *entries;
- } ev_qualifier_ids;
- struct intlist *tid_list;
- struct intlist *pid_list;
- struct {
- size_t nr;
- pid_t *entries;
- } filter_pids;
- double duration_filter;
- double runtime_ms;
- struct {
- u64 vfs_getname,
- proc_getname;
- } stats;
- bool not_ev_qualifier;
- bool live;
- bool full_time;
- bool sched;
- bool multiple_threads;
- bool summary;
- bool summary_only;
- bool show_comm;
- bool show_tool_stats;
- bool trace_syscalls;
- bool force;
- bool vfs_getname;
- int trace_pgfaults;
-};
-
static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
{
struct thread_trace *ttrace = thread__priv(thread);
@@ -1618,6 +1142,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine,
color_fprintf(trace->output, PERF_COLOR_RED,
"LOST %" PRIu64 " events!\n", event->lost.lost);
ret = machine__process_lost_event(machine, event, sample);
+ break;
default:
ret = machine__process_event(machine, event, sample);
break;
@@ -1675,6 +1200,10 @@ static int syscall__set_arg_fmts(struct syscall *sc)
sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
else if (field->flags & FIELD_IS_POINTER)
sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
+ else if (strcmp(field->type, "pid_t") == 0)
+ sc->arg_scnprintf[idx] = SCA_PID;
+ else if (strcmp(field->type, "umode_t") == 0)
+ sc->arg_scnprintf[idx] = SCA_MODE_T;
++idx;
}
@@ -1685,7 +1214,7 @@ static int trace__read_syscall_info(struct trace *trace, int id)
{
char tp_name[128];
struct syscall *sc;
- const char *name = audit_syscall_to_name(id, trace->audit.machine);
+ const char *name = syscalltbl__name(trace->sctbl, id);
if (name == NULL)
return -1;
@@ -1760,7 +1289,7 @@ static int trace__validate_ev_qualifier(struct trace *trace)
strlist__for_each(pos, trace->ev_qualifier) {
const char *sc = pos->s;
- int id = audit_name_to_syscall(sc, trace->audit.machine);
+ int id = syscalltbl__id(trace->sctbl, sc);
if (id < 0) {
if (err == 0) {
@@ -1846,7 +1375,12 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
"%ld", val);
}
}
- } else {
+ } else if (IS_ERR(sc->tp_format)) {
+ /*
+ * If we managed to read the tracepoint /format file, then we
+ * may end up not having any args, like with gettid(), so only
+ * print the raw args when we didn't manage to read it.
+ */
int i = 0;
while (i < 6) {
@@ -1987,7 +1521,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
goto out_put;
}
- if (!trace->summary_only)
+ if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
trace__printf_interrupted_entry(trace, sample);
ttrace->entry_time = sample->time;
@@ -1998,7 +1532,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
args, trace, thread);
if (sc->is_exit) {
- if (!trace->duration_filter && !trace->summary_only) {
+ if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
fprintf(trace->output, "%-70s\n", ttrace->entry_str);
}
@@ -2018,6 +1552,29 @@ out_put:
return err;
}
+static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
+ struct perf_sample *sample,
+ struct callchain_cursor *cursor)
+{
+ struct addr_location al;
+
+ if (machine__resolve(trace->host, &al, sample) < 0 ||
+ thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
+ return -1;
+
+ return 0;
+}
+
+static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
+{
+ /* TODO: user-configurable print_opts */
+ const unsigned int print_opts = EVSEL__PRINT_SYM |
+ EVSEL__PRINT_DSO |
+ EVSEL__PRINT_UNKNOWN_AS_ADDR;
+
+ return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
+}
+
static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
@@ -2025,7 +1582,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
long ret;
u64 duration = 0;
struct thread *thread;
- int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
+ int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
struct syscall *sc = trace__syscall_info(trace, evsel, id);
struct thread_trace *ttrace;
@@ -2042,7 +1599,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
- if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
+ if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
trace__set_fd_pathname(thread, ret, ttrace->filename.name);
ttrace->filename.pending_open = false;
++trace->stats.vfs_getname;
@@ -2057,6 +1614,15 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
} else if (trace->duration_filter)
goto out;
+ if (sample->callchain) {
+ callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+ if (callchain_ret == 0) {
+ if (callchain_cursor.nr < trace->min_stack)
+ goto out;
+ callchain_ret = 1;
+ }
+ }
+
if (trace->summary_only)
goto out;
@@ -2073,7 +1639,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
if (sc->fmt == NULL) {
signed_print:
fprintf(trace->output, ") = %ld", ret);
- } else if (ret < 0 && sc->fmt->errmsg) {
+ } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
char bf[STRERR_BUFSIZE];
const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
*e = audit_errno_to_name(-ret);
@@ -2083,10 +1649,24 @@ signed_print:
fprintf(trace->output, ") = 0 Timeout");
else if (sc->fmt->hexret)
fprintf(trace->output, ") = %#lx", ret);
- else
+ else if (sc->fmt->errpid) {
+ struct thread *child = machine__find_thread(trace->host, ret, ret);
+
+ if (child != NULL) {
+ fprintf(trace->output, ") = %ld", ret);
+ if (child->comm_set)
+ fprintf(trace->output, " (%s)", thread__comm_str(child));
+ thread__put(child);
+ }
+ } else
goto signed_print;
fputc('\n', trace->output);
+
+ if (callchain_ret > 0)
+ trace__fprintf_callchain(trace, sample);
+ else if (callchain_ret < 0)
+ pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
out:
ttrace->entry_pending = false;
err = 0;
@@ -2217,6 +1797,17 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
{
+ int callchain_ret = 0;
+
+ if (sample->callchain) {
+ callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+ if (callchain_ret == 0) {
+ if (callchain_cursor.nr < trace->min_stack)
+ goto out;
+ callchain_ret = 1;
+ }
+ }
+
trace__printf_interrupted_entry(trace, sample);
trace__fprintf_tstamp(trace, sample->time, trace->output);
@@ -2234,6 +1825,12 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
}
fprintf(trace->output, ")\n");
+
+ if (callchain_ret > 0)
+ trace__fprintf_callchain(trace, sample);
+ else if (callchain_ret < 0)
+ pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+out:
return 0;
}
@@ -2264,8 +1861,19 @@ static int trace__pgfault(struct trace *trace,
char map_type = 'd';
struct thread_trace *ttrace;
int err = -1;
+ int callchain_ret = 0;
thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+
+ if (sample->callchain) {
+ callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+ if (callchain_ret == 0) {
+ if (callchain_cursor.nr < trace->min_stack)
+ goto out_put;
+ callchain_ret = 1;
+ }
+ }
+
ttrace = thread__trace(thread, trace->output);
if (ttrace == NULL)
goto out_put;
@@ -2307,6 +1915,11 @@ static int trace__pgfault(struct trace *trace,
print_location(trace->output, sample, &al, true, false);
fprintf(trace->output, " (%c%c)\n", map_type, al.level);
+
+ if (callchain_ret > 0)
+ trace__fprintf_callchain(trace, sample);
+ else if (callchain_ret < 0)
+ pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
out:
err = 0;
out_put:
@@ -2326,6 +1939,23 @@ static bool skip_sample(struct trace *trace, struct perf_sample *sample)
return false;
}
+static void trace__set_base_time(struct trace *trace,
+ struct perf_evsel *evsel,
+ struct perf_sample *sample)
+{
+ /*
+ * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
+ * and don't use sample->time unconditionally, we may end up having
+ * some other event in the future without PERF_SAMPLE_TIME for good
+ * reason, i.e. we may not be interested in its timestamps, just in
+ * it taking place, picking some piece of information when it
+ * appears in our event stream (vfs_getname comes to mind).
+ */
+ if (trace->base_time == 0 && !trace->full_time &&
+ (evsel->attr.sample_type & PERF_SAMPLE_TIME))
+ trace->base_time = sample->time;
+}
+
static int trace__process_sample(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -2340,8 +1970,7 @@ static int trace__process_sample(struct perf_tool *tool,
if (skip_sample(trace, sample))
return 0;
- if (!trace->full_time && trace->base_time == 0)
- trace->base_time = sample->time;
+ trace__set_base_time(trace, evsel, sample);
if (handler) {
++trace->nr_events;
@@ -2450,8 +2079,7 @@ static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
return true;
}
-static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
- u64 config)
+static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
{
struct perf_evsel *evsel;
struct perf_event_attr attr = {
@@ -2465,13 +2093,10 @@ static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
event_attr_init(&attr);
evsel = perf_evsel__new(&attr);
- if (!evsel)
- return -ENOMEM;
-
- evsel->handler = trace__pgfault;
- perf_evlist__add(evlist, evsel);
+ if (evsel)
+ evsel->handler = trace__pgfault;
- return 0;
+ return evsel;
}
static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
@@ -2479,9 +2104,6 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st
const u32 type = event->header.type;
struct perf_evsel *evsel;
- if (!trace->full_time && trace->base_time == 0)
- trace->base_time = sample->time;
-
if (type != PERF_RECORD_SAMPLE) {
trace__process_event(trace, trace->host, event, sample);
return;
@@ -2493,6 +2115,8 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st
return;
}
+ trace__set_base_time(trace, evsel, sample);
+
if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
sample->raw_data == NULL) {
fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
@@ -2527,6 +2151,15 @@ static int trace__add_syscall_newtp(struct trace *trace)
perf_evlist__add(evlist, sys_enter);
perf_evlist__add(evlist, sys_exit);
+ if (callchain_param.enabled && !trace->kernel_syscallchains) {
+ /*
+ * We're interested only in the user space callchain
+ * leading to the syscall, allow overriding that for
+ * debugging reasons using --kernel_syscall_callchains
+ */
+ sys_exit->attr.exclude_callchain_kernel = 1;
+ }
+
trace->syscalls.events.sys_enter = sys_enter;
trace->syscalls.events.sys_exit = sys_exit;
@@ -2565,7 +2198,7 @@ out_enomem:
static int trace__run(struct trace *trace, int argc, const char **argv)
{
struct perf_evlist *evlist = trace->evlist;
- struct perf_evsel *evsel;
+ struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
int err = -1, i;
unsigned long before;
const bool forks = argc > 0;
@@ -2579,14 +2212,19 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
if (trace->trace_syscalls)
trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
- if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
- perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
- goto out_error_mem;
+ if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
+ pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
+ if (pgfault_maj == NULL)
+ goto out_error_mem;
+ perf_evlist__add(evlist, pgfault_maj);
}
- if ((trace->trace_pgfaults & TRACE_PFMIN) &&
- perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
- goto out_error_mem;
+ if ((trace->trace_pgfaults & TRACE_PFMIN)) {
+ pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
+ if (pgfault_min == NULL)
+ goto out_error_mem;
+ perf_evlist__add(evlist, pgfault_min);
+ }
if (trace->sched &&
perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
@@ -2605,7 +2243,45 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
goto out_delete_evlist;
}
- perf_evlist__config(evlist, &trace->opts);
+ perf_evlist__config(evlist, &trace->opts, NULL);
+
+ if (callchain_param.enabled) {
+ bool use_identifier = false;
+
+ if (trace->syscalls.events.sys_exit) {
+ perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
+ &trace->opts, &callchain_param);
+ use_identifier = true;
+ }
+
+ if (pgfault_maj) {
+ perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
+ use_identifier = true;
+ }
+
+ if (pgfault_min) {
+ perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
+ use_identifier = true;
+ }
+
+ if (use_identifier) {
+ /*
+ * Now we have evsels with different sample_ids, use
+ * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
+ * from a fixed position in each ring buffer record.
+ *
+ * As of this the changeset introducing this comment, this
+ * isn't strictly needed, as the fields that can come before
+ * PERF_SAMPLE_ID are all used, but we'll probably disable
+ * some of those for things like copying the payload of
+ * pointer syscall arguments, and for vfs_getname we don't
+ * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
+ * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
+ */
+ perf_evlist__set_sample_bit(evlist, IDENTIFIER);
+ perf_evlist__reset_sample_bit(evlist, ID);
+ }
+ }
signal(SIGCHLD, sig_handler);
signal(SIGINT, sig_handler);
@@ -2883,15 +2559,29 @@ static size_t trace__fprintf_threads_header(FILE *fp)
return printed;
}
+DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
+ struct stats *stats;
+ double msecs;
+ int syscall;
+)
+{
+ struct int_node *source = rb_entry(nd, struct int_node, rb_node);
+ struct stats *stats = source->priv;
+
+ entry->syscall = source->i;
+ entry->stats = stats;
+ entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
+}
+
static size_t thread__dump_stats(struct thread_trace *ttrace,
struct trace *trace, FILE *fp)
{
- struct stats *stats;
size_t printed = 0;
struct syscall *sc;
- struct int_node *inode = intlist__first(ttrace->syscall_stats);
+ struct rb_node *nd;
+ DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
- if (inode == NULL)
+ if (syscall_stats == NULL)
return 0;
printed += fprintf(fp, "\n");
@@ -2900,9 +2590,8 @@ static size_t thread__dump_stats(struct thread_trace *ttrace,
printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
- /* each int_node is a syscall */
- while (inode) {
- stats = inode->priv;
+ resort_rb__for_each(nd, syscall_stats) {
+ struct stats *stats = syscall_stats_entry->stats;
if (stats) {
double min = (double)(stats->min) / NSEC_PER_MSEC;
double max = (double)(stats->max) / NSEC_PER_MSEC;
@@ -2913,34 +2602,23 @@ static size_t thread__dump_stats(struct thread_trace *ttrace,
pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
avg /= NSEC_PER_MSEC;
- sc = &trace->syscalls.table[inode->i];
+ sc = &trace->syscalls.table[syscall_stats_entry->syscall];
printed += fprintf(fp, " %-15s", sc->name);
printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
- n, avg * n, min, avg);
+ n, syscall_stats_entry->msecs, min, avg);
printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
}
-
- inode = intlist__next(inode);
}
+ resort_rb__delete(syscall_stats);
printed += fprintf(fp, "\n\n");
return printed;
}
-/* struct used to pass data to per-thread function */
-struct summary_data {
- FILE *fp;
- struct trace *trace;
- size_t printed;
-};
-
-static int trace__fprintf_one_thread(struct thread *thread, void *priv)
+static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
{
- struct summary_data *data = priv;
- FILE *fp = data->fp;
- size_t printed = data->printed;
- struct trace *trace = data->trace;
+ size_t printed = 0;
struct thread_trace *ttrace = thread__priv(thread);
double ratio;
@@ -2956,25 +2634,45 @@ static int trace__fprintf_one_thread(struct thread *thread, void *priv)
printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
if (ttrace->pfmin)
printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
- printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
+ if (trace->sched)
+ printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
+ else if (fputc('\n', fp) != EOF)
+ ++printed;
+
printed += thread__dump_stats(ttrace, trace, fp);
- data->printed += printed;
+ return printed;
+}
- return 0;
+static unsigned long thread__nr_events(struct thread_trace *ttrace)
+{
+ return ttrace ? ttrace->nr_events : 0;
+}
+
+DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
+ struct thread *thread;
+)
+{
+ entry->thread = rb_entry(nd, struct thread, rb_node);
}
static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
{
- struct summary_data data = {
- .fp = fp,
- .trace = trace
- };
- data.printed = trace__fprintf_threads_header(fp);
+ DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
+ size_t printed = trace__fprintf_threads_header(fp);
+ struct rb_node *nd;
+
+ if (threads == NULL) {
+ fprintf(fp, "%s", "Error sorting output by nr_events!\n");
+ return 0;
+ }
- machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
+ resort_rb__for_each(nd, threads)
+ printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
- return data.printed;
+ resort_rb__delete(threads);
+
+ return printed;
}
static int trace__set_duration(const struct option *opt, const char *str,
@@ -3070,10 +2768,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
NULL
};
struct trace trace = {
- .audit = {
- .machine = audit_detect_machine(),
- .open_id = audit_name_to_syscall("open", trace.audit.machine),
- },
.syscalls = {
. max = -1,
},
@@ -3091,6 +2785,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
.output = stderr,
.show_comm = true,
.trace_syscalls = true,
+ .kernel_syscallchains = false,
+ .max_stack = UINT_MAX,
};
const char *output_name = NULL;
const char *ev_qualifier_str = NULL;
@@ -3136,10 +2832,24 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
"Trace pagefaults", parse_pagefaults, "maj"),
OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
+ OPT_CALLBACK(0, "call-graph", &trace.opts,
+ "record_mode[,record_size]", record_callchain_help,
+ &record_parse_callchain_opt),
+ OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
+ "Show the kernel callchains on the syscall exit path"),
+ OPT_UINTEGER(0, "min-stack", &trace.min_stack,
+ "Set the minimum stack depth when parsing the callchain, "
+ "anything below the specified depth will be ignored."),
+ OPT_UINTEGER(0, "max-stack", &trace.max_stack,
+ "Set the maximum stack depth when parsing the callchain, "
+ "anything beyond the specified depth will be ignored. "
+ "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
"per thread proc mmap processing timeout in ms"),
OPT_END()
};
+ bool __maybe_unused max_stack_user_set = true;
+ bool mmap_pages_user_set = true;
const char * const trace_subcommands[] = { "record", NULL };
int err;
char bf[BUFSIZ];
@@ -3148,8 +2858,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
signal(SIGFPE, sighandler_dump_stack);
trace.evlist = perf_evlist__new();
+ trace.sctbl = syscalltbl__new();
- if (trace.evlist == NULL) {
+ if (trace.evlist == NULL || trace.sctbl == NULL) {
pr_err("Not enough memory to run!\n");
err = -ENOMEM;
goto out;
@@ -3158,11 +2869,40 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
+ err = bpf__setup_stdout(trace.evlist);
+ if (err) {
+ bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
+ pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
+ goto out;
+ }
+
+ err = -1;
+
if (trace.trace_pgfaults) {
trace.opts.sample_address = true;
trace.opts.sample_time = true;
}
+ if (trace.opts.mmap_pages == UINT_MAX)
+ mmap_pages_user_set = false;
+
+ if (trace.max_stack == UINT_MAX) {
+ trace.max_stack = sysctl_perf_event_max_stack;
+ max_stack_user_set = false;
+ }
+
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+ if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled)
+ record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
+#endif
+
+ if (callchain_param.enabled) {
+ if (!mmap_pages_user_set && geteuid() == 0)
+ trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
+
+ symbol_conf.use_callchain = true;
+ }
+
if (trace.evlist->nr_entries > 0)
evlist__set_evsel_handler(trace.evlist, trace__event_handler);
@@ -3179,6 +2919,11 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
return -1;
}
+ if (!trace.trace_syscalls && ev_qualifier_str) {
+ pr_err("The -e option can't be used with --no-syscalls.\n");
+ goto out;
+ }
+
if (output_name != NULL) {
err = trace__open_output(&trace, output_name);
if (err < 0) {
@@ -3187,6 +2932,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
}
}
+ trace.open_id = syscalltbl__id(trace.sctbl, "open");
+
if (ev_qualifier_str != NULL) {
const char *s = ev_qualifier_str;
struct strlist_config slist_config = {
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index f7d7f5a1cad5..1e46277286c2 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -27,7 +27,7 @@ NO_PERF_REGS := 1
ifeq ($(ARCH),x86)
$(call detected,CONFIG_X86)
ifeq (${IS_64_BIT}, 1)
- CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT
+ CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -DHAVE_SYSCALL_TABLE -I$(OUTPUT)arch/x86/include/generated
ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
$(call detected,CONFIG_X86_64)
@@ -268,6 +268,12 @@ else
ifneq ($(feature-dwarf), 1)
msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
NO_DWARF := 1
+ else
+ ifneq ($(feature-dwarf_getlocations), 1)
+ msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157);
+ else
+ CFLAGS += -DHAVE_DWARF_GETLOCATIONS
+ endif # dwarf_getlocations
endif # Dwarf support
endif # libelf support
endif # NO_LIBELF
@@ -289,9 +295,6 @@ ifndef NO_LIBELF
CFLAGS += -DHAVE_ELF_GETPHDRNUM_SUPPORT
endif
- # include ARCH specific config
- -include $(src-perf)/arch/$(ARCH)/Makefile
-
ifndef NO_DWARF
ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index 6461e02ab940..3573f315f955 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -92,6 +92,22 @@ error:
return ret;
}
+static int use_arch_timestamp;
+
+static inline uint64_t
+get_arch_timestamp(void)
+{
+#if defined(__i386__) || defined(__x86_64__)
+ unsigned int low, high;
+
+ asm volatile("rdtsc" : "=a" (low), "=d" (high));
+
+ return low | ((uint64_t)high) << 32;
+#else
+ return 0;
+#endif
+}
+
#define NSEC_PER_SEC 1000000000
static int perf_clk_id = CLOCK_MONOTONIC;
@@ -107,6 +123,9 @@ perf_get_timestamp(void)
struct timespec ts;
int ret;
+ if (use_arch_timestamp)
+ return get_arch_timestamp();
+
ret = clock_gettime(perf_clk_id, &ts);
if (ret)
return 0;
@@ -203,6 +222,17 @@ perf_close_marker_file(void)
munmap(marker_addr, pgsz);
}
+static void
+init_arch_timestamp(void)
+{
+ char *str = getenv("JITDUMP_USE_ARCH_TIMESTAMP");
+
+ if (!str || !*str || !strcmp(str, "0"))
+ return;
+
+ use_arch_timestamp = 1;
+}
+
void *jvmti_open(void)
{
int pad_cnt;
@@ -211,11 +241,17 @@ void *jvmti_open(void)
int fd;
FILE *fp;
+ init_arch_timestamp();
+
/*
* check if clockid is supported
*/
- if (!perf_get_timestamp())
- warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+ if (!perf_get_timestamp()) {
+ if (use_arch_timestamp)
+ warnx("jvmti: arch timestamp not supported");
+ else
+ warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+ }
memset(&header, 0, sizeof(header));
@@ -263,6 +299,9 @@ void *jvmti_open(void)
header.timestamp = perf_get_timestamp();
+ if (use_arch_timestamp)
+ header.flags |= JITDUMP_FLAGS_ARCH_TIMESTAMP;
+
if (!fwrite(&header, sizeof(header), 1, fp)) {
warn("jvmti: cannot write dumpfile header");
goto error;
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index aaee0a782747..797000842d40 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -17,6 +17,7 @@
#include <subcmd/parse-options.h>
#include "util/bpf-loader.h"
#include "util/debug.h"
+#include <api/fs/fs.h>
#include <api/fs/tracing_path.h>
#include <pthread.h>
#include <stdlib.h>
@@ -308,9 +309,11 @@ static int handle_alias(int *argcp, const char ***argv)
if (*argcp > 1) {
struct strbuf buf;
- strbuf_init(&buf, PATH_MAX);
- strbuf_addstr(&buf, alias_string);
- sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
+ if (strbuf_init(&buf, PATH_MAX) < 0 ||
+ strbuf_addstr(&buf, alias_string) < 0 ||
+ sq_quote_argv(&buf, (*argv) + 1,
+ PATH_MAX) < 0)
+ die("Failed to allocate memory.");
free(alias_string);
alias_string = buf.buf;
}
@@ -533,6 +536,7 @@ int main(int argc, const char **argv)
{
const char *cmd;
char sbuf[STRERR_BUFSIZE];
+ int value;
/* libsubcmd init */
exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT);
@@ -542,6 +546,9 @@ int main(int argc, const char **argv)
page_size = sysconf(_SC_PAGE_SIZE);
cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+ if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
+ sysctl_perf_event_max_stack = value;
+
cmd = extract_argv0_path(argv[0]);
if (!cmd)
cmd = "perf-help";
@@ -549,6 +556,7 @@ int main(int argc, const char **argv)
srandom(time(NULL));
perf_config(perf_default_config, NULL);
+ set_buildid_dir(NULL);
/* get debugfs/tracefs mount point from /proc/mounts */
tracing_path_mount();
@@ -572,7 +580,6 @@ int main(int argc, const char **argv)
}
if (!prefixcmp(cmd, "trace")) {
#ifdef HAVE_LIBAUDIT_SUPPORT
- set_buildid_dir(NULL);
setup_path();
argv[0] = "trace";
return cmd_trace(argc, argv, NULL);
@@ -587,7 +594,6 @@ int main(int argc, const char **argv)
argc--;
handle_options(&argv, &argc, NULL);
commit_pager_choice();
- set_buildid_dir(NULL);
if (argc > 0) {
if (!prefixcmp(argv[0], "--"))
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 5381a01c0610..cd8f1b150f9e 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -52,7 +52,6 @@ struct record_opts {
bool sample_weight;
bool sample_time;
bool sample_time_set;
- bool callgraph_set;
bool period;
bool running_time;
bool full_auxtrace;
diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py
index 1b02cdc0cab6..7656ff8aa066 100644
--- a/tools/perf/scripts/python/export-to-postgresql.py
+++ b/tools/perf/scripts/python/export-to-postgresql.py
@@ -34,10 +34,9 @@ import datetime
#
# ubuntu:
#
-# $ sudo apt-get install postgresql
+# $ sudo apt-get install postgresql python-pyside.qtsql libqt4-sql-psql
# $ sudo su - postgres
-# $ createuser <your user id here>
-# Shall the new role be a superuser? (y/n) y
+# $ createuser -s <your user id here>
#
# An example of using this script with Intel PT:
#
@@ -224,11 +223,14 @@ sys.path.append(os.environ['PERF_EXEC_PATH'] + \
perf_db_export_mode = True
perf_db_export_calls = False
+perf_db_export_callchains = False
+
def usage():
- print >> sys.stderr, "Usage is: export-to-postgresql.py <database name> [<columns>] [<calls>]"
+ print >> sys.stderr, "Usage is: export-to-postgresql.py <database name> [<columns>] [<calls>] [<callchains>]"
print >> sys.stderr, "where: columns 'all' or 'branches'"
- print >> sys.stderr, " calls 'calls' => create calls table"
+ print >> sys.stderr, " calls 'calls' => create calls and call_paths table"
+ print >> sys.stderr, " callchains 'callchains' => create call_paths table"
raise Exception("Too few arguments")
if (len(sys.argv) < 2):
@@ -246,9 +248,11 @@ if columns not in ("all", "branches"):
branches = (columns == "branches")
-if (len(sys.argv) >= 4):
- if (sys.argv[3] == "calls"):
+for i in range(3,len(sys.argv)):
+ if (sys.argv[i] == "calls"):
perf_db_export_calls = True
+ elif (sys.argv[i] == "callchains"):
+ perf_db_export_callchains = True
else:
usage()
@@ -359,14 +363,16 @@ else:
'transaction bigint,'
'data_src bigint,'
'branch_type integer,'
- 'in_tx boolean)')
+ 'in_tx boolean,'
+ 'call_path_id bigint)')
-if perf_db_export_calls:
+if perf_db_export_calls or perf_db_export_callchains:
do_query(query, 'CREATE TABLE call_paths ('
'id bigint NOT NULL,'
'parent_id bigint,'
'symbol_id bigint,'
'ip bigint)')
+if perf_db_export_calls:
do_query(query, 'CREATE TABLE calls ('
'id bigint NOT NULL,'
'thread_id bigint,'
@@ -428,7 +434,7 @@ do_query(query, 'CREATE VIEW comm_threads_view AS '
'(SELECT tid FROM threads WHERE id = thread_id) AS tid'
' FROM comm_threads')
-if perf_db_export_calls:
+if perf_db_export_calls or perf_db_export_callchains:
do_query(query, 'CREATE VIEW call_paths_view AS '
'SELECT '
'c.id,'
@@ -444,6 +450,7 @@ if perf_db_export_calls:
'(SELECT dso_id FROM symbols WHERE id = p.symbol_id) AS parent_dso_id,'
'(SELECT dso FROM symbols_view WHERE id = p.symbol_id) AS parent_dso_short_name'
' FROM call_paths c INNER JOIN call_paths p ON p.id = c.parent_id')
+if perf_db_export_calls:
do_query(query, 'CREATE VIEW calls_view AS '
'SELECT '
'calls.id,'
@@ -541,8 +548,9 @@ dso_file = open_output_file("dso_table.bin")
symbol_file = open_output_file("symbol_table.bin")
branch_type_file = open_output_file("branch_type_table.bin")
sample_file = open_output_file("sample_table.bin")
-if perf_db_export_calls:
+if perf_db_export_calls or perf_db_export_callchains:
call_path_file = open_output_file("call_path_table.bin")
+if perf_db_export_calls:
call_file = open_output_file("call_table.bin")
def trace_begin():
@@ -554,8 +562,8 @@ def trace_begin():
comm_table(0, "unknown")
dso_table(0, 0, "unknown", "unknown", "")
symbol_table(0, 0, 0, 0, 0, "unknown")
- sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
- if perf_db_export_calls:
+ sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+ if perf_db_export_calls or perf_db_export_callchains:
call_path_table(0, 0, 0, 0)
unhandled_count = 0
@@ -571,8 +579,9 @@ def trace_end():
copy_output_file(symbol_file, "symbols")
copy_output_file(branch_type_file, "branch_types")
copy_output_file(sample_file, "samples")
- if perf_db_export_calls:
+ if perf_db_export_calls or perf_db_export_callchains:
copy_output_file(call_path_file, "call_paths")
+ if perf_db_export_calls:
copy_output_file(call_file, "calls")
print datetime.datetime.today(), "Removing intermediate files..."
@@ -585,8 +594,9 @@ def trace_end():
remove_output_file(symbol_file)
remove_output_file(branch_type_file)
remove_output_file(sample_file)
- if perf_db_export_calls:
+ if perf_db_export_calls or perf_db_export_callchains:
remove_output_file(call_path_file)
+ if perf_db_export_calls:
remove_output_file(call_file)
os.rmdir(output_dir_name)
print datetime.datetime.today(), "Adding primary keys"
@@ -599,8 +609,9 @@ def trace_end():
do_query(query, 'ALTER TABLE symbols ADD PRIMARY KEY (id)')
do_query(query, 'ALTER TABLE branch_types ADD PRIMARY KEY (id)')
do_query(query, 'ALTER TABLE samples ADD PRIMARY KEY (id)')
- if perf_db_export_calls:
+ if perf_db_export_calls or perf_db_export_callchains:
do_query(query, 'ALTER TABLE call_paths ADD PRIMARY KEY (id)')
+ if perf_db_export_calls:
do_query(query, 'ALTER TABLE calls ADD PRIMARY KEY (id)')
print datetime.datetime.today(), "Adding foreign keys"
@@ -623,10 +634,11 @@ def trace_end():
'ADD CONSTRAINT symbolfk FOREIGN KEY (symbol_id) REFERENCES symbols (id),'
'ADD CONSTRAINT todsofk FOREIGN KEY (to_dso_id) REFERENCES dsos (id),'
'ADD CONSTRAINT tosymbolfk FOREIGN KEY (to_symbol_id) REFERENCES symbols (id)')
- if perf_db_export_calls:
+ if perf_db_export_calls or perf_db_export_callchains:
do_query(query, 'ALTER TABLE call_paths '
'ADD CONSTRAINT parentfk FOREIGN KEY (parent_id) REFERENCES call_paths (id),'
'ADD CONSTRAINT symbolfk FOREIGN KEY (symbol_id) REFERENCES symbols (id)')
+ if perf_db_export_calls:
do_query(query, 'ALTER TABLE calls '
'ADD CONSTRAINT threadfk FOREIGN KEY (thread_id) REFERENCES threads (id),'
'ADD CONSTRAINT commfk FOREIGN KEY (comm_id) REFERENCES comms (id),'
@@ -694,11 +706,11 @@ def branch_type_table(branch_type, name, *x):
value = struct.pack(fmt, 2, 4, branch_type, n, name)
branch_type_file.write(value)
-def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, *x):
+def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, call_path_id, *x):
if branches:
- value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiB", 17, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx)
+ value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiBiq", 18, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx, 8, call_path_id)
else:
- value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiB", 21, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx)
+ value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiBiq", 22, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx, 8, call_path_id)
sample_file.write(value)
def call_path_table(cp_id, parent_id, symbol_id, ip, *x):
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 1ba628ed049a..66a28982547b 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -37,6 +37,8 @@ perf-y += topology.o
perf-y += cpumap.o
perf-y += stat.o
perf-y += event_update.o
+perf-y += event-times.o
+perf-y += backward-ring-buffer.o
$(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
$(call rule_mkdir)
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
new file mode 100644
index 000000000000..d9ba991a9a30
--- /dev/null
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -0,0 +1,151 @@
+/*
+ * Test backward bit in event attribute, read ring buffer from end to
+ * beginning
+ */
+
+#include <perf.h>
+#include <evlist.h>
+#include <sys/prctl.h>
+#include "tests.h"
+#include "debug.h"
+
+#define NR_ITERS 111
+
+static void testcase(void)
+{
+ int i;
+
+ for (i = 0; i < NR_ITERS; i++) {
+ char proc_name[10];
+
+ snprintf(proc_name, sizeof(proc_name), "p:%d\n", i);
+ prctl(PR_SET_NAME, proc_name);
+ }
+}
+
+static int count_samples(struct perf_evlist *evlist, int *sample_count,
+ int *comm_count)
+{
+ int i;
+
+ for (i = 0; i < evlist->nr_mmaps; i++) {
+ union perf_event *event;
+
+ perf_evlist__mmap_read_catchup(evlist, i);
+ while ((event = perf_evlist__mmap_read_backward(evlist, i)) != NULL) {
+ const u32 type = event->header.type;
+
+ switch (type) {
+ case PERF_RECORD_SAMPLE:
+ (*sample_count)++;
+ break;
+ case PERF_RECORD_COMM:
+ (*comm_count)++;
+ break;
+ default:
+ pr_err("Unexpected record of type %d\n", type);
+ return TEST_FAIL;
+ }
+ }
+ }
+ return TEST_OK;
+}
+
+static int do_test(struct perf_evlist *evlist, int mmap_pages,
+ int *sample_count, int *comm_count)
+{
+ int err;
+ char sbuf[STRERR_BUFSIZE];
+
+ err = perf_evlist__mmap(evlist, mmap_pages, true);
+ if (err < 0) {
+ pr_debug("perf_evlist__mmap: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
+ return TEST_FAIL;
+ }
+
+ perf_evlist__enable(evlist);
+ testcase();
+ perf_evlist__disable(evlist);
+
+ err = count_samples(evlist, sample_count, comm_count);
+ perf_evlist__munmap(evlist);
+ return err;
+}
+
+
+int test__backward_ring_buffer(int subtest __maybe_unused)
+{
+ int ret = TEST_SKIP, err, sample_count = 0, comm_count = 0;
+ char pid[16], sbuf[STRERR_BUFSIZE];
+ struct perf_evlist *evlist;
+ struct perf_evsel *evsel __maybe_unused;
+ struct parse_events_error parse_error;
+ struct record_opts opts = {
+ .target = {
+ .uid = UINT_MAX,
+ .uses_mmap = true,
+ },
+ .freq = 0,
+ .mmap_pages = 256,
+ .default_interval = 1,
+ };
+
+ snprintf(pid, sizeof(pid), "%d", getpid());
+ pid[sizeof(pid) - 1] = '\0';
+ opts.target.tid = opts.target.pid = pid;
+
+ evlist = perf_evlist__new();
+ if (!evlist) {
+ pr_debug("No ehough memory to create evlist\n");
+ return TEST_FAIL;
+ }
+
+ err = perf_evlist__create_maps(evlist, &opts.target);
+ if (err < 0) {
+ pr_debug("Not enough memory to create thread/cpu maps\n");
+ goto out_delete_evlist;
+ }
+
+ bzero(&parse_error, sizeof(parse_error));
+ err = parse_events(evlist, "syscalls:sys_enter_prctl", &parse_error);
+ if (err) {
+ pr_debug("Failed to parse tracepoint event, try use root\n");
+ ret = TEST_SKIP;
+ goto out_delete_evlist;
+ }
+
+ perf_evlist__config(evlist, &opts, NULL);
+
+ /* Set backward bit, ring buffer should be writing from end */
+ evlist__for_each(evlist, evsel)
+ evsel->attr.write_backward = 1;
+
+ err = perf_evlist__open(evlist);
+ if (err < 0) {
+ pr_debug("perf_evlist__open: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
+ goto out_delete_evlist;
+ }
+
+ ret = TEST_FAIL;
+ err = do_test(evlist, opts.mmap_pages, &sample_count,
+ &comm_count);
+ if (err != TEST_OK)
+ goto out_delete_evlist;
+
+ if ((sample_count != NR_ITERS) || (comm_count != NR_ITERS)) {
+ pr_err("Unexpected counter: sample_count=%d, comm_count=%d\n",
+ sample_count, comm_count);
+ goto out_delete_evlist;
+ }
+
+ err = do_test(evlist, 1, &sample_count, &comm_count);
+ if (err != TEST_OK)
+ goto out_delete_evlist;
+
+ ret = TEST_OK;
+out_delete_evlist:
+ perf_evlist__delete(evlist);
+ return ret;
+}
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 199501c71e27..f31eed31c1a9 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -138,7 +138,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
perf_evlist__splice_list_tail(evlist, &parse_evlist.list);
evlist->nr_groups = parse_evlist.nr_groups;
- perf_evlist__config(evlist, &opts);
+ perf_evlist__config(evlist, &opts, NULL);
err = perf_evlist__open(evlist);
if (err < 0) {
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index f2b1dcac45d3..0e95c20ecf6e 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -204,6 +204,14 @@ static struct test generic_tests[] = {
.func = test__event_update,
},
{
+ .desc = "Test events times",
+ .func = test__event_times,
+ },
+ {
+ .desc = "Test backward reading from ring buffer",
+ .func = test__backward_ring_buffer,
+ },
+ {
.func = NULL,
},
};
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index abd3f0ec0c0b..68a69a195545 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -532,7 +532,7 @@ static int do_test_code_reading(bool try_kcore)
goto out_put;
}
- perf_evlist__config(evlist, &opts);
+ perf_evlist__config(evlist, &opts, NULL);
evsel = perf_evlist__first(evlist);
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index dc673ff7c437..8cf0d9e189a8 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -202,7 +202,7 @@ static int dsos__create(int cnt, int size)
{
int i;
- dsos = malloc(sizeof(dsos) * cnt);
+ dsos = malloc(sizeof(*dsos) * cnt);
TEST_ASSERT_VAL("failed to alloc dsos array", dsos);
for (i = 0; i < cnt; i++) {
diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c
new file mode 100644
index 000000000000..95fb744f6628
--- /dev/null
+++ b/tools/perf/tests/event-times.c
@@ -0,0 +1,236 @@
+#include <linux/compiler.h>
+#include <string.h>
+#include "tests.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "util.h"
+#include "debug.h"
+#include "thread_map.h"
+#include "target.h"
+
+static int attach__enable_on_exec(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel = perf_evlist__last(evlist);
+ struct target target = {
+ .uid = UINT_MAX,
+ };
+ const char *argv[] = { "true", NULL, };
+ char sbuf[STRERR_BUFSIZE];
+ int err;
+
+ pr_debug("attaching to spawned child, enable on exec\n");
+
+ err = perf_evlist__create_maps(evlist, &target);
+ if (err < 0) {
+ pr_debug("Not enough memory to create thread/cpu maps\n");
+ return err;
+ }
+
+ err = perf_evlist__prepare_workload(evlist, &target, argv, false, NULL);
+ if (err < 0) {
+ pr_debug("Couldn't run the workload!\n");
+ return err;
+ }
+
+ evsel->attr.enable_on_exec = 1;
+
+ err = perf_evlist__open(evlist);
+ if (err < 0) {
+ pr_debug("perf_evlist__open: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
+ return err;
+ }
+
+ return perf_evlist__start_workload(evlist) == 1 ? TEST_OK : TEST_FAIL;
+}
+
+static int detach__enable_on_exec(struct perf_evlist *evlist)
+{
+ waitpid(evlist->workload.pid, NULL, 0);
+ return 0;
+}
+
+static int attach__current_disabled(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel = perf_evlist__last(evlist);
+ struct thread_map *threads;
+ int err;
+
+ pr_debug("attaching to current thread as disabled\n");
+
+ threads = thread_map__new(-1, getpid(), UINT_MAX);
+ if (threads == NULL) {
+ pr_debug("thread_map__new\n");
+ return -1;
+ }
+
+ evsel->attr.disabled = 1;
+
+ err = perf_evsel__open_per_thread(evsel, threads);
+ if (err) {
+ pr_debug("Failed to open event cpu-clock:u\n");
+ return err;
+ }
+
+ thread_map__put(threads);
+ return perf_evsel__enable(evsel) == 0 ? TEST_OK : TEST_FAIL;
+}
+
+static int attach__current_enabled(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel = perf_evlist__last(evlist);
+ struct thread_map *threads;
+ int err;
+
+ pr_debug("attaching to current thread as enabled\n");
+
+ threads = thread_map__new(-1, getpid(), UINT_MAX);
+ if (threads == NULL) {
+ pr_debug("failed to call thread_map__new\n");
+ return -1;
+ }
+
+ err = perf_evsel__open_per_thread(evsel, threads);
+
+ thread_map__put(threads);
+ return err == 0 ? TEST_OK : TEST_FAIL;
+}
+
+static int detach__disable(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel = perf_evlist__last(evlist);
+
+ return perf_evsel__enable(evsel);
+}
+
+static int attach__cpu_disabled(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel = perf_evlist__last(evlist);
+ struct cpu_map *cpus;
+ int err;
+
+ pr_debug("attaching to CPU 0 as enabled\n");
+
+ cpus = cpu_map__new("0");
+ if (cpus == NULL) {
+ pr_debug("failed to call cpu_map__new\n");
+ return -1;
+ }
+
+ evsel->attr.disabled = 1;
+
+ err = perf_evsel__open_per_cpu(evsel, cpus);
+ if (err) {
+ if (err == -EACCES)
+ return TEST_SKIP;
+
+ pr_debug("Failed to open event cpu-clock:u\n");
+ return err;
+ }
+
+ cpu_map__put(cpus);
+ return perf_evsel__enable(evsel);
+}
+
+static int attach__cpu_enabled(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel = perf_evlist__last(evlist);
+ struct cpu_map *cpus;
+ int err;
+
+ pr_debug("attaching to CPU 0 as enabled\n");
+
+ cpus = cpu_map__new("0");
+ if (cpus == NULL) {
+ pr_debug("failed to call cpu_map__new\n");
+ return -1;
+ }
+
+ err = perf_evsel__open_per_cpu(evsel, cpus);
+ if (err == -EACCES)
+ return TEST_SKIP;
+
+ cpu_map__put(cpus);
+ return err ? TEST_FAIL : TEST_OK;
+}
+
+static int test_times(int (attach)(struct perf_evlist *),
+ int (detach)(struct perf_evlist *))
+{
+ struct perf_counts_values count;
+ struct perf_evlist *evlist = NULL;
+ struct perf_evsel *evsel;
+ int err = -1, i;
+
+ evlist = perf_evlist__new();
+ if (!evlist) {
+ pr_debug("failed to create event list\n");
+ goto out_err;
+ }
+
+ err = parse_events(evlist, "cpu-clock:u", NULL);
+ if (err) {
+ pr_debug("failed to parse event cpu-clock:u\n");
+ goto out_err;
+ }
+
+ evsel = perf_evlist__last(evlist);
+ evsel->attr.read_format |=
+ PERF_FORMAT_TOTAL_TIME_ENABLED |
+ PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+ err = attach(evlist);
+ if (err == TEST_SKIP) {
+ pr_debug(" SKIP : not enough rights\n");
+ return err;
+ }
+
+ TEST_ASSERT_VAL("failed to attach", !err);
+
+ for (i = 0; i < 100000000; i++) { }
+
+ TEST_ASSERT_VAL("failed to detach", !detach(evlist));
+
+ perf_evsel__read(evsel, 0, 0, &count);
+
+ err = !(count.ena == count.run);
+
+ pr_debug(" %s: ena %" PRIu64", run %" PRIu64"\n",
+ !err ? "OK " : "FAILED",
+ count.ena, count.run);
+
+out_err:
+ if (evlist)
+ perf_evlist__delete(evlist);
+ return !err ? TEST_OK : TEST_FAIL;
+}
+
+/*
+ * This test creates software event 'cpu-clock'
+ * attaches it in several ways (explained below)
+ * and checks that enabled and running times
+ * match.
+ */
+int test__event_times(int subtest __maybe_unused)
+{
+ int err, ret = 0;
+
+#define _T(attach, detach) \
+ err = test_times(attach, detach); \
+ if (err && (ret == TEST_OK || ret == TEST_SKIP)) \
+ ret = err;
+
+ /* attach on newly spawned process after exec */
+ _T(attach__enable_on_exec, detach__enable_on_exec)
+ /* attach on current process as enabled */
+ _T(attach__current_enabled, detach__disable)
+ /* attach on current process as disabled */
+ _T(attach__current_disabled, detach__disable)
+ /* attach on cpu as disabled */
+ _T(attach__cpu_disabled, detach__disable)
+ /* attach on cpu as enabled */
+ _T(attach__cpu_enabled, detach__disable)
+
+#undef _T
+ return ret;
+}
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index 012eab5d1df1..63ecf21750eb 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -30,7 +30,7 @@ static int process_event_scale(struct perf_tool *tool __maybe_unused,
TEST_ASSERT_VAL("wrong id", ev->id == 123);
TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__SCALE);
- TEST_ASSERT_VAL("wrong scale", ev_data->scale = 0.123);
+ TEST_ASSERT_VAL("wrong scale", ev_data->scale == 0.123);
return 0;
}
diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c
index f55f4bd47932..6b21746d6eec 100644
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c
@@ -161,7 +161,7 @@ void print_hists_in(struct hists *hists)
struct rb_root *root;
struct rb_node *node;
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root = &hists->entries_collapsed;
else
root = hists->entries_in;
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index ed5aa9eaeb6c..a9e3db3afac4 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -101,7 +101,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
if (machine__resolve(machine, &al, &sample) < 0)
goto out;
- if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+ if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack,
NULL) < 0) {
addr_location__put(&al);
goto out;
@@ -126,7 +126,7 @@ static void del_hist_entries(struct hists *hists)
struct rb_root *root_out;
struct rb_node *node;
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root_in = &hists->entries_collapsed;
else
root_in = hists->entries_in;
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index b825d24f8186..e846f8c42013 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -81,7 +81,7 @@ static int add_hist_entries(struct perf_evlist *evlist,
al.socket = fake_samples[i].socket;
if (hist_entry_iter__add(&iter, &al,
- PERF_MAX_STACK_DEPTH, NULL) < 0) {
+ sysctl_perf_event_max_stack, NULL) < 0) {
addr_location__put(&al);
goto out;
}
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 358324e47805..acf5a1301c07 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -145,7 +145,7 @@ static int __validate_match(struct hists *hists)
/*
* Only entries from fake_common_samples should have a pair.
*/
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root = &hists->entries_collapsed;
else
root = hists->entries_in;
@@ -197,7 +197,7 @@ static int __validate_link(struct hists *hists, int idx)
* and some entries will have no pair. However every entry
* in other hists should have (dummy) pair.
*/
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root = &hists->entries_collapsed;
else
root = hists->entries_in;
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index d3556fbe8c5c..63c5efaba1b5 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -67,7 +67,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
if (machine__resolve(machine, &al, &sample) < 0)
goto out;
- if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+ if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack,
NULL) < 0) {
addr_location__put(&al);
goto out;
@@ -92,7 +92,7 @@ static void del_hist_entries(struct hists *hists)
struct rb_root *root_out;
struct rb_node *node;
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root_in = &hists->entries_collapsed;
else
root_in = hists->entries_in;
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index ddb78fae064a..614e45a3c603 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -80,7 +80,7 @@ int test__keep_tracking(int subtest __maybe_unused)
CHECK__(parse_events(evlist, "dummy:u", NULL));
CHECK__(parse_events(evlist, "cycles:u", NULL));
- perf_evlist__config(evlist, &opts);
+ perf_evlist__config(evlist, &opts, NULL);
evsel = perf_evlist__first(evlist);
diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index eb99a105f31c..4344fe482c1d 100644
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -44,7 +44,7 @@ int test__syscall_openat_tp_fields(int subtest __maybe_unused)
goto out_delete_evlist;
}
- perf_evsel__config(evsel, &opts);
+ perf_evsel__config(evsel, &opts, NULL);
thread_map__set_pid(evlist->threads, 0, getpid());
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 1cc78cefe399..b836ee6a8d9b 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -99,7 +99,7 @@ int test__PERF_RECORD(int subtest __maybe_unused)
perf_evsel__set_sample_bit(evsel, CPU);
perf_evsel__set_sample_bit(evsel, TID);
perf_evsel__set_sample_bit(evsel, TIME);
- perf_evlist__config(evlist, &opts);
+ perf_evlist__config(evlist, &opts, NULL);
err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask);
if (err < 0) {
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index ebd80168d51e..39a689bf7574 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -417,7 +417,7 @@ int test__switch_tracking(int subtest __maybe_unused)
perf_evsel__set_sample_bit(tracking_evsel, TIME);
/* Config events */
- perf_evlist__config(evlist, &opts);
+ perf_evlist__config(evlist, &opts, NULL);
/* Check moved event is still at the front */
if (cycles_evsel != perf_evlist__first(evlist)) {
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 82b2b5e6ba7c..c57e72c826d2 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -85,6 +85,8 @@ int test__synthesize_stat_config(int subtest);
int test__synthesize_stat(int subtest);
int test__synthesize_stat_round(int subtest);
int test__event_update(int subtest);
+int test__event_times(int subtest);
+int test__backward_ring_buffer(int subtest);
#if defined(__arm__) || defined(__aarch64__)
#ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 630b0b409b97..e63abab7d5a1 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -54,8 +54,14 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
* Step 3:
*
* Load and split /proc/kallsyms into multiple maps, one per module.
+ * Do not use kcore, as this test was designed before kcore support
+ * and has parts that only make sense if using the non-kcore code.
+ * XXX: extend it to stress the kcorre code as well, hint: the list
+ * of modules extracted from /proc/kcore, in its current form, can't
+ * be compacted against the list of modules found in the "vmlinux"
+ * code and with the one got from /proc/modules from the "kallsyms" code.
*/
- if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, NULL) <= 0) {
+ if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true, NULL) <= 0) {
pr_debug("dso__load_kallsyms ");
goto out;
}
@@ -157,6 +163,9 @@ next_pair:
pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
mem_start, sym->name, pair->name);
+ } else {
+ pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+ mem_start, sym->name, first_pair->name);
}
}
} else
diff --git a/tools/perf/trace/beauty/eventfd.c b/tools/perf/trace/beauty/eventfd.c
new file mode 100644
index 000000000000..d64f4a9128a1
--- /dev/null
+++ b/tools/perf/trace/beauty/eventfd.c
@@ -0,0 +1,38 @@
+#include <sys/eventfd.h>
+
+#ifndef EFD_SEMAPHORE
+#define EFD_SEMAPHORE 1
+#endif
+
+#ifndef EFD_NONBLOCK
+#define EFD_NONBLOCK 00004000
+#endif
+
+#ifndef EFD_CLOEXEC
+#define EFD_CLOEXEC 02000000
+#endif
+
+static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size, struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+ if (flags == 0)
+ return scnprintf(bf, size, "NONE");
+#define P_FLAG(n) \
+ if (flags & EFD_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~EFD_##n; \
+ }
+
+ P_FLAG(SEMAPHORE);
+ P_FLAG(CLOEXEC);
+ P_FLAG(NONBLOCK);
+#undef P_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
diff --git a/tools/perf/trace/beauty/flock.c b/tools/perf/trace/beauty/flock.c
new file mode 100644
index 000000000000..021bb48c6336
--- /dev/null
+++ b/tools/perf/trace/beauty/flock.c
@@ -0,0 +1,31 @@
+
+static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, op = arg->val;
+
+ if (op == 0)
+ return scnprintf(bf, size, "NONE");
+#define P_CMD(cmd) \
+ if ((op & LOCK_##cmd) == LOCK_##cmd) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
+ op &= ~LOCK_##cmd; \
+ }
+
+ P_CMD(SH);
+ P_CMD(EX);
+ P_CMD(NB);
+ P_CMD(UN);
+ P_CMD(MAND);
+ P_CMD(RW);
+ P_CMD(READ);
+ P_CMD(WRITE);
+#undef P_OP
+
+ if (op)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
+
+ return printed;
+}
+
+#define SCA_FLOCK syscall_arg__scnprintf_flock
diff --git a/tools/perf/trace/beauty/futex_op.c b/tools/perf/trace/beauty/futex_op.c
new file mode 100644
index 000000000000..e2476211f22d
--- /dev/null
+++ b/tools/perf/trace/beauty/futex_op.c
@@ -0,0 +1,44 @@
+#include <linux/futex.h>
+
+static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
+{
+ enum syscall_futex_args {
+ SCF_UADDR = (1 << 0),
+ SCF_OP = (1 << 1),
+ SCF_VAL = (1 << 2),
+ SCF_TIMEOUT = (1 << 3),
+ SCF_UADDR2 = (1 << 4),
+ SCF_VAL3 = (1 << 5),
+ };
+ int op = arg->val;
+ int cmd = op & FUTEX_CMD_MASK;
+ size_t printed = 0;
+
+ switch (cmd) {
+#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
+ P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
+ P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+ P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+ P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break;
+ P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break;
+ P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break;
+ P_FUTEX_OP(WAKE_OP); break;
+ P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+ P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+ P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break;
+ P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break;
+ P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break;
+ P_FUTEX_OP(WAIT_REQUEUE_PI); break;
+ default: printed = scnprintf(bf, size, "%#x", cmd); break;
+ }
+
+ if (op & FUTEX_PRIVATE_FLAG)
+ printed += scnprintf(bf + printed, size - printed, "|PRIV");
+
+ if (op & FUTEX_CLOCK_REALTIME)
+ printed += scnprintf(bf + printed, size - printed, "|CLKRT");
+
+ return printed;
+}
+
+#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op
diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c
new file mode 100644
index 000000000000..3444a4d5382d
--- /dev/null
+++ b/tools/perf/trace/beauty/mmap.c
@@ -0,0 +1,158 @@
+#include <sys/mman.h>
+
+static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, prot = arg->val;
+
+ if (prot == PROT_NONE)
+ return scnprintf(bf, size, "NONE");
+#define P_MMAP_PROT(n) \
+ if (prot & PROT_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ prot &= ~PROT_##n; \
+ }
+
+ P_MMAP_PROT(EXEC);
+ P_MMAP_PROT(READ);
+ P_MMAP_PROT(WRITE);
+#ifdef PROT_SEM
+ P_MMAP_PROT(SEM);
+#endif
+ P_MMAP_PROT(GROWSDOWN);
+ P_MMAP_PROT(GROWSUP);
+#undef P_MMAP_PROT
+
+ if (prot)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
+
+ return printed;
+}
+
+#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
+
+#ifndef MAP_STACK
+# define MAP_STACK 0x20000
+#endif
+
+static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+#define P_MMAP_FLAG(n) \
+ if (flags & MAP_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~MAP_##n; \
+ }
+
+ P_MMAP_FLAG(SHARED);
+ P_MMAP_FLAG(PRIVATE);
+#ifdef MAP_32BIT
+ P_MMAP_FLAG(32BIT);
+#endif
+ P_MMAP_FLAG(ANONYMOUS);
+ P_MMAP_FLAG(DENYWRITE);
+ P_MMAP_FLAG(EXECUTABLE);
+ P_MMAP_FLAG(FILE);
+ P_MMAP_FLAG(FIXED);
+ P_MMAP_FLAG(GROWSDOWN);
+#ifdef MAP_HUGETLB
+ P_MMAP_FLAG(HUGETLB);
+#endif
+ P_MMAP_FLAG(LOCKED);
+ P_MMAP_FLAG(NONBLOCK);
+ P_MMAP_FLAG(NORESERVE);
+ P_MMAP_FLAG(POPULATE);
+ P_MMAP_FLAG(STACK);
+#ifdef MAP_UNINITIALIZED
+ P_MMAP_FLAG(UNINITIALIZED);
+#endif
+#undef P_MMAP_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
+
+static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+#define P_MREMAP_FLAG(n) \
+ if (flags & MREMAP_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~MREMAP_##n; \
+ }
+
+ P_MREMAP_FLAG(MAYMOVE);
+#ifdef MREMAP_FIXED
+ P_MREMAP_FLAG(FIXED);
+#endif
+#undef P_MREMAP_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
+
+#ifndef MADV_HWPOISON
+#define MADV_HWPOISON 100
+#endif
+
+#ifndef MADV_MERGEABLE
+#define MADV_MERGEABLE 12
+#endif
+
+#ifndef MADV_UNMERGEABLE
+#define MADV_UNMERGEABLE 13
+#endif
+
+static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int behavior = arg->val;
+
+ switch (behavior) {
+#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
+ P_MADV_BHV(NORMAL);
+ P_MADV_BHV(RANDOM);
+ P_MADV_BHV(SEQUENTIAL);
+ P_MADV_BHV(WILLNEED);
+ P_MADV_BHV(DONTNEED);
+ P_MADV_BHV(REMOVE);
+ P_MADV_BHV(DONTFORK);
+ P_MADV_BHV(DOFORK);
+ P_MADV_BHV(HWPOISON);
+#ifdef MADV_SOFT_OFFLINE
+ P_MADV_BHV(SOFT_OFFLINE);
+#endif
+ P_MADV_BHV(MERGEABLE);
+ P_MADV_BHV(UNMERGEABLE);
+#ifdef MADV_HUGEPAGE
+ P_MADV_BHV(HUGEPAGE);
+#endif
+#ifdef MADV_NOHUGEPAGE
+ P_MADV_BHV(NOHUGEPAGE);
+#endif
+#ifdef MADV_DONTDUMP
+ P_MADV_BHV(DONTDUMP);
+#endif
+#ifdef MADV_DODUMP
+ P_MADV_BHV(DODUMP);
+#endif
+#undef P_MADV_PHV
+ default: break;
+ }
+
+ return scnprintf(bf, size, "%#x", behavior);
+}
+
+#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
diff --git a/tools/perf/trace/beauty/mode_t.c b/tools/perf/trace/beauty/mode_t.c
new file mode 100644
index 000000000000..930d8fef2400
--- /dev/null
+++ b/tools/perf/trace/beauty/mode_t.c
@@ -0,0 +1,68 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* From include/linux/stat.h */
+#ifndef S_IRWXUGO
+#define S_IRWXUGO (S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+#ifndef S_IALLUGO
+#define S_IALLUGO (S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO)
+#endif
+#ifndef S_IRUGO
+#define S_IRUGO (S_IRUSR|S_IRGRP|S_IROTH)
+#endif
+#ifndef S_IWUGO
+#define S_IWUGO (S_IWUSR|S_IWGRP|S_IWOTH)
+#endif
+#ifndef S_IXUGO
+#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
+#endif
+
+static size_t syscall_arg__scnprintf_mode_t(char *bf, size_t size, struct syscall_arg *arg)
+{
+ int printed = 0, mode = arg->val;
+
+#define P_MODE(n) \
+ if ((mode & S_##n) == S_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ mode &= ~S_##n; \
+ }
+
+ P_MODE(IALLUGO);
+ P_MODE(IRWXUGO);
+ P_MODE(IRUGO);
+ P_MODE(IWUGO);
+ P_MODE(IXUGO);
+ P_MODE(IFMT);
+ P_MODE(IFSOCK);
+ P_MODE(IFLNK);
+ P_MODE(IFREG);
+ P_MODE(IFBLK);
+ P_MODE(IFDIR);
+ P_MODE(IFCHR);
+ P_MODE(IFIFO);
+ P_MODE(ISUID);
+ P_MODE(ISGID);
+ P_MODE(ISVTX);
+ P_MODE(IRWXU);
+ P_MODE(IRUSR);
+ P_MODE(IWUSR);
+ P_MODE(IXUSR);
+ P_MODE(IRWXG);
+ P_MODE(IRGRP);
+ P_MODE(IWGRP);
+ P_MODE(IXGRP);
+ P_MODE(IRWXO);
+ P_MODE(IROTH);
+ P_MODE(IWOTH);
+ P_MODE(IXOTH);
+#undef P_MODE
+
+ if (mode)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", mode);
+
+ return printed;
+}
+
+#define SCA_MODE_T syscall_arg__scnprintf_mode_t
diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c
new file mode 100644
index 000000000000..07fa8a0acad6
--- /dev/null
+++ b/tools/perf/trace/beauty/msg_flags.c
@@ -0,0 +1,62 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#ifndef MSG_PROBE
+#define MSG_PROBE 0x10
+#endif
+#ifndef MSG_WAITFORONE
+#define MSG_WAITFORONE 0x10000
+#endif
+#ifndef MSG_SENDPAGE_NOTLAST
+#define MSG_SENDPAGE_NOTLAST 0x20000
+#endif
+#ifndef MSG_FASTOPEN
+#define MSG_FASTOPEN 0x20000000
+#endif
+#ifndef MSG_CMSG_CLOEXEC
+# define MSG_CMSG_CLOEXEC 0x40000000
+#endif
+
+static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+ if (flags == 0)
+ return scnprintf(bf, size, "NONE");
+#define P_MSG_FLAG(n) \
+ if (flags & MSG_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~MSG_##n; \
+ }
+
+ P_MSG_FLAG(OOB);
+ P_MSG_FLAG(PEEK);
+ P_MSG_FLAG(DONTROUTE);
+ P_MSG_FLAG(TRYHARD);
+ P_MSG_FLAG(CTRUNC);
+ P_MSG_FLAG(PROBE);
+ P_MSG_FLAG(TRUNC);
+ P_MSG_FLAG(DONTWAIT);
+ P_MSG_FLAG(EOR);
+ P_MSG_FLAG(WAITALL);
+ P_MSG_FLAG(FIN);
+ P_MSG_FLAG(SYN);
+ P_MSG_FLAG(CONFIRM);
+ P_MSG_FLAG(RST);
+ P_MSG_FLAG(ERRQUEUE);
+ P_MSG_FLAG(NOSIGNAL);
+ P_MSG_FLAG(MORE);
+ P_MSG_FLAG(WAITFORONE);
+ P_MSG_FLAG(SENDPAGE_NOTLAST);
+ P_MSG_FLAG(FASTOPEN);
+ P_MSG_FLAG(CMSG_CLOEXEC);
+#undef P_MSG_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
diff --git a/tools/perf/trace/beauty/open_flags.c b/tools/perf/trace/beauty/open_flags.c
new file mode 100644
index 000000000000..0f3679e0cdcf
--- /dev/null
+++ b/tools/perf/trace/beauty/open_flags.c
@@ -0,0 +1,56 @@
+
+static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+ if (!(flags & O_CREAT))
+ arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
+
+ if (flags == 0)
+ return scnprintf(bf, size, "RDONLY");
+#define P_FLAG(n) \
+ if (flags & O_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~O_##n; \
+ }
+
+ P_FLAG(APPEND);
+ P_FLAG(ASYNC);
+ P_FLAG(CLOEXEC);
+ P_FLAG(CREAT);
+ P_FLAG(DIRECT);
+ P_FLAG(DIRECTORY);
+ P_FLAG(EXCL);
+ P_FLAG(LARGEFILE);
+ P_FLAG(NOATIME);
+ P_FLAG(NOCTTY);
+#ifdef O_NONBLOCK
+ P_FLAG(NONBLOCK);
+#elif O_NDELAY
+ P_FLAG(NDELAY);
+#endif
+#ifdef O_PATH
+ P_FLAG(PATH);
+#endif
+ P_FLAG(RDWR);
+#ifdef O_DSYNC
+ if ((flags & O_SYNC) == O_SYNC)
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
+ else {
+ P_FLAG(DSYNC);
+ }
+#else
+ P_FLAG(SYNC);
+#endif
+ P_FLAG(TRUNC);
+ P_FLAG(WRONLY);
+#undef P_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c
new file mode 100644
index 000000000000..311f09dd718d
--- /dev/null
+++ b/tools/perf/trace/beauty/perf_event_open.c
@@ -0,0 +1,43 @@
+#ifndef PERF_FLAG_FD_NO_GROUP
+# define PERF_FLAG_FD_NO_GROUP (1UL << 0)
+#endif
+
+#ifndef PERF_FLAG_FD_OUTPUT
+# define PERF_FLAG_FD_OUTPUT (1UL << 1)
+#endif
+
+#ifndef PERF_FLAG_PID_CGROUP
+# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
+#endif
+
+#ifndef PERF_FLAG_FD_CLOEXEC
+# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
+#endif
+
+static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+ if (flags == 0)
+ return 0;
+
+#define P_FLAG(n) \
+ if (flags & PERF_FLAG_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~PERF_FLAG_##n; \
+ }
+
+ P_FLAG(FD_NO_GROUP);
+ P_FLAG(FD_OUTPUT);
+ P_FLAG(PID_CGROUP);
+ P_FLAG(FD_CLOEXEC);
+#undef P_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c
new file mode 100644
index 000000000000..07486ea65ae3
--- /dev/null
+++ b/tools/perf/trace/beauty/pid.c
@@ -0,0 +1,21 @@
+static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg)
+{
+ int pid = arg->val;
+ struct trace *trace = arg->trace;
+ size_t printed = scnprintf(bf, size, "%d", pid);
+ struct thread *thread = machine__findnew_thread(trace->host, pid, pid);
+
+ if (thread != NULL) {
+ if (!thread->comm_set)
+ thread__set_comm_from_proc(thread);
+
+ if (thread->comm_set)
+ printed += scnprintf(bf + printed, size - printed,
+ " (%s)", thread__comm_str(thread));
+ thread__put(thread);
+ }
+
+ return printed;
+}
+
+#define SCA_PID syscall_arg__scnprintf_pid
diff --git a/tools/perf/trace/beauty/sched_policy.c b/tools/perf/trace/beauty/sched_policy.c
new file mode 100644
index 000000000000..c205bc608b3c
--- /dev/null
+++ b/tools/perf/trace/beauty/sched_policy.c
@@ -0,0 +1,44 @@
+#include <sched.h>
+
+/*
+ * Not defined anywhere else, probably, just to make sure we
+ * catch future flags
+ */
+#define SCHED_POLICY_MASK 0xff
+
+#ifndef SCHED_DEADLINE
+#define SCHED_DEADLINE 6
+#endif
+
+static size_t syscall_arg__scnprintf_sched_policy(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ const char *policies[] = {
+ "NORMAL", "FIFO", "RR", "BATCH", "ISO", "IDLE", "DEADLINE",
+ };
+ size_t printed;
+ int policy = arg->val,
+ flags = policy & ~SCHED_POLICY_MASK;
+
+ policy &= SCHED_POLICY_MASK;
+ if (policy <= SCHED_DEADLINE)
+ printed = scnprintf(bf, size, "%s", policies[policy]);
+ else
+ printed = scnprintf(bf, size, "%#x", policy);
+
+#define P_POLICY_FLAG(n) \
+ if (flags & SCHED_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
+ flags &= ~SCHED_##n; \
+ }
+
+ P_POLICY_FLAG(RESET_ON_FORK);
+#undef P_POLICY_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
+
+ return printed;
+}
+
+#define SCA_SCHED_POLICY syscall_arg__scnprintf_sched_policy
diff --git a/tools/perf/trace/beauty/seccomp.c b/tools/perf/trace/beauty/seccomp.c
new file mode 100644
index 000000000000..213c5a7e3e92
--- /dev/null
+++ b/tools/perf/trace/beauty/seccomp.c
@@ -0,0 +1,52 @@
+#include <linux/seccomp.h>
+
+#ifndef SECCOMP_SET_MODE_STRICT
+#define SECCOMP_SET_MODE_STRICT 0
+#endif
+#ifndef SECCOMP_SET_MODE_FILTER
+#define SECCOMP_SET_MODE_FILTER 1
+#endif
+
+static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct syscall_arg *arg)
+{
+ int op = arg->val;
+ size_t printed = 0;
+
+ switch (op) {
+#define P_SECCOMP_SET_MODE_OP(n) case SECCOMP_SET_MODE_##n: printed = scnprintf(bf, size, #n); break
+ P_SECCOMP_SET_MODE_OP(STRICT);
+ P_SECCOMP_SET_MODE_OP(FILTER);
+#undef P_SECCOMP_SET_MODE_OP
+ default: printed = scnprintf(bf, size, "%#x", op); break;
+ }
+
+ return printed;
+}
+
+#define SCA_SECCOMP_OP syscall_arg__scnprintf_seccomp_op
+
+#ifndef SECCOMP_FILTER_FLAG_TSYNC
+#define SECCOMP_FILTER_FLAG_TSYNC 1
+#endif
+
+static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+#define P_FLAG(n) \
+ if (flags & SECCOMP_FILTER_FLAG_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~SECCOMP_FILTER_FLAG_##n; \
+ }
+
+ P_FLAG(TSYNC);
+#undef P_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags
diff --git a/tools/perf/trace/beauty/signum.c b/tools/perf/trace/beauty/signum.c
new file mode 100644
index 000000000000..d3b0b1fab077
--- /dev/null
+++ b/tools/perf/trace/beauty/signum.c
@@ -0,0 +1,53 @@
+
+static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
+{
+ int sig = arg->val;
+
+ switch (sig) {
+#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
+ P_SIGNUM(HUP);
+ P_SIGNUM(INT);
+ P_SIGNUM(QUIT);
+ P_SIGNUM(ILL);
+ P_SIGNUM(TRAP);
+ P_SIGNUM(ABRT);
+ P_SIGNUM(BUS);
+ P_SIGNUM(FPE);
+ P_SIGNUM(KILL);
+ P_SIGNUM(USR1);
+ P_SIGNUM(SEGV);
+ P_SIGNUM(USR2);
+ P_SIGNUM(PIPE);
+ P_SIGNUM(ALRM);
+ P_SIGNUM(TERM);
+ P_SIGNUM(CHLD);
+ P_SIGNUM(CONT);
+ P_SIGNUM(STOP);
+ P_SIGNUM(TSTP);
+ P_SIGNUM(TTIN);
+ P_SIGNUM(TTOU);
+ P_SIGNUM(URG);
+ P_SIGNUM(XCPU);
+ P_SIGNUM(XFSZ);
+ P_SIGNUM(VTALRM);
+ P_SIGNUM(PROF);
+ P_SIGNUM(WINCH);
+ P_SIGNUM(IO);
+ P_SIGNUM(PWR);
+ P_SIGNUM(SYS);
+#ifdef SIGEMT
+ P_SIGNUM(EMT);
+#endif
+#ifdef SIGSTKFLT
+ P_SIGNUM(STKFLT);
+#endif
+#ifdef SIGSWI
+ P_SIGNUM(SWI);
+#endif
+ default: break;
+ }
+
+ return scnprintf(bf, size, "%#x", sig);
+}
+
+#define SCA_SIGNUM syscall_arg__scnprintf_signum
diff --git a/tools/perf/trace/beauty/socket_type.c b/tools/perf/trace/beauty/socket_type.c
new file mode 100644
index 000000000000..0a5ce818131c
--- /dev/null
+++ b/tools/perf/trace/beauty/socket_type.c
@@ -0,0 +1,60 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#ifndef SOCK_DCCP
+# define SOCK_DCCP 6
+#endif
+
+#ifndef SOCK_CLOEXEC
+# define SOCK_CLOEXEC 02000000
+#endif
+
+#ifndef SOCK_NONBLOCK
+# define SOCK_NONBLOCK 00004000
+#endif
+
+#ifndef SOCK_TYPE_MASK
+#define SOCK_TYPE_MASK 0xf
+#endif
+
+static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size, struct syscall_arg *arg)
+{
+ size_t printed;
+ int type = arg->val,
+ flags = type & ~SOCK_TYPE_MASK;
+
+ type &= SOCK_TYPE_MASK;
+ /*
+ * Can't use a strarray, MIPS may override for ABI reasons.
+ */
+ switch (type) {
+#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
+ P_SK_TYPE(STREAM);
+ P_SK_TYPE(DGRAM);
+ P_SK_TYPE(RAW);
+ P_SK_TYPE(RDM);
+ P_SK_TYPE(SEQPACKET);
+ P_SK_TYPE(DCCP);
+ P_SK_TYPE(PACKET);
+#undef P_SK_TYPE
+ default:
+ printed = scnprintf(bf, size, "%#x", type);
+ }
+
+#define P_SK_FLAG(n) \
+ if (flags & SOCK_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
+ flags &= ~SOCK_##n; \
+ }
+
+ P_SK_FLAG(CLOEXEC);
+ P_SK_FLAG(NONBLOCK);
+#undef P_SK_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
+
+ return printed;
+}
+
+#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
diff --git a/tools/perf/trace/beauty/waitid_options.c b/tools/perf/trace/beauty/waitid_options.c
new file mode 100644
index 000000000000..7942724adec8
--- /dev/null
+++ b/tools/perf/trace/beauty/waitid_options.c
@@ -0,0 +1,26 @@
+#include <sys/types.h>
+#include <sys/wait.h>
+
+static size_t syscall_arg__scnprintf_waitid_options(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, options = arg->val;
+
+#define P_OPTION(n) \
+ if (options & W##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ options &= ~W##n; \
+ }
+
+ P_OPTION(NOHANG);
+ P_OPTION(UNTRACED);
+ P_OPTION(CONTINUED);
+#undef P_OPTION
+
+ if (options)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", options);
+
+ return printed;
+}
+
+#define SCA_WAITID_OPTIONS syscall_arg__scnprintf_waitid_options
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 2a83414159a6..538bae880bfe 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1607,9 +1607,8 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
dummy_hpp.buf[ret] = '\0';
- rtrim(dummy_hpp.buf);
- start = ltrim(dummy_hpp.buf);
+ start = trim(dummy_hpp.buf);
ret = strlen(start);
if (start != dummy_hpp.buf)
@@ -1897,11 +1896,10 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
bool first = true;
int ret;
- if (symbol_conf.use_callchain)
+ if (symbol_conf.use_callchain) {
folded_sign = hist_entry__folded(he);
-
- if (symbol_conf.use_callchain)
printed += fprintf(fp, "%c ", folded_sign);
+ }
hists__for_each_format(browser->hists, fmt) {
if (perf_hpp__should_skip(fmt, he->hists))
@@ -2137,7 +2135,7 @@ static int hists__browser_title(struct hists *hists,
printed += snprintf(bf + printed, size - printed,
", UID: %s", hists->uid_filter_str);
if (thread) {
- if (sort__has_thread) {
+ if (hists__has(hists, thread)) {
printed += scnprintf(bf + printed, size - printed,
", Thread: %s(%d)",
(thread->comm_set ? thread__comm_str(thread) : ""),
@@ -2322,7 +2320,8 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
{
struct thread *thread = act->thread;
- if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+ if ((!hists__has(browser->hists, thread) &&
+ !hists__has(browser->hists, comm)) || thread == NULL)
return 0;
if (browser->hists->thread_filter) {
@@ -2331,7 +2330,7 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
thread__zput(browser->hists->thread_filter);
ui_helpline__pop();
} else {
- if (sort__has_thread) {
+ if (hists__has(browser->hists, thread)) {
ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
thread->comm_set ? thread__comm_str(thread) : "",
thread->tid);
@@ -2356,10 +2355,11 @@ add_thread_opt(struct hist_browser *browser, struct popup_action *act,
{
int ret;
- if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+ if ((!hists__has(browser->hists, thread) &&
+ !hists__has(browser->hists, comm)) || thread == NULL)
return 0;
- if (sort__has_thread) {
+ if (hists__has(browser->hists, thread)) {
ret = asprintf(optstr, "Zoom %s %s(%d) thread",
browser->hists->thread_filter ? "out of" : "into",
thread->comm_set ? thread__comm_str(thread) : "",
@@ -2382,7 +2382,7 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
{
struct map *map = act->ms.map;
- if (!sort__has_dso || map == NULL)
+ if (!hists__has(browser->hists, dso) || map == NULL)
return 0;
if (browser->hists->dso_filter) {
@@ -2409,7 +2409,7 @@ static int
add_dso_opt(struct hist_browser *browser, struct popup_action *act,
char **optstr, struct map *map)
{
- if (!sort__has_dso || map == NULL)
+ if (!hists__has(browser->hists, dso) || map == NULL)
return 0;
if (asprintf(optstr, "Zoom %s %s DSO",
@@ -2431,10 +2431,10 @@ do_browse_map(struct hist_browser *browser __maybe_unused,
}
static int
-add_map_opt(struct hist_browser *browser __maybe_unused,
+add_map_opt(struct hist_browser *browser,
struct popup_action *act, char **optstr, struct map *map)
{
- if (!sort__has_dso || map == NULL)
+ if (!hists__has(browser->hists, dso) || map == NULL)
return 0;
if (asprintf(optstr, "Browse map details") < 0)
@@ -2536,7 +2536,7 @@ add_exit_opt(struct hist_browser *browser __maybe_unused,
static int
do_zoom_socket(struct hist_browser *browser, struct popup_action *act)
{
- if (!sort__has_socket || act->socket < 0)
+ if (!hists__has(browser->hists, socket) || act->socket < 0)
return 0;
if (browser->hists->socket_filter > -1) {
@@ -2558,7 +2558,7 @@ static int
add_socket_opt(struct hist_browser *browser, struct popup_action *act,
char **optstr, int socket_id)
{
- if (!sort__has_socket || socket_id < 0)
+ if (!hists__has(browser->hists, socket) || socket_id < 0)
return 0;
if (asprintf(optstr, "Zoom %s Processor Socket %d",
@@ -2749,7 +2749,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
*/
goto out_free_stack;
case 'a':
- if (!sort__has_sym) {
+ if (!hists__has(hists, sym)) {
ui_browser__warning(&browser->b, delay_secs * 2,
"Annotation is only available for symbolic views, "
"include \"sym*\" in --sort to use it.");
@@ -2912,7 +2912,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
continue;
}
- if (!sort__has_sym || browser->selection == NULL)
+ if (!hists__has(hists, sym) || browser->selection == NULL)
goto skip_annotation;
if (sort__mode == SORT_MODE__BRANCH) {
@@ -2956,7 +2956,7 @@ skip_annotation:
goto skip_scripting;
if (browser->he_selection) {
- if (sort__has_thread && thread) {
+ if (hists__has(hists, thread) && thread) {
nr_options += add_script_opt(browser,
&actions[nr_options],
&options[nr_options],
@@ -2971,7 +2971,7 @@ skip_annotation:
*
* See hist_browser__show_entry.
*/
- if (sort__has_sym && browser->selection->sym) {
+ if (hists__has(hists, sym) && browser->selection->sym) {
nr_options += add_script_opt(browser,
&actions[nr_options],
&options[nr_options],
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 2aa45b606fa4..932adfaa05af 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -379,7 +379,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
gtk_tree_store_set(store, &iter, col_idx++, s, -1);
}
- if (symbol_conf.use_callchain && sort__has_sym) {
+ if (symbol_conf.use_callchain && hists__has(hists, sym)) {
if (callchain_param.mode == CHAIN_GRAPH_REL)
total = symbol_conf.cumulate_callchain ?
h->stat_acc->period : h->stat.period;
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 3baeaa6e71b5..af07ffb129ca 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -635,7 +635,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
ret += fmt->width(fmt, &dummy_hpp, hists_to_evsel(hists));
}
- if (verbose && sort__has_sym) /* Addr + origin */
+ if (verbose && hists__has(hists, sym)) /* Addr + origin */
ret += 3 + BITS_PER_LONG / 4;
return ret;
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 7aff5acf3265..560eb47d56f9 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -569,9 +569,8 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
first_col = false;
fmt->header(fmt, hpp, hists_to_evsel(hists));
- rtrim(hpp->buf);
- header_width += fprintf(fp, "%s", ltrim(hpp->buf));
+ header_width += fprintf(fp, "%s", trim(hpp->buf));
}
}
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index da48fd843438..8c6c8a0ca642 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -8,6 +8,7 @@ libperf-y += env.o
libperf-y += event.o
libperf-y += evlist.o
libperf-y += evsel.o
+libperf-y += evsel_fprintf.o
libperf-y += find_bit.o
libperf-y += kallsyms.o
libperf-y += levenshtein.o
@@ -26,9 +27,9 @@ libperf-y += strlist.o
libperf-y += strfilter.o
libperf-y += top.o
libperf-y += usage.o
-libperf-y += wrapper.o
libperf-y += dso.o
libperf-y += symbol.o
+libperf-y += symbol_fprintf.o
libperf-y += color.o
libperf-y += header.o
libperf-y += callchain.o
@@ -38,6 +39,7 @@ libperf-y += machine.o
libperf-y += map.o
libperf-y += pstack.o
libperf-y += session.o
+libperf-$(CONFIG_AUDIT) += syscalltbl.o
libperf-y += ordered-events.o
libperf-y += comm.o
libperf-y += thread.o
@@ -69,9 +71,9 @@ libperf-y += stat-shadow.o
libperf-y += record.o
libperf-y += srcline.o
libperf-y += data.o
-libperf-$(CONFIG_X86) += tsc.o
-libperf-$(CONFIG_AUXTRACE) += tsc.o
+libperf-y += tsc.o
libperf-y += cloexec.o
+libperf-y += call-path.o
libperf-y += thread-stack.o
libperf-$(CONFIG_AUXTRACE) += auxtrace.o
libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index b795b6994144..4db73d5a0dbc 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1138,7 +1138,7 @@ fallback:
if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
!dso__is_kcore(dso)) {
- char bf[BUILD_ID_SIZE * 2 + 16] = " with build id ";
+ char bf[SBUILD_ID_SIZE + 15] = " with build id ";
char *build_id_msg = NULL;
if (dso->annotate_warned)
@@ -1665,5 +1665,5 @@ int hist_entry__annotate(struct hist_entry *he, size_t privsize)
bool ui__has_annotation(void)
{
- return use_browser == 1 && sort__has_sym;
+ return use_browser == 1 && perf_hpp_list.sym;
}
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index ec164fe70718..c9169011e55e 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ;
+ synth_opts->initial_skip = 0;
}
/*
@@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
synth_opts->last_branch_sz = val;
}
break;
+ case 's':
+ synth_opts->initial_skip = strtoul(p, &endptr, 10);
+ if (p == endptr)
+ goto out_err;
+ p = endptr;
+ break;
case ' ':
case ',':
break;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 57ff31ecb8e4..767989e0e312 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -68,6 +68,7 @@ enum itrace_period_type {
* @last_branch_sz: branch context size
* @period: 'instructions' events period
* @period_type: 'instructions' events period type
+ * @initial_skip: skip N events at the beginning.
*/
struct itrace_synth_opts {
bool set;
@@ -86,6 +87,7 @@ struct itrace_synth_opts {
unsigned int last_branch_sz;
unsigned long long period;
enum itrace_period_type period_type;
+ unsigned long initial_skip;
};
/**
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 0967ce601931..493307d1414c 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -842,6 +842,58 @@ bpf_map_op__new(struct parse_events_term *term)
return op;
}
+static struct bpf_map_op *
+bpf_map_op__clone(struct bpf_map_op *op)
+{
+ struct bpf_map_op *newop;
+
+ newop = memdup(op, sizeof(*op));
+ if (!newop) {
+ pr_debug("Failed to alloc bpf_map_op\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&newop->list);
+ if (op->key_type == BPF_MAP_KEY_RANGES) {
+ size_t memsz = op->k.array.nr_ranges *
+ sizeof(op->k.array.ranges[0]);
+
+ newop->k.array.ranges = memdup(op->k.array.ranges, memsz);
+ if (!newop->k.array.ranges) {
+ pr_debug("Failed to alloc indices for map\n");
+ free(newop);
+ return NULL;
+ }
+ }
+
+ return newop;
+}
+
+static struct bpf_map_priv *
+bpf_map_priv__clone(struct bpf_map_priv *priv)
+{
+ struct bpf_map_priv *newpriv;
+ struct bpf_map_op *pos, *newop;
+
+ newpriv = zalloc(sizeof(*newpriv));
+ if (!newpriv) {
+ pr_debug("No enough memory to alloc map private\n");
+ return NULL;
+ }
+ INIT_LIST_HEAD(&newpriv->ops_list);
+
+ list_for_each_entry(pos, &priv->ops_list, list) {
+ newop = bpf_map_op__clone(pos);
+ if (!newop) {
+ bpf_map_priv__purge(newpriv);
+ return NULL;
+ }
+ list_add_tail(&newop->list, &newpriv->ops_list);
+ }
+
+ return newpriv;
+}
+
static int
bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
{
@@ -1417,6 +1469,89 @@ int bpf__apply_obj_config(void)
return 0;
}
+#define bpf__for_each_map(pos, obj, objtmp) \
+ bpf_object__for_each_safe(obj, objtmp) \
+ bpf_map__for_each(pos, obj)
+
+#define bpf__for_each_stdout_map(pos, obj, objtmp) \
+ bpf__for_each_map(pos, obj, objtmp) \
+ if (bpf_map__get_name(pos) && \
+ (strcmp("__bpf_stdout__", \
+ bpf_map__get_name(pos)) == 0))
+
+int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+ struct bpf_map_priv *tmpl_priv = NULL;
+ struct bpf_object *obj, *tmp;
+ struct perf_evsel *evsel = NULL;
+ struct bpf_map *map;
+ int err;
+ bool need_init = false;
+
+ bpf__for_each_stdout_map(map, obj, tmp) {
+ struct bpf_map_priv *priv;
+
+ err = bpf_map__get_private(map, (void **)&priv);
+ if (err)
+ return -BPF_LOADER_ERRNO__INTERNAL;
+
+ /*
+ * No need to check map type: type should have been
+ * verified by kernel.
+ */
+ if (!need_init && !priv)
+ need_init = !priv;
+ if (!tmpl_priv && priv)
+ tmpl_priv = priv;
+ }
+
+ if (!need_init)
+ return 0;
+
+ if (!tmpl_priv) {
+ err = parse_events(evlist, "bpf-output/no-inherit=1,name=__bpf_stdout__/",
+ NULL);
+ if (err) {
+ pr_debug("ERROR: failed to create bpf-output event\n");
+ return -err;
+ }
+
+ evsel = perf_evlist__last(evlist);
+ }
+
+ bpf__for_each_stdout_map(map, obj, tmp) {
+ struct bpf_map_priv *priv;
+
+ err = bpf_map__get_private(map, (void **)&priv);
+ if (err)
+ return -BPF_LOADER_ERRNO__INTERNAL;
+ if (priv)
+ continue;
+
+ if (tmpl_priv) {
+ priv = bpf_map_priv__clone(tmpl_priv);
+ if (!priv)
+ return -ENOMEM;
+
+ err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
+ if (err) {
+ bpf_map_priv__clear(map, priv);
+ return err;
+ }
+ } else if (evsel) {
+ struct bpf_map_op *op;
+
+ op = bpf_map__add_newop(map, NULL);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+ op->op_type = BPF_MAP_OP_SET_EVSEL;
+ op->v.evsel = evsel;
+ }
+ }
+
+ return 0;
+}
+
#define ERRNO_OFFSET(e) ((e) - __BPF_LOADER_ERRNO__START)
#define ERRCODE_OFFSET(c) ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
#define NR_ERRNO (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -1590,3 +1725,11 @@ int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
bpf__strerror_end(buf, size);
return 0;
}
+
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+ int err, char *buf, size_t size)
+{
+ bpf__strerror_head(err, buf, size);
+ bpf__strerror_end(buf, size);
+ return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index be4311944e3d..941e17275aa7 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -79,6 +79,11 @@ int bpf__strerror_config_obj(struct bpf_object *obj,
size_t size);
int bpf__apply_obj_config(void);
int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
+
+int bpf__setup_stdout(struct perf_evlist *evlist);
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist, int err,
+ char *buf, size_t size);
+
#else
static inline struct bpf_object *
bpf__prepare_load(const char *filename __maybe_unused,
@@ -125,6 +130,12 @@ bpf__apply_obj_config(void)
}
static inline int
+bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+ return 0;
+}
+
+static inline int
__bpf_strerror(char *buf, size_t size)
{
if (!size)
@@ -177,5 +188,13 @@ bpf__strerror_apply_obj_config(int err __maybe_unused,
{
return __bpf_strerror(buf, size);
}
+
+static inline int
+bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+ int err __maybe_unused, char *buf,
+ size_t size)
+{
+ return __bpf_strerror(buf, size);
+}
#endif
#endif
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 0573c2ec861d..bff425e1232c 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -261,14 +261,14 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
if (dso__is_vdso(pos)) {
name = pos->short_name;
- name_len = pos->short_name_len + 1;
+ name_len = pos->short_name_len;
} else if (dso__is_kcore(pos)) {
machine__mmap_name(machine, nm, sizeof(nm));
name = nm;
- name_len = strlen(nm) + 1;
+ name_len = strlen(nm);
} else {
name = pos->long_name;
- name_len = pos->long_name_len + 1;
+ name_len = pos->long_name_len;
}
in_kernel = pos->kernel ||
@@ -365,39 +365,17 @@ static char *build_id_cache__dirname_from_path(const char *name,
int build_id_cache__list_build_ids(const char *pathname,
struct strlist **result)
{
- struct strlist *list;
char *dir_name;
- DIR *dir;
- struct dirent *d;
int ret = 0;
- list = strlist__new(NULL, NULL);
dir_name = build_id_cache__dirname_from_path(pathname, false, false);
- if (!list || !dir_name) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!dir_name)
+ return -ENOMEM;
- /* List up all dirents */
- dir = opendir(dir_name);
- if (!dir) {
+ *result = lsdir(dir_name, lsdir_no_dot_filter);
+ if (!*result)
ret = -errno;
- goto out;
- }
-
- while ((d = readdir(dir)) != NULL) {
- if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
- continue;
- strlist__add(list, d->d_name);
- }
- closedir(dir);
-
-out:
free(dir_name);
- if (ret)
- strlist__delete(list);
- else
- *result = list;
return ret;
}
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 1f5a93c2c9a2..0d814bb74661 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -40,25 +40,6 @@ int split_cmdline(char *cmdline, const char ***argv);
#define alloc_nr(x) (((x)+16)*3/2)
-/*
- * Realloc the buffer pointed at by variable 'x' so that it can hold
- * at least 'nr' entries; the number of entries currently allocated
- * is 'alloc', using the standard growing factor alloc_nr() macro.
- *
- * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
- */
-#define ALLOC_GROW(x, nr, alloc) \
- do { \
- if ((nr) > alloc) { \
- if (alloc_nr(alloc) < (nr)) \
- alloc = (nr); \
- else \
- alloc = alloc_nr(alloc); \
- x = xrealloc((x), alloc * sizeof(*(x))); \
- } \
- } while(0)
-
-
static inline int is_absolute_path(const char *path)
{
return path[0] == '/';
diff --git a/tools/perf/util/call-path.c b/tools/perf/util/call-path.c
new file mode 100644
index 000000000000..904a17052e38
--- /dev/null
+++ b/tools/perf/util/call-path.c
@@ -0,0 +1,122 @@
+/*
+ * call-path.h: Manipulate a tree data structure containing function call paths
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#include "util.h"
+#include "call-path.h"
+
+static void call_path__init(struct call_path *cp, struct call_path *parent,
+ struct symbol *sym, u64 ip, bool in_kernel)
+{
+ cp->parent = parent;
+ cp->sym = sym;
+ cp->ip = sym ? 0 : ip;
+ cp->db_id = 0;
+ cp->in_kernel = in_kernel;
+ RB_CLEAR_NODE(&cp->rb_node);
+ cp->children = RB_ROOT;
+}
+
+struct call_path_root *call_path_root__new(void)
+{
+ struct call_path_root *cpr;
+
+ cpr = zalloc(sizeof(struct call_path_root));
+ if (!cpr)
+ return NULL;
+ call_path__init(&cpr->call_path, NULL, NULL, 0, false);
+ INIT_LIST_HEAD(&cpr->blocks);
+ return cpr;
+}
+
+void call_path_root__free(struct call_path_root *cpr)
+{
+ struct call_path_block *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &cpr->blocks, node) {
+ list_del(&pos->node);
+ free(pos);
+ }
+ free(cpr);
+}
+
+static struct call_path *call_path__new(struct call_path_root *cpr,
+ struct call_path *parent,
+ struct symbol *sym, u64 ip,
+ bool in_kernel)
+{
+ struct call_path_block *cpb;
+ struct call_path *cp;
+ size_t n;
+
+ if (cpr->next < cpr->sz) {
+ cpb = list_last_entry(&cpr->blocks, struct call_path_block,
+ node);
+ } else {
+ cpb = zalloc(sizeof(struct call_path_block));
+ if (!cpb)
+ return NULL;
+ list_add_tail(&cpb->node, &cpr->blocks);
+ cpr->sz += CALL_PATH_BLOCK_SIZE;
+ }
+
+ n = cpr->next++ & CALL_PATH_BLOCK_MASK;
+ cp = &cpb->cp[n];
+
+ call_path__init(cp, parent, sym, ip, in_kernel);
+
+ return cp;
+}
+
+struct call_path *call_path__findnew(struct call_path_root *cpr,
+ struct call_path *parent,
+ struct symbol *sym, u64 ip, u64 ks)
+{
+ struct rb_node **p;
+ struct rb_node *node_parent = NULL;
+ struct call_path *cp;
+ bool in_kernel = ip >= ks;
+
+ if (sym)
+ ip = 0;
+
+ if (!parent)
+ return call_path__new(cpr, parent, sym, ip, in_kernel);
+
+ p = &parent->children.rb_node;
+ while (*p != NULL) {
+ node_parent = *p;
+ cp = rb_entry(node_parent, struct call_path, rb_node);
+
+ if (cp->sym == sym && cp->ip == ip)
+ return cp;
+
+ if (sym < cp->sym || (sym == cp->sym && ip < cp->ip))
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ cp = call_path__new(cpr, parent, sym, ip, in_kernel);
+ if (!cp)
+ return NULL;
+
+ rb_link_node(&cp->rb_node, node_parent, p);
+ rb_insert_color(&cp->rb_node, &parent->children);
+
+ return cp;
+}
diff --git a/tools/perf/util/call-path.h b/tools/perf/util/call-path.h
new file mode 100644
index 000000000000..477f6d03b659
--- /dev/null
+++ b/tools/perf/util/call-path.h
@@ -0,0 +1,77 @@
+/*
+ * call-path.h: Manipulate a tree data structure containing function call paths
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef __PERF_CALL_PATH_H
+#define __PERF_CALL_PATH_H
+
+#include <sys/types.h>
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+/**
+ * struct call_path - node in list of calls leading to a function call.
+ * @parent: call path to the parent function call
+ * @sym: symbol of function called
+ * @ip: only if sym is null, the ip of the function
+ * @db_id: id used for db-export
+ * @in_kernel: whether function is a in the kernel
+ * @rb_node: node in parent's tree of called functions
+ * @children: tree of call paths of functions called
+ *
+ * In combination with the call_return structure, the call_path structure
+ * defines a context-sensitve call-graph.
+ */
+struct call_path {
+ struct call_path *parent;
+ struct symbol *sym;
+ u64 ip;
+ u64 db_id;
+ bool in_kernel;
+ struct rb_node rb_node;
+ struct rb_root children;
+};
+
+#define CALL_PATH_BLOCK_SHIFT 8
+#define CALL_PATH_BLOCK_SIZE (1 << CALL_PATH_BLOCK_SHIFT)
+#define CALL_PATH_BLOCK_MASK (CALL_PATH_BLOCK_SIZE - 1)
+
+struct call_path_block {
+ struct call_path cp[CALL_PATH_BLOCK_SIZE];
+ struct list_head node;
+};
+
+/**
+ * struct call_path_root - root of all call paths.
+ * @call_path: root call path
+ * @blocks: list of blocks to store call paths
+ * @next: next free space
+ * @sz: number of spaces
+ */
+struct call_path_root {
+ struct call_path call_path;
+ struct list_head blocks;
+ size_t next;
+ size_t sz;
+};
+
+struct call_path_root *call_path_root__new(void);
+void call_path_root__free(struct call_path_root *cpr);
+
+struct call_path *call_path__findnew(struct call_path_root *cpr,
+ struct call_path *parent,
+ struct symbol *sym, u64 ip, u64 ks);
+
+#endif
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 24b4bd0d7754..07fd30bc2f81 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -109,6 +109,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
bool record_opt_set = false;
bool try_stack_size = false;
+ callchain_param.enabled = true;
symbol_conf.use_callchain = true;
if (!arg)
@@ -117,6 +118,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
while ((tok = strtok((char *)arg, ",")) != NULL) {
if (!strncmp(tok, "none", strlen(tok))) {
callchain_param.mode = CHAIN_NONE;
+ callchain_param.enabled = false;
symbol_conf.use_callchain = false;
return 0;
}
@@ -788,7 +790,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
return 0;
}
-int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent,
+int sample__resolve_callchain(struct perf_sample *sample,
+ struct callchain_cursor *cursor, struct symbol **parent,
struct perf_evsel *evsel, struct addr_location *al,
int max_stack)
{
@@ -796,8 +799,8 @@ int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent
return 0;
if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain ||
- sort__has_parent) {
- return thread__resolve_callchain(al->thread, evsel, sample,
+ perf_hpp_list.parent) {
+ return thread__resolve_callchain(al->thread, cursor, evsel, sample,
parent, al, max_stack);
}
return 0;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index d2a9e694810c..65e2a4f7cb4e 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -212,7 +212,14 @@ struct hist_entry;
int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset);
int record_callchain_opt(const struct option *opt, const char *arg, int unset);
-int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent,
+struct record_opts;
+
+int record_opts__parse_callchain(struct record_opts *record,
+ struct callchain_param *callchain,
+ const char *arg, bool unset);
+
+int sample__resolve_callchain(struct perf_sample *sample,
+ struct callchain_cursor *cursor, struct symbol **parent,
struct perf_evsel *evsel, struct addr_location *al,
int max_stack);
int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *sample);
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 4e727635476e..dad7d8272168 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -13,6 +13,7 @@
#include <subcmd/exec-cmd.h>
#include "util/hist.h" /* perf_hist_config */
#include "util/llvm-utils.h" /* perf_llvm_config */
+#include "config.h"
#define MAXNAME (256)
@@ -377,6 +378,21 @@ const char *perf_config_dirname(const char *name, const char *value)
return value;
}
+static int perf_buildid_config(const char *var, const char *value)
+{
+ /* same dir for all commands */
+ if (!strcmp(var, "buildid.dir")) {
+ const char *dir = perf_config_dirname(var, value);
+
+ if (!dir)
+ return -1;
+ strncpy(buildid_dir, dir, MAXPATHLEN-1);
+ buildid_dir[MAXPATHLEN-1] = '\0';
+ }
+
+ return 0;
+}
+
static int perf_default_core_config(const char *var __maybe_unused,
const char *value __maybe_unused)
{
@@ -412,6 +428,9 @@ int perf_default_config(const char *var, const char *value,
if (!prefixcmp(var, "llvm."))
return perf_llvm_config(var, value);
+ if (!prefixcmp(var, "buildid."))
+ return perf_buildid_config(var, value);
+
/* Add other config variables here. */
return 0;
}
@@ -506,41 +525,185 @@ out:
return ret;
}
-/*
- * Call this to report error for your variable that should not
- * get a boolean value (i.e. "[my] var" means "true").
- */
-int config_error_nonbool(const char *var)
+static struct perf_config_section *find_section(struct list_head *sections,
+ const char *section_name)
{
- return error("Missing value for '%s'", var);
+ struct perf_config_section *section;
+
+ list_for_each_entry(section, sections, node)
+ if (!strcmp(section->name, section_name))
+ return section;
+
+ return NULL;
+}
+
+static struct perf_config_item *find_config_item(const char *name,
+ struct perf_config_section *section)
+{
+ struct perf_config_item *item;
+
+ list_for_each_entry(item, &section->items, node)
+ if (!strcmp(item->name, name))
+ return item;
+
+ return NULL;
}
-struct buildid_dir_config {
- char *dir;
-};
+static struct perf_config_section *add_section(struct list_head *sections,
+ const char *section_name)
+{
+ struct perf_config_section *section = zalloc(sizeof(*section));
+
+ if (!section)
+ return NULL;
+
+ INIT_LIST_HEAD(&section->items);
+ section->name = strdup(section_name);
+ if (!section->name) {
+ pr_debug("%s: strdup failed\n", __func__);
+ free(section);
+ return NULL;
+ }
+
+ list_add_tail(&section->node, sections);
+ return section;
+}
-static int buildid_dir_command_config(const char *var, const char *value,
- void *data)
+static struct perf_config_item *add_config_item(struct perf_config_section *section,
+ const char *name)
{
- struct buildid_dir_config *c = data;
- const char *v;
+ struct perf_config_item *item = zalloc(sizeof(*item));
- /* same dir for all commands */
- if (!strcmp(var, "buildid.dir")) {
- v = perf_config_dirname(var, value);
- if (!v)
- return -1;
- strncpy(c->dir, v, MAXPATHLEN-1);
- c->dir[MAXPATHLEN-1] = '\0';
+ if (!item)
+ return NULL;
+
+ item->name = strdup(name);
+ if (!item->name) {
+ pr_debug("%s: strdup failed\n", __func__);
+ free(item);
+ return NULL;
}
+
+ list_add_tail(&item->node, &section->items);
+ return item;
+}
+
+static int set_value(struct perf_config_item *item, const char *value)
+{
+ char *val = strdup(value);
+
+ if (!val)
+ return -1;
+
+ zfree(&item->value);
+ item->value = val;
return 0;
}
-static void check_buildid_dir_config(void)
+static int collect_config(const char *var, const char *value,
+ void *perf_config_set)
{
- struct buildid_dir_config c;
- c.dir = buildid_dir;
- perf_config(buildid_dir_command_config, &c);
+ int ret = -1;
+ char *ptr, *key;
+ char *section_name, *name;
+ struct perf_config_section *section = NULL;
+ struct perf_config_item *item = NULL;
+ struct perf_config_set *set = perf_config_set;
+ struct list_head *sections = &set->sections;
+
+ key = ptr = strdup(var);
+ if (!key) {
+ pr_debug("%s: strdup failed\n", __func__);
+ return -1;
+ }
+
+ section_name = strsep(&ptr, ".");
+ name = ptr;
+ if (name == NULL || value == NULL)
+ goto out_free;
+
+ section = find_section(sections, section_name);
+ if (!section) {
+ section = add_section(sections, section_name);
+ if (!section)
+ goto out_free;
+ }
+
+ item = find_config_item(name, section);
+ if (!item) {
+ item = add_config_item(section, name);
+ if (!item)
+ goto out_free;
+ }
+
+ ret = set_value(item, value);
+ return ret;
+
+out_free:
+ free(key);
+ perf_config_set__delete(set);
+ return -1;
+}
+
+struct perf_config_set *perf_config_set__new(void)
+{
+ struct perf_config_set *set = zalloc(sizeof(*set));
+
+ if (set) {
+ INIT_LIST_HEAD(&set->sections);
+ perf_config(collect_config, set);
+ }
+
+ return set;
+}
+
+static void perf_config_item__delete(struct perf_config_item *item)
+{
+ zfree(&item->name);
+ zfree(&item->value);
+ free(item);
+}
+
+static void perf_config_section__purge(struct perf_config_section *section)
+{
+ struct perf_config_item *item, *tmp;
+
+ list_for_each_entry_safe(item, tmp, &section->items, node) {
+ list_del_init(&item->node);
+ perf_config_item__delete(item);
+ }
+}
+
+static void perf_config_section__delete(struct perf_config_section *section)
+{
+ perf_config_section__purge(section);
+ zfree(&section->name);
+ free(section);
+}
+
+static void perf_config_set__purge(struct perf_config_set *set)
+{
+ struct perf_config_section *section, *tmp;
+
+ list_for_each_entry_safe(section, tmp, &set->sections, node) {
+ list_del_init(&section->node);
+ perf_config_section__delete(section);
+ }
+}
+
+void perf_config_set__delete(struct perf_config_set *set)
+{
+ perf_config_set__purge(set);
+ free(set);
+}
+
+/*
+ * Call this to report error for your variable that should not
+ * get a boolean value (i.e. "[my] var" means "true").
+ */
+int config_error_nonbool(const char *var)
+{
+ return error("Missing value for '%s'", var);
}
void set_buildid_dir(const char *dir)
@@ -548,16 +711,13 @@ void set_buildid_dir(const char *dir)
if (dir)
scnprintf(buildid_dir, MAXPATHLEN-1, "%s", dir);
- /* try config file */
- if (buildid_dir[0] == '\0')
- check_buildid_dir_config();
-
/* default to $HOME/.debug */
if (buildid_dir[0] == '\0') {
- char *v = getenv("HOME");
- if (v) {
+ char *home = getenv("HOME");
+
+ if (home) {
snprintf(buildid_dir, MAXPATHLEN-1, "%s/%s",
- v, DEBUG_CACHE_DIR);
+ home, DEBUG_CACHE_DIR);
} else {
strncpy(buildid_dir, DEBUG_CACHE_DIR, MAXPATHLEN-1);
}
diff --git a/tools/perf/util/config.h b/tools/perf/util/config.h
new file mode 100644
index 000000000000..22ec626ac718
--- /dev/null
+++ b/tools/perf/util/config.h
@@ -0,0 +1,26 @@
+#ifndef __PERF_CONFIG_H
+#define __PERF_CONFIG_H
+
+#include <stdbool.h>
+#include <linux/list.h>
+
+struct perf_config_item {
+ char *name;
+ char *value;
+ struct list_head node;
+};
+
+struct perf_config_section {
+ char *name;
+ struct list_head items;
+ struct list_head node;
+};
+
+struct perf_config_set {
+ struct list_head sections;
+};
+
+struct perf_config_set *perf_config_set__new(void);
+void perf_config_set__delete(struct perf_config_set *set);
+
+#endif /* __PERF_CONFIG_H */
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 9bcf2bed3a6d..02d801670f30 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -587,3 +587,15 @@ int cpu__setup_cpunode_map(void)
closedir(dir1);
return 0;
}
+
+bool cpu_map__has(struct cpu_map *cpus, int cpu)
+{
+ int i;
+
+ for (i = 0; i < cpus->nr; ++i) {
+ if (cpus->map[i] == cpu)
+ return true;
+ }
+
+ return false;
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 81a2562aaa2b..1a0a35073ce1 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -66,4 +66,6 @@ int cpu__get_node(int cpu);
int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
int (*f)(struct cpu_map *map, int cpu, void *data),
void *data);
+
+bool cpu_map__has(struct cpu_map *cpus, int cpu);
#endif /* __PERF_CPUMAP_H */
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 1921942fc2e0..be83516155ee 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -136,3 +136,44 @@ ssize_t perf_data_file__write(struct perf_data_file *file,
{
return writen(file->fd, buf, size);
}
+
+int perf_data_file__switch(struct perf_data_file *file,
+ const char *postfix,
+ size_t pos, bool at_exit)
+{
+ char *new_filepath;
+ int ret;
+
+ if (check_pipe(file))
+ return -EINVAL;
+ if (perf_data_file__is_read(file))
+ return -EINVAL;
+
+ if (asprintf(&new_filepath, "%s.%s", file->path, postfix) < 0)
+ return -ENOMEM;
+
+ /*
+ * Only fire a warning, don't return error, continue fill
+ * original file.
+ */
+ if (rename(file->path, new_filepath))
+ pr_warning("Failed to rename %s to %s\n", file->path, new_filepath);
+
+ if (!at_exit) {
+ close(file->fd);
+ ret = perf_data_file__open(file);
+ if (ret < 0)
+ goto out;
+
+ if (lseek(file->fd, pos, SEEK_SET) == (off_t)-1) {
+ ret = -errno;
+ pr_debug("Failed to lseek to %zu: %s",
+ pos, strerror(errno));
+ goto out;
+ }
+ }
+ ret = file->fd;
+out:
+ free(new_filepath);
+ return ret;
+}
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
index 2b15d0c95c7f..ae510ce16cb1 100644
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@@ -46,5 +46,14 @@ int perf_data_file__open(struct perf_data_file *file);
void perf_data_file__close(struct perf_data_file *file);
ssize_t perf_data_file__write(struct perf_data_file *file,
void *buf, size_t size);
-
+/*
+ * If at_exit is set, only rename current perf.data to
+ * perf.data.<postfix>, continue write on original file.
+ * Set at_exit when flushing the last output.
+ *
+ * Return value is fd of new output.
+ */
+int perf_data_file__switch(struct perf_data_file *file,
+ const char *postfix,
+ size_t pos, bool at_exit);
#endif /* __PERF_DATA_H */
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index 049438d51b9a..8d96c80cc67e 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -23,6 +23,8 @@
#include "event.h"
#include "util.h"
#include "thread-stack.h"
+#include "callchain.h"
+#include "call-path.h"
#include "db-export.h"
struct deferred_export {
@@ -258,8 +260,7 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al,
if (!al->sym) {
al->sym = symbol__new(al->addr, 0, 0, "unknown");
if (al->sym)
- symbols__insert(&dso->symbols[al->map->type],
- al->sym);
+ dso__insert_symbol(dso, al->map->type, al->sym);
}
if (al->sym) {
@@ -276,6 +277,80 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al,
return 0;
}
+static struct call_path *call_path_from_sample(struct db_export *dbe,
+ struct machine *machine,
+ struct thread *thread,
+ struct perf_sample *sample,
+ struct perf_evsel *evsel)
+{
+ u64 kernel_start = machine__kernel_start(machine);
+ struct call_path *current = &dbe->cpr->call_path;
+ enum chain_order saved_order = callchain_param.order;
+ int err;
+
+ if (!symbol_conf.use_callchain || !sample->callchain)
+ return NULL;
+
+ /*
+ * Since the call path tree must be built starting with the root, we
+ * must use ORDER_CALL for call chain resolution, in order to process
+ * the callchain starting with the root node and ending with the leaf.
+ */
+ callchain_param.order = ORDER_CALLER;
+ err = thread__resolve_callchain(thread, &callchain_cursor, evsel,
+ sample, NULL, NULL,
+ sysctl_perf_event_max_stack);
+ if (err) {
+ callchain_param.order = saved_order;
+ return NULL;
+ }
+ callchain_cursor_commit(&callchain_cursor);
+
+ while (1) {
+ struct callchain_cursor_node *node;
+ struct addr_location al;
+ u64 dso_db_id = 0, sym_db_id = 0, offset = 0;
+
+ memset(&al, 0, sizeof(al));
+
+ node = callchain_cursor_current(&callchain_cursor);
+ if (!node)
+ break;
+ /*
+ * Handle export of symbol and dso for this node by
+ * constructing an addr_location struct and then passing it to
+ * db_ids_from_al() to perform the export.
+ */
+ al.sym = node->sym;
+ al.map = node->map;
+ al.machine = machine;
+ al.addr = node->ip;
+
+ if (al.map && !al.sym)
+ al.sym = dso__find_symbol(al.map->dso, MAP__FUNCTION,
+ al.addr);
+
+ db_ids_from_al(dbe, &al, &dso_db_id, &sym_db_id, &offset);
+
+ /* add node to the call path tree if it doesn't exist */
+ current = call_path__findnew(dbe->cpr, current,
+ al.sym, node->ip,
+ kernel_start);
+
+ callchain_cursor_advance(&callchain_cursor);
+ }
+
+ /* Reset the callchain order to its prior value. */
+ callchain_param.order = saved_order;
+
+ if (current == &dbe->cpr->call_path) {
+ /* Bail because the callchain was empty. */
+ return NULL;
+ }
+
+ return current;
+}
+
int db_export__branch_type(struct db_export *dbe, u32 branch_type,
const char *name)
{
@@ -329,6 +404,16 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
if (err)
goto out_put;
+ if (dbe->cpr) {
+ struct call_path *cp = call_path_from_sample(dbe, al->machine,
+ thread, sample,
+ evsel);
+ if (cp) {
+ db_export__call_path(dbe, cp);
+ es.call_path_id = cp->db_id;
+ }
+ }
+
if ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) &&
sample_addr_correlates_sym(&evsel->attr)) {
struct addr_location addr_al;
diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h
index 25e22fd76aca..67bc6b8ad2d6 100644
--- a/tools/perf/util/db-export.h
+++ b/tools/perf/util/db-export.h
@@ -27,6 +27,7 @@ struct dso;
struct perf_sample;
struct addr_location;
struct call_return_processor;
+struct call_path_root;
struct call_path;
struct call_return;
@@ -43,6 +44,7 @@ struct export_sample {
u64 addr_dso_db_id;
u64 addr_sym_db_id;
u64 addr_offset; /* addr offset from symbol start */
+ u64 call_path_id;
};
struct db_export {
@@ -64,6 +66,7 @@ struct db_export {
int (*export_call_return)(struct db_export *dbe,
struct call_return *cr);
struct call_return_processor *crp;
+ struct call_path_root *cpr;
u64 evsel_last_db_id;
u64 machine_last_db_id;
u64 thread_last_db_id;
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 8e6395439ca0..3357479082ca 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -38,7 +38,7 @@ int dso__read_binary_type_filename(const struct dso *dso,
enum dso_binary_type type,
char *root_dir, char *filename, size_t size)
{
- char build_id_hex[BUILD_ID_SIZE * 2 + 1];
+ char build_id_hex[SBUILD_ID_SIZE];
int ret = 0;
size_t len;
@@ -1301,7 +1301,7 @@ size_t __dsos__fprintf(struct list_head *head, FILE *fp)
size_t dso__fprintf_buildid(struct dso *dso, FILE *fp)
{
- char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+ char sbuild_id[SBUILD_ID_SIZE];
build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
return fprintf(fp, "%s", sbuild_id);
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 577e600c8eb1..a347b19c961a 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -915,8 +915,7 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf)
tmp = "*";
else if (tag == DW_TAG_subroutine_type) {
/* Function pointer */
- strbuf_add(buf, "(function_type)", 15);
- return 0;
+ return strbuf_add(buf, "(function_type)", 15);
} else {
if (!dwarf_diename(&type))
return -ENOENT;
@@ -927,14 +926,10 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf)
else if (tag == DW_TAG_enumeration_type)
tmp = "enum ";
/* Write a base name */
- strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type));
- return 0;
+ return strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type));
}
ret = die_get_typename(&type, buf);
- if (ret == 0)
- strbuf_addstr(buf, tmp);
-
- return ret;
+ return ret ? ret : strbuf_addstr(buf, tmp);
}
/**
@@ -951,14 +946,13 @@ int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf)
ret = die_get_typename(vr_die, buf);
if (ret < 0) {
pr_debug("Failed to get type, make it unknown.\n");
- strbuf_add(buf, " (unknown_type)", 14);
+ ret = strbuf_add(buf, " (unknown_type)", 14);
}
- strbuf_addf(buf, "\t%s", dwarf_diename(vr_die));
-
- return 0;
+ return ret < 0 ? ret : strbuf_addf(buf, "\t%s", dwarf_diename(vr_die));
}
+#ifdef HAVE_DWARF_GETLOCATIONS
/**
* die_get_var_innermost_scope - Get innermost scope range of given variable DIE
* @sp_die: a subprogram DIE
@@ -998,22 +992,24 @@ static int die_get_var_innermost_scope(Dwarf_Die *sp_die, Dwarf_Die *vr_die,
}
while ((offset = dwarf_ranges(&scopes[1], offset, &base,
- &start, &end)) > 0) {
+ &start, &end)) > 0) {
start -= entry;
end -= entry;
if (first) {
- strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
- name, start, end);
+ ret = strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+ name, start, end);
first = false;
} else {
- strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
- start, end);
+ ret = strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+ start, end);
}
+ if (ret < 0)
+ goto out;
}
if (!first)
- strbuf_add(buf, "]>", 2);
+ ret = strbuf_add(buf, "]>", 2);
out:
free(scopes);
@@ -1053,30 +1049,39 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf)
if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL)
return -EINVAL;
- while ((offset = dwarf_getlocations(
- &attr, offset, &base,
- &start, &end, &op, &nops)) > 0) {
+ while ((offset = dwarf_getlocations(&attr, offset, &base,
+ &start, &end, &op, &nops)) > 0) {
if (start == 0) {
/* Single Location Descriptions */
ret = die_get_var_innermost_scope(sp_die, vr_die, buf);
- return ret;
+ goto out;
}
/* Location Lists */
start -= entry;
end -= entry;
if (first) {
- strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
- name, start, end);
+ ret = strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+ name, start, end);
first = false;
} else {
- strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
- start, end);
+ ret = strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+ start, end);
}
+ if (ret < 0)
+ goto out;
}
if (!first)
- strbuf_add(buf, "]>", 2);
-
+ ret = strbuf_add(buf, "]>", 2);
+out:
return ret;
}
+#else
+int die_get_var_range(Dwarf_Die *sp_die __maybe_unused,
+ Dwarf_Die *vr_die __maybe_unused,
+ struct strbuf *buf __maybe_unused)
+{
+ return -ENOTSUP;
+}
+#endif
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index dad55d04ffdd..f6fcc6832949 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -45,6 +45,7 @@ static const char *perf_event__names[] = {
[PERF_RECORD_STAT] = "STAT",
[PERF_RECORD_STAT_ROUND] = "STAT_ROUND",
[PERF_RECORD_EVENT_UPDATE] = "EVENT_UPDATE",
+ [PERF_RECORD_TIME_CONV] = "TIME_CONV",
};
const char *perf_event__name(unsigned int id)
@@ -433,7 +434,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
{
char filename[PATH_MAX];
DIR *tasks;
- struct dirent dirent, *next;
+ struct dirent *dirent;
pid_t tgid, ppid;
int rc = 0;
@@ -462,11 +463,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
return 0;
}
- while (!readdir_r(tasks, &dirent, &next) && next) {
+ while ((dirent = readdir(tasks)) != NULL) {
char *end;
pid_t _pid;
- _pid = strtol(dirent.d_name, &end, 10);
+ _pid = strtol(dirent->d_name, &end, 10);
if (*end)
continue;
@@ -575,7 +576,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
{
DIR *proc;
char proc_path[PATH_MAX];
- struct dirent dirent, *next;
+ struct dirent *dirent;
union perf_event *comm_event, *mmap_event, *fork_event;
int err = -1;
@@ -600,9 +601,9 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
if (proc == NULL)
goto out_free_fork;
- while (!readdir_r(proc, &dirent, &next) && next) {
+ while ((dirent = readdir(proc)) != NULL) {
char *end;
- pid_t pid = strtol(dirent.d_name, &end, 10);
+ pid_t pid = strtol(dirent->d_name, &end, 10);
if (*end) /* only interested in proper numerical dirents */
continue;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 6bb1c928350d..8d363d5e65a2 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -233,6 +233,7 @@ enum perf_user_event_type { /* above any possible kernel type */
PERF_RECORD_STAT = 76,
PERF_RECORD_STAT_ROUND = 77,
PERF_RECORD_EVENT_UPDATE = 78,
+ PERF_RECORD_TIME_CONV = 79,
PERF_RECORD_HEADER_MAX
};
@@ -469,6 +470,13 @@ struct stat_round_event {
u64 time;
};
+struct time_conv_event {
+ struct perf_event_header header;
+ u64 time_shift;
+ u64 time_mult;
+ u64 time_zero;
+};
+
union perf_event {
struct perf_event_header header;
struct mmap_event mmap;
@@ -497,6 +505,7 @@ union perf_event {
struct stat_config_event stat_config;
struct stat_event stat;
struct stat_round_event stat_round;
+ struct time_conv_event time_conv;
};
void perf_event__print_totals(void);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 86a03836a83f..c4bfe11479a0 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -679,53 +679,52 @@ static struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist,
return NULL;
}
-union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
+/* When check_messup is true, 'end' must points to a good entry */
+static union perf_event *
+perf_mmap__read(struct perf_mmap *md, bool check_messup, u64 start,
+ u64 end, u64 *prev)
{
- struct perf_mmap *md = &evlist->mmap[idx];
- u64 head;
- u64 old = md->prev;
unsigned char *data = md->base + page_size;
union perf_event *event = NULL;
+ int diff = end - start;
- /*
- * Check if event was unmapped due to a POLLHUP/POLLERR.
- */
- if (!atomic_read(&md->refcnt))
- return NULL;
-
- head = perf_mmap__read_head(md);
- if (evlist->overwrite) {
+ if (check_messup) {
/*
* If we're further behind than half the buffer, there's a chance
* the writer will bite our tail and mess up the samples under us.
*
- * If we somehow ended up ahead of the head, we got messed up.
+ * If we somehow ended up ahead of the 'end', we got messed up.
*
- * In either case, truncate and restart at head.
+ * In either case, truncate and restart at 'end'.
*/
- int diff = head - old;
if (diff > md->mask / 2 || diff < 0) {
fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
/*
- * head points to a known good entry, start there.
+ * 'end' points to a known good entry, start there.
*/
- old = head;
+ start = end;
+ diff = 0;
}
}
- if (old != head) {
+ if (diff >= (int)sizeof(event->header)) {
size_t size;
- event = (union perf_event *)&data[old & md->mask];
+ event = (union perf_event *)&data[start & md->mask];
size = event->header.size;
+ if (size < sizeof(event->header) || diff < (int)size) {
+ event = NULL;
+ goto broken_event;
+ }
+
/*
* Event straddles the mmap boundary -- header should always
* be inside due to u64 alignment of output.
*/
- if ((old & md->mask) + size != ((old + size) & md->mask)) {
- unsigned int offset = old;
+ if ((start & md->mask) + size != ((start + size) & md->mask)) {
+ unsigned int offset = start;
unsigned int len = min(sizeof(*event), size), cpy;
void *dst = md->event_copy;
@@ -740,14 +739,83 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
event = (union perf_event *) md->event_copy;
}
- old += size;
+ start += size;
}
- md->prev = old;
+broken_event:
+ if (prev)
+ *prev = start;
return event;
}
+union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
+{
+ struct perf_mmap *md = &evlist->mmap[idx];
+ u64 head;
+ u64 old = md->prev;
+
+ /*
+ * Check if event was unmapped due to a POLLHUP/POLLERR.
+ */
+ if (!atomic_read(&md->refcnt))
+ return NULL;
+
+ head = perf_mmap__read_head(md);
+
+ return perf_mmap__read(md, evlist->overwrite, old, head, &md->prev);
+}
+
+union perf_event *
+perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx)
+{
+ struct perf_mmap *md = &evlist->mmap[idx];
+ u64 head, end;
+ u64 start = md->prev;
+
+ /*
+ * Check if event was unmapped due to a POLLHUP/POLLERR.
+ */
+ if (!atomic_read(&md->refcnt))
+ return NULL;
+
+ head = perf_mmap__read_head(md);
+ if (!head)
+ return NULL;
+
+ /*
+ * 'head' pointer starts from 0. Kernel minus sizeof(record) form
+ * it each time when kernel writes to it, so in fact 'head' is
+ * negative. 'end' pointer is made manually by adding the size of
+ * the ring buffer to 'head' pointer, means the validate data can
+ * read is the whole ring buffer. If 'end' is positive, the ring
+ * buffer has not fully filled, so we must adjust 'end' to 0.
+ *
+ * However, since both 'head' and 'end' is unsigned, we can't
+ * simply compare 'end' against 0. Here we compare '-head' and
+ * the size of the ring buffer, where -head is the number of bytes
+ * kernel write to the ring buffer.
+ */
+ if (-head < (u64)(md->mask + 1))
+ end = 0;
+ else
+ end = head + md->mask + 1;
+
+ return perf_mmap__read(md, false, start, end, &md->prev);
+}
+
+void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx)
+{
+ struct perf_mmap *md = &evlist->mmap[idx];
+ u64 head;
+
+ if (!atomic_read(&md->refcnt))
+ return;
+
+ head = perf_mmap__read_head(md);
+ md->prev = head;
+}
+
static bool perf_mmap__empty(struct perf_mmap *md)
{
return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base;
@@ -986,26 +1054,34 @@ out_unmap:
return -1;
}
-static size_t perf_evlist__mmap_size(unsigned long pages)
+unsigned long perf_event_mlock_kb_in_pages(void)
{
- if (pages == UINT_MAX) {
- int max;
+ unsigned long pages;
+ int max;
- if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) {
- /*
- * Pick a once upon a time good value, i.e. things look
- * strange since we can't read a sysctl value, but lets not
- * die yet...
- */
- max = 512;
- } else {
- max -= (page_size / 1024);
- }
+ if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) {
+ /*
+ * Pick a once upon a time good value, i.e. things look
+ * strange since we can't read a sysctl value, but lets not
+ * die yet...
+ */
+ max = 512;
+ } else {
+ max -= (page_size / 1024);
+ }
- pages = (max * 1024) / page_size;
- if (!is_power_of_2(pages))
- pages = rounddown_pow_of_two(pages);
- } else if (!is_power_of_2(pages))
+ pages = (max * 1024) / page_size;
+ if (!is_power_of_2(pages))
+ pages = rounddown_pow_of_two(pages);
+
+ return pages;
+}
+
+static size_t perf_evlist__mmap_size(unsigned long pages)
+{
+ if (pages == UINT_MAX)
+ pages = perf_event_mlock_kb_in_pages();
+ else if (!is_power_of_2(pages))
return 0;
return (pages + 1) * page_size;
@@ -1192,6 +1268,24 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
perf_evlist__propagate_maps(evlist);
}
+void __perf_evlist__set_sample_bit(struct perf_evlist *evlist,
+ enum perf_event_sample_format bit)
+{
+ struct perf_evsel *evsel;
+
+ evlist__for_each(evlist, evsel)
+ __perf_evsel__set_sample_bit(evsel, bit);
+}
+
+void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist,
+ enum perf_event_sample_format bit)
+{
+ struct perf_evsel *evsel;
+
+ evlist__for_each(evlist, evsel)
+ __perf_evsel__reset_sample_bit(evsel, bit);
+}
+
int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel)
{
struct perf_evsel *evsel;
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index a0d15221db6e..85d1b59802e8 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -87,6 +87,17 @@ int perf_evlist__add_dummy(struct perf_evlist *evlist);
int perf_evlist__add_newtp(struct perf_evlist *evlist,
const char *sys, const char *name, void *handler);
+void __perf_evlist__set_sample_bit(struct perf_evlist *evlist,
+ enum perf_event_sample_format bit);
+void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist,
+ enum perf_event_sample_format bit);
+
+#define perf_evlist__set_sample_bit(evlist, bit) \
+ __perf_evlist__set_sample_bit(evlist, PERF_SAMPLE_##bit)
+
+#define perf_evlist__reset_sample_bit(evlist, bit) \
+ __perf_evlist__reset_sample_bit(evlist, PERF_SAMPLE_##bit)
+
int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter);
int perf_evlist__set_filter_pid(struct perf_evlist *evlist, pid_t pid);
int perf_evlist__set_filter_pids(struct perf_evlist *evlist, size_t npids, pid_t *pids);
@@ -118,16 +129,23 @@ struct perf_sample_id *perf_evlist__id2sid(struct perf_evlist *evlist, u64 id);
union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx);
+union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist,
+ int idx);
+void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
+
void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
int perf_evlist__open(struct perf_evlist *evlist);
void perf_evlist__close(struct perf_evlist *evlist);
+struct callchain_param;
+
void perf_evlist__set_id_pos(struct perf_evlist *evlist);
bool perf_can_sample_identifier(void);
bool perf_can_record_switch_events(void);
bool perf_can_record_cpu_wide(void);
-void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts);
+void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts,
+ struct callchain_param *callchain);
int record_opts__config(struct record_opts *opts);
int perf_evlist__prepare_workload(struct perf_evlist *evlist,
@@ -144,6 +162,8 @@ int perf_evlist__parse_mmap_pages(const struct option *opt,
const char *str,
int unset);
+unsigned long perf_event_mlock_kb_in_pages(void);
+
int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
bool overwrite, unsigned int auxtrace_pages,
bool auxtrace_overwrite);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 738ce226002b..964c7c3602c0 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -226,7 +226,8 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
perf_evsel__init(evsel, attr, idx);
if (perf_evsel__is_bpf_output(evsel)) {
- evsel->attr.sample_type |= PERF_SAMPLE_RAW;
+ evsel->attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
+ PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD),
evsel->attr.sample_period = 1;
}
@@ -561,10 +562,9 @@ int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size)
return ret;
}
-static void
-perf_evsel__config_callgraph(struct perf_evsel *evsel,
- struct record_opts *opts,
- struct callchain_param *param)
+void perf_evsel__config_callchain(struct perf_evsel *evsel,
+ struct record_opts *opts,
+ struct callchain_param *param)
{
bool function = perf_evsel__is_function_event(evsel);
struct perf_event_attr *attr = &evsel->attr;
@@ -704,7 +704,7 @@ static void apply_config_terms(struct perf_evsel *evsel,
/* set perf-event callgraph */
if (param.enabled)
- perf_evsel__config_callgraph(evsel, opts, &param);
+ perf_evsel__config_callchain(evsel, opts, &param);
}
}
@@ -736,7 +736,8 @@ static void apply_config_terms(struct perf_evsel *evsel,
* enable/disable events specifically, as there's no
* initial traced exec call.
*/
-void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
+void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
+ struct callchain_param *callchain)
{
struct perf_evsel *leader = evsel->leader;
struct perf_event_attr *attr = &evsel->attr;
@@ -811,8 +812,8 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
if (perf_evsel__is_function_event(evsel))
evsel->attr.exclude_callchain_user = 1;
- if (callchain_param.enabled && !evsel->no_aux_samples)
- perf_evsel__config_callgraph(evsel, opts, &callchain_param);
+ if (callchain && callchain->enabled && !evsel->no_aux_samples)
+ perf_evsel__config_callchain(evsel, opts, callchain);
if (opts->sample_intr_regs) {
attr->sample_regs_intr = opts->sample_intr_regs;
@@ -1230,6 +1231,21 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
__p_bits(buf, size, value, bits);
}
+static void __p_branch_sample_type(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n }
+ struct bit_names bits[] = {
+ bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY),
+ bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL),
+ bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX),
+ bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP),
+ bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES),
+ { .name = NULL, }
+ };
+#undef bit_name
+ __p_bits(buf, size, value, bits);
+}
+
static void __p_read_format(char *buf, size_t size, u64 value)
{
#define bit_name(n) { PERF_FORMAT_##n, #n }
@@ -1248,6 +1264,7 @@ static void __p_read_format(char *buf, size_t size, u64 value)
#define p_unsigned(val) snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
#define p_signed(val) snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
#define p_sample_type(val) __p_sample_type(buf, BUF_SIZE, val)
+#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val)
#define p_read_format(val) __p_read_format(buf, BUF_SIZE, val)
#define PRINT_ATTRn(_n, _f, _p) \
@@ -1299,12 +1316,13 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
PRINT_ATTRf(comm_exec, p_unsigned);
PRINT_ATTRf(use_clockid, p_unsigned);
PRINT_ATTRf(context_switch, p_unsigned);
+ PRINT_ATTRf(write_backward, p_unsigned);
PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
PRINT_ATTRf(bp_type, p_unsigned);
PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex);
PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex);
- PRINT_ATTRf(branch_sample_type, p_unsigned);
+ PRINT_ATTRf(branch_sample_type, p_branch_sample_type);
PRINT_ATTRf(sample_regs_user, p_hex);
PRINT_ATTRf(sample_stack_user, p_unsigned);
PRINT_ATTRf(clockid, p_signed);
@@ -2253,98 +2271,11 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
return 0;
}
-static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
-{
- va_list args;
- int ret = 0;
-
- if (!*first) {
- ret += fprintf(fp, ",");
- } else {
- ret += fprintf(fp, ":");
- *first = false;
- }
-
- va_start(args, fmt);
- ret += vfprintf(fp, fmt, args);
- va_end(args);
- return ret;
-}
-
-static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv)
-{
- return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val);
-}
-
-int perf_evsel__fprintf(struct perf_evsel *evsel,
- struct perf_attr_details *details, FILE *fp)
-{
- bool first = true;
- int printed = 0;
-
- if (details->event_group) {
- struct perf_evsel *pos;
-
- if (!perf_evsel__is_group_leader(evsel))
- return 0;
-
- if (evsel->nr_members > 1)
- printed += fprintf(fp, "%s{", evsel->group_name ?: "");
-
- printed += fprintf(fp, "%s", perf_evsel__name(evsel));
- for_each_group_member(pos, evsel)
- printed += fprintf(fp, ",%s", perf_evsel__name(pos));
-
- if (evsel->nr_members > 1)
- printed += fprintf(fp, "}");
- goto out;
- }
-
- printed += fprintf(fp, "%s", perf_evsel__name(evsel));
-
- if (details->verbose) {
- printed += perf_event_attr__fprintf(fp, &evsel->attr,
- __print_attr__fprintf, &first);
- } else if (details->freq) {
- const char *term = "sample_freq";
-
- if (!evsel->attr.freq)
- term = "sample_period";
-
- printed += comma_fprintf(fp, &first, " %s=%" PRIu64,
- term, (u64)evsel->attr.sample_freq);
- }
-
- if (details->trace_fields) {
- struct format_field *field;
-
- if (evsel->attr.type != PERF_TYPE_TRACEPOINT) {
- printed += comma_fprintf(fp, &first, " (not a tracepoint)");
- goto out;
- }
-
- field = evsel->tp_format->format.fields;
- if (field == NULL) {
- printed += comma_fprintf(fp, &first, " (no trace field)");
- goto out;
- }
-
- printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name);
-
- field = field->next;
- while (field) {
- printed += comma_fprintf(fp, &first, "%s", field->name);
- field = field->next;
- }
- }
-out:
- fputc('\n', fp);
- return ++printed;
-}
-
bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
char *msg, size_t msgsize)
{
+ int paranoid;
+
if ((err == ENOENT || err == ENXIO || err == ENODEV) &&
evsel->attr.type == PERF_TYPE_HARDWARE &&
evsel->attr.config == PERF_COUNT_HW_CPU_CYCLES) {
@@ -2364,6 +2295,22 @@ bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
zfree(&evsel->name);
return true;
+ } else if (err == EACCES && !evsel->attr.exclude_kernel &&
+ (paranoid = perf_event_paranoid()) > 1) {
+ const char *name = perf_evsel__name(evsel);
+ char *new_name;
+
+ if (asprintf(&new_name, "%s%su", name, strchr(name, ':') ? "" : ":") < 0)
+ return false;
+
+ if (evsel->name)
+ free(evsel->name);
+ evsel->name = new_name;
+ scnprintf(msg, msgsize,
+"kernel.perf_event_paranoid=%d, trying to fall back to excluding kernel samples", paranoid);
+ evsel->attr.exclude_kernel = 1;
+
+ return true;
}
return false;
@@ -2382,12 +2329,13 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
"Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
"which controls use of the performance events system by\n"
"unprivileged users (without CAP_SYS_ADMIN).\n\n"
- "The default value is 1:\n\n"
+ "The current value is %d:\n\n"
" -1: Allow use of (almost) all events by all users\n"
">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
- target->system_wide ? "system-wide " : "");
+ target->system_wide ? "system-wide " : "",
+ perf_event_paranoid());
case ENOENT:
return scnprintf(msg, size, "The %s event is not supported.",
perf_evsel__name(evsel));
@@ -2397,10 +2345,18 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
"Probably the maximum number of open file descriptors has been reached.\n"
"Hint: Try again after reducing the number of events.\n"
"Hint: Try increasing the limit with 'ulimit -n <limit>'");
+ case ENOMEM:
+ if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0 &&
+ access("/proc/sys/kernel/perf_event_max_stack", F_OK) == 0)
+ return scnprintf(msg, size,
+ "Not enough memory to setup event with callchain.\n"
+ "Hint: Try tweaking /proc/sys/kernel/perf_event_max_stack\n"
+ "Hint: Current value: %d", sysctl_perf_event_max_stack);
+ break;
case ENODEV:
if (target->cpu_list)
return scnprintf(msg, size, "%s",
- "No such device - did you specify an out-of-range profile CPU?\n");
+ "No such device - did you specify an out-of-range profile CPU?");
break;
case EOPNOTSUPP:
if (evsel->attr.precise_ip)
@@ -2432,7 +2388,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
return scnprintf(msg, size,
"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
"/bin/dmesg may provide additional information.\n"
- "No CONFIG_PERF_EVENTS=y kernel support configured?\n",
+ "No CONFIG_PERF_EVENTS=y kernel support configured?",
err, strerror_r(err, sbuf, sizeof(sbuf)),
perf_evsel__name(evsel));
}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 501ea6e565f1..8a644fef452c 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -178,8 +178,14 @@ void perf_evsel__init(struct perf_evsel *evsel,
void perf_evsel__exit(struct perf_evsel *evsel);
void perf_evsel__delete(struct perf_evsel *evsel);
+struct callchain_param;
+
void perf_evsel__config(struct perf_evsel *evsel,
- struct record_opts *opts);
+ struct record_opts *opts,
+ struct callchain_param *callchain);
+void perf_evsel__config_callchain(struct perf_evsel *evsel,
+ struct record_opts *opts,
+ struct callchain_param *callchain);
int __perf_evsel__sample_size(u64 sample_type);
void perf_evsel__calc_id_pos(struct perf_evsel *evsel);
@@ -381,6 +387,24 @@ struct perf_attr_details {
int perf_evsel__fprintf(struct perf_evsel *evsel,
struct perf_attr_details *details, FILE *fp);
+#define EVSEL__PRINT_IP (1<<0)
+#define EVSEL__PRINT_SYM (1<<1)
+#define EVSEL__PRINT_DSO (1<<2)
+#define EVSEL__PRINT_SYMOFFSET (1<<3)
+#define EVSEL__PRINT_ONELINE (1<<4)
+#define EVSEL__PRINT_SRCLINE (1<<5)
+#define EVSEL__PRINT_UNKNOWN_AS_ADDR (1<<6)
+
+struct callchain_cursor;
+
+int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
+ unsigned int print_opts,
+ struct callchain_cursor *cursor, FILE *fp);
+
+int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
+ int left_alignment, unsigned int print_opts,
+ struct callchain_cursor *cursor, FILE *fp);
+
bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
char *msg, size_t msgsize);
int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
@@ -396,7 +420,7 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); \
(_evsel) && (_evsel)->leader == (_leader); \
(_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node))
-static inline bool has_branch_callstack(struct perf_evsel *evsel)
+static inline bool perf_evsel__has_branch_callstack(const struct perf_evsel *evsel)
{
return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
}
diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
new file mode 100644
index 000000000000..3674e77ad640
--- /dev/null
+++ b/tools/perf/util/evsel_fprintf.c
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <stdbool.h>
+#include <traceevent/event-parse.h>
+#include "evsel.h"
+#include "callchain.h"
+#include "map.h"
+#include "symbol.h"
+
+static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
+{
+ va_list args;
+ int ret = 0;
+
+ if (!*first) {
+ ret += fprintf(fp, ",");
+ } else {
+ ret += fprintf(fp, ":");
+ *first = false;
+ }
+
+ va_start(args, fmt);
+ ret += vfprintf(fp, fmt, args);
+ va_end(args);
+ return ret;
+}
+
+static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv)
+{
+ return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val);
+}
+
+int perf_evsel__fprintf(struct perf_evsel *evsel,
+ struct perf_attr_details *details, FILE *fp)
+{
+ bool first = true;
+ int printed = 0;
+
+ if (details->event_group) {
+ struct perf_evsel *pos;
+
+ if (!perf_evsel__is_group_leader(evsel))
+ return 0;
+
+ if (evsel->nr_members > 1)
+ printed += fprintf(fp, "%s{", evsel->group_name ?: "");
+
+ printed += fprintf(fp, "%s", perf_evsel__name(evsel));
+ for_each_group_member(pos, evsel)
+ printed += fprintf(fp, ",%s", perf_evsel__name(pos));
+
+ if (evsel->nr_members > 1)
+ printed += fprintf(fp, "}");
+ goto out;
+ }
+
+ printed += fprintf(fp, "%s", perf_evsel__name(evsel));
+
+ if (details->verbose) {
+ printed += perf_event_attr__fprintf(fp, &evsel->attr,
+ __print_attr__fprintf, &first);
+ } else if (details->freq) {
+ const char *term = "sample_freq";
+
+ if (!evsel->attr.freq)
+ term = "sample_period";
+
+ printed += comma_fprintf(fp, &first, " %s=%" PRIu64,
+ term, (u64)evsel->attr.sample_freq);
+ }
+
+ if (details->trace_fields) {
+ struct format_field *field;
+
+ if (evsel->attr.type != PERF_TYPE_TRACEPOINT) {
+ printed += comma_fprintf(fp, &first, " (not a tracepoint)");
+ goto out;
+ }
+
+ field = evsel->tp_format->format.fields;
+ if (field == NULL) {
+ printed += comma_fprintf(fp, &first, " (no trace field)");
+ goto out;
+ }
+
+ printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name);
+
+ field = field->next;
+ while (field) {
+ printed += comma_fprintf(fp, &first, "%s", field->name);
+ field = field->next;
+ }
+ }
+out:
+ fputc('\n', fp);
+ return ++printed;
+}
+
+int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
+ unsigned int print_opts, struct callchain_cursor *cursor,
+ FILE *fp)
+{
+ int printed = 0;
+ struct callchain_cursor_node *node;
+ int print_ip = print_opts & EVSEL__PRINT_IP;
+ int print_sym = print_opts & EVSEL__PRINT_SYM;
+ int print_dso = print_opts & EVSEL__PRINT_DSO;
+ int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET;
+ int print_oneline = print_opts & EVSEL__PRINT_ONELINE;
+ int print_srcline = print_opts & EVSEL__PRINT_SRCLINE;
+ int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR;
+ char s = print_oneline ? ' ' : '\t';
+
+ if (sample->callchain) {
+ struct addr_location node_al;
+
+ callchain_cursor_commit(cursor);
+
+ while (1) {
+ u64 addr = 0;
+
+ node = callchain_cursor_current(cursor);
+ if (!node)
+ break;
+
+ if (node->sym && node->sym->ignore)
+ goto next;
+
+ printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " ");
+
+ if (print_ip)
+ printed += fprintf(fp, "%c%16" PRIx64, s, node->ip);
+
+ if (node->map)
+ addr = node->map->map_ip(node->map, node->ip);
+
+ if (print_sym) {
+ printed += fprintf(fp, " ");
+ node_al.addr = addr;
+ node_al.map = node->map;
+
+ if (print_symoffset) {
+ printed += __symbol__fprintf_symname_offs(node->sym, &node_al,
+ print_unknown_as_addr, fp);
+ } else {
+ printed += __symbol__fprintf_symname(node->sym, &node_al,
+ print_unknown_as_addr, fp);
+ }
+ }
+
+ if (print_dso) {
+ printed += fprintf(fp, " (");
+ printed += map__fprintf_dsoname(node->map, fp);
+ printed += fprintf(fp, ")");
+ }
+
+ if (print_srcline)
+ printed += map__fprintf_srcline(node->map, addr, "\n ", fp);
+
+ if (!print_oneline)
+ printed += fprintf(fp, "\n");
+next:
+ callchain_cursor_advance(cursor);
+ }
+ }
+
+ return printed;
+}
+
+int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
+ int left_alignment, unsigned int print_opts,
+ struct callchain_cursor *cursor, FILE *fp)
+{
+ int printed = 0;
+ int print_ip = print_opts & EVSEL__PRINT_IP;
+ int print_sym = print_opts & EVSEL__PRINT_SYM;
+ int print_dso = print_opts & EVSEL__PRINT_DSO;
+ int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET;
+ int print_srcline = print_opts & EVSEL__PRINT_SRCLINE;
+ int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR;
+
+ if (cursor != NULL) {
+ printed += sample__fprintf_callchain(sample, left_alignment,
+ print_opts, cursor, fp);
+ } else if (!(al->sym && al->sym->ignore)) {
+ printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " ");
+
+ if (print_ip)
+ printed += fprintf(fp, "%16" PRIx64, sample->ip);
+
+ if (print_sym) {
+ printed += fprintf(fp, " ");
+ if (print_symoffset) {
+ printed += __symbol__fprintf_symname_offs(al->sym, al,
+ print_unknown_as_addr, fp);
+ } else {
+ printed += __symbol__fprintf_symname(al->sym, al,
+ print_unknown_as_addr, fp);
+ }
+ }
+
+ if (print_dso) {
+ printed += fprintf(fp, " (");
+ printed += map__fprintf_dsoname(al->map, fp);
+ printed += fprintf(fp, ")");
+ }
+
+ if (print_srcline)
+ printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp);
+ }
+
+ return printed;
+}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 90680ec9f8b8..08852dde1378 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1474,7 +1474,7 @@ static int __event_process_build_id(struct build_id_event *bev,
dso = machine__findnew_dso(machine, filename);
if (dso != NULL) {
- char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+ char sbuild_id[SBUILD_ID_SIZE];
dso__set_build_id(dso, &bev->build_id);
@@ -1819,7 +1819,8 @@ static int process_cpu_topology(struct perf_file_section *section,
ph->env.nr_sibling_cores = nr;
size += sizeof(u32);
- strbuf_init(&sb, 128);
+ if (strbuf_init(&sb, 128) < 0)
+ goto free_cpu;
for (i = 0; i < nr; i++) {
str = do_read_string(fd, ph);
@@ -1827,7 +1828,8 @@ static int process_cpu_topology(struct perf_file_section *section,
goto error;
/* include a NULL character at the end */
- strbuf_add(&sb, str, strlen(str) + 1);
+ if (strbuf_add(&sb, str, strlen(str) + 1) < 0)
+ goto error;
size += string_size(str);
free(str);
}
@@ -1849,7 +1851,8 @@ static int process_cpu_topology(struct perf_file_section *section,
goto error;
/* include a NULL character at the end */
- strbuf_add(&sb, str, strlen(str) + 1);
+ if (strbuf_add(&sb, str, strlen(str) + 1) < 0)
+ goto error;
size += string_size(str);
free(str);
}
@@ -1912,13 +1915,14 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
/* nr nodes */
ret = readn(fd, &nr, sizeof(nr));
if (ret != sizeof(nr))
- goto error;
+ return -1;
if (ph->needs_swap)
nr = bswap_32(nr);
ph->env.nr_numa_nodes = nr;
- strbuf_init(&sb, 256);
+ if (strbuf_init(&sb, 256) < 0)
+ return -1;
for (i = 0; i < nr; i++) {
/* node number */
@@ -1940,15 +1944,17 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
mem_free = bswap_64(mem_free);
}
- strbuf_addf(&sb, "%u:%"PRIu64":%"PRIu64":",
- node, mem_total, mem_free);
+ if (strbuf_addf(&sb, "%u:%"PRIu64":%"PRIu64":",
+ node, mem_total, mem_free) < 0)
+ goto error;
str = do_read_string(fd, ph);
if (!str)
goto error;
/* include a NULL character at the end */
- strbuf_add(&sb, str, strlen(str) + 1);
+ if (strbuf_add(&sb, str, strlen(str) + 1) < 0)
+ goto error;
free(str);
}
ph->env.numa_nodes = strbuf_detach(&sb, NULL);
@@ -1982,7 +1988,8 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused
}
ph->env.nr_pmu_mappings = pmu_num;
- strbuf_init(&sb, 128);
+ if (strbuf_init(&sb, 128) < 0)
+ return -1;
while (pmu_num) {
if (readn(fd, &type, sizeof(type)) != sizeof(type))
@@ -1994,9 +2001,11 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused
if (!name)
goto error;
- strbuf_addf(&sb, "%u:%s", type, name);
+ if (strbuf_addf(&sb, "%u:%s", type, name) < 0)
+ goto error;
/* include a NULL character at the end */
- strbuf_add(&sb, "", 1);
+ if (strbuf_add(&sb, "", 1) < 0)
+ goto error;
if (!strcmp(name, "msr"))
ph->env.msr_pmu_type = type;
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c
index 43a98a4dc1e1..d62ccaeeadd6 100644
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -27,16 +27,27 @@ static int levenshtein_compare(const void *p1, const void *p2)
return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
}
-static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+static int add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
{
- unsigned int i;
-
- ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
-
+ unsigned int i, nr = cmds->cnt + old->cnt;
+ void *tmp;
+
+ if (nr > cmds->alloc) {
+ /* Choose bigger one to alloc */
+ if (alloc_nr(cmds->alloc) < nr)
+ cmds->alloc = nr;
+ else
+ cmds->alloc = alloc_nr(cmds->alloc);
+ tmp = realloc(cmds->names, cmds->alloc * sizeof(*cmds->names));
+ if (!tmp)
+ return -1;
+ cmds->names = tmp;
+ }
for (i = 0; i < old->cnt; i++)
cmds->names[cmds->cnt++] = old->names[i];
zfree(&old->names);
old->cnt = 0;
+ return 0;
}
const char *help_unknown_cmd(const char *cmd)
@@ -52,8 +63,11 @@ const char *help_unknown_cmd(const char *cmd)
load_command_list("perf-", &main_cmds, &other_cmds);
- add_cmd_list(&main_cmds, &aliases);
- add_cmd_list(&main_cmds, &other_cmds);
+ if (add_cmd_list(&main_cmds, &aliases) < 0 ||
+ add_cmd_list(&main_cmds, &other_cmds) < 0) {
+ fprintf(stderr, "ERROR: Failed to allocate command list for unknown command.\n");
+ goto end;
+ }
qsort(main_cmds.names, main_cmds.cnt,
sizeof(main_cmds.names), cmdname_compare);
uniq(&main_cmds);
@@ -99,6 +113,6 @@ const char *help_unknown_cmd(const char *cmd)
for (i = 0; i < n; i++)
fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
}
-
+end:
exit(1);
}
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 31c4641fe5ff..cfab531437c7 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -295,7 +295,7 @@ static void hists__delete_entry(struct hists *hists, struct hist_entry *he)
root_in = &he->parent_he->hroot_in;
root_out = &he->parent_he->hroot_out;
} else {
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root_in = &hists->entries_collapsed;
else
root_in = hists->entries_in;
@@ -953,7 +953,7 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
{
int err, err2;
- err = sample__resolve_callchain(iter->sample, &iter->parent,
+ err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent,
iter->evsel, al, max_stack_depth);
if (err)
return err;
@@ -1295,8 +1295,9 @@ static int hists__hierarchy_insert_entry(struct hists *hists,
return ret;
}
-int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
- struct hist_entry *he)
+static int hists__collapse_insert_entry(struct hists *hists,
+ struct rb_root *root,
+ struct hist_entry *he)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
@@ -1372,7 +1373,7 @@ int hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
struct hist_entry *n;
int ret;
- if (!sort__need_collapse)
+ if (!hists__has(hists, need_collapse))
return 0;
hists->nr_entries = 0;
@@ -1631,7 +1632,7 @@ static void output_resort(struct hists *hists, struct ui_progress *prog,
return;
}
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root = &hists->entries_collapsed;
else
root = hists->entries_in;
@@ -2035,7 +2036,7 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
struct hist_entry *he;
int64_t cmp;
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
root = &hists->entries_collapsed;
else
root = hists->entries_in;
@@ -2061,6 +2062,8 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
if (he) {
memset(&he->stat, 0, sizeof(he->stat));
he->hists = hists;
+ if (symbol_conf.cumulate_callchain)
+ memset(he->stat_acc, 0, sizeof(he->stat));
rb_link_node(&he->rb_node_in, parent, p);
rb_insert_color(&he->rb_node_in, root);
hists__inc_stats(hists, he);
@@ -2075,7 +2078,7 @@ static struct hist_entry *hists__find_entry(struct hists *hists,
{
struct rb_node *n;
- if (sort__need_collapse)
+ if (hists__has(hists, need_collapse))
n = hists->entries_collapsed.rb_node;
else
n = hists->entries_in->rb_node;
@@ -2104,7 +2107,7 @@ void hists__match(struct hists *leader, struct hists *other)
struct rb_node *nd;
struct hist_entry *pos, *pair;
- if (sort__need_collapse)
+ if (hists__has(leader, need_collapse))
root = &leader->entries_collapsed;
else
root = leader->entries_in;
@@ -2129,7 +2132,7 @@ int hists__link(struct hists *leader, struct hists *other)
struct rb_node *nd;
struct hist_entry *pos, *pair;
- if (sort__need_collapse)
+ if (hists__has(other, need_collapse))
root = &other->entries_collapsed;
else
root = other->entries_in;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index bec0cd660fbd..0f84bfb42bb1 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -82,6 +82,8 @@ struct hists {
int nr_hpp_node;
};
+#define hists__has(__h, __f) (__h)->hpp_list->__f
+
struct hist_entry_iter;
struct hist_iter_ops {
@@ -199,8 +201,6 @@ int hists__init(void);
int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list);
struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
-int hists__collapse_insert_entry(struct hists *hists,
- struct rb_root *root, struct hist_entry *he);
struct perf_hpp {
char *buf;
@@ -240,6 +240,14 @@ struct perf_hpp_fmt {
struct perf_hpp_list {
struct list_head fields;
struct list_head sorts;
+
+ int need_collapse;
+ int parent;
+ int sym;
+ int dso;
+ int socket;
+ int thread;
+ int comm;
};
extern struct perf_hpp_list perf_hpp_list;
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index abf1366e2a24..9df996085563 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -66,6 +66,7 @@ struct intel_bts {
u64 branches_id;
size_t branches_event_size;
bool synth_needs_swap;
+ unsigned long num_events;
};
struct intel_bts_queue {
@@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq,
union perf_event event;
struct perf_sample sample = { .ip = 0, };
+ if (bts->synth_opts.initial_skip &&
+ bts->num_events++ <= bts->synth_opts.initial_skip)
+ return 0;
+
event.sample.header.type = PERF_RECORD_SAMPLE;
event.sample.header.misc = PERF_RECORD_MISC_USER;
event.sample.header.size = sizeof(struct perf_event_header);
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 9409d014b46c..9c8f15da86ce 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -356,7 +356,7 @@ static const char *intel_pt_err_msgs[] = {
int intel_pt__strerror(int code, char *buf, size_t buflen)
{
- if (code < 1 || code > INTEL_PT_ERR_MAX)
+ if (code < 1 || code >= INTEL_PT_ERR_MAX)
code = INTEL_PT_ERR_UNK;
strlcpy(buf, intel_pt_err_msgs[code], buflen);
return 0;
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 617578440989..137196990012 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -100,6 +100,8 @@ struct intel_pt {
u64 cyc_bit;
u64 noretcomp_bit;
unsigned max_non_turbo_ratio;
+
+ unsigned long num_events;
};
enum switch_state {
@@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
return 0;
+ if (pt->synth_opts.initial_skip &&
+ pt->num_events++ < pt->synth_opts.initial_skip)
+ return 0;
+
event->sample.header.type = PERF_RECORD_SAMPLE;
event->sample.header.misc = PERF_RECORD_MISC_USER;
event->sample.header.size = sizeof(struct perf_event_header);
@@ -1029,6 +1035,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
union perf_event *event = ptq->event_buf;
struct perf_sample sample = { .ip = 0, };
+ if (pt->synth_opts.initial_skip &&
+ pt->num_events++ < pt->synth_opts.initial_skip)
+ return 0;
+
event->sample.header.type = PERF_RECORD_SAMPLE;
event->sample.header.misc = PERF_RECORD_MISC_USER;
event->sample.header.size = sizeof(struct perf_event_header);
@@ -1087,6 +1097,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
union perf_event *event = ptq->event_buf;
struct perf_sample sample = { .ip = 0, };
+ if (pt->synth_opts.initial_skip &&
+ pt->num_events++ < pt->synth_opts.initial_skip)
+ return 0;
+
event->sample.header.type = PERF_RECORD_SAMPLE;
event->sample.header.misc = PERF_RECORD_MISC_USER;
event->sample.header.size = sizeof(struct perf_event_header);
@@ -1199,14 +1213,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
ptq->have_sample = false;
if (pt->sample_instructions &&
- (state->type & INTEL_PT_INSTRUCTION)) {
+ (state->type & INTEL_PT_INSTRUCTION) &&
+ (!pt->synth_opts.initial_skip ||
+ pt->num_events++ >= pt->synth_opts.initial_skip)) {
err = intel_pt_synth_instruction_sample(ptq);
if (err)
return err;
}
if (pt->sample_transactions &&
- (state->type & INTEL_PT_TRANSACTION)) {
+ (state->type & INTEL_PT_TRANSACTION) &&
+ (!pt->synth_opts.initial_skip ||
+ pt->num_events++ >= pt->synth_opts.initial_skip)) {
err = intel_pt_synth_transaction_sample(ptq);
if (err)
return err;
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index ad0c0bb1fbc7..86afe9618bb0 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -17,6 +17,7 @@
#include "strlist.h"
#include <elf.h>
+#include "tsc.h"
#include "session.h"
#include "jit.h"
#include "jitdump.h"
@@ -33,6 +34,7 @@ struct jit_buf_desc {
size_t bufsize;
FILE *in;
bool needs_bswap; /* handles cross-endianess */
+ bool use_arch_timestamp;
void *debug_data;
size_t nr_debug_entries;
uint32_t code_load_count;
@@ -158,13 +160,16 @@ jit_open(struct jit_buf_desc *jd, const char *name)
header.flags = bswap_64(header.flags);
}
+ jd->use_arch_timestamp = header.flags & JITDUMP_FLAGS_ARCH_TIMESTAMP;
+
if (verbose > 2)
- pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n",
+ pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\nuse_arch_timestamp=%d\n",
header.version,
header.total_size,
(unsigned long long)header.timestamp,
header.pid,
- header.elf_mach);
+ header.elf_mach,
+ jd->use_arch_timestamp);
if (header.flags & JITDUMP_FLAGS_RESERVED) {
pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n",
@@ -172,10 +177,15 @@ jit_open(struct jit_buf_desc *jd, const char *name)
goto error;
}
+ if (jd->use_arch_timestamp && !jd->session->time_conv.time_mult) {
+ pr_err("jitdump file uses arch timestamps but there is no timestamp conversion\n");
+ goto error;
+ }
+
/*
* validate event is using the correct clockid
*/
- if (jit_validate_events(jd->session)) {
+ if (!jd->use_arch_timestamp && jit_validate_events(jd->session)) {
pr_err("error, jitted code must be sampled with perf record -k 1\n");
goto error;
}
@@ -329,6 +339,23 @@ jit_inject_event(struct jit_buf_desc *jd, union perf_event *event)
return 0;
}
+static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp)
+{
+ struct perf_tsc_conversion tc;
+
+ if (!jd->use_arch_timestamp)
+ return timestamp;
+
+ tc.time_shift = jd->session->time_conv.time_shift;
+ tc.time_mult = jd->session->time_conv.time_mult;
+ tc.time_zero = jd->session->time_conv.time_zero;
+
+ if (!tc.time_mult)
+ return 0;
+
+ return tsc_to_perf_time(timestamp, &tc);
+}
+
static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
{
struct perf_sample sample;
@@ -385,7 +412,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
return -1;
}
if (stat(filename, &st))
- memset(&st, 0, sizeof(stat));
+ memset(&st, 0, sizeof(st));
event->mmap2.header.type = PERF_RECORD_MMAP2;
event->mmap2.header.misc = PERF_RECORD_MISC_USER;
@@ -410,7 +437,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
id->tid = tid;
}
if (jd->sample_type & PERF_SAMPLE_TIME)
- id->time = jr->load.p.timestamp;
+ id->time = convert_timestamp(jd, jr->load.p.timestamp);
/*
* create pseudo sample to induce dso hit increment
@@ -473,7 +500,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
size++; /* for \0 */
if (stat(filename, &st))
- memset(&st, 0, sizeof(stat));
+ memset(&st, 0, sizeof(st));
size = PERF_ALIGN(size, sizeof(u64));
@@ -499,7 +526,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
id->tid = tid;
}
if (jd->sample_type & PERF_SAMPLE_TIME)
- id->time = jr->load.p.timestamp;
+ id->time = convert_timestamp(jd, jr->load.p.timestamp);
/*
* create pseudo sample to induce dso hit increment
diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h
index b66c1f503d9e..bcacd20d0c1c 100644
--- a/tools/perf/util/jitdump.h
+++ b/tools/perf/util/jitdump.h
@@ -23,9 +23,12 @@
#define JITHEADER_VERSION 1
enum jitdump_flags_bits {
+ JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT,
JITDUMP_FLAGS_MAX_BIT,
};
+#define JITDUMP_FLAGS_ARCH_TIMESTAMP (1ULL << JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT)
+
#define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \
(~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0)
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 80b9b6a87990..639a2903065e 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -32,6 +32,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
machine->threads = RB_ROOT;
pthread_rwlock_init(&machine->threads_lock, NULL);
+ machine->nr_threads = 0;
INIT_LIST_HEAD(&machine->dead_threads);
machine->last_match = NULL;
@@ -430,6 +431,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine,
*/
thread__get(th);
machine->last_match = th;
+ ++machine->nr_threads;
}
return th;
@@ -681,11 +683,13 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
size_t machine__fprintf(struct machine *machine, FILE *fp)
{
- size_t ret = 0;
+ size_t ret;
struct rb_node *nd;
pthread_rwlock_rdlock(&machine->threads_lock);
+ ret = fprintf(fp, "Threads: %u\n", machine->nr_threads);
+
for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
struct thread *pos = rb_entry(nd, struct thread, rb_node);
@@ -908,11 +912,11 @@ int machines__create_kernel_maps(struct machines *machines, pid_t pid)
return machine__create_kernel_maps(machine);
}
-int machine__load_kallsyms(struct machine *machine, const char *filename,
- enum map_type type, symbol_filter_t filter)
+int __machine__load_kallsyms(struct machine *machine, const char *filename,
+ enum map_type type, bool no_kcore, symbol_filter_t filter)
{
struct map *map = machine__kernel_map(machine);
- int ret = dso__load_kallsyms(map->dso, filename, map, filter);
+ int ret = __dso__load_kallsyms(map->dso, filename, map, no_kcore, filter);
if (ret > 0) {
dso__set_loaded(map->dso, type);
@@ -927,6 +931,12 @@ int machine__load_kallsyms(struct machine *machine, const char *filename,
return ret;
}
+int machine__load_kallsyms(struct machine *machine, const char *filename,
+ enum map_type type, symbol_filter_t filter)
+{
+ return __machine__load_kallsyms(machine, filename, type, false, filter);
+}
+
int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
symbol_filter_t filter)
{
@@ -1413,6 +1423,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th,
pthread_rwlock_wrlock(&machine->threads_lock);
rb_erase_init(&th->rb_node, &machine->threads);
RB_CLEAR_NODE(&th->rb_node);
+ --machine->nr_threads;
/*
* Move it first to the dead_threads list, then drop the reference,
* if this is the last reference, then the thread__delete destructor
@@ -1599,6 +1610,7 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,
}
static int add_callchain_ip(struct thread *thread,
+ struct callchain_cursor *cursor,
struct symbol **parent,
struct addr_location *root_al,
u8 *cpumode,
@@ -1630,7 +1642,7 @@ static int add_callchain_ip(struct thread *thread,
* It seems the callchain is corrupted.
* Discard all.
*/
- callchain_cursor_reset(&callchain_cursor);
+ callchain_cursor_reset(cursor);
return 1;
}
return 0;
@@ -1640,7 +1652,7 @@ static int add_callchain_ip(struct thread *thread,
}
if (al.sym != NULL) {
- if (sort__has_parent && !*parent &&
+ if (perf_hpp_list.parent && !*parent &&
symbol__match_regex(al.sym, &parent_regex))
*parent = al.sym;
else if (have_ignore_callees && root_al &&
@@ -1648,13 +1660,13 @@ static int add_callchain_ip(struct thread *thread,
/* Treat this symbol as the root,
forgetting its callees. */
*root_al = al;
- callchain_cursor_reset(&callchain_cursor);
+ callchain_cursor_reset(cursor);
}
}
if (symbol_conf.hide_unresolved && al.sym == NULL)
return 0;
- return callchain_cursor_append(&callchain_cursor, al.addr, al.map, al.sym);
+ return callchain_cursor_append(cursor, al.addr, al.map, al.sym);
}
struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
@@ -1724,6 +1736,7 @@ static int remove_loops(struct branch_entry *l, int nr)
* negative error code on other errors.
*/
static int resolve_lbr_callchain_sample(struct thread *thread,
+ struct callchain_cursor *cursor,
struct perf_sample *sample,
struct symbol **parent,
struct addr_location *root_al,
@@ -1756,7 +1769,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
*/
int mix_chain_nr = i + 1 + lbr_nr + 1;
- if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
+ if (mix_chain_nr > (int)sysctl_perf_event_max_stack + PERF_MAX_BRANCH_DEPTH) {
pr_warning("corrupted callchain. skipping...\n");
return 0;
}
@@ -1778,7 +1791,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
ip = lbr_stack->entries[0].to;
}
- err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
+ err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip);
if (err)
return (err < 0) ? err : 0;
}
@@ -1789,6 +1802,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
}
static int thread__resolve_callchain_sample(struct thread *thread,
+ struct callchain_cursor *cursor,
struct perf_evsel *evsel,
struct perf_sample *sample,
struct symbol **parent,
@@ -1803,10 +1817,8 @@ static int thread__resolve_callchain_sample(struct thread *thread,
int skip_idx = -1;
int first_call = 0;
- callchain_cursor_reset(&callchain_cursor);
-
- if (has_branch_callstack(evsel)) {
- err = resolve_lbr_callchain_sample(thread, sample, parent,
+ if (perf_evsel__has_branch_callstack(evsel)) {
+ err = resolve_lbr_callchain_sample(thread, cursor, sample, parent,
root_al, max_stack);
if (err)
return (err < 0) ? err : 0;
@@ -1816,7 +1828,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
* Based on DWARF debug information, some architectures skip
* a callchain entry saved by the kernel.
*/
- if (chain->nr < PERF_MAX_STACK_DEPTH)
+ if (chain->nr < sysctl_perf_event_max_stack)
skip_idx = arch_skip_callchain_idx(thread, chain);
/*
@@ -1863,10 +1875,10 @@ static int thread__resolve_callchain_sample(struct thread *thread,
nr = remove_loops(be, nr);
for (i = 0; i < nr; i++) {
- err = add_callchain_ip(thread, parent, root_al,
+ err = add_callchain_ip(thread, cursor, parent, root_al,
NULL, be[i].to);
if (!err)
- err = add_callchain_ip(thread, parent, root_al,
+ err = add_callchain_ip(thread, cursor, parent, root_al,
NULL, be[i].from);
if (err == -EINVAL)
break;
@@ -1877,7 +1889,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
}
check_calls:
- if (chain->nr > PERF_MAX_STACK_DEPTH && (int)chain->nr > max_stack) {
+ if (chain->nr > sysctl_perf_event_max_stack && (int)chain->nr > max_stack) {
pr_warning("corrupted callchain. skipping...\n");
return 0;
}
@@ -1896,7 +1908,7 @@ check_calls:
#endif
ip = chain->ips[j];
- err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
+ err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip);
if (err)
return (err < 0) ? err : 0;
@@ -1915,19 +1927,12 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
entry->map, entry->sym);
}
-int thread__resolve_callchain(struct thread *thread,
- struct perf_evsel *evsel,
- struct perf_sample *sample,
- struct symbol **parent,
- struct addr_location *root_al,
- int max_stack)
+static int thread__resolve_callchain_unwind(struct thread *thread,
+ struct callchain_cursor *cursor,
+ struct perf_evsel *evsel,
+ struct perf_sample *sample,
+ int max_stack)
{
- int ret = thread__resolve_callchain_sample(thread, evsel,
- sample, parent,
- root_al, max_stack);
- if (ret)
- return ret;
-
/* Can we do dwarf post unwind? */
if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) &&
(evsel->attr.sample_type & PERF_SAMPLE_STACK_USER)))
@@ -1938,9 +1943,45 @@ int thread__resolve_callchain(struct thread *thread,
(!sample->user_stack.size))
return 0;
- return unwind__get_entries(unwind_entry, &callchain_cursor,
+ return unwind__get_entries(unwind_entry, cursor,
thread, sample, max_stack);
+}
+
+int thread__resolve_callchain(struct thread *thread,
+ struct callchain_cursor *cursor,
+ struct perf_evsel *evsel,
+ struct perf_sample *sample,
+ struct symbol **parent,
+ struct addr_location *root_al,
+ int max_stack)
+{
+ int ret = 0;
+
+ callchain_cursor_reset(&callchain_cursor);
+ if (callchain_param.order == ORDER_CALLEE) {
+ ret = thread__resolve_callchain_sample(thread, cursor,
+ evsel, sample,
+ parent, root_al,
+ max_stack);
+ if (ret)
+ return ret;
+ ret = thread__resolve_callchain_unwind(thread, cursor,
+ evsel, sample,
+ max_stack);
+ } else {
+ ret = thread__resolve_callchain_unwind(thread, cursor,
+ evsel, sample,
+ max_stack);
+ if (ret)
+ return ret;
+ ret = thread__resolve_callchain_sample(thread, cursor,
+ evsel, sample,
+ parent, root_al,
+ max_stack);
+ }
+
+ return ret;
}
int machine__for_each_thread(struct machine *machine,
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 8499db281158..83f46790c52f 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -31,6 +31,7 @@ struct machine {
char *root_dir;
struct rb_root threads;
pthread_rwlock_t threads_lock;
+ unsigned int nr_threads;
struct list_head dead_threads;
struct thread *last_match;
struct vdso_info *vdso_info;
@@ -141,7 +142,11 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
struct addr_location *al);
struct mem_info *sample__resolve_mem(struct perf_sample *sample,
struct addr_location *al);
+
+struct callchain_cursor;
+
int thread__resolve_callchain(struct thread *thread,
+ struct callchain_cursor *cursor,
struct perf_evsel *evsel,
struct perf_sample *sample,
struct symbol **parent,
@@ -211,6 +216,8 @@ struct symbol *machine__find_kernel_function_by_name(struct machine *machine,
struct map *machine__findnew_module_map(struct machine *machine, u64 start,
const char *filename);
+int __machine__load_kallsyms(struct machine *machine, const char *filename,
+ enum map_type type, bool no_kcore, symbol_filter_t filter);
int machine__load_kallsyms(struct machine *machine, const char *filename,
enum map_type type, symbol_filter_t filter);
int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 171b6d10a04b..b19bcd3b7128 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -289,7 +289,7 @@ int map__load(struct map *map, symbol_filter_t filter)
nr = dso__load(map->dso, map, filter);
if (nr < 0) {
if (map->dso->has_build_id) {
- char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+ char sbuild_id[SBUILD_ID_SIZE];
build_id__sprintf(map->dso->build_id,
sizeof(map->dso->build_id),
@@ -431,6 +431,13 @@ u64 map__rip_2objdump(struct map *map, u64 rip)
if (map->dso->rel)
return rip - map->pgoff;
+ /*
+ * kernel modules also have DSO_TYPE_USER in dso->kernel,
+ * but all kernel modules are ET_REL, so won't get here.
+ */
+ if (map->dso->kernel == DSO_TYPE_USER)
+ return rip + map->dso->text_offset;
+
return map->unmap_ip(map, rip) - map->reloc;
}
@@ -454,6 +461,13 @@ u64 map__objdump_2mem(struct map *map, u64 ip)
if (map->dso->rel)
return map->unmap_ip(map, ip + map->pgoff);
+ /*
+ * kernel modules also have DSO_TYPE_USER in dso->kernel,
+ * but all kernel modules are ET_REL, so won't get here.
+ */
+ if (map->dso->kernel == DSO_TYPE_USER)
+ return map->unmap_ip(map, ip - map->dso->text_offset);
+
return ip + map->reloc;
}
diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c
index b1b9e2385f4b..fe84df1875aa 100644
--- a/tools/perf/util/ordered-events.c
+++ b/tools/perf/util/ordered-events.c
@@ -308,3 +308,12 @@ void ordered_events__free(struct ordered_events *oe)
free(event);
}
}
+
+void ordered_events__reinit(struct ordered_events *oe)
+{
+ ordered_events__deliver_t old_deliver = oe->deliver;
+
+ ordered_events__free(oe);
+ memset(oe, '\0', sizeof(*oe));
+ ordered_events__init(oe, old_deliver);
+}
diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h
index f403991e3bfd..e11468a9a6e4 100644
--- a/tools/perf/util/ordered-events.h
+++ b/tools/perf/util/ordered-events.h
@@ -49,6 +49,7 @@ void ordered_events__delete(struct ordered_events *oe, struct ordered_event *eve
int ordered_events__flush(struct ordered_events *oe, enum oe_flush how);
void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t deliver);
void ordered_events__free(struct ordered_events *oe);
+void ordered_events__reinit(struct ordered_events *oe);
static inline
void ordered_events__set_alloc_size(struct ordered_events *oe, u64 size)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4c19d5e79d8c..bcbc983d4b12 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -138,11 +138,11 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE)
#define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT)
-#define for_each_subsystem(sys_dir, sys_dirent, sys_next) \
- while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next) \
- if (sys_dirent.d_type == DT_DIR && \
- (strcmp(sys_dirent.d_name, ".")) && \
- (strcmp(sys_dirent.d_name, "..")))
+#define for_each_subsystem(sys_dir, sys_dirent) \
+ while ((sys_dirent = readdir(sys_dir)) != NULL) \
+ if (sys_dirent->d_type == DT_DIR && \
+ (strcmp(sys_dirent->d_name, ".")) && \
+ (strcmp(sys_dirent->d_name, "..")))
static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
{
@@ -159,12 +159,12 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
return 0;
}
-#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) \
- while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next) \
- if (evt_dirent.d_type == DT_DIR && \
- (strcmp(evt_dirent.d_name, ".")) && \
- (strcmp(evt_dirent.d_name, "..")) && \
- (!tp_event_has_id(&sys_dirent, &evt_dirent)))
+#define for_each_event(sys_dirent, evt_dir, evt_dirent) \
+ while ((evt_dirent = readdir(evt_dir)) != NULL) \
+ if (evt_dirent->d_type == DT_DIR && \
+ (strcmp(evt_dirent->d_name, ".")) && \
+ (strcmp(evt_dirent->d_name, "..")) && \
+ (!tp_event_has_id(sys_dirent, evt_dirent)))
#define MAX_EVENT_LENGTH 512
@@ -173,7 +173,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
{
struct tracepoint_path *path = NULL;
DIR *sys_dir, *evt_dir;
- struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+ struct dirent *sys_dirent, *evt_dirent;
char id_buf[24];
int fd;
u64 id;
@@ -184,18 +184,18 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
if (!sys_dir)
return NULL;
- for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+ for_each_subsystem(sys_dir, sys_dirent) {
snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
- sys_dirent.d_name);
+ sys_dirent->d_name);
evt_dir = opendir(dir_path);
if (!evt_dir)
continue;
- for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+ for_each_event(sys_dirent, evt_dir, evt_dirent) {
snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path,
- evt_dirent.d_name);
+ evt_dirent->d_name);
fd = open(evt_path, O_RDONLY);
if (fd < 0)
continue;
@@ -220,9 +220,9 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
free(path);
return NULL;
}
- strncpy(path->system, sys_dirent.d_name,
+ strncpy(path->system, sys_dirent->d_name,
MAX_EVENT_LENGTH);
- strncpy(path->name, evt_dirent.d_name,
+ strncpy(path->name, evt_dirent->d_name,
MAX_EVENT_LENGTH);
return path;
}
@@ -1812,7 +1812,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
bool name_only)
{
DIR *sys_dir, *evt_dir;
- struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+ struct dirent *sys_dirent, *evt_dirent;
char evt_path[MAXPATHLEN];
char dir_path[MAXPATHLEN];
char **evt_list = NULL;
@@ -1830,20 +1830,20 @@ restart:
goto out_close_sys_dir;
}
- for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+ for_each_subsystem(sys_dir, sys_dirent) {
if (subsys_glob != NULL &&
- !strglobmatch(sys_dirent.d_name, subsys_glob))
+ !strglobmatch(sys_dirent->d_name, subsys_glob))
continue;
snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
- sys_dirent.d_name);
+ sys_dirent->d_name);
evt_dir = opendir(dir_path);
if (!evt_dir)
continue;
- for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+ for_each_event(sys_dirent, evt_dir, evt_dirent) {
if (event_glob != NULL &&
- !strglobmatch(evt_dirent.d_name, event_glob))
+ !strglobmatch(evt_dirent->d_name, event_glob))
continue;
if (!evt_num_known) {
@@ -1852,7 +1852,7 @@ restart:
}
snprintf(evt_path, MAXPATHLEN, "%s:%s",
- sys_dirent.d_name, evt_dirent.d_name);
+ sys_dirent->d_name, evt_dirent->d_name);
evt_list[evt_i] = strdup(evt_path);
if (evt_list[evt_i] == NULL)
@@ -1905,7 +1905,7 @@ out_close_sys_dir:
int is_valid_tracepoint(const char *event_string)
{
DIR *sys_dir, *evt_dir;
- struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+ struct dirent *sys_dirent, *evt_dirent;
char evt_path[MAXPATHLEN];
char dir_path[MAXPATHLEN];
@@ -1913,17 +1913,17 @@ int is_valid_tracepoint(const char *event_string)
if (!sys_dir)
return 0;
- for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+ for_each_subsystem(sys_dir, sys_dirent) {
snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
- sys_dirent.d_name);
+ sys_dirent->d_name);
evt_dir = opendir(dir_path);
if (!evt_dir)
continue;
- for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+ for_each_event(sys_dirent, evt_dir, evt_dirent) {
snprintf(evt_path, MAXPATHLEN, "%s:%s",
- sys_dirent.d_name, evt_dirent.d_name);
+ sys_dirent->d_name, evt_dirent->d_name);
if (!strcmp(evt_path, event_string)) {
closedir(evt_dir);
closedir(sys_dir);
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index adef23b1352e..ddb0261b2577 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -602,14 +602,13 @@ static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v,
static __u64 pmu_format_max_value(const unsigned long *format)
{
- int w;
+ __u64 w = 0;
+ int fbit;
- w = bitmap_weight(format, PERF_PMU_FORMAT_BITS);
- if (!w)
- return 0;
- if (w < 64)
- return (1ULL << w) - 1;
- return -1;
+ for_each_set_bit(fbit, format, PERF_PMU_FORMAT_BITS)
+ w |= (1ULL << fbit);
+
+ return w;
}
/*
@@ -644,20 +643,20 @@ static int pmu_resolve_param_term(struct parse_events_term *term,
static char *pmu_formats_string(struct list_head *formats)
{
struct perf_pmu_format *format;
- char *str;
- struct strbuf buf;
+ char *str = NULL;
+ struct strbuf buf = STRBUF_INIT;
unsigned i = 0;
if (!formats)
return NULL;
- strbuf_init(&buf, 0);
/* sysfs exported terms */
list_for_each_entry(format, formats, list)
- strbuf_addf(&buf, i++ ? ",%s" : "%s",
- format->name);
+ if (strbuf_addf(&buf, i++ ? ",%s" : "%s", format->name) < 0)
+ goto error;
str = strbuf_detach(&buf, NULL);
+error:
strbuf_release(&buf);
return str;
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 8319fbb08636..74401a20106d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -265,6 +265,65 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address)
return true;
}
+/*
+ * NOTE:
+ * '.gnu.linkonce.this_module' section of kernel module elf directly
+ * maps to 'struct module' from linux/module.h. This section contains
+ * actual module name which will be used by kernel after loading it.
+ * But, we cannot use 'struct module' here since linux/module.h is not
+ * exposed to user-space. Offset of 'name' has remained same from long
+ * time, so hardcoding it here.
+ */
+#ifdef __LP64__
+#define MOD_NAME_OFFSET 24
+#else
+#define MOD_NAME_OFFSET 12
+#endif
+
+/*
+ * @module can be module name of module file path. In case of path,
+ * inspect elf and find out what is actual module name.
+ * Caller has to free mod_name after using it.
+ */
+static char *find_module_name(const char *module)
+{
+ int fd;
+ Elf *elf;
+ GElf_Ehdr ehdr;
+ GElf_Shdr shdr;
+ Elf_Data *data;
+ Elf_Scn *sec;
+ char *mod_name = NULL;
+
+ fd = open(module, O_RDONLY);
+ if (fd < 0)
+ return NULL;
+
+ elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+ if (elf == NULL)
+ goto elf_err;
+
+ if (gelf_getehdr(elf, &ehdr) == NULL)
+ goto ret_err;
+
+ sec = elf_section_by_name(elf, &ehdr, &shdr,
+ ".gnu.linkonce.this_module", NULL);
+ if (!sec)
+ goto ret_err;
+
+ data = elf_getdata(sec, NULL);
+ if (!data || !data->d_buf)
+ goto ret_err;
+
+ mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET);
+
+ret_err:
+ elf_end(elf);
+elf_err:
+ close(fd);
+ return mod_name;
+}
+
#ifdef HAVE_DWARF_SUPPORT
static int kernel_get_module_dso(const char *module, struct dso **pdso)
@@ -486,8 +545,10 @@ static int get_text_start_address(const char *exec, unsigned long *address)
return -errno;
elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
- if (elf == NULL)
- return -EINVAL;
+ if (elf == NULL) {
+ ret = -EINVAL;
+ goto out_close;
+ }
if (gelf_getehdr(elf, &ehdr) == NULL)
goto out;
@@ -499,6 +560,9 @@ static int get_text_start_address(const char *exec, unsigned long *address)
ret = 0;
out:
elf_end(elf);
+out_close:
+ close(fd);
+
return ret;
}
@@ -583,32 +647,23 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs,
int ntevs, const char *module)
{
int i, ret = 0;
- char *tmp;
+ char *mod_name = NULL;
if (!module)
return 0;
- tmp = strrchr(module, '/');
- if (tmp) {
- /* This is a module path -- get the module name */
- module = strdup(tmp + 1);
- if (!module)
- return -ENOMEM;
- tmp = strchr(module, '.');
- if (tmp)
- *tmp = '\0';
- tmp = (char *)module; /* For free() */
- }
+ mod_name = find_module_name(module);
for (i = 0; i < ntevs; i++) {
- tevs[i].point.module = strdup(module);
+ tevs[i].point.module =
+ strdup(mod_name ? mod_name : module);
if (!tevs[i].point.module) {
ret = -ENOMEM;
break;
}
}
- free(tmp);
+ free(mod_name);
return ret;
}
@@ -1618,69 +1673,65 @@ out:
}
/* Compose only probe arg */
-int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len)
+char *synthesize_perf_probe_arg(struct perf_probe_arg *pa)
{
struct perf_probe_arg_field *field = pa->field;
- int ret;
- char *tmp = buf;
+ struct strbuf buf;
+ char *ret = NULL;
+ int err;
+
+ if (strbuf_init(&buf, 64) < 0)
+ return NULL;
if (pa->name && pa->var)
- ret = e_snprintf(tmp, len, "%s=%s", pa->name, pa->var);
+ err = strbuf_addf(&buf, "%s=%s", pa->name, pa->var);
else
- ret = e_snprintf(tmp, len, "%s", pa->name ? pa->name : pa->var);
- if (ret <= 0)
- goto error;
- tmp += ret;
- len -= ret;
+ err = strbuf_addstr(&buf, pa->name ?: pa->var);
+ if (err)
+ goto out;
while (field) {
if (field->name[0] == '[')
- ret = e_snprintf(tmp, len, "%s", field->name);
+ err = strbuf_addstr(&buf, field->name);
else
- ret = e_snprintf(tmp, len, "%s%s",
- field->ref ? "->" : ".", field->name);
- if (ret <= 0)
- goto error;
- tmp += ret;
- len -= ret;
+ err = strbuf_addf(&buf, "%s%s", field->ref ? "->" : ".",
+ field->name);
field = field->next;
+ if (err)
+ goto out;
}
- if (pa->type) {
- ret = e_snprintf(tmp, len, ":%s", pa->type);
- if (ret <= 0)
- goto error;
- tmp += ret;
- len -= ret;
- }
+ if (pa->type)
+ if (strbuf_addf(&buf, ":%s", pa->type) < 0)
+ goto out;
- return tmp - buf;
-error:
- pr_debug("Failed to synthesize perf probe argument: %d\n", ret);
+ ret = strbuf_detach(&buf, NULL);
+out:
+ strbuf_release(&buf);
return ret;
}
/* Compose only probe point (not argument) */
static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
{
- char *buf, *tmp;
- char offs[32] = "", line[32] = "", file[32] = "";
- int ret, len;
+ struct strbuf buf;
+ char *tmp, *ret = NULL;
+ int len, err = 0;
- buf = zalloc(MAX_CMDLEN);
- if (buf == NULL) {
- ret = -ENOMEM;
- goto error;
- }
- if (pp->offset) {
- ret = e_snprintf(offs, 32, "+%lu", pp->offset);
- if (ret <= 0)
- goto error;
- }
- if (pp->line) {
- ret = e_snprintf(line, 32, ":%d", pp->line);
- if (ret <= 0)
- goto error;
+ if (strbuf_init(&buf, 64) < 0)
+ return NULL;
+
+ if (pp->function) {
+ if (strbuf_addstr(&buf, pp->function) < 0)
+ goto out;
+ if (pp->offset)
+ err = strbuf_addf(&buf, "+%lu", pp->offset);
+ else if (pp->line)
+ err = strbuf_addf(&buf, ":%d", pp->line);
+ else if (pp->retprobe)
+ err = strbuf_addstr(&buf, "%return");
+ if (err)
+ goto out;
}
if (pp->file) {
tmp = pp->file;
@@ -1689,25 +1740,15 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
tmp = strchr(pp->file + len - 30, '/');
tmp = tmp ? tmp + 1 : pp->file + len - 30;
}
- ret = e_snprintf(file, 32, "@%s", tmp);
- if (ret <= 0)
- goto error;
+ err = strbuf_addf(&buf, "@%s", tmp);
+ if (!err && !pp->function && pp->line)
+ err = strbuf_addf(&buf, ":%d", pp->line);
}
-
- if (pp->function)
- ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s%s", pp->function,
- offs, pp->retprobe ? "%return" : "", line,
- file);
- else
- ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", file, line);
- if (ret <= 0)
- goto error;
-
- return buf;
-error:
- pr_debug("Failed to synthesize perf probe point: %d\n", ret);
- free(buf);
- return NULL;
+ if (!err)
+ ret = strbuf_detach(&buf, NULL);
+out:
+ strbuf_release(&buf);
+ return ret;
}
#if 0
@@ -1736,45 +1777,32 @@ char *synthesize_perf_probe_command(struct perf_probe_event *pev)
#endif
static int __synthesize_probe_trace_arg_ref(struct probe_trace_arg_ref *ref,
- char **buf, size_t *buflen,
- int depth)
+ struct strbuf *buf, int depth)
{
- int ret;
+ int err;
if (ref->next) {
depth = __synthesize_probe_trace_arg_ref(ref->next, buf,
- buflen, depth + 1);
+ depth + 1);
if (depth < 0)
- goto out;
- }
-
- ret = e_snprintf(*buf, *buflen, "%+ld(", ref->offset);
- if (ret < 0)
- depth = ret;
- else {
- *buf += ret;
- *buflen -= ret;
+ return depth;
}
-out:
- return depth;
-
+ err = strbuf_addf(buf, "%+ld(", ref->offset);
+ return (err < 0) ? err : depth;
}
static int synthesize_probe_trace_arg(struct probe_trace_arg *arg,
- char *buf, size_t buflen)
+ struct strbuf *buf)
{
struct probe_trace_arg_ref *ref = arg->ref;
- int ret, depth = 0;
- char *tmp = buf;
+ int depth = 0, err;
/* Argument name or separator */
if (arg->name)
- ret = e_snprintf(buf, buflen, " %s=", arg->name);
+ err = strbuf_addf(buf, " %s=", arg->name);
else
- ret = e_snprintf(buf, buflen, " ");
- if (ret < 0)
- return ret;
- buf += ret;
- buflen -= ret;
+ err = strbuf_addch(buf, ' ');
+ if (err)
+ return err;
/* Special case: @XXX */
if (arg->value[0] == '@' && arg->ref)
@@ -1782,59 +1810,44 @@ static int synthesize_probe_trace_arg(struct probe_trace_arg *arg,
/* Dereferencing arguments */
if (ref) {
- depth = __synthesize_probe_trace_arg_ref(ref, &buf,
- &buflen, 1);
+ depth = __synthesize_probe_trace_arg_ref(ref, buf, 1);
if (depth < 0)
return depth;
}
/* Print argument value */
if (arg->value[0] == '@' && arg->ref)
- ret = e_snprintf(buf, buflen, "%s%+ld", arg->value,
- arg->ref->offset);
+ err = strbuf_addf(buf, "%s%+ld", arg->value, arg->ref->offset);
else
- ret = e_snprintf(buf, buflen, "%s", arg->value);
- if (ret < 0)
- return ret;
- buf += ret;
- buflen -= ret;
+ err = strbuf_addstr(buf, arg->value);
/* Closing */
- while (depth--) {
- ret = e_snprintf(buf, buflen, ")");
- if (ret < 0)
- return ret;
- buf += ret;
- buflen -= ret;
- }
+ while (!err && depth--)
+ err = strbuf_addch(buf, ')');
+
/* Print argument type */
- if (arg->type) {
- ret = e_snprintf(buf, buflen, ":%s", arg->type);
- if (ret <= 0)
- return ret;
- buf += ret;
- }
+ if (!err && arg->type)
+ err = strbuf_addf(buf, ":%s", arg->type);
- return buf - tmp;
+ return err;
}
char *synthesize_probe_trace_command(struct probe_trace_event *tev)
{
struct probe_trace_point *tp = &tev->point;
- char *buf;
- int i, len, ret;
+ struct strbuf buf;
+ char *ret = NULL;
+ int i, err;
- buf = zalloc(MAX_CMDLEN);
- if (buf == NULL)
+ /* Uprobes must have tp->module */
+ if (tev->uprobes && !tp->module)
return NULL;
- len = e_snprintf(buf, MAX_CMDLEN, "%c:%s/%s ", tp->retprobe ? 'r' : 'p',
- tev->group, tev->event);
- if (len <= 0)
- goto error;
+ if (strbuf_init(&buf, 32) < 0)
+ return NULL;
- /* Uprobes must have tp->module */
- if (tev->uprobes && !tp->module)
+ if (strbuf_addf(&buf, "%c:%s/%s ", tp->retprobe ? 'r' : 'p',
+ tev->group, tev->event) < 0)
goto error;
/*
* If tp->address == 0, then this point must be a
@@ -1849,34 +1862,25 @@ char *synthesize_probe_trace_command(struct probe_trace_event *tev)
/* Use the tp->address for uprobes */
if (tev->uprobes)
- ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s:0x%lx",
- tp->module, tp->address);
+ err = strbuf_addf(&buf, "%s:0x%lx", tp->module, tp->address);
else if (!strncmp(tp->symbol, "0x", 2))
/* Absolute address. See try_to_find_absolute_address() */
- ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s0x%lx",
- tp->module ?: "", tp->module ? ":" : "",
- tp->address);
+ err = strbuf_addf(&buf, "%s%s0x%lx", tp->module ?: "",
+ tp->module ? ":" : "", tp->address);
else
- ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s%s+%lu",
- tp->module ?: "", tp->module ? ":" : "",
- tp->symbol, tp->offset);
-
- if (ret <= 0)
+ err = strbuf_addf(&buf, "%s%s%s+%lu", tp->module ?: "",
+ tp->module ? ":" : "", tp->symbol, tp->offset);
+ if (err)
goto error;
- len += ret;
- for (i = 0; i < tev->nargs; i++) {
- ret = synthesize_probe_trace_arg(&tev->args[i], buf + len,
- MAX_CMDLEN - len);
- if (ret <= 0)
+ for (i = 0; i < tev->nargs; i++)
+ if (synthesize_probe_trace_arg(&tev->args[i], &buf) < 0)
goto error;
- len += ret;
- }
- return buf;
+ ret = strbuf_detach(&buf, NULL);
error:
- free(buf);
- return NULL;
+ strbuf_release(&buf);
+ return ret;
}
static int find_perf_probe_point_from_map(struct probe_trace_point *tp,
@@ -1958,7 +1962,7 @@ static int convert_to_perf_probe_point(struct probe_trace_point *tp,
static int convert_to_perf_probe_event(struct probe_trace_event *tev,
struct perf_probe_event *pev, bool is_kprobe)
{
- char buf[64] = "";
+ struct strbuf buf = STRBUF_INIT;
int i, ret;
/* Convert event/group name */
@@ -1981,14 +1985,15 @@ static int convert_to_perf_probe_event(struct probe_trace_event *tev,
if (tev->args[i].name)
pev->args[i].name = strdup(tev->args[i].name);
else {
- ret = synthesize_probe_trace_arg(&tev->args[i],
- buf, 64);
- pev->args[i].name = strdup(buf);
+ if ((ret = strbuf_init(&buf, 32)) < 0)
+ goto error;
+ ret = synthesize_probe_trace_arg(&tev->args[i], &buf);
+ pev->args[i].name = strbuf_detach(&buf, NULL);
}
if (pev->args[i].name == NULL && ret >= 0)
ret = -ENOMEM;
}
-
+error:
if (ret < 0)
clear_perf_probe_event(pev);
@@ -2162,35 +2167,38 @@ static int perf_probe_event__sprintf(const char *group, const char *event,
struct strbuf *result)
{
int i, ret;
- char buf[128];
- char *place;
+ char *buf;
- /* Synthesize only event probe point */
- place = synthesize_perf_probe_point(&pev->point);
- if (!place)
- return -EINVAL;
+ if (asprintf(&buf, "%s:%s", group, event) < 0)
+ return -errno;
+ ret = strbuf_addf(result, " %-20s (on ", buf);
+ free(buf);
+ if (ret)
+ return ret;
- ret = e_snprintf(buf, 128, "%s:%s", group, event);
- if (ret < 0)
- goto out;
+ /* Synthesize only event probe point */
+ buf = synthesize_perf_probe_point(&pev->point);
+ if (!buf)
+ return -ENOMEM;
+ ret = strbuf_addstr(result, buf);
+ free(buf);
- strbuf_addf(result, " %-20s (on %s", buf, place);
- if (module)
- strbuf_addf(result, " in %s", module);
+ if (!ret && module)
+ ret = strbuf_addf(result, " in %s", module);
- if (pev->nargs > 0) {
- strbuf_add(result, " with", 5);
- for (i = 0; i < pev->nargs; i++) {
- ret = synthesize_perf_probe_arg(&pev->args[i],
- buf, 128);
- if (ret < 0)
- goto out;
- strbuf_addf(result, " %s", buf);
+ if (!ret && pev->nargs > 0) {
+ ret = strbuf_add(result, " with", 5);
+ for (i = 0; !ret && i < pev->nargs; i++) {
+ buf = synthesize_perf_probe_arg(&pev->args[i]);
+ if (!buf)
+ return -ENOMEM;
+ ret = strbuf_addf(result, " %s", buf);
+ free(buf);
}
}
- strbuf_addch(result, ')');
-out:
- free(place);
+ if (!ret)
+ ret = strbuf_addch(result, ')');
+
return ret;
}
@@ -2498,7 +2506,8 @@ static int find_probe_functions(struct map *map, char *name,
void __weak arch__fix_tev_from_maps(struct perf_probe_event *pev __maybe_unused,
struct probe_trace_event *tev __maybe_unused,
- struct map *map __maybe_unused) { }
+ struct map *map __maybe_unused,
+ struct symbol *sym __maybe_unused) { }
/*
* Find probe function addresses from map.
@@ -2516,6 +2525,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
struct probe_trace_point *tp;
int num_matched_functions;
int ret, i, j, skipped = 0;
+ char *mod_name;
map = get_target_map(pev->target, pev->uprobes);
if (!map) {
@@ -2600,9 +2610,19 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
tp->realname = strdup_or_goto(sym->name, nomem_out);
tp->retprobe = pp->retprobe;
- if (pev->target)
- tev->point.module = strdup_or_goto(pev->target,
- nomem_out);
+ if (pev->target) {
+ if (pev->uprobes) {
+ tev->point.module = strdup_or_goto(pev->target,
+ nomem_out);
+ } else {
+ mod_name = find_module_name(pev->target);
+ tev->point.module =
+ strdup(mod_name ? mod_name : pev->target);
+ free(mod_name);
+ if (!tev->point.module)
+ goto nomem_out;
+ }
+ }
tev->uprobes = pev->uprobes;
tev->nargs = pev->nargs;
if (tev->nargs) {
@@ -2624,7 +2644,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
strdup_or_goto(pev->args[i].type,
nomem_out);
}
- arch__fix_tev_from_maps(pev, tev, map);
+ arch__fix_tev_from_maps(pev, tev, map, sym);
}
if (ret == skipped) {
ret = -ENOENT;
@@ -2743,9 +2763,13 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev,
{
int ret;
- if (pev->uprobes && !pev->group) {
- /* Replace group name if not given */
- ret = convert_exec_to_group(pev->target, &pev->group);
+ if (!pev->group) {
+ /* Set group name if not given */
+ if (!pev->uprobes) {
+ pev->group = strdup(PERFPROBE_GROUP);
+ ret = pev->group ? 0 : -ENOMEM;
+ } else
+ ret = convert_exec_to_group(pev->target, &pev->group);
if (ret != 0) {
pr_warning("Failed to make a group name.\n");
return ret;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index e54e7b011577..5a27eb4fad05 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -120,7 +120,7 @@ int parse_probe_trace_command(const char *cmd, struct probe_trace_event *tev);
/* Events to command string */
char *synthesize_perf_probe_command(struct perf_probe_event *pev);
char *synthesize_probe_trace_command(struct probe_trace_event *tev);
-int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len);
+char *synthesize_perf_probe_arg(struct perf_probe_arg *pa);
/* Check the perf_probe_event needs debuginfo */
bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
@@ -154,7 +154,8 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
int show_available_funcs(const char *module, struct strfilter *filter, bool user);
bool arch__prefers_symtab(void);
void arch__fix_tev_from_maps(struct perf_probe_event *pev,
- struct probe_trace_event *tev, struct map *map);
+ struct probe_trace_event *tev, struct map *map,
+ struct symbol *sym);
/* If there is no space to write, returns -E2BIG. */
int e_snprintf(char *str, size_t size, const char *format, ...)
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index e3b3b92e4458..3fe6214970e6 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -220,8 +220,7 @@ int probe_file__add_event(int fd, struct probe_trace_event *tev)
pr_debug("Writing event: %s\n", buf);
if (!probe_event_dry_run) {
- ret = write(fd, buf, strlen(buf));
- if (ret <= 0) {
+ if (write(fd, buf, strlen(buf)) < (int)strlen(buf)) {
ret = -errno;
pr_warning("Failed to write event: %s\n",
strerror_r(errno, sbuf, sizeof(sbuf)));
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index b3bd0fba0237..1259839dbf6d 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -553,7 +553,7 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf)
{
Dwarf_Die vr_die;
- char buf[32], *ptr;
+ char *buf, *ptr;
int ret = 0;
/* Copy raw parameters */
@@ -563,13 +563,13 @@ static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf)
if (pf->pvar->name)
pf->tvar->name = strdup(pf->pvar->name);
else {
- ret = synthesize_perf_probe_arg(pf->pvar, buf, 32);
- if (ret < 0)
- return ret;
+ buf = synthesize_perf_probe_arg(pf->pvar);
+ if (!buf)
+ return -ENOMEM;
ptr = strchr(buf, ':'); /* Change type separator to _ */
if (ptr)
*ptr = '_';
- pf->tvar->name = strdup(buf);
+ pf->tvar->name = buf;
}
if (pf->tvar->name == NULL)
return -ENOMEM;
@@ -1294,6 +1294,7 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
{
struct available_var_finder *af = data;
struct variable_list *vl;
+ struct strbuf buf = STRBUF_INIT;
int tag, ret;
vl = &af->vls[af->nvls - 1];
@@ -1307,25 +1308,26 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
if (ret == 0 || ret == -ERANGE) {
int ret2;
bool externs = !af->child;
- struct strbuf buf;
- strbuf_init(&buf, 64);
+ if (strbuf_init(&buf, 64) < 0)
+ goto error;
if (probe_conf.show_location_range) {
- if (!externs) {
- if (ret)
- strbuf_add(&buf, "[INV]\t", 6);
- else
- strbuf_add(&buf, "[VAL]\t", 6);
- } else
- strbuf_add(&buf, "[EXT]\t", 6);
+ if (!externs)
+ ret2 = strbuf_add(&buf,
+ ret ? "[INV]\t" : "[VAL]\t", 6);
+ else
+ ret2 = strbuf_add(&buf, "[EXT]\t", 6);
+ if (ret2)
+ goto error;
}
ret2 = die_get_varname(die_mem, &buf);
if (!ret2 && probe_conf.show_location_range &&
!externs) {
- strbuf_addch(&buf, '\t');
+ if (strbuf_addch(&buf, '\t') < 0)
+ goto error;
ret2 = die_get_var_range(&af->pf.sp_die,
die_mem, &buf);
}
@@ -1343,6 +1345,10 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
return DIE_FIND_CB_CONTINUE;
else
return DIE_FIND_CB_SIBLING;
+error:
+ strbuf_release(&buf);
+ pr_debug("Error in strbuf\n");
+ return DIE_FIND_CB_END;
}
/* Add a found vars into available variables list */
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 8162ba0e2e57..36c6862119e3 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -23,3 +23,4 @@ util/strlist.c
util/trace-event.c
../lib/rbtree.c
util/string.c
+util/symbol_fprintf.c
diff --git a/tools/perf/util/quote.c b/tools/perf/util/quote.c
index 01f03242b86a..c6d4ee2de752 100644
--- a/tools/perf/util/quote.c
+++ b/tools/perf/util/quote.c
@@ -17,38 +17,42 @@ static inline int need_bs_quote(char c)
return (c == '\'' || c == '!');
}
-static void sq_quote_buf(struct strbuf *dst, const char *src)
+static int sq_quote_buf(struct strbuf *dst, const char *src)
{
char *to_free = NULL;
+ int ret;
if (dst->buf == src)
to_free = strbuf_detach(dst, NULL);
- strbuf_addch(dst, '\'');
- while (*src) {
+ ret = strbuf_addch(dst, '\'');
+ while (!ret && *src) {
size_t len = strcspn(src, "'!");
- strbuf_add(dst, src, len);
+ ret = strbuf_add(dst, src, len);
src += len;
- while (need_bs_quote(*src)) {
- strbuf_addstr(dst, "'\\");
- strbuf_addch(dst, *src++);
- strbuf_addch(dst, '\'');
- }
+ while (!ret && need_bs_quote(*src))
+ ret = strbuf_addf(dst, "'\\%c\'", *src++);
}
- strbuf_addch(dst, '\'');
+ if (!ret)
+ ret = strbuf_addch(dst, '\'');
free(to_free);
+
+ return ret;
}
-void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
+int sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
{
- int i;
+ int i, ret;
/* Copy into destination buffer. */
- strbuf_grow(dst, 255);
- for (i = 0; argv[i]; ++i) {
- strbuf_addch(dst, ' ');
- sq_quote_buf(dst, argv[i]);
+ ret = strbuf_grow(dst, 255);
+ for (i = 0; !ret && argv[i]; ++i) {
+ ret = strbuf_addch(dst, ' ');
+ if (ret)
+ break;
+ ret = sq_quote_buf(dst, argv[i]);
if (maxlen && dst->len > maxlen)
die("Too many or long arguments");
}
+ return ret;
}
diff --git a/tools/perf/util/quote.h b/tools/perf/util/quote.h
index 3340c9c4a6ca..e1ec19146fb0 100644
--- a/tools/perf/util/quote.h
+++ b/tools/perf/util/quote.h
@@ -24,6 +24,6 @@
* sq_quote() in a real application.
*/
-void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
+int sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
#endif /* __PERF_QUOTE_H */
diff --git a/tools/perf/util/rb_resort.h b/tools/perf/util/rb_resort.h
new file mode 100644
index 000000000000..abc76e3d3098
--- /dev/null
+++ b/tools/perf/util/rb_resort.h
@@ -0,0 +1,149 @@
+#ifndef _PERF_RESORT_RB_H_
+#define _PERF_RESORT_RB_H_
+/*
+ * Template for creating a class to resort an existing rb_tree according to
+ * a new sort criteria, that must be present in the entries of the source
+ * rb_tree.
+ *
+ * (c) 2016 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Quick example, resorting threads by its shortname:
+ *
+ * First define the prefix (threads) to be used for the functions and data
+ * structures created, and provide an expression for the sorting, then the
+ * fields to be present in each of the entries in the new, sorted, rb_tree.
+ *
+ * The body of the init function should collect the fields, maybe
+ * pre-calculating them from multiple entries in the original 'entry' from
+ * the rb_tree used as a source for the entries to be sorted:
+
+DEFINE_RB_RESORT_RB(threads, strcmp(a->thread->shortname,
+ b->thread->shortname) < 0,
+ struct thread *thread;
+)
+{
+ entry->thread = rb_entry(nd, struct thread, rb_node);
+}
+
+ * After this it is just a matter of instantiating it and iterating it,
+ * for a few data structures with existing rb_trees, such as 'struct machine',
+ * helpers are available to get the rb_root and the nr_entries:
+
+ DECLARE_RESORT_RB_MACHINE_THREADS(threads, machine_ptr);
+
+ * This will instantiate the new rb_tree and a cursor for it, that can be used as:
+
+ struct rb_node *nd;
+
+ resort_rb__for_each(nd, threads) {
+ struct thread *t = threads_entry;
+ printf("%s: %d\n", t->shortname, t->tid);
+ }
+
+ * Then delete it:
+
+ resort_rb__delete(threads);
+
+ * The name of the data structures and functions will have a _sorted suffix
+ * right before the method names, i.e. will look like:
+ *
+ * struct threads_sorted_entry {}
+ * threads_sorted__insert()
+ */
+
+#define DEFINE_RESORT_RB(__name, __comp, ...) \
+struct __name##_sorted_entry { \
+ struct rb_node rb_node; \
+ __VA_ARGS__ \
+}; \
+static void __name##_sorted__init_entry(struct rb_node *nd, \
+ struct __name##_sorted_entry *entry); \
+ \
+static int __name##_sorted__cmp(struct rb_node *nda, struct rb_node *ndb) \
+{ \
+ struct __name##_sorted_entry *a, *b; \
+ a = rb_entry(nda, struct __name##_sorted_entry, rb_node); \
+ b = rb_entry(ndb, struct __name##_sorted_entry, rb_node); \
+ return __comp; \
+} \
+ \
+struct __name##_sorted { \
+ struct rb_root entries; \
+ struct __name##_sorted_entry nd[0]; \
+}; \
+ \
+static void __name##_sorted__insert(struct __name##_sorted *sorted, \
+ struct rb_node *sorted_nd) \
+{ \
+ struct rb_node **p = &sorted->entries.rb_node, *parent = NULL; \
+ while (*p != NULL) { \
+ parent = *p; \
+ if (__name##_sorted__cmp(sorted_nd, parent)) \
+ p = &(*p)->rb_left; \
+ else \
+ p = &(*p)->rb_right; \
+ } \
+ rb_link_node(sorted_nd, parent, p); \
+ rb_insert_color(sorted_nd, &sorted->entries); \
+} \
+ \
+static void __name##_sorted__sort(struct __name##_sorted *sorted, \
+ struct rb_root *entries) \
+{ \
+ struct rb_node *nd; \
+ unsigned int i = 0; \
+ for (nd = rb_first(entries); nd; nd = rb_next(nd)) { \
+ struct __name##_sorted_entry *snd = &sorted->nd[i++]; \
+ __name##_sorted__init_entry(nd, snd); \
+ __name##_sorted__insert(sorted, &snd->rb_node); \
+ } \
+} \
+ \
+static struct __name##_sorted *__name##_sorted__new(struct rb_root *entries, \
+ int nr_entries) \
+{ \
+ struct __name##_sorted *sorted; \
+ sorted = malloc(sizeof(*sorted) + sizeof(sorted->nd[0]) * nr_entries); \
+ if (sorted) { \
+ sorted->entries = RB_ROOT; \
+ __name##_sorted__sort(sorted, entries); \
+ } \
+ return sorted; \
+} \
+ \
+static void __name##_sorted__delete(struct __name##_sorted *sorted) \
+{ \
+ free(sorted); \
+} \
+ \
+static void __name##_sorted__init_entry(struct rb_node *nd, \
+ struct __name##_sorted_entry *entry)
+
+#define DECLARE_RESORT_RB(__name) \
+struct __name##_sorted_entry *__name##_entry; \
+struct __name##_sorted *__name = __name##_sorted__new
+
+#define resort_rb__for_each(__nd, __name) \
+ for (__nd = rb_first(&__name->entries); \
+ __name##_entry = rb_entry(__nd, struct __name##_sorted_entry, \
+ rb_node), __nd; \
+ __nd = rb_next(__nd))
+
+#define resort_rb__delete(__name) \
+ __name##_sorted__delete(__name), __name = NULL
+
+/*
+ * Helpers for other classes that contains both an rbtree and the
+ * number of entries in it:
+ */
+
+/* For 'struct intlist' */
+#define DECLARE_RESORT_RB_INTLIST(__name, __ilist) \
+ DECLARE_RESORT_RB(__name)(&__ilist->rblist.entries, \
+ __ilist->rblist.nr_entries)
+
+/* For 'struct machine->threads' */
+#define DECLARE_RESORT_RB_MACHINE_THREADS(__name, __machine) \
+ DECLARE_RESORT_RB(__name)(&__machine->threads, __machine->nr_threads)
+
+#endif /* _PERF_RESORT_RB_H_ */
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 0467367dc315..481792c7484b 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -129,7 +129,8 @@ bool perf_can_record_cpu_wide(void)
return true;
}
-void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts)
+void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts,
+ struct callchain_param *callchain)
{
struct perf_evsel *evsel;
bool use_sample_identifier = false;
@@ -148,7 +149,7 @@ void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts)
use_comm_exec = perf_can_comm_exec();
evlist__for_each(evlist, evsel) {
- perf_evsel__config(evsel, opts);
+ perf_evsel__config(evsel, opts, callchain);
if (evsel->tracking && use_comm_exec)
evsel->attr.comm_exec = 1;
}
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index b3aabc0d4eb0..62c7f6988e0e 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -31,6 +31,8 @@
#include <perl.h>
#include "../../perf.h"
+#include "../callchain.h"
+#include "../machine.h"
#include "../thread.h"
#include "../event.h"
#include "../trace-event.h"
@@ -248,10 +250,90 @@ static void define_event_symbols(struct event_format *event,
define_event_symbols(event, ev_name, args->next);
}
+static SV *perl_process_callchain(struct perf_sample *sample,
+ struct perf_evsel *evsel,
+ struct addr_location *al)
+{
+ AV *list;
+
+ list = newAV();
+ if (!list)
+ goto exit;
+
+ if (!symbol_conf.use_callchain || !sample->callchain)
+ goto exit;
+
+ if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+ sample, NULL, NULL,
+ sysctl_perf_event_max_stack) != 0) {
+ pr_err("Failed to resolve callchain. Skipping\n");
+ goto exit;
+ }
+ callchain_cursor_commit(&callchain_cursor);
+
+
+ while (1) {
+ HV *elem;
+ struct callchain_cursor_node *node;
+ node = callchain_cursor_current(&callchain_cursor);
+ if (!node)
+ break;
+
+ elem = newHV();
+ if (!elem)
+ goto exit;
+
+ if (!hv_stores(elem, "ip", newSVuv(node->ip))) {
+ hv_undef(elem);
+ goto exit;
+ }
+
+ if (node->sym) {
+ HV *sym = newHV();
+ if (!sym) {
+ hv_undef(elem);
+ goto exit;
+ }
+ if (!hv_stores(sym, "start", newSVuv(node->sym->start)) ||
+ !hv_stores(sym, "end", newSVuv(node->sym->end)) ||
+ !hv_stores(sym, "binding", newSVuv(node->sym->binding)) ||
+ !hv_stores(sym, "name", newSVpvn(node->sym->name,
+ node->sym->namelen)) ||
+ !hv_stores(elem, "sym", newRV_noinc((SV*)sym))) {
+ hv_undef(sym);
+ hv_undef(elem);
+ goto exit;
+ }
+ }
+
+ if (node->map) {
+ struct map *map = node->map;
+ const char *dsoname = "[unknown]";
+ if (map && map->dso && (map->dso->name || map->dso->long_name)) {
+ if (symbol_conf.show_kernel_path && map->dso->long_name)
+ dsoname = map->dso->long_name;
+ else if (map->dso->name)
+ dsoname = map->dso->name;
+ }
+ if (!hv_stores(elem, "dso", newSVpv(dsoname,0))) {
+ hv_undef(elem);
+ goto exit;
+ }
+ }
+
+ callchain_cursor_advance(&callchain_cursor);
+ av_push(list, newRV_noinc((SV*)elem));
+ }
+
+exit:
+ return newRV_noinc((SV*)list);
+}
+
static void perl_process_tracepoint(struct perf_sample *sample,
struct perf_evsel *evsel,
- struct thread *thread)
+ struct addr_location *al)
{
+ struct thread *thread = al->thread;
struct event_format *event = evsel->tp_format;
struct format_field *field;
static char handler[256];
@@ -295,6 +377,7 @@ static void perl_process_tracepoint(struct perf_sample *sample,
XPUSHs(sv_2mortal(newSVuv(ns)));
XPUSHs(sv_2mortal(newSViv(pid)));
XPUSHs(sv_2mortal(newSVpv(comm, 0)));
+ XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al)));
/* common fields other than pid can be accessed via xsub fns */
@@ -329,6 +412,7 @@ static void perl_process_tracepoint(struct perf_sample *sample,
XPUSHs(sv_2mortal(newSVuv(nsecs)));
XPUSHs(sv_2mortal(newSViv(pid)));
XPUSHs(sv_2mortal(newSVpv(comm, 0)));
+ XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al)));
call_pv("main::trace_unhandled", G_SCALAR);
}
SPAGAIN;
@@ -366,7 +450,7 @@ static void perl_process_event(union perf_event *event,
struct perf_evsel *evsel,
struct addr_location *al)
{
- perl_process_tracepoint(sample, evsel, al->thread);
+ perl_process_tracepoint(sample, evsel, al);
perl_process_event_generic(event, sample, evsel);
}
@@ -490,7 +574,27 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
fprintf(ofp, "use Perf::Trace::Util;\n\n");
fprintf(ofp, "sub trace_begin\n{\n\t# optional\n}\n\n");
- fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n\n");
+ fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n");
+
+
+ fprintf(ofp, "\n\
+sub print_backtrace\n\
+{\n\
+ my $callchain = shift;\n\
+ for my $node (@$callchain)\n\
+ {\n\
+ if(exists $node->{sym})\n\
+ {\n\
+ printf( \"\\t[\\%%x] \\%%s\\n\", $node->{ip}, $node->{sym}{name});\n\
+ }\n\
+ else\n\
+ {\n\
+ printf( \"\\t[\\%%x]\\n\", $node{ip});\n\
+ }\n\
+ }\n\
+}\n\n\
+");
+
while ((event = trace_find_next_event(pevent, event))) {
fprintf(ofp, "sub %s::%s\n{\n", event->system, event->name);
@@ -502,7 +606,8 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
fprintf(ofp, "$common_secs, ");
fprintf(ofp, "$common_nsecs,\n");
fprintf(ofp, "\t $common_pid, ");
- fprintf(ofp, "$common_comm,\n\t ");
+ fprintf(ofp, "$common_comm, ");
+ fprintf(ofp, "$common_callchain,\n\t ");
not_first = 0;
count = 0;
@@ -519,7 +624,7 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
fprintf(ofp, "\tprint_header($event_name, $common_cpu, "
"$common_secs, $common_nsecs,\n\t "
- "$common_pid, $common_comm);\n\n");
+ "$common_pid, $common_comm, $common_callchain);\n\n");
fprintf(ofp, "\tprintf(\"");
@@ -581,17 +686,22 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
fprintf(ofp, "$%s", f->name);
}
- fprintf(ofp, ");\n");
+ fprintf(ofp, ");\n\n");
+
+ fprintf(ofp, "\tprint_backtrace($common_callchain);\n");
+
fprintf(ofp, "}\n\n");
}
fprintf(ofp, "sub trace_unhandled\n{\n\tmy ($event_name, $context, "
"$common_cpu, $common_secs, $common_nsecs,\n\t "
- "$common_pid, $common_comm) = @_;\n\n");
+ "$common_pid, $common_comm, $common_callchain) = @_;\n\n");
fprintf(ofp, "\tprint_header($event_name, $common_cpu, "
"$common_secs, $common_nsecs,\n\t $common_pid, "
- "$common_comm);\n}\n\n");
+ "$common_comm, $common_callchain);\n");
+ fprintf(ofp, "\tprint_backtrace($common_callchain);\n");
+ fprintf(ofp, "}\n\n");
fprintf(ofp, "sub print_header\n{\n"
"\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n"
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index fbd05242b4e5..ff134700bf30 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -41,6 +41,7 @@
#include "../thread-stack.h"
#include "../trace-event.h"
#include "../machine.h"
+#include "../call-path.h"
#include "thread_map.h"
#include "cpumap.h"
#include "stat.h"
@@ -323,7 +324,7 @@ static PyObject *python_process_callchain(struct perf_sample *sample,
if (!symbol_conf.use_callchain || !sample->callchain)
goto exit;
- if (thread__resolve_callchain(al->thread, evsel,
+ if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
sample, NULL, NULL,
scripting_max_stack) != 0) {
pr_err("Failed to resolve callchain. Skipping\n");
@@ -407,8 +408,11 @@ static void python_process_tracepoint(struct perf_sample *sample,
if (!t)
Py_FatalError("couldn't create Python tuple");
- if (!event)
- die("ug! no event found for type %d", (int)evsel->attr.config);
+ if (!event) {
+ snprintf(handler_name, sizeof(handler_name),
+ "ug! no event found for type %" PRIu64, (u64)evsel->attr.config);
+ Py_FatalError(handler_name);
+ }
pid = raw_field_value(event, "common_pid", data);
@@ -614,7 +618,7 @@ static int python_export_dso(struct db_export *dbe, struct dso *dso,
struct machine *machine)
{
struct tables *tables = container_of(dbe, struct tables, dbe);
- char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+ char sbuild_id[SBUILD_ID_SIZE];
PyObject *t;
build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
@@ -681,7 +685,7 @@ static int python_export_sample(struct db_export *dbe,
struct tables *tables = container_of(dbe, struct tables, dbe);
PyObject *t;
- t = tuple_new(21);
+ t = tuple_new(22);
tuple_set_u64(t, 0, es->db_id);
tuple_set_u64(t, 1, es->evsel->db_id);
@@ -704,6 +708,7 @@ static int python_export_sample(struct db_export *dbe,
tuple_set_u64(t, 18, es->sample->data_src);
tuple_set_s32(t, 19, es->sample->flags & PERF_BRANCH_MASK);
tuple_set_s32(t, 20, !!(es->sample->flags & PERF_IP_FLAG_IN_TX));
+ tuple_set_u64(t, 21, es->call_path_id);
call_object(tables->sample_handler, t, "sample_table");
@@ -998,8 +1003,10 @@ static void set_table_handlers(struct tables *tables)
{
const char *perf_db_export_mode = "perf_db_export_mode";
const char *perf_db_export_calls = "perf_db_export_calls";
- PyObject *db_export_mode, *db_export_calls;
+ const char *perf_db_export_callchains = "perf_db_export_callchains";
+ PyObject *db_export_mode, *db_export_calls, *db_export_callchains;
bool export_calls = false;
+ bool export_callchains = false;
int ret;
memset(tables, 0, sizeof(struct tables));
@@ -1016,6 +1023,7 @@ static void set_table_handlers(struct tables *tables)
if (!ret)
return;
+ /* handle export calls */
tables->dbe.crp = NULL;
db_export_calls = PyDict_GetItemString(main_dict, perf_db_export_calls);
if (db_export_calls) {
@@ -1033,6 +1041,33 @@ static void set_table_handlers(struct tables *tables)
Py_FatalError("failed to create calls processor");
}
+ /* handle export callchains */
+ tables->dbe.cpr = NULL;
+ db_export_callchains = PyDict_GetItemString(main_dict,
+ perf_db_export_callchains);
+ if (db_export_callchains) {
+ ret = PyObject_IsTrue(db_export_callchains);
+ if (ret == -1)
+ handler_call_die(perf_db_export_callchains);
+ export_callchains = !!ret;
+ }
+
+ if (export_callchains) {
+ /*
+ * Attempt to use the call path root from the call return
+ * processor, if the call return processor is in use. Otherwise,
+ * we allocate a new call path root. This prevents exporting
+ * duplicate call path ids when both are in use simultaniously.
+ */
+ if (tables->dbe.crp)
+ tables->dbe.cpr = tables->dbe.crp->cpr;
+ else
+ tables->dbe.cpr = call_path_root__new();
+
+ if (!tables->dbe.cpr)
+ Py_FatalError("failed to create call path root");
+ }
+
tables->db_export_mode = true;
/*
* Reserve per symbol space for symbol->db_id via symbol__priv()
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4abd85c6346d..2335b2824d8a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -409,6 +409,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
tool->stat = process_stat_stub;
if (tool->stat_round == NULL)
tool->stat_round = process_stat_round_stub;
+ if (tool->time_conv == NULL)
+ tool->time_conv = process_event_op2_stub;
}
static void swap_sample_id_all(union perf_event *event, void *data)
@@ -794,6 +796,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
[PERF_RECORD_STAT] = perf_event__stat_swap,
[PERF_RECORD_STAT_ROUND] = perf_event__stat_round_swap,
[PERF_RECORD_EVENT_UPDATE] = perf_event__event_update_swap,
+ [PERF_RECORD_TIME_CONV] = perf_event__all64_swap,
[PERF_RECORD_HEADER_MAX] = NULL,
};
@@ -904,7 +907,7 @@ static void callchain__printf(struct perf_evsel *evsel,
unsigned int i;
struct ip_callchain *callchain = sample->callchain;
- if (has_branch_callstack(evsel))
+ if (perf_evsel__has_branch_callstack(evsel))
callchain__lbr_callstack_printf(sample);
printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
@@ -1078,7 +1081,7 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
if (sample_type & PERF_SAMPLE_CALLCHAIN)
callchain__printf(evsel, sample);
- if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
+ if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !perf_evsel__has_branch_callstack(evsel))
branch_stack__printf(sample);
if (sample_type & PERF_SAMPLE_REGS_USER)
@@ -1341,6 +1344,9 @@ static s64 perf_session__process_user_event(struct perf_session *session,
return tool->stat(tool, event, session);
case PERF_RECORD_STAT_ROUND:
return tool->stat_round(tool, event, session);
+ case PERF_RECORD_TIME_CONV:
+ session->time_conv = event->time_conv;
+ return tool->time_conv(tool, event, session);
default:
return -EINVAL;
}
@@ -1830,7 +1836,11 @@ out:
out_err:
ui_progress__finish();
perf_session__warn_about_errors(session);
- ordered_events__free(&session->ordered_events);
+ /*
+ * We may switching perf.data output, make ordered_events
+ * reusable.
+ */
+ ordered_events__reinit(&session->ordered_events);
auxtrace__free_events(session);
session->one_mmap = false;
return err;
@@ -1947,105 +1957,6 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
return NULL;
}
-void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample,
- struct addr_location *al,
- unsigned int print_opts, unsigned int stack_depth)
-{
- struct callchain_cursor_node *node;
- int print_ip = print_opts & PRINT_IP_OPT_IP;
- int print_sym = print_opts & PRINT_IP_OPT_SYM;
- int print_dso = print_opts & PRINT_IP_OPT_DSO;
- int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET;
- int print_oneline = print_opts & PRINT_IP_OPT_ONELINE;
- int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE;
- char s = print_oneline ? ' ' : '\t';
-
- if (symbol_conf.use_callchain && sample->callchain) {
- struct addr_location node_al;
-
- if (thread__resolve_callchain(al->thread, evsel,
- sample, NULL, NULL,
- stack_depth) != 0) {
- if (verbose)
- error("Failed to resolve callchain. Skipping\n");
- return;
- }
- callchain_cursor_commit(&callchain_cursor);
-
- if (print_symoffset)
- node_al = *al;
-
- while (stack_depth) {
- u64 addr = 0;
-
- node = callchain_cursor_current(&callchain_cursor);
- if (!node)
- break;
-
- if (node->sym && node->sym->ignore)
- goto next;
-
- if (print_ip)
- printf("%c%16" PRIx64, s, node->ip);
-
- if (node->map)
- addr = node->map->map_ip(node->map, node->ip);
-
- if (print_sym) {
- printf(" ");
- if (print_symoffset) {
- node_al.addr = addr;
- node_al.map = node->map;
- symbol__fprintf_symname_offs(node->sym, &node_al, stdout);
- } else
- symbol__fprintf_symname(node->sym, stdout);
- }
-
- if (print_dso) {
- printf(" (");
- map__fprintf_dsoname(node->map, stdout);
- printf(")");
- }
-
- if (print_srcline)
- map__fprintf_srcline(node->map, addr, "\n ",
- stdout);
-
- if (!print_oneline)
- printf("\n");
-
- stack_depth--;
-next:
- callchain_cursor_advance(&callchain_cursor);
- }
-
- } else {
- if (al->sym && al->sym->ignore)
- return;
-
- if (print_ip)
- printf("%16" PRIx64, sample->ip);
-
- if (print_sym) {
- printf(" ");
- if (print_symoffset)
- symbol__fprintf_symname_offs(al->sym, al,
- stdout);
- else
- symbol__fprintf_symname(al->sym, stdout);
- }
-
- if (print_dso) {
- printf(" (");
- map__fprintf_dsoname(al->map, stdout);
- printf(")");
- }
-
- if (print_srcline)
- map__fprintf_srcline(al->map, al->addr, "\n ", stdout);
- }
-}
-
int perf_session__cpu_bitmap(struct perf_session *session,
const char *cpu_list, unsigned long *cpu_bitmap)
{
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 5f792e35d4c1..4bd758553450 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -26,6 +26,7 @@ struct perf_session {
struct itrace_synth_opts *itrace_synth_opts;
struct list_head auxtrace_index;
struct trace_event tevent;
+ struct time_conv_event time_conv;
bool repipe;
bool one_mmap;
void *one_mmap_addr;
@@ -35,13 +36,6 @@ struct perf_session {
struct perf_tool *tool;
};
-#define PRINT_IP_OPT_IP (1<<0)
-#define PRINT_IP_OPT_SYM (1<<1)
-#define PRINT_IP_OPT_DSO (1<<2)
-#define PRINT_IP_OPT_SYMOFFSET (1<<3)
-#define PRINT_IP_OPT_ONELINE (1<<4)
-#define PRINT_IP_OPT_SRCLINE (1<<5)
-
struct perf_tool;
struct perf_session *perf_session__new(struct perf_data_file *file,
@@ -103,10 +97,6 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
unsigned int type);
-void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample,
- struct addr_location *al,
- unsigned int print_opts, unsigned int stack_depth);
-
int perf_session__cpu_bitmap(struct perf_session *session,
const char *cpu_list, unsigned long *cpu_bitmap);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 47966a1618c7..20e69edd5006 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -21,13 +21,6 @@ const char *sort_order;
const char *field_order;
regex_t ignore_callees_regex;
int have_ignore_callees = 0;
-int sort__need_collapse = 0;
-int sort__has_parent = 0;
-int sort__has_sym = 0;
-int sort__has_dso = 0;
-int sort__has_socket = 0;
-int sort__has_thread = 0;
-int sort__has_comm = 0;
enum sort_mode sort__mode = SORT_MODE__NORMAL;
/*
@@ -244,7 +237,7 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
* comparing symbol address alone is not enough since it's a
* relative address within a dso.
*/
- if (!sort__has_dso) {
+ if (!hists__has(left->hists, dso) || hists__has(right->hists, dso)) {
ret = sort__dso_cmp(left, right);
if (ret != 0)
return ret;
@@ -2163,7 +2156,7 @@ static int __sort_dimension__add(struct sort_dimension *sd,
return -1;
if (sd->entry->se_collapse)
- sort__need_collapse = 1;
+ list->need_collapse = 1;
sd->taken = 1;
@@ -2245,9 +2238,9 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
pr_err("Invalid regex: %s\n%s", parent_pattern, err);
return -EINVAL;
}
- sort__has_parent = 1;
+ list->parent = 1;
} else if (sd->entry == &sort_sym) {
- sort__has_sym = 1;
+ list->sym = 1;
/*
* perf diff displays the performance difference amongst
* two or more perf.data files. Those files could come
@@ -2258,13 +2251,13 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
sd->entry->se_collapse = sort__sym_sort;
} else if (sd->entry == &sort_dso) {
- sort__has_dso = 1;
+ list->dso = 1;
} else if (sd->entry == &sort_socket) {
- sort__has_socket = 1;
+ list->socket = 1;
} else if (sd->entry == &sort_thread) {
- sort__has_thread = 1;
+ list->thread = 1;
} else if (sd->entry == &sort_comm) {
- sort__has_comm = 1;
+ list->comm = 1;
}
return __sort_dimension__add(sd, list, level);
@@ -2289,7 +2282,7 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
return -EINVAL;
if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
- sort__has_sym = 1;
+ list->sym = 1;
__sort_dimension__add(sd, list, level);
return 0;
@@ -2305,7 +2298,7 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
return -EINVAL;
if (sd->entry == &sort_mem_daddr_sym)
- sort__has_sym = 1;
+ list->sym = 1;
__sort_dimension__add(sd, list, level);
return 0;
@@ -2445,6 +2438,9 @@ static char *prefix_if_not_in(const char *pre, char *str)
static char *setup_overhead(char *keys)
{
+ if (sort__mode == SORT_MODE__DIFF)
+ return keys;
+
keys = prefix_if_not_in("overhead", keys);
if (symbol_conf.cumulate_callchain)
@@ -2746,10 +2742,10 @@ int setup_sorting(struct perf_evlist *evlist)
void reset_output_field(void)
{
- sort__need_collapse = 0;
- sort__has_parent = 0;
- sort__has_sym = 0;
- sort__has_dso = 0;
+ perf_hpp_list.need_collapse = 0;
+ perf_hpp_list.parent = 0;
+ perf_hpp_list.sym = 0;
+ perf_hpp_list.dso = 0;
field_order = NULL;
sort_order = NULL;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 3f4e35998119..42927f448bcb 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -31,13 +31,6 @@ extern const char *parent_pattern;
extern const char default_sort_order[];
extern regex_t ignore_callees_regex;
extern int have_ignore_callees;
-extern int sort__need_collapse;
-extern int sort__has_dso;
-extern int sort__has_parent;
-extern int sort__has_sym;
-extern int sort__has_socket;
-extern int sort__has_thread;
-extern int sort__has_comm;
extern enum sort_mode sort__mode;
extern struct sort_entry sort_comm;
extern struct sort_entry sort_dso;
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 4d9b481cf3b6..ffa1d0653861 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -307,6 +307,7 @@ int perf_stat_process_counter(struct perf_stat_config *config,
struct perf_counts_values *aggr = &counter->counts->aggr;
struct perf_stat_evsel *ps = counter->priv;
u64 *count = counter->counts->aggr.values;
+ u64 val;
int i, ret;
aggr->val = aggr->ena = aggr->run = 0;
@@ -346,7 +347,8 @@ int perf_stat_process_counter(struct perf_stat_config *config,
/*
* Save the full runtime - to allow normalization during printout:
*/
- perf_stat__update_shadow_stats(counter, count, 0);
+ val = counter->scale * *count;
+ perf_stat__update_shadow_stats(counter, &val, 0);
return 0;
}
diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c
index 8fb73295ec34..f95f682aa2b2 100644
--- a/tools/perf/util/strbuf.c
+++ b/tools/perf/util/strbuf.c
@@ -1,3 +1,4 @@
+#include "debug.h"
#include "cache.h"
#include <linux/kernel.h>
@@ -17,12 +18,13 @@ int prefixcmp(const char *str, const char *prefix)
*/
char strbuf_slopbuf[1];
-void strbuf_init(struct strbuf *sb, ssize_t hint)
+int strbuf_init(struct strbuf *sb, ssize_t hint)
{
sb->alloc = sb->len = 0;
sb->buf = strbuf_slopbuf;
if (hint)
- strbuf_grow(sb, hint);
+ return strbuf_grow(sb, hint);
+ return 0;
}
void strbuf_release(struct strbuf *sb)
@@ -42,67 +44,104 @@ char *strbuf_detach(struct strbuf *sb, size_t *sz)
return res;
}
-void strbuf_grow(struct strbuf *sb, size_t extra)
+int strbuf_grow(struct strbuf *sb, size_t extra)
{
- if (sb->len + extra + 1 <= sb->len)
- die("you want to use way too much memory");
- if (!sb->alloc)
- sb->buf = NULL;
- ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
+ char *buf;
+ size_t nr = sb->len + extra + 1;
+
+ if (nr < sb->alloc)
+ return 0;
+
+ if (nr <= sb->len)
+ return -E2BIG;
+
+ if (alloc_nr(sb->alloc) > nr)
+ nr = alloc_nr(sb->alloc);
+
+ /*
+ * Note that sb->buf == strbuf_slopbuf if sb->alloc == 0, and it is
+ * a static variable. Thus we have to avoid passing it to realloc.
+ */
+ buf = realloc(sb->alloc ? sb->buf : NULL, nr * sizeof(*buf));
+ if (!buf)
+ return -ENOMEM;
+
+ sb->buf = buf;
+ sb->alloc = nr;
+ return 0;
}
-void strbuf_addch(struct strbuf *sb, int c)
+int strbuf_addch(struct strbuf *sb, int c)
{
- strbuf_grow(sb, 1);
+ int ret = strbuf_grow(sb, 1);
+ if (ret)
+ return ret;
+
sb->buf[sb->len++] = c;
sb->buf[sb->len] = '\0';
+ return 0;
}
-void strbuf_add(struct strbuf *sb, const void *data, size_t len)
+int strbuf_add(struct strbuf *sb, const void *data, size_t len)
{
- strbuf_grow(sb, len);
+ int ret = strbuf_grow(sb, len);
+ if (ret)
+ return ret;
+
memcpy(sb->buf + sb->len, data, len);
- strbuf_setlen(sb, sb->len + len);
+ return strbuf_setlen(sb, sb->len + len);
}
-static void strbuf_addv(struct strbuf *sb, const char *fmt, va_list ap)
+static int strbuf_addv(struct strbuf *sb, const char *fmt, va_list ap)
{
- int len;
+ int len, ret;
va_list ap_saved;
- if (!strbuf_avail(sb))
- strbuf_grow(sb, 64);
+ if (!strbuf_avail(sb)) {
+ ret = strbuf_grow(sb, 64);
+ if (ret)
+ return ret;
+ }
va_copy(ap_saved, ap);
len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
if (len < 0)
- die("your vsnprintf is broken");
+ return len;
if (len > strbuf_avail(sb)) {
- strbuf_grow(sb, len);
+ ret = strbuf_grow(sb, len);
+ if (ret)
+ return ret;
len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap_saved);
va_end(ap_saved);
if (len > strbuf_avail(sb)) {
- die("this should not happen, your vsnprintf is broken");
+ pr_debug("this should not happen, your vsnprintf is broken");
+ return -EINVAL;
}
}
- strbuf_setlen(sb, sb->len + len);
+ return strbuf_setlen(sb, sb->len + len);
}
-void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
+int strbuf_addf(struct strbuf *sb, const char *fmt, ...)
{
va_list ap;
+ int ret;
va_start(ap, fmt);
- strbuf_addv(sb, fmt, ap);
+ ret = strbuf_addv(sb, fmt, ap);
va_end(ap);
+ return ret;
}
ssize_t strbuf_read(struct strbuf *sb, int fd, ssize_t hint)
{
size_t oldlen = sb->len;
size_t oldalloc = sb->alloc;
+ int ret;
+
+ ret = strbuf_grow(sb, hint ? hint : 8192);
+ if (ret)
+ return ret;
- strbuf_grow(sb, hint ? hint : 8192);
for (;;) {
ssize_t cnt;
@@ -112,12 +151,14 @@ ssize_t strbuf_read(struct strbuf *sb, int fd, ssize_t hint)
strbuf_release(sb);
else
strbuf_setlen(sb, oldlen);
- return -1;
+ return cnt;
}
if (!cnt)
break;
sb->len += cnt;
- strbuf_grow(sb, 8192);
+ ret = strbuf_grow(sb, 8192);
+ if (ret)
+ return ret;
}
sb->buf[sb->len] = '\0';
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
index ab9be0fbbd40..54b409297d4a 100644
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -51,7 +51,7 @@ struct strbuf {
#define STRBUF_INIT { 0, 0, strbuf_slopbuf }
/*----- strbuf life cycle -----*/
-void strbuf_init(struct strbuf *buf, ssize_t hint);
+int strbuf_init(struct strbuf *buf, ssize_t hint);
void strbuf_release(struct strbuf *buf);
char *strbuf_detach(struct strbuf *buf, size_t *);
@@ -60,26 +60,31 @@ static inline ssize_t strbuf_avail(const struct strbuf *sb) {
return sb->alloc ? sb->alloc - sb->len - 1 : 0;
}
-void strbuf_grow(struct strbuf *buf, size_t);
+int strbuf_grow(struct strbuf *buf, size_t);
-static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
- if (!sb->alloc)
- strbuf_grow(sb, 0);
+static inline int strbuf_setlen(struct strbuf *sb, size_t len) {
+ int ret;
+ if (!sb->alloc) {
+ ret = strbuf_grow(sb, 0);
+ if (ret)
+ return ret;
+ }
assert(len < sb->alloc);
sb->len = len;
sb->buf[len] = '\0';
+ return 0;
}
/*----- add data in your buffer -----*/
-void strbuf_addch(struct strbuf *sb, int c);
+int strbuf_addch(struct strbuf *sb, int c);
-void strbuf_add(struct strbuf *buf, const void *, size_t);
-static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
- strbuf_add(sb, s, strlen(s));
+int strbuf_add(struct strbuf *buf, const void *, size_t);
+static inline int strbuf_addstr(struct strbuf *sb, const char *s) {
+ return strbuf_add(sb, s, strlen(s));
}
__attribute__((format(printf,2,3)))
-void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+int strbuf_addf(struct strbuf *sb, const char *fmt, ...);
/* XXX: if read fails, any partial read is undone */
ssize_t strbuf_read(struct strbuf *, int fd, ssize_t hint);
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index bc229a74c6a9..87a297dd8901 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -709,17 +709,10 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
if (ss->opdshdr.sh_type != SHT_PROGBITS)
ss->opdsec = NULL;
- if (dso->kernel == DSO_TYPE_USER) {
- GElf_Shdr shdr;
- ss->adjust_symbols = (ehdr.e_type == ET_EXEC ||
- ehdr.e_type == ET_REL ||
- dso__is_vdso(dso) ||
- elf_section_by_name(elf, &ehdr, &shdr,
- ".gnu.prelink_undo",
- NULL) != NULL);
- } else {
+ if (dso->kernel == DSO_TYPE_USER)
+ ss->adjust_symbols = true;
+ else
ss->adjust_symbols = elf__needs_adjust_symbols(ehdr);
- }
ss->name = strdup(name);
if (!ss->name) {
@@ -777,7 +770,8 @@ static bool want_demangle(bool is_kernel_sym)
return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
}
-void __weak arch__elf_sym_adjust(GElf_Sym *sym __maybe_unused) { }
+void __weak arch__sym_update(struct symbol *s __maybe_unused,
+ GElf_Sym *sym __maybe_unused) { }
int dso__load_sym(struct dso *dso, struct map *map,
struct symsrc *syms_ss, struct symsrc *runtime_ss,
@@ -954,8 +948,6 @@ int dso__load_sym(struct dso *dso, struct map *map,
(sym.st_value & 1))
--sym.st_value;
- arch__elf_sym_adjust(&sym);
-
if (dso->kernel || kmodule) {
char dso_name[PATH_MAX];
@@ -1089,6 +1081,8 @@ new_symbol:
if (!f)
goto out_elf_end;
+ arch__sym_update(f, &sym);
+
if (filter && filter(curr_map, f))
symbol__delete(f);
else {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e7588dc91518..7fb33304fb4e 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -255,40 +255,6 @@ void symbol__delete(struct symbol *sym)
free(((void *)sym) - symbol_conf.priv_size);
}
-size_t symbol__fprintf(struct symbol *sym, FILE *fp)
-{
- return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n",
- sym->start, sym->end,
- sym->binding == STB_GLOBAL ? 'g' :
- sym->binding == STB_LOCAL ? 'l' : 'w',
- sym->name);
-}
-
-size_t symbol__fprintf_symname_offs(const struct symbol *sym,
- const struct addr_location *al, FILE *fp)
-{
- unsigned long offset;
- size_t length;
-
- if (sym && sym->name) {
- length = fprintf(fp, "%s", sym->name);
- if (al) {
- if (al->addr < sym->end)
- offset = al->addr - sym->start;
- else
- offset = al->addr - al->map->start - sym->start;
- length += fprintf(fp, "+0x%lx", offset);
- }
- return length;
- } else
- return fprintf(fp, "[unknown]");
-}
-
-size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp)
-{
- return symbol__fprintf_symname_offs(sym, NULL, fp);
-}
-
void symbols__delete(struct rb_root *symbols)
{
struct symbol *pos;
@@ -335,7 +301,7 @@ static struct symbol *symbols__find(struct rb_root *symbols, u64 ip)
if (ip < s->start)
n = n->rb_left;
- else if (ip >= s->end)
+ else if (ip > s->end || (ip == s->end && ip != s->start))
n = n->rb_right;
else
return s;
@@ -364,11 +330,6 @@ static struct symbol *symbols__next(struct symbol *sym)
return NULL;
}
-struct symbol_name_rb_node {
- struct rb_node rb_node;
- struct symbol sym;
-};
-
static void symbols__insert_by_name(struct rb_root *symbols, struct symbol *sym)
{
struct rb_node **p = &symbols->rb_node;
@@ -452,6 +413,18 @@ void dso__reset_find_symbol_cache(struct dso *dso)
}
}
+void dso__insert_symbol(struct dso *dso, enum map_type type, struct symbol *sym)
+{
+ symbols__insert(&dso->symbols[type], sym);
+
+ /* update the symbol cache if necessary */
+ if (dso->last_find_result[type].addr >= sym->start &&
+ (dso->last_find_result[type].addr < sym->end ||
+ sym->start == sym->end)) {
+ dso->last_find_result[type].symbol = sym;
+ }
+}
+
struct symbol *dso__find_symbol(struct dso *dso,
enum map_type type, u64 addr)
{
@@ -497,21 +470,6 @@ void dso__sort_by_name(struct dso *dso, enum map_type type)
&dso->symbols[type]);
}
-size_t dso__fprintf_symbols_by_name(struct dso *dso,
- enum map_type type, FILE *fp)
-{
- size_t ret = 0;
- struct rb_node *nd;
- struct symbol_name_rb_node *pos;
-
- for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) {
- pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
- fprintf(fp, "%s\n", pos->sym.name);
- }
-
- return ret;
-}
-
int modules__parse(const char *filename, void *arg,
int (*process_module)(void *arg, const char *name,
u64 start))
@@ -1262,8 +1220,8 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta)
return 0;
}
-int dso__load_kallsyms(struct dso *dso, const char *filename,
- struct map *map, symbol_filter_t filter)
+int __dso__load_kallsyms(struct dso *dso, const char *filename,
+ struct map *map, bool no_kcore, symbol_filter_t filter)
{
u64 delta = 0;
@@ -1284,12 +1242,18 @@ int dso__load_kallsyms(struct dso *dso, const char *filename,
else
dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS;
- if (!dso__load_kcore(dso, map, filename))
+ if (!no_kcore && !dso__load_kcore(dso, map, filename))
return dso__split_kallsyms_for_kcore(dso, map, filter);
else
return dso__split_kallsyms(dso, map, delta, filter);
}
+int dso__load_kallsyms(struct dso *dso, const char *filename,
+ struct map *map, symbol_filter_t filter)
+{
+ return __dso__load_kallsyms(dso, filename, map, false, filter);
+}
+
static int dso__load_perf_map(struct dso *dso, struct map *map,
symbol_filter_t filter)
{
@@ -1644,25 +1608,27 @@ out:
return err;
}
+static bool visible_dir_filter(const char *name, struct dirent *d)
+{
+ if (d->d_type != DT_DIR)
+ return false;
+ return lsdir_no_dot_filter(name, d);
+}
+
static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz)
{
char kallsyms_filename[PATH_MAX];
- struct dirent *dent;
int ret = -1;
- DIR *d;
+ struct strlist *dirs;
+ struct str_node *nd;
- d = opendir(dir);
- if (!d)
+ dirs = lsdir(dir, visible_dir_filter);
+ if (!dirs)
return -1;
- while (1) {
- dent = readdir(d);
- if (!dent)
- break;
- if (dent->d_type != DT_DIR)
- continue;
+ strlist__for_each(nd, dirs) {
scnprintf(kallsyms_filename, sizeof(kallsyms_filename),
- "%s/%s/kallsyms", dir, dent->d_name);
+ "%s/%s/kallsyms", dir, nd->s);
if (!validate_kcore_addresses(kallsyms_filename, map)) {
strlcpy(dir, kallsyms_filename, dir_sz);
ret = 0;
@@ -1670,7 +1636,7 @@ static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz)
}
}
- closedir(d);
+ strlist__delete(dirs);
return ret;
}
@@ -1678,7 +1644,7 @@ static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz)
static char *dso__find_kallsyms(struct dso *dso, struct map *map)
{
u8 host_build_id[BUILD_ID_SIZE];
- char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+ char sbuild_id[SBUILD_ID_SIZE];
bool is_host = false;
char path[PATH_MAX];
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index c8b7544d9267..2b5e4ed76fcb 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -55,6 +55,7 @@ struct symbol {
u16 namelen;
u8 binding;
bool ignore;
+ u8 arch_sym;
char name[0];
};
@@ -140,6 +141,11 @@ struct symbol_conf {
extern struct symbol_conf symbol_conf;
+struct symbol_name_rb_node {
+ struct rb_node rb_node;
+ struct symbol sym;
+};
+
static inline int __symbol__join_symfs(char *bf, size_t size, const char *path)
{
return path__join(bf, size, symbol_conf.symfs, path);
@@ -235,9 +241,14 @@ int dso__load_vmlinux(struct dso *dso, struct map *map,
symbol_filter_t filter);
int dso__load_vmlinux_path(struct dso *dso, struct map *map,
symbol_filter_t filter);
+int __dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map,
+ bool no_kcore, symbol_filter_t filter);
int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map,
symbol_filter_t filter);
+void dso__insert_symbol(struct dso *dso, enum map_type type,
+ struct symbol *sym);
+
struct symbol *dso__find_symbol(struct dso *dso, enum map_type type,
u64 addr);
struct symbol *dso__find_symbol_by_name(struct dso *dso, enum map_type type,
@@ -262,8 +273,14 @@ int symbol__init(struct perf_env *env);
void symbol__exit(void);
void symbol__elf_init(void);
struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name);
+size_t __symbol__fprintf_symname_offs(const struct symbol *sym,
+ const struct addr_location *al,
+ bool unknown_as_addr, FILE *fp);
size_t symbol__fprintf_symname_offs(const struct symbol *sym,
const struct addr_location *al, FILE *fp);
+size_t __symbol__fprintf_symname(const struct symbol *sym,
+ const struct addr_location *al,
+ bool unknown_as_addr, FILE *fp);
size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp);
size_t symbol__fprintf(struct symbol *sym, FILE *fp);
bool symbol_type__is_a(char symbol_type, enum map_type map_type);
@@ -310,7 +327,7 @@ int setup_intlist(struct intlist **list, const char *list_str,
#ifdef HAVE_LIBELF_SUPPORT
bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
-void arch__elf_sym_adjust(GElf_Sym *sym);
+void arch__sym_update(struct symbol *s, GElf_Sym *sym);
#endif
#define SYMBOL_A 0
diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c
new file mode 100644
index 000000000000..a680bdaa65dc
--- /dev/null
+++ b/tools/perf/util/symbol_fprintf.c
@@ -0,0 +1,71 @@
+#include <elf.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "symbol.h"
+
+size_t symbol__fprintf(struct symbol *sym, FILE *fp)
+{
+ return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n",
+ sym->start, sym->end,
+ sym->binding == STB_GLOBAL ? 'g' :
+ sym->binding == STB_LOCAL ? 'l' : 'w',
+ sym->name);
+}
+
+size_t __symbol__fprintf_symname_offs(const struct symbol *sym,
+ const struct addr_location *al,
+ bool unknown_as_addr, FILE *fp)
+{
+ unsigned long offset;
+ size_t length;
+
+ if (sym && sym->name) {
+ length = fprintf(fp, "%s", sym->name);
+ if (al) {
+ if (al->addr < sym->end)
+ offset = al->addr - sym->start;
+ else
+ offset = al->addr - al->map->start - sym->start;
+ length += fprintf(fp, "+0x%lx", offset);
+ }
+ return length;
+ } else if (al && unknown_as_addr)
+ return fprintf(fp, "[%#" PRIx64 "]", al->addr);
+ else
+ return fprintf(fp, "[unknown]");
+}
+
+size_t symbol__fprintf_symname_offs(const struct symbol *sym,
+ const struct addr_location *al,
+ FILE *fp)
+{
+ return __symbol__fprintf_symname_offs(sym, al, false, fp);
+}
+
+size_t __symbol__fprintf_symname(const struct symbol *sym,
+ const struct addr_location *al,
+ bool unknown_as_addr, FILE *fp)
+{
+ return __symbol__fprintf_symname_offs(sym, al, unknown_as_addr, fp);
+}
+
+size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp)
+{
+ return __symbol__fprintf_symname_offs(sym, NULL, false, fp);
+}
+
+size_t dso__fprintf_symbols_by_name(struct dso *dso,
+ enum map_type type, FILE *fp)
+{
+ size_t ret = 0;
+ struct rb_node *nd;
+ struct symbol_name_rb_node *pos;
+
+ for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) {
+ pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
+ fprintf(fp, "%s\n", pos->sym.name);
+ }
+
+ return ret;
+}
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
new file mode 100644
index 000000000000..bbb4c1957578
--- /dev/null
+++ b/tools/perf/util/syscalltbl.c
@@ -0,0 +1,134 @@
+/*
+ * System call table mapper
+ *
+ * (C) 2016 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include "syscalltbl.h"
+#include <stdlib.h>
+
+#ifdef HAVE_SYSCALL_TABLE
+#include <linux/compiler.h>
+#include <string.h>
+#include "util.h"
+
+#if defined(__x86_64__)
+#include <asm/syscalls_64.c>
+const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID;
+static const char **syscalltbl_native = syscalltbl_x86_64;
+#endif
+
+struct syscall {
+ int id;
+ const char *name;
+};
+
+static int syscallcmpname(const void *vkey, const void *ventry)
+{
+ const char *key = vkey;
+ const struct syscall *entry = ventry;
+
+ return strcmp(key, entry->name);
+}
+
+static int syscallcmp(const void *va, const void *vb)
+{
+ const struct syscall *a = va, *b = vb;
+
+ return strcmp(a->name, b->name);
+}
+
+static int syscalltbl__init_native(struct syscalltbl *tbl)
+{
+ int nr_entries = 0, i, j;
+ struct syscall *entries;
+
+ for (i = 0; i <= syscalltbl_native_max_id; ++i)
+ if (syscalltbl_native[i])
+ ++nr_entries;
+
+ entries = tbl->syscalls.entries = malloc(sizeof(struct syscall) * nr_entries);
+ if (tbl->syscalls.entries == NULL)
+ return -1;
+
+ for (i = 0, j = 0; i <= syscalltbl_native_max_id; ++i) {
+ if (syscalltbl_native[i]) {
+ entries[j].name = syscalltbl_native[i];
+ entries[j].id = i;
+ ++j;
+ }
+ }
+
+ qsort(tbl->syscalls.entries, nr_entries, sizeof(struct syscall), syscallcmp);
+ tbl->syscalls.nr_entries = nr_entries;
+ return 0;
+}
+
+struct syscalltbl *syscalltbl__new(void)
+{
+ struct syscalltbl *tbl = malloc(sizeof(*tbl));
+ if (tbl) {
+ if (syscalltbl__init_native(tbl)) {
+ free(tbl);
+ return NULL;
+ }
+ }
+ return tbl;
+}
+
+void syscalltbl__delete(struct syscalltbl *tbl)
+{
+ zfree(&tbl->syscalls.entries);
+ free(tbl);
+}
+
+const char *syscalltbl__name(const struct syscalltbl *tbl __maybe_unused, int id)
+{
+ return id <= syscalltbl_native_max_id ? syscalltbl_native[id]: NULL;
+}
+
+int syscalltbl__id(struct syscalltbl *tbl, const char *name)
+{
+ struct syscall *sc = bsearch(name, tbl->syscalls.entries,
+ tbl->syscalls.nr_entries, sizeof(*sc),
+ syscallcmpname);
+
+ return sc ? sc->id : -1;
+}
+
+#else /* HAVE_SYSCALL_TABLE */
+
+#include <libaudit.h>
+
+struct syscalltbl *syscalltbl__new(void)
+{
+ struct syscalltbl *tbl = malloc(sizeof(*tbl));
+ if (tbl)
+ tbl->audit_machine = audit_detect_machine();
+ return tbl;
+}
+
+void syscalltbl__delete(struct syscalltbl *tbl)
+{
+ free(tbl);
+}
+
+const char *syscalltbl__name(const struct syscalltbl *tbl, int id)
+{
+ return audit_syscall_to_name(id, tbl->audit_machine);
+}
+
+int syscalltbl__id(struct syscalltbl *tbl, const char *name)
+{
+ return audit_name_to_syscall(name, tbl->audit_machine);
+}
+#endif /* HAVE_SYSCALL_TABLE */
diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h
new file mode 100644
index 000000000000..e2951510484f
--- /dev/null
+++ b/tools/perf/util/syscalltbl.h
@@ -0,0 +1,20 @@
+#ifndef __PERF_SYSCALLTBL_H
+#define __PERF_SYSCALLTBL_H
+
+struct syscalltbl {
+ union {
+ int audit_machine;
+ struct {
+ int nr_entries;
+ void *entries;
+ } syscalls;
+ };
+};
+
+struct syscalltbl *syscalltbl__new(void);
+void syscalltbl__delete(struct syscalltbl *tbl);
+
+const char *syscalltbl__name(const struct syscalltbl *tbl, int id);
+int syscalltbl__id(struct syscalltbl *tbl, const char *name);
+
+#endif /* __PERF_SYSCALLTBL_H */
diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
index 679688e70ae7..825086aa9a08 100644
--- a/tools/perf/util/thread-stack.c
+++ b/tools/perf/util/thread-stack.c
@@ -22,44 +22,9 @@
#include "debug.h"
#include "symbol.h"
#include "comm.h"
+#include "call-path.h"
#include "thread-stack.h"
-#define CALL_PATH_BLOCK_SHIFT 8
-#define CALL_PATH_BLOCK_SIZE (1 << CALL_PATH_BLOCK_SHIFT)
-#define CALL_PATH_BLOCK_MASK (CALL_PATH_BLOCK_SIZE - 1)
-
-struct call_path_block {
- struct call_path cp[CALL_PATH_BLOCK_SIZE];
- struct list_head node;
-};
-
-/**
- * struct call_path_root - root of all call paths.
- * @call_path: root call path
- * @blocks: list of blocks to store call paths
- * @next: next free space
- * @sz: number of spaces
- */
-struct call_path_root {
- struct call_path call_path;
- struct list_head blocks;
- size_t next;
- size_t sz;
-};
-
-/**
- * struct call_return_processor - provides a call-back to consume call-return
- * information.
- * @cpr: call path root
- * @process: call-back that accepts call/return information
- * @data: anonymous data for call-back
- */
-struct call_return_processor {
- struct call_path_root *cpr;
- int (*process)(struct call_return *cr, void *data);
- void *data;
-};
-
#define STACK_GROWTH 2048
/**
@@ -335,108 +300,6 @@ void thread_stack__sample(struct thread *thread, struct ip_callchain *chain,
chain->ips[i] = thread->ts->stack[thread->ts->cnt - i].ret_addr;
}
-static void call_path__init(struct call_path *cp, struct call_path *parent,
- struct symbol *sym, u64 ip, bool in_kernel)
-{
- cp->parent = parent;
- cp->sym = sym;
- cp->ip = sym ? 0 : ip;
- cp->db_id = 0;
- cp->in_kernel = in_kernel;
- RB_CLEAR_NODE(&cp->rb_node);
- cp->children = RB_ROOT;
-}
-
-static struct call_path_root *call_path_root__new(void)
-{
- struct call_path_root *cpr;
-
- cpr = zalloc(sizeof(struct call_path_root));
- if (!cpr)
- return NULL;
- call_path__init(&cpr->call_path, NULL, NULL, 0, false);
- INIT_LIST_HEAD(&cpr->blocks);
- return cpr;
-}
-
-static void call_path_root__free(struct call_path_root *cpr)
-{
- struct call_path_block *pos, *n;
-
- list_for_each_entry_safe(pos, n, &cpr->blocks, node) {
- list_del(&pos->node);
- free(pos);
- }
- free(cpr);
-}
-
-static struct call_path *call_path__new(struct call_path_root *cpr,
- struct call_path *parent,
- struct symbol *sym, u64 ip,
- bool in_kernel)
-{
- struct call_path_block *cpb;
- struct call_path *cp;
- size_t n;
-
- if (cpr->next < cpr->sz) {
- cpb = list_last_entry(&cpr->blocks, struct call_path_block,
- node);
- } else {
- cpb = zalloc(sizeof(struct call_path_block));
- if (!cpb)
- return NULL;
- list_add_tail(&cpb->node, &cpr->blocks);
- cpr->sz += CALL_PATH_BLOCK_SIZE;
- }
-
- n = cpr->next++ & CALL_PATH_BLOCK_MASK;
- cp = &cpb->cp[n];
-
- call_path__init(cp, parent, sym, ip, in_kernel);
-
- return cp;
-}
-
-static struct call_path *call_path__findnew(struct call_path_root *cpr,
- struct call_path *parent,
- struct symbol *sym, u64 ip, u64 ks)
-{
- struct rb_node **p;
- struct rb_node *node_parent = NULL;
- struct call_path *cp;
- bool in_kernel = ip >= ks;
-
- if (sym)
- ip = 0;
-
- if (!parent)
- return call_path__new(cpr, parent, sym, ip, in_kernel);
-
- p = &parent->children.rb_node;
- while (*p != NULL) {
- node_parent = *p;
- cp = rb_entry(node_parent, struct call_path, rb_node);
-
- if (cp->sym == sym && cp->ip == ip)
- return cp;
-
- if (sym < cp->sym || (sym == cp->sym && ip < cp->ip))
- p = &(*p)->rb_left;
- else
- p = &(*p)->rb_right;
- }
-
- cp = call_path__new(cpr, parent, sym, ip, in_kernel);
- if (!cp)
- return NULL;
-
- rb_link_node(&cp->rb_node, node_parent, p);
- rb_insert_color(&cp->rb_node, &parent->children);
-
- return cp;
-}
-
struct call_return_processor *
call_return_processor__new(int (*process)(struct call_return *cr, void *data),
void *data)
diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h
index e1528f1374c3..ad44c7944b8e 100644
--- a/tools/perf/util/thread-stack.h
+++ b/tools/perf/util/thread-stack.h
@@ -19,17 +19,16 @@
#include <sys/types.h>
#include <linux/types.h>
-#include <linux/rbtree.h>
struct thread;
struct comm;
struct ip_callchain;
struct symbol;
struct dso;
-struct call_return_processor;
struct comm;
struct perf_sample;
struct addr_location;
+struct call_path;
/*
* Call/Return flags.
@@ -69,26 +68,16 @@ struct call_return {
};
/**
- * struct call_path - node in list of calls leading to a function call.
- * @parent: call path to the parent function call
- * @sym: symbol of function called
- * @ip: only if sym is null, the ip of the function
- * @db_id: id used for db-export
- * @in_kernel: whether function is a in the kernel
- * @rb_node: node in parent's tree of called functions
- * @children: tree of call paths of functions called
- *
- * In combination with the call_return structure, the call_path structure
- * defines a context-sensitve call-graph.
+ * struct call_return_processor - provides a call-back to consume call-return
+ * information.
+ * @cpr: call path root
+ * @process: call-back that accepts call/return information
+ * @data: anonymous data for call-back
*/
-struct call_path {
- struct call_path *parent;
- struct symbol *sym;
- u64 ip;
- u64 db_id;
- bool in_kernel;
- struct rb_node rb_node;
- struct rb_root children;
+struct call_return_processor {
+ struct call_path_root *cpr;
+ int (*process)(struct call_return *cr, void *data);
+ void *data;
};
int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index dfd00c6dad6e..45fcb715a36b 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -10,6 +10,8 @@
#include "comm.h"
#include "unwind.h"
+#include <api/fs/fs.h>
+
int thread__init_map_groups(struct thread *thread, struct machine *machine)
{
struct thread *leader;
@@ -153,6 +155,23 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp,
return 0;
}
+int thread__set_comm_from_proc(struct thread *thread)
+{
+ char path[64];
+ char *comm = NULL;
+ size_t sz;
+ int err = -1;
+
+ if (!(snprintf(path, sizeof(path), "%d/task/%d/comm",
+ thread->pid_, thread->tid) >= (int)sizeof(path)) &&
+ procfs__read_str(path, &comm, &sz) == 0) {
+ comm[sz - 1] = '\0';
+ err = thread__set_comm(thread, comm, 0);
+ }
+
+ return err;
+}
+
const char *thread__comm_str(const struct thread *thread)
{
const struct comm *comm = thread__comm(thread);
@@ -233,7 +252,7 @@ void thread__find_cpumode_addr_location(struct thread *thread,
struct addr_location *al)
{
size_t i;
- const u8 const cpumodes[] = {
+ const u8 cpumodes[] = {
PERF_RECORD_MISC_USER,
PERF_RECORD_MISC_KERNEL,
PERF_RECORD_MISC_GUEST_USER,
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index a0ac0317affb..45fba13c800b 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -9,6 +9,9 @@
#include "symbol.h"
#include <strlist.h>
#include <intlist.h>
+#ifdef HAVE_LIBUNWIND_SUPPORT
+#include <libunwind.h>
+#endif
struct thread_stack;
@@ -32,6 +35,9 @@ struct thread {
void *priv;
struct thread_stack *ts;
+#ifdef HAVE_LIBUNWIND_SUPPORT
+ unw_addr_space_t addr_space;
+#endif
};
struct machine;
@@ -65,6 +71,8 @@ static inline int thread__set_comm(struct thread *thread, const char *comm,
return __thread__set_comm(thread, comm, timestamp, false);
}
+int thread__set_comm_from_proc(struct thread *thread);
+
int thread__comm_len(struct thread *thread);
struct comm *thread__comm(const struct thread *thread);
struct comm *thread__exec_comm(const struct thread *thread);
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index 08afc6909953..5654fe15e036 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -94,7 +94,7 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
DIR *proc;
int max_threads = 32, items, i;
char path[256];
- struct dirent dirent, *next, **namelist = NULL;
+ struct dirent *dirent, **namelist = NULL;
struct thread_map *threads = thread_map__alloc(max_threads);
if (threads == NULL)
@@ -107,16 +107,16 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
threads->nr = 0;
atomic_set(&threads->refcnt, 1);
- while (!readdir_r(proc, &dirent, &next) && next) {
+ while ((dirent = readdir(proc)) != NULL) {
char *end;
bool grow = false;
struct stat st;
- pid_t pid = strtol(dirent.d_name, &end, 10);
+ pid_t pid = strtol(dirent->d_name, &end, 10);
if (*end) /* only interested in proper numerical dirents */
continue;
- snprintf(path, sizeof(path), "/proc/%s", dirent.d_name);
+ snprintf(path, sizeof(path), "/proc/%s", dirent->d_name);
if (stat(path, &st) != 0)
continue;
@@ -260,7 +260,7 @@ struct thread_map *thread_map__new_dummy(void)
return threads;
}
-static struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
+struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
{
struct thread_map *threads = NULL, *nt;
int ntasks = 0;
@@ -436,3 +436,15 @@ struct thread_map *thread_map__new_event(struct thread_map_event *event)
return threads;
}
+
+bool thread_map__has(struct thread_map *threads, pid_t pid)
+{
+ int i;
+
+ for (i = 0; i < threads->nr; ++i) {
+ if (threads->map[i].pid == pid)
+ return true;
+ }
+
+ return false;
+}
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
index 85e4c7c4fbde..bd3b971588da 100644
--- a/tools/perf/util/thread_map.h
+++ b/tools/perf/util/thread_map.h
@@ -31,6 +31,8 @@ void thread_map__put(struct thread_map *map);
struct thread_map *thread_map__new_str(const char *pid,
const char *tid, uid_t uid);
+struct thread_map *thread_map__new_by_tid_str(const char *tid_str);
+
size_t thread_map__fprintf(struct thread_map *threads, FILE *fp);
static inline int thread_map__nr(struct thread_map *threads)
@@ -55,4 +57,5 @@ static inline char *thread_map__comm(struct thread_map *map, int thread)
}
void thread_map__read_comms(struct thread_map *threads);
+bool thread_map__has(struct thread_map *threads, pid_t pid);
#endif /* __PERF_THREAD_MAP_H */
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 55de4cffcd4e..ac2590a3de2d 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -57,6 +57,7 @@ struct perf_tool {
id_index,
auxtrace_info,
auxtrace_error,
+ time_conv,
thread_map,
cpu_map,
stat_config,
diff --git a/tools/perf/util/trigger.h b/tools/perf/util/trigger.h
new file mode 100644
index 000000000000..e97d7016d771
--- /dev/null
+++ b/tools/perf/util/trigger.h
@@ -0,0 +1,94 @@
+#ifndef __TRIGGER_H_
+#define __TRIGGER_H_ 1
+
+#include "util/debug.h"
+#include "asm/bug.h"
+
+/*
+ * Use trigger to model operations which need to be executed when
+ * an event (a signal, for example) is observed.
+ *
+ * States and transits:
+ *
+ *
+ * OFF--(on)--> READY --(hit)--> HIT
+ * ^ |
+ * | (ready)
+ * | |
+ * \_____________/
+ *
+ * is_hit and is_ready are two key functions to query the state of
+ * a trigger. is_hit means the event already happen; is_ready means the
+ * trigger is waiting for the event.
+ */
+
+struct trigger {
+ volatile enum {
+ TRIGGER_ERROR = -2,
+ TRIGGER_OFF = -1,
+ TRIGGER_READY = 0,
+ TRIGGER_HIT = 1,
+ } state;
+ const char *name;
+};
+
+#define TRIGGER_WARN_ONCE(t, exp) \
+ WARN_ONCE(t->state != exp, "trigger '%s' state transist error: %d in %s()\n", \
+ t->name, t->state, __func__)
+
+static inline bool trigger_is_available(struct trigger *t)
+{
+ return t->state >= 0;
+}
+
+static inline bool trigger_is_error(struct trigger *t)
+{
+ return t->state <= TRIGGER_ERROR;
+}
+
+static inline void trigger_on(struct trigger *t)
+{
+ TRIGGER_WARN_ONCE(t, TRIGGER_OFF);
+ t->state = TRIGGER_READY;
+}
+
+static inline void trigger_ready(struct trigger *t)
+{
+ if (!trigger_is_available(t))
+ return;
+ t->state = TRIGGER_READY;
+}
+
+static inline void trigger_hit(struct trigger *t)
+{
+ if (!trigger_is_available(t))
+ return;
+ TRIGGER_WARN_ONCE(t, TRIGGER_READY);
+ t->state = TRIGGER_HIT;
+}
+
+static inline void trigger_off(struct trigger *t)
+{
+ if (!trigger_is_available(t))
+ return;
+ t->state = TRIGGER_OFF;
+}
+
+static inline void trigger_error(struct trigger *t)
+{
+ t->state = TRIGGER_ERROR;
+}
+
+static inline bool trigger_is_ready(struct trigger *t)
+{
+ return t->state == TRIGGER_READY;
+}
+
+static inline bool trigger_is_hit(struct trigger *t)
+{
+ return t->state == TRIGGER_HIT;
+}
+
+#define DEFINE_TRIGGER(n) \
+struct trigger n = {.state = TRIGGER_OFF, .name = #n}
+#endif
diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h
index a8b78f1b3243..d5b11e2b85e0 100644
--- a/tools/perf/util/tsc.h
+++ b/tools/perf/util/tsc.h
@@ -3,10 +3,29 @@
#include <linux/types.h>
-#include "../arch/x86/util/tsc.h"
+#include "event.h"
+
+struct perf_tsc_conversion {
+ u16 time_shift;
+ u32 time_mult;
+ u64 time_zero;
+};
+struct perf_event_mmap_page;
+
+int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
+ struct perf_tsc_conversion *tc);
u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc);
u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
u64 rdtsc(void);
+struct perf_event_mmap_page;
+struct perf_tool;
+struct machine;
+
+int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc,
+ struct perf_tool *tool,
+ perf_event__handler_t process,
+ struct machine *machine);
+
#endif
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index ee7e372297e5..63687d3a344e 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -32,6 +32,7 @@
#include "symbol.h"
#include "util.h"
#include "debug.h"
+#include "asm/bug.h"
extern int
UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
@@ -580,43 +581,33 @@ static unw_accessors_t accessors = {
int unwind__prepare_access(struct thread *thread)
{
- unw_addr_space_t addr_space;
-
if (callchain_param.record_mode != CALLCHAIN_DWARF)
return 0;
- addr_space = unw_create_addr_space(&accessors, 0);
- if (!addr_space) {
+ thread->addr_space = unw_create_addr_space(&accessors, 0);
+ if (!thread->addr_space) {
pr_err("unwind: Can't create unwind address space.\n");
return -ENOMEM;
}
- unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL);
- thread__set_priv(thread, addr_space);
-
+ unw_set_caching_policy(thread->addr_space, UNW_CACHE_GLOBAL);
return 0;
}
void unwind__flush_access(struct thread *thread)
{
- unw_addr_space_t addr_space;
-
if (callchain_param.record_mode != CALLCHAIN_DWARF)
return;
- addr_space = thread__priv(thread);
- unw_flush_cache(addr_space, 0, 0);
+ unw_flush_cache(thread->addr_space, 0, 0);
}
void unwind__finish_access(struct thread *thread)
{
- unw_addr_space_t addr_space;
-
if (callchain_param.record_mode != CALLCHAIN_DWARF)
return;
- addr_space = thread__priv(thread);
- unw_destroy_addr_space(addr_space);
+ unw_destroy_addr_space(thread->addr_space);
}
static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
@@ -639,7 +630,9 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
* unwind itself.
*/
if (max_stack - 1 > 0) {
- addr_space = thread__priv(ui->thread);
+ WARN_ONCE(!ui->thread, "WARNING: ui->thread is NULL");
+ addr_space = ui->thread->addr_space;
+
if (addr_space == NULL)
return -1;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index b7766c577b01..eab077ad6ca9 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -33,6 +33,8 @@ struct callchain_param callchain_param = {
unsigned int page_size;
int cacheline_size;
+unsigned int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
+
bool test_attr__enabled;
bool perf_host = true;
@@ -117,6 +119,40 @@ int rm_rf(char *path)
return rmdir(path);
}
+/* A filter which removes dot files */
+bool lsdir_no_dot_filter(const char *name __maybe_unused, struct dirent *d)
+{
+ return d->d_name[0] != '.';
+}
+
+/* lsdir reads a directory and store it in strlist */
+struct strlist *lsdir(const char *name,
+ bool (*filter)(const char *, struct dirent *))
+{
+ struct strlist *list = NULL;
+ DIR *dir;
+ struct dirent *d;
+
+ dir = opendir(name);
+ if (!dir)
+ return NULL;
+
+ list = strlist__new(NULL, NULL);
+ if (!list) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ while ((d = readdir(dir)) != NULL) {
+ if (!filter || filter(name, d))
+ strlist__add(list, d->d_name);
+ }
+
+out:
+ closedir(dir);
+ return list;
+}
+
static int slow_copyfile(const char *from, const char *to)
{
int err = -1;
@@ -471,7 +507,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
"needed for --call-graph fp\n");
break;
-#ifdef HAVE_DWARF_UNWIND_SUPPORT
/* Dwarf style */
} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
const unsigned long default_stack_dump_size = 8192;
@@ -487,7 +522,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
ret = get_stack_size(tok, &size);
param->dump_size = size;
}
-#endif /* HAVE_DWARF_UNWIND_SUPPORT */
} else if (!strncmp(name, "lbr", sizeof("lbr"))) {
if (!strtok_r(NULL, ",", &saveptr)) {
param->record_mode = CALLCHAIN_LBR;
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 8298d607c738..7651633a8dc7 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -79,6 +79,7 @@
#include <termios.h>
#include <linux/bitops.h>
#include <termios.h>
+#include "strlist.h"
extern const char *graph_line;
extern const char *graph_dotted_line;
@@ -159,12 +160,6 @@ static inline char *gitstrchrnul(const char *s, int c)
}
#endif
-/*
- * Wrappers:
- */
-void *xrealloc(void *ptr, size_t size) __attribute__((weak));
-
-
static inline void *zalloc(size_t size)
{
return calloc(1, size);
@@ -222,6 +217,8 @@ static inline int sane_case(int x, int high)
int mkdir_p(char *path, mode_t mode);
int rm_rf(char *path);
+struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *));
+bool lsdir_no_dot_filter(const char *name, struct dirent *d);
int copyfile(const char *from, const char *to);
int copyfile_mode(const char *from, const char *to, mode_t mode);
int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size);
@@ -254,11 +251,17 @@ int hex2u64(const char *ptr, u64 *val);
char *ltrim(char *s);
char *rtrim(char *s);
+static inline char *trim(char *s)
+{
+ return ltrim(rtrim(s));
+}
+
void dump_stack(void);
void sighandler_dump_stack(int sig);
extern unsigned int page_size;
extern int cacheline_size;
+extern unsigned int sysctl_perf_event_max_stack;
struct parse_tag {
char tag;
diff --git a/tools/perf/util/wrapper.c b/tools/perf/util/wrapper.c
deleted file mode 100644
index 5f1a07c4b87b..000000000000
--- a/tools/perf/util/wrapper.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Various trivial helper wrappers around standard functions
- */
-#include "cache.h"
-
-/*
- * There's no pack memory to release - but stay close to the Git
- * version so wrap this away:
- */
-static inline void release_pack_memory(size_t size __maybe_unused,
- int flag __maybe_unused)
-{
-}
-
-void *xrealloc(void *ptr, size_t size)
-{
- void *ret = realloc(ptr, size);
- if (!ret && !size)
- ret = realloc(ptr, 1);
- if (!ret) {
- release_pack_memory(size, -1);
- ret = realloc(ptr, size);
- if (!ret && !size)
- ret = realloc(ptr, 1);
- if (!ret)
- die("Out of memory, realloc failed");
- }
- return ret;
-}
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index b04afc3295df..ff9e5f20a5a7 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -19,6 +19,7 @@ TARGETS += powerpc
TARGETS += pstore
TARGETS += ptrace
TARGETS += seccomp
+TARGETS += sigaltstack
TARGETS += size
TARGETS += static_keys
TARGETS += sysctl
diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh
new file mode 100755
index 000000000000..3633828375e3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Alternate sleeping and spinning on randomly selected CPUs. The purpose
+# of this script is to inflict random OS jitter on a concurrently running
+# test.
+#
+# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ]
+#
+# me: Random-number-generator seed salt.
+# duration: Time to run in seconds.
+# sleepmax: Maximum microseconds to sleep, defaults to one second.
+# spinmax: Maximum microseconds to spin, defaults to one millisecond.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+me=$(($1 * 1000))
+duration=$2
+sleepmax=${3-1000000}
+spinmax=${4-1000}
+
+n=1
+
+starttime=`awk 'BEGIN { print systime(); }' < /dev/null`
+
+while :
+do
+ # Check for done.
+ t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null`
+ if test "$t" -gt "$duration"
+ then
+ exit 0;
+ fi
+
+ # Set affinity to randomly selected CPU
+ cpus=`ls /sys/devices/system/cpu/*/online |
+ sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' |
+ grep -v '^0*$'`
+ cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
+ srand(n + me + systime());
+ ncpus = split(cpus, ca);
+ curcpu = ca[int(rand() * ncpus + 1)];
+ mask = lshift(1, curcpu);
+ if (mask + 0 <= 0)
+ mask = 1;
+ printf("%#x\n", mask);
+ }' < /dev/null`
+ n=$(($n+1))
+ if ! taskset -p $cpumask $$ > /dev/null 2>&1
+ then
+ echo taskset failure: '"taskset -p ' $cpumask $$ '"'
+ exit 1
+ fi
+
+ # Sleep a random duration
+ sleeptime=`awk -v me=$me -v n=$n -v sleepmax=$sleepmax 'BEGIN {
+ srand(n + me + systime());
+ printf("%06d", int(rand() * sleepmax));
+ }' < /dev/null`
+ n=$(($n+1))
+ sleep .$sleeptime
+
+ # Spin a random duration
+ limit=`awk -v me=$me -v n=$n -v spinmax=$spinmax 'BEGIN {
+ srand(n + me + systime());
+ printf("%06d", int(rand() * spinmax));
+ }' < /dev/null`
+ n=$(($n+1))
+ for i in {1..$limit}
+ do
+ echo > /dev/null
+ done
+done
+
+exit 1
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh
new file mode 100755
index 000000000000..f79b0e9e84fc
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+#
+# Analyze a given results directory for rcuperf performance measurements,
+# looking for ftrace data. Exits with 0 if data was found, analyzed, and
+# printed. Intended to be invoked from kvm-recheck-rcuperf.sh after
+# argument checking.
+#
+# Usage: kvm-recheck-rcuperf-ftrace.sh resdir
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+i="$1"
+. tools/testing/selftests/rcutorture/bin/functions.sh
+
+if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100
+then
+ exit 10
+fi
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+grep 'us : rcu_exp_grace_period' |
+sed -e 's/us : / : /' |
+tr -d '\015' |
+awk '
+$8 == "start" {
+ if (starttask != "")
+ nlost++;
+ starttask = $1;
+ starttime = $3;
+ startseq = $7;
+}
+
+$8 == "end" {
+ if (starttask == $1 && startseq == $7) {
+ curgpdur = $3 - starttime;
+ gptimes[++n] = curgpdur;
+ gptaskcnt[starttask]++;
+ sum += curgpdur;
+ if (curgpdur > 1000)
+ print "Long GP " starttime "us to " $3 "us (" curgpdur "us)";
+ starttask = "";
+ } else {
+ # Lost a message or some such, reset.
+ starttask = "";
+ nlost++;
+ }
+}
+
+$8 == "done" {
+ piggybackcnt[$1]++;
+}
+
+END {
+ newNR = asort(gptimes);
+ if (newNR <= 0) {
+ print "No ftrace records found???"
+ exit 10;
+ }
+ pct50 = int(newNR * 50 / 100);
+ if (pct50 < 1)
+ pct50 = 1;
+ pct90 = int(newNR * 90 / 100);
+ if (pct90 < 1)
+ pct90 = 1;
+ pct99 = int(newNR * 99 / 100);
+ if (pct99 < 1)
+ pct99 = 1;
+ div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+ print "Histogram bucket size: " div;
+ last = gptimes[1] - 10;
+ count = 0;
+ for (i = 1; i <= newNR; i++) {
+ current = div * int(gptimes[i] / div);
+ if (last == current) {
+ count++;
+ } else {
+ if (count > 0)
+ print last, count;
+ count = 1;
+ last = current;
+ }
+ }
+ if (count > 0)
+ print last, count;
+ print "Distribution of grace periods across tasks:";
+ for (i in gptaskcnt) {
+ print "\t" i, gptaskcnt[i];
+ nbatches += gptaskcnt[i];
+ }
+ ngps = nbatches;
+ print "Distribution of piggybacking across tasks:";
+ for (i in piggybackcnt) {
+ print "\t" i, piggybackcnt[i];
+ ngps += piggybackcnt[i];
+ }
+ print "Average grace-period duration: " sum / newNR " microseconds";
+ print "Minimum grace-period duration: " gptimes[1];
+ print "50th percentile grace-period duration: " gptimes[pct50];
+ print "90th percentile grace-period duration: " gptimes[pct90];
+ print "99th percentile grace-period duration: " gptimes[pct99];
+ print "Maximum grace-period duration: " gptimes[newNR];
+ print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0;
+ print "Computed from ftrace data.";
+}'
+exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh
new file mode 100755
index 000000000000..8f3121afc716
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# Analyze a given results directory for rcuperf performance measurements.
+#
+# Usage: kvm-recheck-rcuperf.sh resdir
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+i="$1"
+if test -d $i
+then
+ :
+else
+ echo Unreadable results directory: $i
+ exit 1
+fi
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. tools/testing/selftests/rcutorture/bin/functions.sh
+
+if kvm-recheck-rcuperf-ftrace.sh $i
+then
+ # ftrace data was successfully analyzed, call it good!
+ exit 0
+fi
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+awk '
+/-perf: .* gps: .* batches:/ {
+ ngps = $9;
+ nbatches = $11;
+}
+
+/-perf: .*writer-duration/ {
+ gptimes[++n] = $5 / 1000.;
+ sum += $5 / 1000.;
+}
+
+END {
+ newNR = asort(gptimes);
+ if (newNR <= 0) {
+ print "No rcuperf records found???"
+ exit;
+ }
+ pct50 = int(newNR * 50 / 100);
+ if (pct50 < 1)
+ pct50 = 1;
+ pct90 = int(newNR * 90 / 100);
+ if (pct90 < 1)
+ pct90 = 1;
+ pct99 = int(newNR * 99 / 100);
+ if (pct99 < 1)
+ pct99 = 1;
+ div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+ print "Histogram bucket size: " div;
+ last = gptimes[1] - 10;
+ count = 0;
+ for (i = 1; i <= newNR; i++) {
+ current = div * int(gptimes[i] / div);
+ if (last == current) {
+ count++;
+ } else {
+ if (count > 0)
+ print last, count;
+ count = 1;
+ last = current;
+ }
+ }
+ if (count > 0)
+ print last, count;
+ print "Average grace-period duration: " sum / newNR " microseconds";
+ print "Minimum grace-period duration: " gptimes[1];
+ print "50th percentile grace-period duration: " gptimes[pct50];
+ print "90th percentile grace-period duration: " gptimes[pct90];
+ print "99th percentile grace-period duration: " gptimes[pct99];
+ print "Maximum grace-period duration: " gptimes[newNR];
+ print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches;
+ print "Computed from rcuperf printk output.";
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index d86bdd6b6cc2..f659346d3358 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -48,7 +48,10 @@ do
cat $i/Make.oldconfig.err
fi
parse-build.sh $i/Make.out $configfile
- parse-torture.sh $i/console.log $configfile
+ if test "$TORTURE_SUITE" != rcuperf
+ then
+ parse-torture.sh $i/console.log $configfile
+ fi
parse-console.sh $i/console.log $configfile
if test -r $i/Warnings
then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 0f80eefb0bfd..4109f306d855 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -6,7 +6,7 @@
# Execute this in the source tree. Do not run it as a background task
# because qemu does not seem to like that much.
#
-# Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args
+# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args
#
# qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with
# arguments specifying the number of CPUs and other
@@ -91,25 +91,33 @@ fi
# CONFIG_PCMCIA=n
# CONFIG_CARDBUS=n
# CONFIG_YENTA=n
-if kvm-build.sh $config_template $builddir $T
+base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'`
+if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux
then
+ # Rerunning previous test, so use that test's kernel.
+ QEMU="`identify_qemu $base_resdir/vmlinux`"
+ KERNEL=$base_resdir/bzImage
+ ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh
+ ln -s $base_resdir/.config $resdir # for kvm-recheck.sh
+elif kvm-build.sh $config_template $builddir $T
+then
+ # Had to build a kernel for this test.
QEMU="`identify_qemu $builddir/vmlinux`"
BOOT_IMAGE="`identify_boot_image $QEMU`"
cp $builddir/Make*.out $resdir
+ cp $builddir/vmlinux $resdir
cp $builddir/.config $resdir
if test -n "$BOOT_IMAGE"
then
cp $builddir/$BOOT_IMAGE $resdir
+ KERNEL=$resdir/bzImage
else
echo No identifiable boot image, not running KVM, see $resdir.
echo Do the torture scripts know about your architecture?
fi
parse-build.sh $resdir/Make.out $title
- if test -f $builddir.wait
- then
- mv $builddir.wait $builddir.ready
- fi
else
+ # Build failed.
cp $builddir/Make*.out $resdir
cp $builddir/.config $resdir || :
echo Build failed, not running KVM, see $resdir.
@@ -119,12 +127,15 @@ else
fi
exit 1
fi
+if test -f $builddir.wait
+then
+ mv $builddir.wait $builddir.ready
+fi
while test -f $builddir.ready
do
sleep 1
done
-minutes=$4
-seconds=$(($minutes * 60))
+seconds=$4
qemu_args=$5
boot_args=$6
@@ -167,15 +178,26 @@ then
exit 0
fi
echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
-echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
-( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) &
-qemu_pid=$!
+echo $QEMU $qemu_args -m 512 -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
+( $QEMU $qemu_args -m 512 -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
commandcompleted=0
-echo Monitoring qemu job at pid $qemu_pid
+sleep 10 # Give qemu's pid a chance to reach the file
+if test -s "$resdir/qemu_pid"
+then
+ qemu_pid=`cat "$resdir/qemu_pid"`
+ echo Monitoring qemu job at pid $qemu_pid
+else
+ qemu_pid=""
+ echo Monitoring qemu job at yet-as-unknown pid
+fi
while :
do
+ if test -z "$qemu_pid" -a -s "$resdir/qemu_pid"
+ then
+ qemu_pid=`cat "$resdir/qemu_pid"`
+ fi
kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
- if kill -0 $qemu_pid > /dev/null 2>&1
+ if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1
then
if test $kruntime -ge $seconds
then
@@ -195,12 +217,16 @@ do
ps -fp $killpid >> $resdir/Warnings 2>&1
fi
else
- echo ' ---' `date`: Kernel done
+ echo ' ---' `date`: "Kernel done"
fi
break
fi
done
-if test $commandcompleted -eq 0
+if test -z "$qemu_pid" -a -s "$resdir/qemu_pid"
+then
+ qemu_pid=`cat "$resdir/qemu_pid"`
+fi
+if test $commandcompleted -eq 0 -a -n "$qemu_pid"
then
echo Grace period for qemu job at pid $qemu_pid
while :
@@ -220,6 +246,9 @@ then
fi
sleep 1
done
+elif test -z "$qemu_pid"
+then
+ echo Unknown PID, cannot kill qemu command
fi
parse-torture.sh $resdir/console.log $title
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 4a431767f77a..0d598145873e 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -34,7 +34,7 @@ T=/tmp/kvm.sh.$$
trap 'rm -rf $T' 0
mkdir $T
-dur=30
+dur=$((30*60))
dryrun=""
KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
PATH=${KVM}/bin:$PATH; export PATH
@@ -48,6 +48,7 @@ resdir=""
configs=""
cpus=0
ds=`date +%Y.%m.%d-%H:%M:%S`
+jitter=0
. functions.sh
@@ -63,6 +64,7 @@ usage () {
echo " --dryrun sched|script"
echo " --duration minutes"
echo " --interactive"
+ echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]"
echo " --kmake-arg kernel-make-arguments"
echo " --mac nn:nn:nn:nn:nn:nn"
echo " --no-initrd"
@@ -116,12 +118,17 @@ do
;;
--duration)
checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error'
- dur=$2
+ dur=$(($2*60))
shift
;;
--interactive)
TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE
;;
+ --jitter)
+ checkarg --jitter "(# threads [ sleep [ spin ] ])" $# "$2" '^-\{,1\}[0-9]\+\( \+[0-9]\+\)\{,2\} *$' '^error$'
+ jitter="$2"
+ shift
+ ;;
--kmake-arg)
checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
TORTURE_KMAKE_ARG="$2"
@@ -156,7 +163,7 @@ do
shift
;;
--torture)
- checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--'
+ checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--'
TORTURE_SUITE=$2
shift
;;
@@ -299,6 +306,7 @@ awk < $T/cfgcpu.pack \
-v CONFIGDIR="$CONFIGFRAG/" \
-v KVM="$KVM" \
-v ncpus=$cpus \
+ -v jitter="$jitter" \
-v rd=$resdir/$ds/ \
-v dur=$dur \
-v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \
@@ -359,6 +367,16 @@ function dump(first, pastlast, batchnum)
print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log";
print "fi"
}
+ njitter = 0;
+ split(jitter, ja);
+ if (ja[1] == -1 && ncpus == 0)
+ njitter = 1;
+ else if (ja[1] == -1)
+ njitter = ncpus;
+ else
+ njitter = ja[1];
+ for (j = 0; j < njitter; j++)
+ print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&"
print "wait"
print "if test -z \"$TORTURE_BUILDONLY\""
print "then"
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index 39a2c6d7d7ec..17cbe098b115 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -14,7 +14,7 @@ CONFIG_HOTPLUG_CPU=n
CONFIG_SUSPEND=n
CONFIG_HIBERNATION=n
CONFIG_RCU_FANOUT=4
-CONFIG_RCU_FANOUT_LEAF=4
+CONFIG_RCU_FANOUT_LEAF=3
CONFIG_RCU_NOCB_CPU=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
index 0fc8a3428938..e34c33430447 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
@@ -1 +1 @@
-rcutorture.torture_type=rcu_bh
+rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST
new file mode 100644
index 000000000000..c9f56cf20775
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST
@@ -0,0 +1 @@
+TREE
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon
new file mode 100644
index 000000000000..a09816b8c0f3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_RCU_PERF_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
new file mode 100644
index 000000000000..a312f671a29a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
@@ -0,0 +1,20 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
new file mode 100644
index 000000000000..985fb170d13c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
@@ -0,0 +1,23 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=54
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_FANOUT=3
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
new file mode 100644
index 000000000000..34f2a1b35ee5
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2015
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+# rcuperf_param_nreaders bootparam-string
+#
+# Adds nreaders rcuperf module parameter if not already specified.
+rcuperf_param_nreaders () {
+ if ! echo "$1" | grep -q "rcuperf.nreaders"
+ then
+ echo rcuperf.nreaders=-1
+ fi
+}
+
+# rcuperf_param_nwriters bootparam-string
+#
+# Adds nwriters rcuperf module parameter if not already specified.
+rcuperf_param_nwriters () {
+ if ! echo "$1" | grep -q "rcuperf.nwriters"
+ then
+ echo rcuperf.nwriters=-1
+ fi
+}
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+ echo $1 `rcuperf_param_nreaders "$1"` \
+ `rcuperf_param_nwriters "$1"` \
+ rcuperf.perf_runnable=1 \
+ rcuperf.shutdown=1 \
+ rcuperf.verbose=1
+}
diff --git a/tools/testing/selftests/sigaltstack/Makefile b/tools/testing/selftests/sigaltstack/Makefile
new file mode 100644
index 000000000000..56af56eda6fa
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/Makefile
@@ -0,0 +1,8 @@
+CFLAGS = -Wall
+BINARIES = sas
+all: $(BINARIES)
+
+include ../lib.mk
+
+clean:
+ rm -rf $(BINARIES)
diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c
new file mode 100644
index 000000000000..1bb01258e559
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -0,0 +1,176 @@
+/*
+ * Stas Sergeev <stsp@users.sourceforge.net>
+ *
+ * test sigaltstack(SS_ONSTACK | SS_AUTODISARM)
+ * If that succeeds, then swapcontext() can be used inside sighandler safely.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <ucontext.h>
+#include <alloca.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+
+#ifndef SS_AUTODISARM
+#define SS_AUTODISARM (1U << 31)
+#endif
+
+static void *sstack, *ustack;
+static ucontext_t uc, sc;
+static const char *msg = "[OK]\tStack preserved";
+static const char *msg2 = "[FAIL]\tStack corrupted";
+struct stk_data {
+ char msg[128];
+ int flag;
+};
+
+void my_usr1(int sig, siginfo_t *si, void *u)
+{
+ char *aa;
+ int err;
+ stack_t stk;
+ struct stk_data *p;
+
+ register unsigned long sp asm("sp");
+
+ if (sp < (unsigned long)sstack ||
+ sp >= (unsigned long)sstack + SIGSTKSZ) {
+ printf("[FAIL]\tSP is not on sigaltstack\n");
+ exit(EXIT_FAILURE);
+ }
+ /* put some data on stack. other sighandler will try to overwrite it */
+ aa = alloca(1024);
+ assert(aa);
+ p = (struct stk_data *)(aa + 512);
+ strcpy(p->msg, msg);
+ p->flag = 1;
+ printf("[RUN]\tsignal USR1\n");
+ err = sigaltstack(NULL, &stk);
+ if (err) {
+ perror("[FAIL]\tsigaltstack()");
+ exit(EXIT_FAILURE);
+ }
+ if (stk.ss_flags != SS_DISABLE)
+ printf("[FAIL]\tss_flags=%i, should be SS_DISABLE\n",
+ stk.ss_flags);
+ else
+ printf("[OK]\tsigaltstack is disabled in sighandler\n");
+ swapcontext(&sc, &uc);
+ printf("%s\n", p->msg);
+ if (!p->flag) {
+ printf("[RUN]\tAborting\n");
+ exit(EXIT_FAILURE);
+ }
+}
+
+void my_usr2(int sig, siginfo_t *si, void *u)
+{
+ char *aa;
+ struct stk_data *p;
+
+ printf("[RUN]\tsignal USR2\n");
+ aa = alloca(1024);
+ /* dont run valgrind on this */
+ /* try to find the data stored by previous sighandler */
+ p = memmem(aa, 1024, msg, strlen(msg));
+ if (p) {
+ printf("[FAIL]\tsigaltstack re-used\n");
+ /* corrupt the data */
+ strcpy(p->msg, msg2);
+ /* tell other sighandler that his data is corrupted */
+ p->flag = 0;
+ }
+}
+
+static void switch_fn(void)
+{
+ printf("[RUN]\tswitched to user ctx\n");
+ raise(SIGUSR2);
+ setcontext(&sc);
+}
+
+int main(void)
+{
+ struct sigaction act;
+ stack_t stk;
+ int err;
+
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_ONSTACK | SA_SIGINFO;
+ act.sa_sigaction = my_usr1;
+ sigaction(SIGUSR1, &act, NULL);
+ act.sa_sigaction = my_usr2;
+ sigaction(SIGUSR2, &act, NULL);
+ sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+ if (sstack == MAP_FAILED) {
+ perror("mmap()");
+ return EXIT_FAILURE;
+ }
+
+ err = sigaltstack(NULL, &stk);
+ if (err) {
+ perror("[FAIL]\tsigaltstack()");
+ exit(EXIT_FAILURE);
+ }
+ if (stk.ss_flags == SS_DISABLE) {
+ printf("[OK]\tInitial sigaltstack state was SS_DISABLE\n");
+ } else {
+ printf("[FAIL]\tInitial sigaltstack state was %i; should have been SS_DISABLE\n", stk.ss_flags);
+ return EXIT_FAILURE;
+ }
+
+ stk.ss_sp = sstack;
+ stk.ss_size = SIGSTKSZ;
+ stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
+ err = sigaltstack(&stk, NULL);
+ if (err) {
+ if (errno == EINVAL) {
+ printf("[NOTE]\tThe running kernel doesn't support SS_AUTODISARM\n");
+ /*
+ * If test cases for the !SS_AUTODISARM variant were
+ * added, we could still run them. We don't have any
+ * test cases like that yet, so just exit and report
+ * success.
+ */
+ return 0;
+ } else {
+ perror("[FAIL]\tsigaltstack(SS_ONSTACK | SS_AUTODISARM)");
+ return EXIT_FAILURE;
+ }
+ }
+
+ ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+ if (ustack == MAP_FAILED) {
+ perror("mmap()");
+ return EXIT_FAILURE;
+ }
+ getcontext(&uc);
+ uc.uc_link = NULL;
+ uc.uc_stack.ss_sp = ustack;
+ uc.uc_stack.ss_size = SIGSTKSZ;
+ makecontext(&uc, switch_fn, 0);
+ raise(SIGUSR1);
+
+ err = sigaltstack(NULL, &stk);
+ if (err) {
+ perror("[FAIL]\tsigaltstack()");
+ exit(EXIT_FAILURE);
+ }
+ if (stk.ss_flags != SS_AUTODISARM) {
+ printf("[FAIL]\tss_flags=%i, should be SS_AUTODISARM\n",
+ stk.ss_flags);
+ exit(EXIT_FAILURE);
+ }
+ printf("[OK]\tsigaltstack is still SS_AUTODISARM after signal\n");
+
+ printf("[OK]\tTest passed\n");
+ return 0;
+}