summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/conf.py2
-rw-r--r--Documentation/hwmon/ftsteutates4
-rw-r--r--Documentation/kernel-documentation.rst6
-rw-r--r--Documentation/power/basic-pm-debugging.txt27
-rw-r--r--Documentation/power/interface.txt151
-rw-r--r--Documentation/sphinx-static/theme_overrides.css3
-rw-r--r--MAINTAINERS6
-rw-r--r--Makefile2
-rw-r--r--arch/arm/include/asm/kvm_host.h12
-rw-r--r--arch/arm/kernel/entry-armv.S1
-rw-r--r--arch/arm/kvm/arm.c10
-rw-r--r--arch/arm/mach-imx/gpc.c6
-rw-r--r--arch/arm/mm/mmu.c21
-rw-r--r--arch/arm64/include/asm/kvm_host.h12
-rw-r--r--arch/arm64/kernel/sleep.S10
-rw-r--r--arch/arm64/mm/dump.c6
-rw-r--r--arch/arm64/mm/numa.c2
-rw-r--r--arch/mips/include/asm/kvm_host.h46
-rw-r--r--arch/mips/kvm/mips.c10
-rw-r--r--arch/parisc/include/uapi/asm/errno.h4
-rw-r--r--arch/parisc/kernel/processor.c8
-rw-r--r--arch/parisc/kernel/time.c12
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h37
-rw-r--r--arch/powerpc/include/asm/hmi.h2
-rw-r--r--arch/powerpc/include/asm/io.h29
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h10
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h37
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h87
-rw-r--r--arch/powerpc/include/asm/kvm_host.h123
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h28
-rw-r--r--arch/powerpc/include/asm/mmu.h1
-rw-r--r--arch/powerpc/include/asm/opal.h1
-rw-r--r--arch/powerpc/include/asm/paca.h12
-rw-r--r--arch/powerpc/include/asm/pnv-pci.h3
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kvm/Kconfig3
-rw-r--r--arch/powerpc/kvm/Makefile20
-rw-r--r--arch/powerpc/kvm/book3s.c7
-rw-r--r--arch/powerpc/kvm/book3s_hv.c401
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c156
-rw-r--r--arch/powerpc/kvm/book3s_hv_hmi.c (renamed from arch/powerpc/kernel/hmi.c)0
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c120
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S183
-rw-r--r--arch/powerpc/kvm/book3s_xics.c57
-rw-r--r--arch/powerpc/kvm/book3s_xics.h2
-rw-r--r--arch/powerpc/kvm/e500_mmu.c73
-rw-r--r--arch/powerpc/kvm/powerpc.c61
-rw-r--r--arch/powerpc/kvm/trace_hv.h22
-rw-r--r--arch/powerpc/mm/hash_native_64.c42
-rw-r--r--arch/powerpc/mm/hash_utils_64.c55
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S1
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c24
-rw-r--r--arch/s390/include/asm/facilities_src.h24
-rw-r--r--arch/s390/include/asm/kvm_host.h136
-rw-r--r--arch/s390/kernel/asm-offsets.c1
-rw-r--r--arch/s390/kvm/gaccess.c37
-rw-r--r--arch/s390/kvm/guestdbg.c59
-rw-r--r--arch/s390/kvm/intercept.c1
-rw-r--r--arch/s390/kvm/interrupt.c98
-rw-r--r--arch/s390/kvm/kvm-s390.c80
-rw-r--r--arch/s390/kvm/kvm-s390.h14
-rw-r--r--arch/s390/kvm/priv.c21
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c2
-rw-r--r--arch/x86/include/asm/kvm_host.h78
-rw-r--r--arch/x86/include/asm/pvclock.h5
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c9
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/smpboot.c25
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/debugfs.c69
-rw-r--r--arch/x86/kvm/hyperv.c157
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/svm.c417
-rw-r--r--arch/x86/kvm/vmx.c171
-rw-r--r--arch/x86/kvm/x86.c170
-rw-r--r--arch/x86/kvm/x86.h6
-rw-r--r--arch/x86/power/hibernate_64.c2
-rw-r--r--drivers/clocksource/bcm_kona_timer.c16
-rw-r--r--drivers/clocksource/mips-gic-timer.c2
-rw-r--r--drivers/clocksource/time-armada-370-xp.c1
-rw-r--r--drivers/edac/Kconfig8
-rw-r--r--drivers/edac/Makefile1
-rw-r--r--drivers/edac/skx_edac.c1121
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c5
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c2
-rw-r--r--drivers/gpu/drm/drm_fb_helper.c2
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gpu.c10
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h1
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c10
-rw-r--r--drivers/gpu/drm/i915/i915_gem_gtt.c1
-rw-r--r--drivers/gpu/drm/i915/i915_reg.h1
-rw-r--r--drivers/gpu/drm/i915/intel_audio.c6
-rw-r--r--drivers/gpu/drm/i915/intel_ddi.c91
-rw-r--r--drivers/gpu/drm/i915/intel_display.c170
-rw-r--r--drivers/gpu/drm/i915/intel_fbc.c20
-rw-r--r--drivers/gpu/drm/i915/intel_pm.c6
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.c8
-rw-r--r--drivers/gpu/drm/mediatek/Kconfig3
-rw-r--r--drivers/gpu/drm/radeon/radeon_atpx_handler.c9
-rw-r--r--drivers/hwmon/it87.c2
-rw-r--r--drivers/i2c/busses/i2c-at91.c24
-rw-r--r--drivers/i2c/busses/i2c-bcm-iproc.c2
-rw-r--r--drivers/i2c/busses/i2c-bcm-kona.c2
-rw-r--r--drivers/i2c/busses/i2c-brcmstb.c2
-rw-r--r--drivers/i2c/busses/i2c-cros-ec-tunnel.c2
-rw-r--r--drivers/i2c/busses/i2c-meson.c6
-rw-r--r--drivers/i2c/busses/i2c-ocores.c14
-rw-r--r--drivers/i2c/muxes/i2c-demux-pinctrl.c4
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-raid.c82
-rw-r--r--drivers/md/dm-round-robin.c7
-rw-r--r--drivers/of/base.c14
-rw-r--r--drivers/of/fdt.c2
-rw-r--r--drivers/of/irq.c5
-rw-r--r--drivers/of/platform.c2
-rw-r--r--drivers/scsi/aacraid/commctrl.c13
-rw-r--r--drivers/scsi/fcoe/fcoe_ctlr.c2
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_base.c6
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_fusion.c2
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_base.c22
-rw-r--r--drivers/scsi/ses.c3
-rw-r--r--drivers/usb/class/cdc-acm.c5
-rw-r--r--drivers/usb/class/cdc-acm.h1
-rw-r--r--drivers/usb/core/config.c66
-rw-r--r--drivers/usb/core/devio.c7
-rw-r--r--drivers/usb/core/hub.c23
-rw-r--r--drivers/usb/dwc3/dwc3-of-simple.c1
-rw-r--r--drivers/usb/dwc3/dwc3-pci.c2
-rw-r--r--drivers/usb/dwc3/gadget.c55
-rw-r--r--drivers/usb/gadget/composite.c6
-rw-r--r--drivers/usb/gadget/configfs.c2
-rw-r--r--drivers/usb/gadget/function/rndis.c6
-rw-r--r--drivers/usb/gadget/function/u_ether.c3
-rw-r--r--drivers/usb/gadget/function/uvc_configfs.c2
-rw-r--r--drivers/usb/gadget/legacy/inode.c4
-rw-r--r--drivers/usb/gadget/udc/core.c5
-rw-r--r--drivers/usb/gadget/udc/fsl_qe_udc.c2
-rw-r--r--drivers/usb/host/ehci-hcd.c4
-rw-r--r--drivers/usb/host/max3421-hcd.c2
-rw-r--r--drivers/usb/host/xhci-hub.c3
-rw-r--r--drivers/usb/host/xhci-pci.c3
-rw-r--r--drivers/usb/host/xhci-ring.c16
-rw-r--r--drivers/usb/misc/ftdi-elan.c10
-rw-r--r--drivers/usb/misc/usbtest.c9
-rw-r--r--drivers/usb/phy/phy-omap-otg.c2
-rw-r--r--drivers/usb/renesas_usbhs/common.c3
-rw-r--r--drivers/usb/renesas_usbhs/fifo.c4
-rw-r--r--drivers/usb/renesas_usbhs/mod_gadget.c7
-rw-r--r--drivers/usb/serial/ftdi_sio.c3
-rw-r--r--drivers/usb/serial/ftdi_sio_ids.h12
-rw-r--r--drivers/usb/serial/option.c22
-rw-r--r--drivers/usb/serial/usb-serial.c4
-rw-r--r--fs/iomap.c21
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c14
-rw-r--r--fs/xfs/libxfs/xfs_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c6
-rw-r--r--fs/xfs/xfs_buf.c1
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_iomap.c69
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c9
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--include/linux/compiler.h6
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--kernel/configs/kvm_guest.config (renamed from arch/x86/configs/kvm_guest.config)1
-rw-r--r--kernel/events/core.c95
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/sched/cputime.c33
-rw-r--r--security/Kconfig1
-rw-r--r--tools/arch/arm64/include/uapi/asm/kvm.h2
-rw-r--r--tools/arch/s390/include/uapi/asm/kvm.h41
-rw-r--r--tools/arch/s390/include/uapi/asm/sie.h1
-rw-r--r--tools/perf/arch/powerpc/util/sym-handling.c2
-rw-r--r--tools/perf/arch/x86/util/intel-pt.c6
-rw-r--r--tools/perf/builtin-mem.c3
-rw-r--r--tools/perf/builtin-script.c13
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-decoder.c44
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c24
-rw-r--r--tools/perf/util/jitdump.c1
-rw-r--r--tools/perf/util/probe-file.c36
-rw-r--r--tools/perf/util/symbol-elf.c3
-rw-r--r--virt/kvm/kvm_main.c44
188 files changed, 4748 insertions, 1351 deletions
diff --git a/Documentation/conf.py b/Documentation/conf.py
index 96b7aa66c89c..106ae9c740b9 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -131,7 +131,7 @@ pygments_style = 'sphinx'
todo_include_todos = False
primary_domain = 'C'
-highlight_language = 'C'
+highlight_language = 'guess'
# -- Options for HTML output ----------------------------------------------
diff --git a/Documentation/hwmon/ftsteutates b/Documentation/hwmon/ftsteutates
index 2a1bf69c6a26..8c10a916de20 100644
--- a/Documentation/hwmon/ftsteutates
+++ b/Documentation/hwmon/ftsteutates
@@ -19,5 +19,5 @@ enhancements. It can monitor up to 4 voltages, 16 temperatures and
implemented in this driver.
Specification of the chip can be found here:
-ftp:///pub/Mainboard-OEM-Sales/Services/Software&Tools/Linux_SystemMonitoring&Watchdog&GPIO/BMC-Teutates_Specification_V1.21.pdf
-ftp:///pub/Mainboard-OEM-Sales/Services/Software&Tools/Linux_SystemMonitoring&Watchdog&GPIO/Fujitsu_mainboards-1-Sensors_HowTo-en-US.pdf
+ftp://ftp.ts.fujitsu.com/pub/Mainboard-OEM-Sales/Services/Software&Tools/Linux_SystemMonitoring&Watchdog&GPIO/BMC-Teutates_Specification_V1.21.pdf
+ftp://ftp.ts.fujitsu.com/pub/Mainboard-OEM-Sales/Services/Software&Tools/Linux_SystemMonitoring&Watchdog&GPIO/Fujitsu_mainboards-1-Sensors_HowTo-en-US.pdf
diff --git a/Documentation/kernel-documentation.rst b/Documentation/kernel-documentation.rst
index c4eb5049da39..391decc66a18 100644
--- a/Documentation/kernel-documentation.rst
+++ b/Documentation/kernel-documentation.rst
@@ -366,8 +366,6 @@ Domain`_ references.
Cross-referencing from reStructuredText
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. highlight:: none
-
To cross-reference the functions and types defined in the kernel-doc comments
from reStructuredText documents, please use the `Sphinx C Domain`_
references. For example::
@@ -390,8 +388,6 @@ For further details, please refer to the `Sphinx C Domain`_ documentation.
Function documentation
----------------------
-.. highlight:: c
-
The general format of a function and function-like macro kernel-doc comment is::
/**
@@ -572,8 +568,6 @@ DocBook XML [DEPRECATED]
Converting DocBook to Sphinx
----------------------------
-.. highlight:: none
-
Over time, we expect all of the documents under ``Documentation/DocBook`` to be
converted to Sphinx and reStructuredText. For most DocBook XML documents, a good
enough solution is to use the simple ``Documentation/sphinx/tmplcvt`` script,
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index b96098ccfe69..708f87f78a75 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -164,7 +164,32 @@ load n/2 modules more and try again.
Again, if you find the offending module(s), it(they) must be unloaded every time
before hibernation, and please report the problem with it(them).
-c) Advanced debugging
+c) Using the "test_resume" hibernation option
+
+/sys/power/disk generally tells the kernel what to do after creating a
+hibernation image. One of the available options is "test_resume" which
+causes the just created image to be used for immediate restoration. Namely,
+after doing:
+
+# echo test_resume > /sys/power/disk
+# echo disk > /sys/power/state
+
+a hibernation image will be created and a resume from it will be triggered
+immediately without involving the platform firmware in any way.
+
+That test can be used to check if failures to resume from hibernation are
+related to bad interactions with the platform firmware. That is, if the above
+works every time, but resume from actual hibernation does not work or is
+unreliable, the platform firmware may be responsible for the failures.
+
+On architectures and platforms that support using different kernels to restore
+hibernation images (that is, the kernel used to read the image from storage and
+load it into memory is different from the one included in the image) or support
+kernel address space randomization, it also can be used to check if failures
+to resume may be related to the differences between the restore and image
+kernels.
+
+d) Advanced debugging
In case that hibernation does not work on your system even in the minimal
configuration and compiling more drivers as modules is not practical or some
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index f1f0f59a7c47..974916ff6608 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -1,75 +1,76 @@
-Power Management Interface
-
-
-The power management subsystem provides a unified sysfs interface to
-userspace, regardless of what architecture or platform one is
-running. The interface exists in /sys/power/ directory (assuming sysfs
-is mounted at /sys).
-
-/sys/power/state controls system power state. Reading from this file
-returns what states are supported, which is hard-coded to 'freeze',
-'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk'
-(Suspend-to-Disk).
-
-Writing to this file one of those strings causes the system to
-transition into that state. Please see the file
-Documentation/power/states.txt for a description of each of those
-states.
-
-
-/sys/power/disk controls the operating mode of the suspend-to-disk
-mechanism. Suspend-to-disk can be handled in several ways. We have a
-few options for putting the system to sleep - using the platform driver
-(e.g. ACPI or other suspend_ops), powering off the system or rebooting the
-system (for testing).
-
-Additionally, /sys/power/disk can be used to turn on one of the two testing
-modes of the suspend-to-disk mechanism: 'testproc' or 'test'. If the
-suspend-to-disk mechanism is in the 'testproc' mode, writing 'disk' to
-/sys/power/state will cause the kernel to disable nonboot CPUs and freeze
-tasks, wait for 5 seconds, unfreeze tasks and enable nonboot CPUs. If it is
-in the 'test' mode, writing 'disk' to /sys/power/state will cause the kernel
-to disable nonboot CPUs and freeze tasks, shrink memory, suspend devices, wait
-for 5 seconds, resume devices, unfreeze tasks and enable nonboot CPUs. Then,
-we are able to look in the log messages and work out, for example, which code
-is being slow and which device drivers are misbehaving.
-
-Reading from this file will display all supported modes and the currently
-selected one in brackets, for example
-
- [shutdown] reboot test testproc
-
-Writing to this file will accept one of
-
- 'platform' (only if the platform supports it)
- 'shutdown'
- 'reboot'
- 'testproc'
- 'test'
-
-/sys/power/image_size controls the size of the image created by
-the suspend-to-disk mechanism. It can be written a string
-representing a non-negative integer that will be used as an upper
-limit of the image size, in bytes. The suspend-to-disk mechanism will
-do its best to ensure the image size will not exceed that number. However,
-if this turns out to be impossible, it will try to suspend anyway using the
-smallest image possible. In particular, if "0" is written to this file, the
-suspend image will be as small as possible.
-
-Reading from this file will display the current image size limit, which
-is set to 2/5 of available RAM by default.
-
-/sys/power/pm_trace controls the code which saves the last PM event point in
-the RTC across reboots, so that you can debug a machine that just hangs
-during suspend (or more commonly, during resume). Namely, the RTC is only
-used to save the last PM event point if this file contains '1'. Initially it
-contains '0' which may be changed to '1' by writing a string representing a
-nonzero integer into it.
-
-To use this debugging feature you should attempt to suspend the machine, then
-reboot it and run
-
- dmesg -s 1000000 | grep 'hash matches'
-
-CAUTION: Using it will cause your machine's real-time (CMOS) clock to be
-set to a random invalid time after a resume.
+Power Management Interface for System Sleep
+
+Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+The power management subsystem provides userspace with a unified sysfs interface
+for system sleep regardless of the underlying system architecture or platform.
+The interface is located in the /sys/power/ directory (assuming that sysfs is
+mounted at /sys).
+
+/sys/power/state is the system sleep state control file.
+
+Reading from it returns a list of supported sleep states, encoded as:
+
+'freeze' (Suspend-to-Idle)
+'standby' (Power-On Suspend)
+'mem' (Suspend-to-RAM)
+'disk' (Suspend-to-Disk)
+
+Suspend-to-Idle is always supported. Suspend-to-Disk is always supported
+too as long the kernel has been configured to support hibernation at all
+(ie. CONFIG_HIBERNATION is set in the kernel configuration file). Support
+for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
+platform.
+
+If one of the strings listed in /sys/power/state is written to it, the system
+will attempt to transition into the corresponding sleep state. Refer to
+Documentation/power/states.txt for a description of each of those states.
+
+/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
+Specifically, it tells the kernel what to do after creating a hibernation image.
+
+Reading from it returns a list of supported options encoded as:
+
+'platform' (put the system into sleep using a platform-provided method)
+'shutdown' (shut the system down)
+'reboot' (reboot the system)
+'suspend' (trigger a Suspend-to-RAM transition)
+'test_resume' (resume-after-hibernation test mode)
+
+The currently selected option is printed in square brackets.
+
+The 'platform' option is only available if the platform provides a special
+mechanism to put the system to sleep after creating a hibernation image (ACPI
+does that, for example). The 'suspend' option is available if Suspend-to-RAM
+is supported. Refer to Documentation/power/basic_pm_debugging.txt for the
+description of the 'test_resume' option.
+
+To select an option, write the string representing it to /sys/power/disk.
+
+/sys/power/image_size controls the size of hibernation images.
+
+It can be written a string representing a non-negative integer that will be
+used as a best-effort upper limit of the image size, in bytes. The hibernation
+core will do its best to ensure that the image size will not exceed that number.
+However, if that turns out to be impossible to achieve, a hibernation image will
+still be created and its size will be as small as possible. In particular,
+writing '0' to this file will enforce hibernation images to be as small as
+possible.
+
+Reading from this file returns the current image size limit, which is set to
+around 2/5 of available RAM by default.
+
+/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
+or resume event point in the RTC across reboots.
+
+It helps to debug hard lockups or reboots due to device driver failures that
+occur during system suspend or resume (which is more common) more effectively.
+
+If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
+event point in turn will be stored in the RTC memory (overwriting the actual
+RTC information), so it will survive a system crash if one occurs right after
+storing it and it can be used later to identify the driver that caused the crash
+to happen (see Documentation/power/s2ram.txt for more information).
+
+Initially it contains '0' which may be changed to '1' by writing a string
+representing a nonzero integer into it.
diff --git a/Documentation/sphinx-static/theme_overrides.css b/Documentation/sphinx-static/theme_overrides.css
index 3a2ac4bcfd78..e88461c4c1e6 100644
--- a/Documentation/sphinx-static/theme_overrides.css
+++ b/Documentation/sphinx-static/theme_overrides.css
@@ -42,11 +42,12 @@
caption a.headerlink { opacity: 0; }
caption a.headerlink:hover { opacity: 1; }
- /* inline literal: drop the borderbox and red color */
+ /* inline literal: drop the borderbox, padding and red color */
code, .rst-content tt, .rst-content code {
color: inherit;
border: none;
+ padding: unset;
background: inherit;
font-size: 85%;
}
diff --git a/MAINTAINERS b/MAINTAINERS
index a306795a7450..0bbe4b105c34 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4525,6 +4525,12 @@ L: linux-edac@vger.kernel.org
S: Maintained
F: drivers/edac/sb_edac.c
+EDAC-SKYLAKE
+M: Tony Luck <tony.luck@intel.com>
+L: linux-edac@vger.kernel.org
+S: Maintained
+F: drivers/edac/skx_edac.c
+
EDAC-XGENE
APPLIED MICRO (APM) X-GENE SOC EDAC
M: Loc Ho <lho@apm.com>
diff --git a/Makefile b/Makefile
index 5c18baad7218..3537aa23905f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 4
PATCHLEVEL = 8
SUBLEVEL = 0
-EXTRAVERSION = -rc2
+EXTRAVERSION = -rc3
NAME = Psychotic Stoned Sheep
# *DOCUMENTATION*
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index c2c40a7ee738..2d19e02d03fd 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -188,15 +188,15 @@ struct kvm_vcpu_arch {
};
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 hvc_exit_stat;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
u64 mmio_exit_user;
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index bc5f50799d75..9f157e7c51e7 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -295,6 +295,7 @@ __und_svc_fault:
bl __und_fault
__und_svc_finish:
+ get_thread_info tsk
ldr r5, [sp, #S_PSR] @ Get SVC cpsr
svc_exit r5 @ return from exception
UNWIND(.fnend )
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 85a3f90cb8ec..8a4a5637fab1 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -144,6 +144,16 @@ out_fail_alloc:
return ret;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c
index fd8720532471..0df062d8b2c9 100644
--- a/arch/arm/mach-imx/gpc.c
+++ b/arch/arm/mach-imx/gpc.c
@@ -271,6 +271,12 @@ static int __init imx_gpc_init(struct device_node *node,
for (i = 0; i < IMR_NUM; i++)
writel_relaxed(~0, gpc_base + GPC_IMR1 + i * 4);
+ /*
+ * Clear the OF_POPULATED flag set in of_irq_init so that
+ * later the GPC power domain driver will not be skipped.
+ */
+ of_node_clear_flag(node, OF_POPULATED);
+
return 0;
}
IRQCHIP_DECLARE(imx_gpc, "fsl,imx6q-gpc", imx_gpc_init);
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 62f4d01941f7..6344913f0804 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -728,7 +728,8 @@ static void *__init late_alloc(unsigned long sz)
{
void *ptr = (void *)__get_free_pages(PGALLOC_GFP, get_order(sz));
- BUG_ON(!ptr);
+ if (!ptr || !pgtable_page_ctor(virt_to_page(ptr)))
+ BUG();
return ptr;
}
@@ -1155,10 +1156,19 @@ void __init sanity_check_meminfo(void)
{
phys_addr_t memblock_limit = 0;
int highmem = 0;
- phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1;
+ u64 vmalloc_limit;
struct memblock_region *reg;
bool should_use_highmem = false;
+ /*
+ * Let's use our own (unoptimized) equivalent of __pa() that is
+ * not affected by wrap-arounds when sizeof(phys_addr_t) == 4.
+ * The result is used as the upper bound on physical memory address
+ * and may itself be outside the valid range for which phys_addr_t
+ * and therefore __pa() is defined.
+ */
+ vmalloc_limit = (u64)(uintptr_t)vmalloc_min - PAGE_OFFSET + PHYS_OFFSET;
+
for_each_memblock(memory, reg) {
phys_addr_t block_start = reg->base;
phys_addr_t block_end = reg->base + reg->size;
@@ -1183,10 +1193,11 @@ void __init sanity_check_meminfo(void)
if (reg->size > size_limit) {
phys_addr_t overlap_size = reg->size - size_limit;
- pr_notice("Truncating RAM at %pa-%pa to -%pa",
- &block_start, &block_end, &vmalloc_limit);
- memblock_remove(vmalloc_limit, overlap_size);
+ pr_notice("Truncating RAM at %pa-%pa",
+ &block_start, &block_end);
block_end = vmalloc_limit;
+ pr_cont(" to -%pa", &block_end);
+ memblock_remove(vmalloc_limit, overlap_size);
should_use_highmem = true;
}
}
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3eda975837d0..bd94e6766759 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -290,15 +290,15 @@ struct kvm_vcpu_arch {
#endif
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 hvc_exit_stat;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
u64 mmio_exit_user;
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 9a3aec97ac09..ccf79d849e0a 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -101,12 +101,20 @@ ENTRY(cpu_resume)
bl el2_setup // if in EL2 drop to EL1 cleanly
/* enable the MMU early - so we can access sleep_save_stash by va */
adr_l lr, __enable_mmu /* __cpu_setup will return here */
- ldr x27, =_cpu_resume /* __enable_mmu will branch here */
+ adr_l x27, _resume_switched /* __enable_mmu will branch here */
adrp x25, idmap_pg_dir
adrp x26, swapper_pg_dir
b __cpu_setup
ENDPROC(cpu_resume)
+ .pushsection ".idmap.text", "ax"
+_resume_switched:
+ ldr x8, =_cpu_resume
+ br x8
+ENDPROC(_resume_switched)
+ .ltorg
+ .popsection
+
ENTRY(_cpu_resume)
mrs x1, mpidr_el1
adrp x8, mpidr_hash
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index f94b80eb295d..9c3e75df2180 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -242,7 +242,7 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
{
- pte_t *pte = pte_offset_kernel(pmd, 0);
+ pte_t *pte = pte_offset_kernel(pmd, 0UL);
unsigned long addr;
unsigned i;
@@ -254,7 +254,7 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
{
- pmd_t *pmd = pmd_offset(pud, 0);
+ pmd_t *pmd = pmd_offset(pud, 0UL);
unsigned long addr;
unsigned i;
@@ -271,7 +271,7 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
{
- pud_t *pud = pud_offset(pgd, 0);
+ pud_t *pud = pud_offset(pgd, 0UL);
unsigned long addr;
unsigned i;
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index c7fe3ec70774..5bb15eab6f00 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -23,6 +23,8 @@
#include <linux/module.h>
#include <linux/of.h>
+#include <asm/acpi.h>
+
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
nodemask_t numa_nodes_parsed __initdata;
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b54bcadd8aec..5f488dc8a7d5 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -110,32 +110,32 @@
extern atomic_t kvm_mips_instance;
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 wait_exits;
- u32 cache_exits;
- u32 signal_exits;
- u32 int_exits;
- u32 cop_unusable_exits;
- u32 tlbmod_exits;
- u32 tlbmiss_ld_exits;
- u32 tlbmiss_st_exits;
- u32 addrerr_st_exits;
- u32 addrerr_ld_exits;
- u32 syscall_exits;
- u32 resvd_inst_exits;
- u32 break_inst_exits;
- u32 trap_inst_exits;
- u32 msa_fpe_exits;
- u32 fpe_exits;
- u32 msa_disabled_exits;
- u32 flush_dcache_exits;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
+ u64 wait_exits;
+ u64 cache_exits;
+ u64 signal_exits;
+ u64 int_exits;
+ u64 cop_unusable_exits;
+ u64 tlbmod_exits;
+ u64 tlbmiss_ld_exits;
+ u64 tlbmiss_st_exits;
+ u64 addrerr_st_exits;
+ u64 addrerr_ld_exits;
+ u64 syscall_exits;
+ u64 resvd_inst_exits;
+ u64 break_inst_exits;
+ u64 trap_inst_exits;
+ u64 msa_fpe_exits;
+ u64 fpe_exits;
+ u64 msa_disabled_exits;
+ u64 flush_dcache_exits;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
};
struct kvm_arch_memory_slot {
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a6ea084b4d9d..49b25e74d0c7 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -140,6 +140,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return 0;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
void kvm_mips_free_vcpus(struct kvm *kvm)
{
unsigned int i;
diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
index c0ae62520d15..274d5bc6ecce 100644
--- a/arch/parisc/include/uapi/asm/errno.h
+++ b/arch/parisc/include/uapi/asm/errno.h
@@ -97,10 +97,10 @@
#define ENOTCONN 235 /* Transport endpoint is not connected */
#define ESHUTDOWN 236 /* Cannot send after transport endpoint shutdown */
#define ETOOMANYREFS 237 /* Too many references: cannot splice */
-#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */
#define ETIMEDOUT 238 /* Connection timed out */
#define ECONNREFUSED 239 /* Connection refused */
-#define EREMOTERELEASE 240 /* Remote peer released connection */
+#define EREFUSED ECONNREFUSED /* for HP's NFS apparently */
+#define EREMOTERELEASE 240 /* Remote peer released connection */
#define EHOSTDOWN 241 /* Host is down */
#define EHOSTUNREACH 242 /* No route to host */
diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c
index 5adc339eb7c8..0c2a94a0f751 100644
--- a/arch/parisc/kernel/processor.c
+++ b/arch/parisc/kernel/processor.c
@@ -51,8 +51,6 @@ EXPORT_SYMBOL(_parisc_requires_coherency);
DEFINE_PER_CPU(struct cpuinfo_parisc, cpu_data);
-extern int update_cr16_clocksource(void); /* from time.c */
-
/*
** PARISC CPU driver - claim "device" and initialize CPU data structures.
**
@@ -228,12 +226,6 @@ static int processor_probe(struct parisc_device *dev)
}
#endif
- /* If we've registered more than one cpu,
- * we'll use the jiffies clocksource since cr16
- * is not synchronized between CPUs.
- */
- update_cr16_clocksource();
-
return 0;
}
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 505cf1ac5af2..4b0b963d52a7 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -221,18 +221,6 @@ static struct clocksource clocksource_cr16 = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-int update_cr16_clocksource(void)
-{
- /* since the cr16 cycle counters are not synchronized across CPUs,
- we'll check if we should switch to a safe clocksource: */
- if (clocksource_cr16.rating != 0 && num_online_cpus() > 1) {
- clocksource_change_rating(&clocksource_cr16, 0);
- return 1;
- }
-
- return 0;
-}
-
void __init start_cpu_itimer(void)
{
unsigned int cpu = smp_processor_id();
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 287a656ceb57..e407af2b7333 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -245,6 +245,43 @@ static inline int segment_shift(int ssize)
}
/*
+ * This array is indexed by the LP field of the HPTE second dword.
+ * Since this field may contain some RPN bits, some entries are
+ * replicated so that we get the same value irrespective of RPN.
+ * The top 4 bits are the page size index (MMU_PAGE_*) for the
+ * actual page size, the bottom 4 bits are the base page size.
+ */
+extern u8 hpte_page_sizes[1 << LP_BITS];
+
+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+ bool is_base_size)
+{
+ unsigned int i, lp;
+
+ if (!(h & HPTE_V_LARGE))
+ return 1ul << 12;
+
+ /* Look at the 8 bit LP value */
+ lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+ i = hpte_page_sizes[lp];
+ if (!i)
+ return 0;
+ if (!is_base_size)
+ i >>= 4;
+ return 1ul << mmu_psize_defs[i & 0xf].shift;
+}
+
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+ return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
+{
+ return __hpte_page_size(h, l, 1);
+}
+
+/*
* The current system page and segment sizes
*/
extern int mmu_kernel_ssize;
diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
index 88b4901ac4ee..85b7a1a21e22 100644
--- a/arch/powerpc/include/asm/hmi.h
+++ b/arch/powerpc/include/asm/hmi.h
@@ -21,7 +21,7 @@
#ifndef __ASM_PPC64_HMI_H__
#define __ASM_PPC64_HMI_H__
-#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
#define CORE_TB_RESYNC_REQ_BIT 63
#define MAX_SUBCORE_PER_CORE 4
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 2fd1690b79d2..f6fda8482f60 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
#endif
#endif /* __powerpc64__ */
+
+/*
+ * Simple Cache inhibited accessors
+ * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
+ * barriers, callers need to manage memory barriers on their own.
+ * These can only be used in hypervisor real mode.
+ */
+
+static inline u32 _lwzcix(unsigned long addr)
+{
+ u32 ret;
+
+ __asm__ __volatile__("lwzcix %0,0, %1"
+ : "=r" (ret) : "r" (addr) : "memory");
+ return ret;
+}
+
+static inline void _stbcix(u64 addr, u8 val)
+{
+ __asm__ __volatile__("stbcix %0,0,%1"
+ : : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void _stwcix(u64 addr, u32 val)
+{
+ __asm__ __volatile__("stwcix %0,0,%1"
+ : : "r" (val), "r" (addr) : "memory");
+}
+
/*
* Low level IO stream instructions are defined out of line for now
*/
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 5bca220bbb60..05cabed3d1bd 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -105,6 +105,15 @@
#define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60
#define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80
+/* book3s_hv */
+
+/*
+ * Special trap used to indicate to host that this is a
+ * passthrough interrupt that could not be handled
+ * completely in the guest.
+ */
+#define BOOK3S_INTERRUPT_HV_RM_HARD 0x5555
+
#define BOOK3S_IRQPRIO_SYSTEM_RESET 0
#define BOOK3S_IRQPRIO_DATA_SEGMENT 1
#define BOOK3S_IRQPRIO_INST_SEGMENT 2
@@ -136,6 +145,7 @@
#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */
#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
#define RESUME_FLAG_ARCH1 (1<<2)
+#define RESUME_FLAG_ARCH2 (1<<3)
#define RESUME_GUEST 0
#define RESUME_GUEST_NV RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 8f39796c9da8..cef2b892245c 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -69,6 +69,42 @@ struct hpte_cache {
int pagesize;
};
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits. This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
+ */
+struct kvmppc_vcore {
+ int n_runnable;
+ int num_threads;
+ int entry_exit_map;
+ int napping_threads;
+ int first_vcpuid;
+ u16 pcpu;
+ u16 last_cpu;
+ u8 vcore_state;
+ u8 in_guest;
+ struct kvmppc_vcore *master_vcore;
+ struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
+ struct list_head preempt_list;
+ spinlock_t lock;
+ struct swait_queue_head wq;
+ spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
+ u64 stolen_tb;
+ u64 preempt_tb;
+ struct kvm_vcpu *runner;
+ struct kvm *kvm;
+ u64 tb_offset; /* guest timebase - host timebase */
+ ulong lpcr;
+ u32 arch_compat;
+ ulong pcr;
+ ulong dpdes; /* doorbell state (POWER8) */
+ ulong conferring_threads;
+ unsigned int halt_poll_ns;
+};
+
struct kvmppc_vcpu_book3s {
struct kvmppc_sid_map sid_map[SID_MAP_NUM];
struct {
@@ -191,6 +227,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
struct kvm_vcpu *vcpu);
extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
struct kvmppc_book3s_shadow_vcpu *svcpu);
+extern int kvm_irq_bypass;
static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
{
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 88d17b4ea9c8..4ffd5a1e788d 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
#ifndef __ASM_KVM_BOOK3S_64_H__
#define __ASM_KVM_BOOK3S_64_H__
+#include <asm/book3s/64/mmu-hash.h>
+
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
{
@@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
hpte[0] = cpu_to_be64(hpte_v);
}
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
- int i, shift;
- unsigned int mask;
-
- /* start from 1 ignoring MMU_PAGE_4K */
- for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
- /* invalid penc */
- if (mmu_psize_defs[psize].penc[i] == -1)
- continue;
- /*
- * encoding bits per actual page size
- * PTE LP actual page size
- * rrrr rrrz >=8KB
- * rrrr rrzz >=16KB
- * rrrr rzzz >=32KB
- * rrrr zzzz >=64KB
- * .......
- */
- shift = mmu_psize_defs[i].shift - LP_SHIFT;
- if (shift > LP_BITS)
- shift = LP_BITS;
- mask = (1 << shift) - 1;
- if ((lp & mask) == mmu_psize_defs[psize].penc[i])
- return i;
- }
- return -1;
-}
-
static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
unsigned long pte_index)
{
- int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
+ int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
unsigned int penc;
unsigned long rb = 0, va_low, sllp;
unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
if (v & HPTE_V_LARGE) {
- for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) {
-
- /* valid entries have a shift value */
- if (!mmu_psize_defs[b_psize].shift)
- continue;
-
- a_psize = __hpte_actual_psize(lp, b_psize);
- if (a_psize != -1)
- break;
- }
+ i = hpte_page_sizes[lp];
+ b_psize = i & 0xf;
+ a_psize = i >> 4;
}
+
/*
* Ignore the top 14 bits of va
* v have top two bits covering segment size, hence move
@@ -215,45 +181,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
return rb;
}
-static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
- bool is_base_size)
-{
-
- int size, a_psize;
- /* Look at the 8 bit LP value */
- unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-
- /* only handle 4k, 64k and 16M pages for now */
- if (!(h & HPTE_V_LARGE))
- return 1ul << 12;
- else {
- for (size = 0; size < MMU_PAGE_COUNT; size++) {
- /* valid entries have a shift value */
- if (!mmu_psize_defs[size].shift)
- continue;
-
- a_psize = __hpte_actual_psize(lp, size);
- if (a_psize != -1) {
- if (is_base_size)
- return 1ul << mmu_psize_defs[size].shift;
- return 1ul << mmu_psize_defs[a_psize].shift;
- }
- }
-
- }
- return 0;
-}
-
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
-{
- return __hpte_page_size(h, l, 0);
-}
-
-static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l)
-{
- return __hpte_page_size(h, l, 1);
-}
-
static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
{
return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index ec35af34a3fb..ed30d2ea21b7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -43,6 +43,8 @@
#include <asm/cputhreads.h>
#define KVM_MAX_VCPU_ID (threads_per_subcore * KVM_MAX_VCORES)
+#define __KVM_HAVE_ARCH_INTC_INITIALIZED
+
#ifdef CONFIG_KVM_MMIO
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
#endif
@@ -95,42 +97,49 @@ struct kvmppc_vcpu_book3s;
struct kvmppc_book3s_shadow_vcpu;
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_vcpu_stat {
- u32 sum_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 light_exits;
+ u64 sum_exits;
+ u64 mmio_exits;
+ u64 signal_exits;
+ u64 light_exits;
/* Account for special types of light exits: */
- u32 itlb_real_miss_exits;
- u32 itlb_virt_miss_exits;
- u32 dtlb_real_miss_exits;
- u32 dtlb_virt_miss_exits;
- u32 syscall_exits;
- u32 isi_exits;
- u32 dsi_exits;
- u32 emulated_inst_exits;
- u32 dec_exits;
- u32 ext_intr_exits;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 dbell_exits;
- u32 gdbell_exits;
- u32 ld;
- u32 st;
+ u64 itlb_real_miss_exits;
+ u64 itlb_virt_miss_exits;
+ u64 dtlb_real_miss_exits;
+ u64 dtlb_virt_miss_exits;
+ u64 syscall_exits;
+ u64 isi_exits;
+ u64 dsi_exits;
+ u64 emulated_inst_exits;
+ u64 dec_exits;
+ u64 ext_intr_exits;
+ u64 halt_poll_success_ns;
+ u64 halt_poll_fail_ns;
+ u64 halt_wait_ns;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_successful_wait;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 dbell_exits;
+ u64 gdbell_exits;
+ u64 ld;
+ u64 st;
#ifdef CONFIG_PPC_BOOK3S
- u32 pf_storage;
- u32 pf_instruc;
- u32 sp_storage;
- u32 sp_instruc;
- u32 queue_intr;
- u32 ld_slow;
- u32 st_slow;
+ u64 pf_storage;
+ u64 pf_instruc;
+ u64 sp_storage;
+ u64 sp_instruc;
+ u64 queue_intr;
+ u64 ld_slow;
+ u64 st_slow;
#endif
+ u64 pthru_all;
+ u64 pthru_host;
+ u64 pthru_bad_aff;
};
enum kvm_exit_types {
@@ -197,6 +206,8 @@ struct kvmppc_spapr_tce_table {
struct kvmppc_xics;
struct kvmppc_icp;
+struct kvmppc_passthru_irqmap;
+
/*
* The reverse mapping array has one entry for each HPTE,
* which stores the guest's view of the second word of the HPTE
@@ -267,6 +278,7 @@ struct kvm_arch {
#endif
#ifdef CONFIG_KVM_XICS
struct kvmppc_xics *xics;
+ struct kvmppc_passthru_irqmap *pimap;
#endif
struct kvmppc_ops *kvm_ops;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -275,41 +287,6 @@ struct kvm_arch {
#endif
};
-/*
- * Struct for a virtual core.
- * Note: entry_exit_map combines a bitmap of threads that have entered
- * in the bottom 8 bits and a bitmap of threads that have exited in the
- * next 8 bits. This is so that we can atomically set the entry bit
- * iff the exit map is 0 without taking a lock.
- */
-struct kvmppc_vcore {
- int n_runnable;
- int num_threads;
- int entry_exit_map;
- int napping_threads;
- int first_vcpuid;
- u16 pcpu;
- u16 last_cpu;
- u8 vcore_state;
- u8 in_guest;
- struct kvmppc_vcore *master_vcore;
- struct list_head runnable_threads;
- struct list_head preempt_list;
- spinlock_t lock;
- struct swait_queue_head wq;
- spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
- u64 stolen_tb;
- u64 preempt_tb;
- struct kvm_vcpu *runner;
- struct kvm *kvm;
- u64 tb_offset; /* guest timebase - host timebase */
- ulong lpcr;
- u32 arch_compat;
- ulong pcr;
- ulong dpdes; /* doorbell state (POWER8) */
- ulong conferring_threads;
-};
-
#define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff)
#define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8)
#define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0)
@@ -329,6 +306,7 @@ struct kvmppc_vcore {
#define VCORE_SLEEPING 3
#define VCORE_RUNNING 4
#define VCORE_EXITING 5
+#define VCORE_POLLING 6
/*
* Struct used to manage memory for a virtual processor area
@@ -397,6 +375,20 @@ struct kvmhv_tb_accumulator {
u64 tb_max; /* max time */
};
+#ifdef CONFIG_PPC_BOOK3S_64
+struct kvmppc_irq_map {
+ u32 r_hwirq;
+ u32 v_hwirq;
+ struct irq_desc *desc;
+};
+
+#define KVMPPC_PIRQ_MAPPED 1024
+struct kvmppc_passthru_irqmap {
+ int n_mapped;
+ struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED];
+};
+#endif
+
# ifdef CONFIG_PPC_FSL_BOOK3E
#define KVMPPC_BOOKE_IAC_NUM 2
#define KVMPPC_BOOKE_DAC_NUM 2
@@ -668,7 +660,6 @@ struct kvm_vcpu_arch {
long pgfault_index;
unsigned long pgfault_hpte[2];
- struct list_head run_list;
struct task_struct *run_task;
struct kvm_run *kvm_run;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2544edabe7f3..f6e49640dbe1 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,6 +287,10 @@ struct kvmppc_ops {
long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
unsigned long arg);
int (*hcall_implemented)(unsigned long hcall);
+ int (*irq_bypass_add_producer)(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
+ void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
+ struct irq_bypass_producer *);
};
extern struct kvmppc_ops *kvmppc_hv_ops;
@@ -453,8 +457,19 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{
return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
}
+
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+ struct kvm *kvm)
+{
+ if (kvm && kvm_irq_bypass)
+ return kvm->arch.pimap;
+ return NULL;
+}
+
extern void kvmppc_alloc_host_rm_ops(void);
extern void kvmppc_free_host_rm_ops(void);
+extern void kvmppc_free_pimap(struct kvm *kvm);
+extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -464,10 +479,23 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
extern void kvmppc_xics_ipi_action(void);
+extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+ unsigned long host_irq);
+extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+ unsigned long host_irq);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
+ struct kvmppc_irq_map *irq_map,
+ struct kvmppc_passthru_irqmap *pimap);
extern int h_ipi_redirect;
#else
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+ struct kvm *kvm)
+ { return NULL; }
static inline void kvmppc_alloc_host_rm_ops(void) {};
static inline void kvmppc_free_host_rm_ops(void) {};
+static inline void kvmppc_free_pimap(struct kvm *kvm) {};
+static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+ { return 0; }
static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index e2fb408f8398..b78e8d3377f6 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void)
#define MMU_PAGE_16G 13
#define MMU_PAGE_64G 14
+/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
#define MMU_PAGE_COUNT 15
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ee05bd203630..e958b7096f19 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func,
int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func,
uint64_t offset, uint32_t data);
int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
+int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority);
int64_t opal_register_exception_handler(uint64_t opal_exception,
uint64_t handler_address,
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 148303e7771f..6a6792bb39fb 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -183,11 +183,6 @@ struct paca_struct {
*/
u16 in_mce;
u8 hmi_event_available; /* HMI event is available */
- /*
- * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
- * more details
- */
- struct sibling_subcore_state *sibling_subcore_state;
#endif
/* Stuff for accurate time accounting */
@@ -202,6 +197,13 @@ struct paca_struct {
struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
#endif
struct kvmppc_host_state kvm_hstate;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ /*
+ * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+ * more details
+ */
+ struct sibling_subcore_state *sibling_subcore_state;
+#endif
#endif
};
diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h
index 0cbd8134ce81..1b46b52d3212 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/pci_hotplug.h>
+#include <linux/irq.h>
#include <misc/cxl-base.h>
#include <asm/opal-api.h>
@@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
int pnv_cxl_get_irq_count(struct pci_dev *dev);
struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+bool is_pnv_opal_msi(struct irq_chip *chip);
#ifdef CONFIG_CXL_BASE
int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b2027a5cf508..fe4c075bcf50 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o
+obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o
obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o
obj-$(CONFIG_PPC64) += vdso64/
obj-$(CONFIG_ALTIVEC) += vecemu.o
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index c2024ac9d4e8..029be26b5a17 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,9 @@ config KVM
select ANON_INODES
select HAVE_KVM_EVENTFD
select SRCU
+ select KVM_VFIO
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
config KVM_BOOK3S_HANDLER
bool
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1f9e5529e692..7dd89b79d038 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
KVM := ../../../virt/kvm
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
CFLAGS_e500_mmu.o := -I.
CFLAGS_e500_mmu_host.o := -I.
CFLAGS_emulate.o := -I.
CFLAGS_emulate_loadstore.o := -I.
-common-objs-y += powerpc.o emulate.o emulate_loadstore.o
+common-objs-y += powerpc.o emulate_loadstore.o
obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
@@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
kvm-e500-objs := \
$(common-objs-y) \
+ emulate.o \
booke.o \
booke_emulate.o \
booke_interrupts.o \
@@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
kvm-e500mc-objs := \
$(common-objs-y) \
+ emulate.o \
booke.o \
booke_emulate.o \
bookehv_interrupts.o \
@@ -61,9 +63,6 @@ kvm-pr-y := \
book3s_32_mmu.o
ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-kvm-book3s_64-module-objs := \
- $(KVM)/coalesced_mmio.o
-
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
book3s_rmhandlers.o
endif
@@ -78,6 +77,7 @@ kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+ book3s_hv_hmi.o \
book3s_hv_rmhandlers.o \
book3s_hv_rm_mmu.o \
book3s_hv_ras.o \
@@ -88,11 +88,8 @@ endif
kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
book3s_xics.o
-kvm-book3s_64-module-objs += \
- $(KVM)/kvm_main.o \
- $(KVM)/eventfd.o \
- powerpc.o \
- emulate_loadstore.o \
+kvm-book3s_64-module-objs := \
+ $(common-objs-y) \
book3s.o \
book3s_64_vio.o \
book3s_rtas.o \
@@ -102,6 +99,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
kvm-book3s_32-objs := \
$(common-objs-y) \
+ emulate.o \
fpu.o \
book3s_paired_singles.o \
book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 47018fcbf7d6..ba231a1d43d4 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "dec", VCPU_STAT(dec_exits) },
{ "ext_intr", VCPU_STAT(ext_intr_exits) },
{ "queue_intr", VCPU_STAT(queue_intr) },
+ { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) },
+ { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) },
+ { "halt_wait_ns", VCPU_STAT(halt_wait_ns) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+ { "halt_successful_wait", VCPU_STAT(halt_successful_wait) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "pf_storage", VCPU_STAT(pf_storage) },
@@ -64,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "ld_slow", VCPU_STAT(ld_slow) },
{ "st", VCPU_STAT(st) },
{ "st_slow", VCPU_STAT(st_slow) },
+ { "pthru_all", VCPU_STAT(pthru_all) },
+ { "pthru_host", VCPU_STAT(pthru_host) },
+ { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) },
{ NULL }
};
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2fd5580c8f6e..9b3bba643b43 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -53,11 +53,15 @@
#include <asm/smp.h>
#include <asm/dbell.h>
#include <asm/hmi.h>
+#include <asm/pnv-pci.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
#include <linux/module.h>
+#include <linux/compiler.h>
#include "book3s.h"
@@ -70,6 +74,8 @@
/* Used to indicate that a guest page fault needs to be handled */
#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2)
/* Used as a "null" value for timebase values */
#define TB_NIL (~(u64)0)
@@ -89,14 +95,55 @@ static struct kernel_param_ops module_param_ops = {
.get = param_get_int,
};
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
+ S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
#endif
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
+
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+ int *ip)
+{
+ int i = *ip;
+ struct kvm_vcpu *vcpu;
+
+ while (++i < MAX_SMT_THREADS) {
+ vcpu = READ_ONCE(vc->runnable_threads[i]);
+ if (vcpu) {
+ *ip = i;
+ return vcpu;
+ }
+ }
+ return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+ for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
+
static bool kvmppc_ipi_thread(int cpu)
{
/* On POWER8 for IPIs to threads in the same core, use msgsnd */
@@ -991,6 +1038,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
break;
+ case BOOK3S_INTERRUPT_HV_RM_HARD:
+ r = RESUME_PASSTHROUGH;
+ break;
default:
kvmppc_dump_regs(vcpu);
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
@@ -1493,7 +1543,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
if (vcore == NULL)
return NULL;
- INIT_LIST_HEAD(&vcore->runnable_threads);
spin_lock_init(&vcore->lock);
spin_lock_init(&vcore->stoltb_lock);
init_swait_queue_head(&vcore->wq);
@@ -1802,7 +1851,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
spin_unlock_irq(&vcpu->arch.tbacct_lock);
--vc->n_runnable;
- list_del(&vcpu->arch.run_list);
+ WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
}
static int kvmppc_grab_hwthread(int cpu)
@@ -2209,10 +2258,10 @@ static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
static void prepare_threads(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu, *vnext;
+ int i;
+ struct kvm_vcpu *vcpu;
- list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, vc) {
if (signal_pending(vcpu->arch.run_task))
vcpu->arch.ret = -EINTR;
else if (vcpu->arch.vpa.update_pending ||
@@ -2259,15 +2308,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
{
- int still_running = 0;
+ int still_running = 0, i;
u64 now;
long ret;
- struct kvm_vcpu *vcpu, *vnext;
+ struct kvm_vcpu *vcpu;
spin_lock(&vc->lock);
now = get_tb();
- list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, vc) {
/* cancel pending dec exception if dec is positive */
if (now < vcpu->arch.dec_expires &&
kvmppc_core_pending_dec(vcpu))
@@ -2307,8 +2355,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
}
if (vc->n_runnable > 0 && vc->runner == NULL) {
/* make sure there's a candidate runner awake */
- vcpu = list_first_entry(&vc->runnable_threads,
- struct kvm_vcpu, arch.run_list);
+ i = -1;
+ vcpu = next_runnable_thread(vc, &i);
wake_up(&vcpu->arch.cpu_run);
}
}
@@ -2361,7 +2409,7 @@ static inline void kvmppc_set_host_core(int cpu)
*/
static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu, *vnext;
+ struct kvm_vcpu *vcpu;
int i;
int srcu_idx;
struct core_info core_info;
@@ -2397,8 +2445,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
*/
if ((threads_per_core > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
- list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu);
wake_up(&vcpu->arch.cpu_run);
@@ -2477,8 +2524,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
active |= 1 << thr;
list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
pvc->pcpu = pcpu + thr;
- list_for_each_entry(vcpu, &pvc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, vcpu, pvc) {
kvmppc_start_thread(vcpu, pvc);
kvmppc_create_dtl_entry(vcpu, pvc);
trace_kvm_guest_enter(vcpu);
@@ -2604,34 +2650,92 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
finish_wait(&vcpu->arch.cpu_run, &wait);
}
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+ /* 10us base */
+ if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+ vc->halt_poll_ns = 10000;
+ else
+ vc->halt_poll_ns *= halt_poll_ns_grow;
+
+ if (vc->halt_poll_ns > halt_poll_max_ns)
+ vc->halt_poll_ns = halt_poll_max_ns;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+ if (halt_poll_ns_shrink == 0)
+ vc->halt_poll_ns = 0;
+ else
+ vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ for_each_runnable_thread(i, vcpu, vc) {
+ if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+ return 1;
+ }
+
+ return 0;
+}
+
/*
* All the vcpus in this vcore are idle, so wait for a decrementer
* or external interrupt to one of the vcpus. vc->lock is held.
*/
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu;
+ ktime_t cur, start_poll, start_wait;
int do_sleep = 1;
+ u64 block_ns;
DECLARE_SWAITQUEUE(wait);
- prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+ /* Poll for pending exceptions and ceded state */
+ cur = start_poll = ktime_get();
+ if (vc->halt_poll_ns) {
+ ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+ ++vc->runner->stat.halt_attempted_poll;
- /*
- * Check one last time for pending exceptions and ceded state after
- * we put ourselves on the wait queue
- */
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
- if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
- do_sleep = 0;
- break;
+ vc->vcore_state = VCORE_POLLING;
+ spin_unlock(&vc->lock);
+
+ do {
+ if (kvmppc_vcore_check_block(vc)) {
+ do_sleep = 0;
+ break;
+ }
+ cur = ktime_get();
+ } while (single_task_running() && ktime_before(cur, stop));
+
+ spin_lock(&vc->lock);
+ vc->vcore_state = VCORE_INACTIVE;
+
+ if (!do_sleep) {
+ ++vc->runner->stat.halt_successful_poll;
+ goto out;
}
}
- if (!do_sleep) {
+ prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+
+ if (kvmppc_vcore_check_block(vc)) {
finish_swait(&vc->wq, &wait);
- return;
+ do_sleep = 0;
+ /* If we polled, count this as a successful poll */
+ if (vc->halt_poll_ns)
+ ++vc->runner->stat.halt_successful_poll;
+ goto out;
}
+ start_wait = ktime_get();
+
vc->vcore_state = VCORE_SLEEPING;
trace_kvmppc_vcore_blocked(vc, 0);
spin_unlock(&vc->lock);
@@ -2640,13 +2744,52 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
spin_lock(&vc->lock);
vc->vcore_state = VCORE_INACTIVE;
trace_kvmppc_vcore_blocked(vc, 1);
+ ++vc->runner->stat.halt_successful_wait;
+
+ cur = ktime_get();
+
+out:
+ block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
+
+ /* Attribute wait time */
+ if (do_sleep) {
+ vc->runner->stat.halt_wait_ns +=
+ ktime_to_ns(cur) - ktime_to_ns(start_wait);
+ /* Attribute failed poll time */
+ if (vc->halt_poll_ns)
+ vc->runner->stat.halt_poll_fail_ns +=
+ ktime_to_ns(start_wait) -
+ ktime_to_ns(start_poll);
+ } else {
+ /* Attribute successful poll time */
+ if (vc->halt_poll_ns)
+ vc->runner->stat.halt_poll_success_ns +=
+ ktime_to_ns(cur) -
+ ktime_to_ns(start_poll);
+ }
+
+ /* Adjust poll time */
+ if (halt_poll_max_ns) {
+ if (block_ns <= vc->halt_poll_ns)
+ ;
+ /* We slept and blocked for longer than the max halt time */
+ else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+ shrink_halt_poll_ns(vc);
+ /* We slept and our poll time is too small */
+ else if (vc->halt_poll_ns < halt_poll_max_ns &&
+ block_ns < halt_poll_max_ns)
+ grow_halt_poll_ns(vc);
+ } else
+ vc->halt_poll_ns = 0;
+
+ trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
- int n_ceded;
+ int n_ceded, i;
struct kvmppc_vcore *vc;
- struct kvm_vcpu *v, *vn;
+ struct kvm_vcpu *v;
trace_kvmppc_run_vcpu_enter(vcpu);
@@ -2666,7 +2809,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
vcpu->arch.busy_preempt = TB_NIL;
- list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+ WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
++vc->n_runnable;
/*
@@ -2706,8 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
continue;
}
- list_for_each_entry_safe(v, vn, &vc->runnable_threads,
- arch.run_list) {
+ for_each_runnable_thread(i, v, vc) {
kvmppc_core_prepare_to_enter(v);
if (signal_pending(v->arch.run_task)) {
kvmppc_remove_runnable(vc, v);
@@ -2720,7 +2862,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
break;
n_ceded = 0;
- list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+ for_each_runnable_thread(i, v, vc) {
if (!v->arch.pending_exceptions)
n_ceded += v->arch.ceded;
else
@@ -2759,8 +2901,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
/* Wake up some vcpu to run the core */
- v = list_first_entry(&vc->runnable_threads,
- struct kvm_vcpu, arch.run_list);
+ i = -1;
+ v = next_runnable_thread(vc, &i);
wake_up(&v->arch.cpu_run);
}
@@ -2818,7 +2960,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
r = kvmppc_book3s_hv_page_fault(run, vcpu,
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
- }
+ } else if (r == RESUME_PASSTHROUGH)
+ r = kvmppc_xics_rm_complete(vcpu, 0);
} while (is_kvmppc_resume_guest(r));
out:
@@ -3247,6 +3390,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
kvmppc_free_vcores(kvm);
kvmppc_free_hpt(kvm);
+
+ kvmppc_free_pimap(kvm);
}
/* We don't need to emulate any privileged instructions or dcbz */
@@ -3282,6 +3427,184 @@ static int kvmppc_core_check_processor_compat_hv(void)
return 0;
}
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+ kfree(kvm->arch.pimap);
+}
+
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+ return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+ struct irq_desc *desc;
+ struct kvmppc_irq_map *irq_map;
+ struct kvmppc_passthru_irqmap *pimap;
+ struct irq_chip *chip;
+ int i;
+
+ if (!kvm_irq_bypass)
+ return 1;
+
+ desc = irq_to_desc(host_irq);
+ if (!desc)
+ return -EIO;
+
+ mutex_lock(&kvm->lock);
+
+ pimap = kvm->arch.pimap;
+ if (pimap == NULL) {
+ /* First call, allocate structure to hold IRQ map */
+ pimap = kvmppc_alloc_pimap();
+ if (pimap == NULL) {
+ mutex_unlock(&kvm->lock);
+ return -ENOMEM;
+ }
+ kvm->arch.pimap = pimap;
+ }
+
+ /*
+ * For now, we only support interrupts for which the EOI operation
+ * is an OPAL call followed by a write to XIRR, since that's
+ * what our real-mode EOI code does.
+ */
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (!chip || !is_pnv_opal_msi(chip)) {
+ pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
+ host_irq, guest_gsi);
+ mutex_unlock(&kvm->lock);
+ return -ENOENT;
+ }
+
+ /*
+ * See if we already have an entry for this guest IRQ number.
+ * If it's mapped to a hardware IRQ number, that's an error,
+ * otherwise re-use this entry.
+ */
+ for (i = 0; i < pimap->n_mapped; i++) {
+ if (guest_gsi == pimap->mapped[i].v_hwirq) {
+ if (pimap->mapped[i].r_hwirq) {
+ mutex_unlock(&kvm->lock);
+ return -EINVAL;
+ }
+ break;
+ }
+ }
+
+ if (i == KVMPPC_PIRQ_MAPPED) {
+ mutex_unlock(&kvm->lock);
+ return -EAGAIN; /* table is full */
+ }
+
+ irq_map = &pimap->mapped[i];
+
+ irq_map->v_hwirq = guest_gsi;
+ irq_map->desc = desc;
+
+ /*
+ * Order the above two stores before the next to serialize with
+ * the KVM real mode handler.
+ */
+ smp_wmb();
+ irq_map->r_hwirq = desc->irq_data.hwirq;
+
+ if (i == pimap->n_mapped)
+ pimap->n_mapped++;
+
+ kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+
+ mutex_unlock(&kvm->lock);
+
+ return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+ struct irq_desc *desc;
+ struct kvmppc_passthru_irqmap *pimap;
+ int i;
+
+ if (!kvm_irq_bypass)
+ return 0;
+
+ desc = irq_to_desc(host_irq);
+ if (!desc)
+ return -EIO;
+
+ mutex_lock(&kvm->lock);
+
+ if (kvm->arch.pimap == NULL) {
+ mutex_unlock(&kvm->lock);
+ return 0;
+ }
+ pimap = kvm->arch.pimap;
+
+ for (i = 0; i < pimap->n_mapped; i++) {
+ if (guest_gsi == pimap->mapped[i].v_hwirq)
+ break;
+ }
+
+ if (i == pimap->n_mapped) {
+ mutex_unlock(&kvm->lock);
+ return -ENODEV;
+ }
+
+ kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
+ /* invalidate the entry */
+ pimap->mapped[i].r_hwirq = 0;
+
+ /*
+ * We don't free this structure even when the count goes to
+ * zero. The structure is freed when we destroy the VM.
+ */
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret = 0;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ irqfd->producer = prod;
+
+ ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+ if (ret)
+ pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
+ prod->irq, irqfd->gsi, ret);
+
+ return ret;
+}
+
+static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * default external interrupt handling mode - KVM real mode
+ * will switch back to host.
+ */
+ ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+ if (ret)
+ pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
+ prod->irq, irqfd->gsi, ret);
+}
+#endif
+
static long kvm_arch_vm_ioctl_hv(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3400,6 +3723,10 @@ static struct kvmppc_ops kvm_ops_hv = {
.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
.arch_vm_ioctl = kvm_arch_vm_ioctl_hv,
.hcall_implemented = kvmppc_hcall_impl_hv,
+#ifdef CONFIG_KVM_XICS
+ .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
+ .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
+#endif
};
static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5f0380db3eab..0c84d6bc8356 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,7 @@
#include <asm/xics.h>
#include <asm/dbell.h>
#include <asm/cputhreads.h>
+#include <asm/io.h>
#define KVM_CMA_CHUNK_ORDER 18
@@ -286,3 +287,158 @@ void kvmhv_commence_exit(int trap)
struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+#ifdef CONFIG_KVM_XICS
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+ u32 xisr)
+{
+ int i;
+
+ /*
+ * We access the mapped array here without a lock. That
+ * is safe because we never reduce the number of entries
+ * in the array and we never change the v_hwirq field of
+ * an entry once it is set.
+ *
+ * We have also carefully ordered the stores in the writer
+ * and the loads here in the reader, so that if we find a matching
+ * hwirq here, the associated GSI and irq_desc fields are valid.
+ */
+ for (i = 0; i < pimap->n_mapped; i++) {
+ if (xisr == pimap->mapped[i].r_hwirq) {
+ /*
+ * Order subsequent reads in the caller to serialize
+ * with the writer.
+ */
+ smp_rmb();
+ return &pimap->mapped[i];
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If we have an interrupt that's not an IPI, check if we have a
+ * passthrough adapter and if so, check if this external interrupt
+ * is for the adapter.
+ * We will attempt to deliver the IRQ directly to the target VCPU's
+ * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+ *
+ * If the delivery fails or if this is not for a passthrough adapter,
+ * return to the host to handle this interrupt. We earlier
+ * saved a copy of the XIRR in the PACA, it will be picked up by
+ * the host ICP driver.
+ */
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+ struct kvmppc_passthru_irqmap *pimap;
+ struct kvmppc_irq_map *irq_map;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = local_paca->kvm_hstate.kvm_vcpu;
+ if (!vcpu)
+ return 1;
+ pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
+ if (!pimap)
+ return 1;
+ irq_map = get_irqmap(pimap, xisr);
+ if (!irq_map)
+ return 1;
+
+ /* We're handling this interrupt, generic code doesn't need to */
+ local_paca->kvm_hstate.saved_xirr = 0;
+
+ return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
+}
+
+#else
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+ return 1;
+}
+#endif
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ * 0 if no interrupt is pending
+ * 1 if an interrupt is pending that needs to be handled by the host
+ * 2 Passthrough that needs completion in the host
+ * -1 if there was a guest wakeup IPI (which has now been cleared)
+ * -2 if there is PCI passthrough external interrupt that was handled
+ */
+
+long kvmppc_read_intr(void)
+{
+ unsigned long xics_phys;
+ u32 h_xirr;
+ __be32 xirr;
+ u32 xisr;
+ u8 host_ipi;
+
+ /* see if a host IPI is pending */
+ host_ipi = local_paca->kvm_hstate.host_ipi;
+ if (host_ipi)
+ return 1;
+
+ /* Now read the interrupt from the ICP */
+ xics_phys = local_paca->kvm_hstate.xics_phys;
+ if (unlikely(!xics_phys))
+ return 1;
+
+ /*
+ * Save XIRR for later. Since we get control in reverse endian
+ * on LE systems, save it byte reversed and fetch it back in
+ * host endian. Note that xirr is the value read from the
+ * XIRR register, while h_xirr is the host endian version.
+ */
+ xirr = _lwzcix(xics_phys + XICS_XIRR);
+ h_xirr = be32_to_cpu(xirr);
+ local_paca->kvm_hstate.saved_xirr = h_xirr;
+ xisr = h_xirr & 0xffffff;
+ /*
+ * Ensure that the store/load complete to guarantee all side
+ * effects of loading from XIRR has completed
+ */
+ smp_mb();
+
+ /* if nothing pending in the ICP */
+ if (!xisr)
+ return 0;
+
+ /* We found something in the ICP...
+ *
+ * If it is an IPI, clear the MFRR and EOI it.
+ */
+ if (xisr == XICS_IPI) {
+ _stbcix(xics_phys + XICS_MFRR, 0xff);
+ _stwcix(xics_phys + XICS_XIRR, xirr);
+ /*
+ * Need to ensure side effects of above stores
+ * complete before proceeding.
+ */
+ smp_mb();
+
+ /*
+ * We need to re-check host IPI now in case it got set in the
+ * meantime. If it's clear, we bounce the interrupt to the
+ * guest
+ */
+ host_ipi = local_paca->kvm_hstate.host_ipi;
+ if (unlikely(host_ipi != 0)) {
+ /* We raced with the host,
+ * we need to resend that IPI, bummer
+ */
+ _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+ /* Let side effects complete */
+ smp_mb();
+ return 1;
+ }
+
+ /* OK, it's an IPI for us */
+ local_paca->kvm_hstate.saved_xirr = 0;
+ return -1;
+ }
+
+ return kvmppc_check_passthru(xisr, xirr);
+}
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
index e3f738eb1cac..e3f738eb1cac 100644
--- a/arch/powerpc/kernel/hmi.c
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 980d8a6f7284..82ff5de8b1e7 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/err.h>
+#include <linux/kernel_stat.h>
#include <asm/kvm_book3s.h>
#include <asm/kvm_ppc.h>
@@ -18,7 +19,10 @@
#include <asm/debug.h>
#include <asm/synch.h>
#include <asm/cputhreads.h>
+#include <asm/pgtable.h>
#include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
#include "book3s_xics.h"
@@ -26,9 +30,12 @@
int h_ipi_redirect = 1;
EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
u32 new_irq);
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
/* -- ICS routines -- */
static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -708,10 +715,123 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
icp->rm_action |= XICS_RM_NOTIFY_EOI;
icp->rm_eoied_irq = irq;
}
+
+ if (state->host_irq) {
+ ++vcpu->stat.pthru_all;
+ if (state->intr_cpu != -1) {
+ int pcpu = raw_smp_processor_id();
+
+ pcpu = cpu_first_thread_sibling(pcpu);
+ ++vcpu->stat.pthru_host;
+ if (state->intr_cpu != pcpu) {
+ ++vcpu->stat.pthru_bad_aff;
+ xics_opal_rm_set_server(state->host_irq, pcpu);
+ }
+ state->intr_cpu = -1;
+ }
+ }
bail:
return check_too_hard(xics, icp);
}
+unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+{
+ unsigned long xics_phys;
+ int64_t rc;
+
+ rc = pnv_opal_pci_msi_eoi(c, hwirq);
+
+ if (rc)
+ eoi_rc = rc;
+
+ iosync();
+
+ /* EOI it */
+ xics_phys = local_paca->kvm_hstate.xics_phys;
+ _stwcix(xics_phys + XICS_XIRR, xirr);
+}
+
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+{
+ unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
+
+ return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+}
+
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+ unsigned long l;
+ unsigned int *raddr;
+ int cpu = smp_processor_id();
+
+ raddr = per_cpu_ptr(addr, cpu);
+ l = (unsigned long)raddr;
+
+ if (REGION_ID(l) == VMALLOC_REGION_ID) {
+ l = vmalloc_to_phys(raddr);
+ raddr = (unsigned int *)l;
+ }
+ ++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ * - state flags represent internal IRQ state and are not expected to be
+ * updated outside the IRQ subsystem
+ * - more importantly, these are useful for edge triggered interrupts,
+ * IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ * and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+ this_cpu_inc_rm(desc->kstat_irqs);
+ __this_cpu_inc(kstat.irqs_sum);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+ u32 xirr,
+ struct kvmppc_irq_map *irq_map,
+ struct kvmppc_passthru_irqmap *pimap)
+{
+ struct kvmppc_xics *xics;
+ struct kvmppc_icp *icp;
+ u32 irq;
+
+ irq = irq_map->v_hwirq;
+ xics = vcpu->kvm->arch.xics;
+ icp = vcpu->arch.icp;
+
+ kvmppc_rm_handle_irq_desc(irq_map->desc);
+ icp_rm_deliver_irq(xics, icp, irq);
+
+ /* EOI the interrupt */
+ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
+
+ if (check_too_hard(xics, icp) == H_TOO_HARD)
+ return 2;
+ else
+ return -2;
+}
+
/* --- Non-real mode XICS-related built-in routines --- */
/**
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 975655573844..7cc924b5eea2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -221,6 +221,13 @@ kvmppc_primary_no_guest:
li r3, 0 /* Don't wake on privileged (OS) doorbell */
b kvm_do_nap
+/*
+ * kvm_novcpu_wakeup
+ * Entered from kvm_start_guest if kvm_hstate.napping is set
+ * to NAPPING_NOVCPU
+ * r2 = kernel TOC
+ * r13 = paca
+ */
kvm_novcpu_wakeup:
ld r1, HSTATE_HOST_R1(r13)
ld r5, HSTATE_KVM_VCORE(r13)
@@ -230,6 +237,13 @@ kvm_novcpu_wakeup:
/* check the wake reason */
bl kvmppc_check_wake_reason
+ /*
+ * Restore volatile registers since we could have called
+ * a C routine in kvmppc_check_wake_reason.
+ * r5 = VCORE
+ */
+ ld r5, HSTATE_KVM_VCORE(r13)
+
/* see if any other thread is already exiting */
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
@@ -322,6 +336,11 @@ kvm_start_guest:
/* Check the wake reason in SRR1 to see why we got here */
bl kvmppc_check_wake_reason
+ /*
+ * kvmppc_check_wake_reason could invoke a C routine, but we
+ * have no volatile registers to restore when we return.
+ */
+
cmpdi r3, 0
bge kvm_no_guest
@@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
cmpwi r3, 512 /* 1 microsecond */
blt hdec_soon
+deliver_guest_interrupt:
ld r6, VCPU_CTR(r4)
ld r7, VCPU_XER(r4)
@@ -895,7 +915,6 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
mtspr SPRN_SRR0, r6
mtspr SPRN_SRR1, r7
-deliver_guest_interrupt:
/* r11 = vcpu->arch.msr & ~MSR_HV */
rldicl r11, r11, 63 - MSR_HV_LG, 1
rotldi r11, r11, 1 + MSR_HV_LG
@@ -1155,10 +1174,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
* set, we know the host wants us out so let's do it now
*/
bl kvmppc_read_intr
+
+ /*
+ * Restore the active volatile registers after returning from
+ * a C function.
+ */
+ ld r9, HSTATE_KVM_VCPU(r13)
+ li r12, BOOK3S_INTERRUPT_EXTERNAL
+
+ /*
+ * kvmppc_read_intr return codes:
+ *
+ * Exit to host (r3 > 0)
+ * 1 An interrupt is pending that needs to be handled by the host
+ * Exit guest and return to host by branching to guest_exit_cont
+ *
+ * 2 Passthrough that needs completion in the host
+ * Exit guest and return to host by branching to guest_exit_cont
+ * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+ * to indicate to the host to complete handling the interrupt
+ *
+ * Before returning to guest, we check if any CPU is heading out
+ * to the host and if so, we head out also. If no CPUs are heading
+ * check return values <= 0.
+ *
+ * Return to guest (r3 <= 0)
+ * 0 No external interrupt is pending
+ * -1 A guest wakeup IPI (which has now been cleared)
+ * In either case, we return to guest to deliver any pending
+ * guest interrupts.
+ *
+ * -2 A PCI passthrough external interrupt was handled
+ * (interrupt was delivered directly to guest)
+ * Return to guest to deliver any pending guest interrupts.
+ */
+
+ cmpdi r3, 1
+ ble 1f
+
+ /* Return code = 2 */
+ li r12, BOOK3S_INTERRUPT_HV_RM_HARD
+ stw r12, VCPU_TRAP(r9)
+ b guest_exit_cont
+
+1: /* Return code <= 1 */
cmpdi r3, 0
bgt guest_exit_cont
- /* Check if any CPU is heading out to the host, if so head out too */
+ /* Return code <= 0 */
4: ld r5, HSTATE_KVM_VCORE(r13)
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
@@ -2213,10 +2276,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
ld r29, VCPU_GPR(R29)(r4)
ld r30, VCPU_GPR(R30)(r4)
ld r31, VCPU_GPR(R31)(r4)
-
+
/* Check the wake reason in SRR1 to see why we got here */
bl kvmppc_check_wake_reason
+ /*
+ * Restore volatile registers since we could have called a
+ * C routine in kvmppc_check_wake_reason
+ * r4 = VCPU
+ * r3 tells us whether we need to return to host or not
+ * WARNING: it gets checked further down:
+ * should not modify r3 until this check is done.
+ */
+ ld r4, HSTATE_KVM_VCPU(r13)
+
/* clear our bit in vcore->napping_threads */
34: ld r5,HSTATE_KVM_VCORE(r13)
lbz r7,HSTATE_PTID(r13)
@@ -2230,7 +2303,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
li r0,0
stb r0,HSTATE_NAPPING(r13)
- /* See if the wake reason means we need to exit */
+ /* See if the wake reason saved in r3 means we need to exit */
stw r12, VCPU_TRAP(r4)
mr r9, r4
cmpdi r3, 0
@@ -2297,10 +2370,14 @@ machine_check_realmode:
* 0 if nothing needs to be done
* 1 if something happened that needs to be handled by the host
* -1 if there was a guest wakeup (IPI or msgsnd)
+ * -2 if we handled a PCI passthrough interrupt (returned by
+ * kvmppc_read_intr only)
*
* Also sets r12 to the interrupt vector for any interrupt that needs
* to be handled now by the host (0x500 for external interrupt), or zero.
- * Modifies r0, r6, r7, r8.
+ * Modifies all volatile registers (since it may call a C function).
+ * This routine calls kvmppc_read_intr, a C function, if an external
+ * interrupt is pending.
*/
kvmppc_check_wake_reason:
mfspr r6, SPRN_SRR1
@@ -2310,8 +2387,7 @@ FTR_SECTION_ELSE
rlwinm r6, r6, 45-31, 0xe /* P7 wake reason field is 3 bits */
ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
cmpwi r6, 8 /* was it an external interrupt? */
- li r12, BOOK3S_INTERRUPT_EXTERNAL
- beq kvmppc_read_intr /* if so, see what it was */
+ beq 7f /* if so, see what it was */
li r3, 0
li r12, 0
cmpwi r6, 6 /* was it the decrementer? */
@@ -2350,83 +2426,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
blr
-/*
- * Determine what sort of external interrupt is pending (if any).
- * Returns:
- * 0 if no interrupt is pending
- * 1 if an interrupt is pending that needs to be handled by the host
- * -1 if there was a guest wakeup IPI (which has now been cleared)
- * Modifies r0, r6, r7, r8, returns value in r3.
- */
-kvmppc_read_intr:
- /* see if a host IPI is pending */
- li r3, 1
- lbz r0, HSTATE_HOST_IPI(r13)
- cmpwi r0, 0
- bne 1f
+ /* external interrupt - create a stack frame so we can call C */
+7: mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+ stdu r1, -PPC_MIN_STKFRM(r1)
+ bl kvmppc_read_intr
+ nop
+ li r12, BOOK3S_INTERRUPT_EXTERNAL
+ cmpdi r3, 1
+ ble 1f
- /* Now read the interrupt from the ICP */
- ld r6, HSTATE_XICS_PHYS(r13)
- li r7, XICS_XIRR
- cmpdi r6, 0
- beq- 1f
- lwzcix r0, r6, r7
/*
- * Save XIRR for later. Since we get in in reverse endian on LE
- * systems, save it byte reversed and fetch it back in host endian.
- */
- li r3, HSTATE_SAVED_XIRR
- STWX_BE r0, r3, r13
-#ifdef __LITTLE_ENDIAN__
- lwz r3, HSTATE_SAVED_XIRR(r13)
-#else
- mr r3, r0
-#endif
- rlwinm. r3, r3, 0, 0xffffff
- sync
- beq 1f /* if nothing pending in the ICP */
-
- /* We found something in the ICP...
- *
- * If it's not an IPI, stash it in the PACA and return to
- * the host, we don't (yet) handle directing real external
- * interrupts directly to the guest
+ * Return code of 2 means PCI passthrough interrupt, but
+ * we need to return back to host to complete handling the
+ * interrupt. Trap reason is expected in r12 by guest
+ * exit code.
*/
- cmpwi r3, XICS_IPI /* if there is, is it an IPI? */
- bne 42f
-
- /* It's an IPI, clear the MFRR and EOI it */
- li r3, 0xff
- li r8, XICS_MFRR
- stbcix r3, r6, r8 /* clear the IPI */
- stwcix r0, r6, r7 /* EOI it */
- sync
-
- /* We need to re-check host IPI now in case it got set in the
- * meantime. If it's clear, we bounce the interrupt to the
- * guest
- */
- lbz r0, HSTATE_HOST_IPI(r13)
- cmpwi r0, 0
- bne- 43f
-
- /* OK, it's an IPI for us */
- li r12, 0
- li r3, -1
-1: blr
-
-42: /* It's not an IPI and it's for the host. We saved a copy of XIRR in
- * the PACA earlier, it will be picked up by the host ICP driver
- */
- li r3, 1
- b 1b
-
-43: /* We raced with the host, we need to resend that IPI, bummer */
- li r0, IPI_PRIORITY
- stbcix r0, r6, r8 /* set the IPI */
- sync
- li r3, 1
- b 1b
+ li r12, BOOK3S_INTERRUPT_HV_RM_HARD
+1:
+ ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+ addi r1, r1, PPC_MIN_STKFRM
+ mtlr r0
+ blr
/*
* Save away FP, VMX and VSX registers.
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 05aa11399a78..3bdc639157c1 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
return 0;
}
+ /* Record which CPU this arrived on for passed-through interrupts */
+ if (state->host_irq)
+ state->intr_cpu = raw_smp_processor_id();
+
/* Attempt delivery */
icp_deliver_irq(xics, NULL, irq);
@@ -812,7 +816,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
return H_SUCCESS;
}
-static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -841,6 +845,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
return H_SUCCESS;
}
+EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete);
int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
{
@@ -892,6 +897,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
/* -- Initialisation code etc. -- */
+static void xics_debugfs_irqmap(struct seq_file *m,
+ struct kvmppc_passthru_irqmap *pimap)
+{
+ int i;
+
+ if (!pimap)
+ return;
+ seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n",
+ pimap->n_mapped);
+ for (i = 0; i < pimap->n_mapped; i++) {
+ seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+ pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+ }
+}
+
static int xics_debug_show(struct seq_file *m, void *private)
{
struct kvmppc_xics *xics = m->private;
@@ -913,6 +933,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_check_resend = 0;
t_reject = 0;
+ xics_debugfs_irqmap(m, kvm->arch.pimap);
+
seq_printf(m, "=========\nICP state\n=========\n");
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1252,6 +1274,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
{
struct kvmppc_xics *xics = kvm->arch.xics;
+ if (!xics)
+ return -ENODEV;
return ics_deliver_irq(xics, irq, level);
}
@@ -1418,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
return pin;
}
+
+void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
+ unsigned long host_irq)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_ics *ics;
+ u16 idx;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &idx);
+ if (!ics)
+ return;
+
+ ics->irq_state[idx].host_irq = host_irq;
+ ics->irq_state[idx].intr_cpu = -1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped);
+
+void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq,
+ unsigned long host_irq)
+{
+ struct kvmppc_xics *xics = kvm->arch.xics;
+ struct kvmppc_ics *ics;
+ u16 idx;
+
+ ics = kvmppc_xics_find_ics(xics, irq, &idx);
+ if (!ics)
+ return;
+
+ ics->irq_state[idx].host_irq = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index a46b954055c4..2a50320b55ca 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -42,6 +42,8 @@ struct ics_irq_state {
u8 lsi; /* level-sensitive interrupt */
u8 asserted; /* Only for LSI */
u8 exists;
+ int intr_cpu;
+ u32 host_irq;
};
/* Atomic ICP state, updated with a single compare & swap */
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 29911a07bcdb..ddbf8f0284c0 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
char *virt;
struct page **pages;
struct tlbe_priv *privs[2] = {};
- u64 *g2h_bitmap = NULL;
+ u64 *g2h_bitmap;
size_t array_len;
u32 sets;
int num_pages, ret, i;
@@ -779,41 +779,44 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
cfg->array / PAGE_SIZE;
- pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+ pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
if (ret < 0)
- goto err_pages;
+ goto free_pages;
if (ret != num_pages) {
num_pages = ret;
ret = -EFAULT;
- goto err_put_page;
+ goto put_pages;
}
virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
if (!virt) {
ret = -ENOMEM;
- goto err_put_page;
+ goto put_pages;
}
- privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
- GFP_KERNEL);
- privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
- GFP_KERNEL);
+ privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL);
+ if (!privs[0]) {
+ ret = -ENOMEM;
+ goto put_pages;
+ }
- if (!privs[0] || !privs[1]) {
+ privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL);
+ if (!privs[1]) {
ret = -ENOMEM;
- goto err_privs;
+ goto free_privs_first;
}
- g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
- GFP_KERNEL);
+ g2h_bitmap = kcalloc(params.tlb_sizes[1],
+ sizeof(*g2h_bitmap),
+ GFP_KERNEL);
if (!g2h_bitmap) {
ret = -ENOMEM;
- goto err_privs;
+ goto free_privs_second;
}
free_gtlb(vcpu_e500);
@@ -845,16 +848,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
kvmppc_recalc_tlb1map_range(vcpu_e500);
return 0;
-
-err_privs:
- kfree(privs[0]);
+ free_privs_second:
kfree(privs[1]);
-
-err_put_page:
+ free_privs_first:
+ kfree(privs[0]);
+ put_pages:
for (i = 0; i < num_pages; i++)
put_page(pages[i]);
-
-err_pages:
+ free_pages:
kfree(pages);
return ret;
}
@@ -904,11 +905,9 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
{
struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
- int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
- int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
if (e500_mmu_host_init(vcpu_e500))
- goto err;
+ goto free_vcpu;
vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -920,37 +919,39 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
vcpu_e500->gtlb_params[1].sets = 1;
- vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+ vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE +
+ KVM_E500_TLB1_SIZE,
+ sizeof(*vcpu_e500->gtlb_arch),
+ GFP_KERNEL);
if (!vcpu_e500->gtlb_arch)
return -ENOMEM;
vcpu_e500->gtlb_offset[0] = 0;
vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
- vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
- vcpu_e500->gtlb_params[0].entries,
+ vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries,
+ sizeof(struct tlbe_ref),
GFP_KERNEL);
if (!vcpu_e500->gtlb_priv[0])
- goto err;
+ goto free_vcpu;
- vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
- vcpu_e500->gtlb_params[1].entries,
+ vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
+ sizeof(struct tlbe_ref),
GFP_KERNEL);
if (!vcpu_e500->gtlb_priv[1])
- goto err;
+ goto free_vcpu;
- vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
- vcpu_e500->gtlb_params[1].entries,
+ vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
+ sizeof(*vcpu_e500->g2h_tlb1_map),
GFP_KERNEL);
if (!vcpu_e500->g2h_tlb1_map)
- goto err;
+ goto free_vcpu;
vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
kvmppc_recalc_tlb1map_range(vcpu_e500);
return 0;
-
-err:
+ free_vcpu:
free_gtlb(vcpu_e500);
return -1;
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6ce40dd6fe51..70963c845e96 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,8 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/module.h>
+#include <linux/irqbypass.h>
+#include <linux/kvm_irqfd.h>
#include <asm/cputable.h>
#include <asm/uaccess.h>
#include <asm/kvm_ppc.h>
@@ -436,6 +438,16 @@ err_out:
return -EINVAL;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
unsigned int i;
@@ -739,6 +751,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
#endif
}
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+ return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+ (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+
+ if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+ return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+ return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+
+ if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+ kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
struct kvm_run *run)
{
@@ -1167,6 +1215,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return r;
}
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_MPIC
+ if (kvm->arch.mpic)
+ return true;
+#endif
+#ifdef CONFIG_KVM_XICS
+ if (kvm->arch.xics)
+ return true;
+#endif
+ return false;
+}
+
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index 33d9daff5783..fb21990c0fb4 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
__entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
);
+TRACE_EVENT(kvmppc_vcore_wakeup,
+ TP_PROTO(int do_sleep, __u64 ns),
+
+ TP_ARGS(do_sleep, ns),
+
+ TP_STRUCT__entry(
+ __field(__u64, ns)
+ __field(int, waited)
+ __field(pid_t, tgid)
+ ),
+
+ TP_fast_assign(
+ __entry->ns = ns;
+ __entry->waited = do_sleep;
+ __entry->tgid = current->tgid;
+ ),
+
+ TP_printk("%s time %lld ns, tgid=%d",
+ __entry->waited ? "wait" : "poll",
+ __entry->ns, __entry->tgid)
+);
+
TRACE_EVENT(kvmppc_run_vcpu_enter,
TP_PROTO(struct kvm_vcpu *vcpu),
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 0e4e9654bd2c..83ddc0e171b0 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid,
}
#endif
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
- int i, shift;
- unsigned int mask;
-
- /* start from 1 ignoring MMU_PAGE_4K */
- for (i = 1; i < MMU_PAGE_COUNT; i++) {
-
- /* invalid penc */
- if (mmu_psize_defs[psize].penc[i] == -1)
- continue;
- /*
- * encoding bits per actual page size
- * PTE LP actual page size
- * rrrr rrrz >=8KB
- * rrrr rrzz >=16KB
- * rrrr rzzz >=32KB
- * rrrr zzzz >=64KB
- * .......
- */
- shift = mmu_psize_defs[i].shift - LP_SHIFT;
- if (shift > LP_BITS)
- shift = LP_BITS;
- mask = (1 << shift) - 1;
- if ((lp & mask) == mmu_psize_defs[psize].penc[i])
- return i;
- }
- return -1;
-}
-
static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
int *psize, int *apsize, int *ssize, unsigned long *vpn)
{
@@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
size = MMU_PAGE_4K;
a_size = MMU_PAGE_4K;
} else {
- for (size = 0; size < MMU_PAGE_COUNT; size++) {
-
- /* valid entries have a shift value */
- if (!mmu_psize_defs[size].shift)
- continue;
-
- a_size = __hpte_actual_psize(lp, size);
- if (a_size != -1)
- break;
- }
+ size = hpte_page_sizes[lp] & 0xf;
+ a_size = hpte_page_sizes[lp] >> 4;
}
/* This works for all page sizes, and for 256M and 1T segments */
if (cpu_has_feature(CPU_FTR_ARCH_300))
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0821556e16f4..ef3ae891a3db 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -93,6 +93,9 @@ static unsigned long _SDR1;
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
EXPORT_SYMBOL_GPL(mmu_psize_defs);
+u8 hpte_page_sizes[1 << LP_BITS];
+EXPORT_SYMBOL_GPL(hpte_page_sizes);
+
struct hash_pte *htab_address;
unsigned long htab_size_bytes;
unsigned long htab_hash_mask;
@@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void)
#endif /* CONFIG_HUGETLB_PAGE */
}
+/*
+ * Fill in the hpte_page_sizes[] array.
+ * We go through the mmu_psize_defs[] array looking for all the
+ * supported base/actual page size combinations. Each combination
+ * has a unique pagesize encoding (penc) value in the low bits of
+ * the LP field of the HPTE. For actual page sizes less than 1MB,
+ * some of the upper LP bits are used for RPN bits, meaning that
+ * we need to fill in several entries in hpte_page_sizes[].
+ *
+ * In diagrammatic form, with r = RPN bits and z = page size bits:
+ * PTE LP actual page size
+ * rrrr rrrz >=8KB
+ * rrrr rrzz >=16KB
+ * rrrr rzzz >=32KB
+ * rrrr zzzz >=64KB
+ * ...
+ *
+ * The zzzz bits are implementation-specific but are chosen so that
+ * no encoding for a larger page size uses the same value in its
+ * low-order N bits as the encoding for the 2^(12+N) byte page size
+ * (if it exists).
+ */
+static void init_hpte_page_sizes(void)
+{
+ long int ap, bp;
+ long int shift, penc;
+
+ for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) {
+ if (!mmu_psize_defs[bp].shift)
+ continue; /* not a supported page size */
+ for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
+ penc = mmu_psize_defs[bp].penc[ap];
+ if (penc == -1)
+ continue;
+ shift = mmu_psize_defs[ap].shift - LP_SHIFT;
+ if (shift <= 0)
+ continue; /* should never happen */
+ /*
+ * For page sizes less than 1MB, this loop
+ * replicates the entry for all possible values
+ * of the rrrr bits.
+ */
+ while (penc < (1 << LP_BITS)) {
+ hpte_page_sizes[penc] = (ap << 4) | bp;
+ penc += 1 << shift;
+ }
+ }
+ }
+}
+
static void __init htab_init_page_sizes(void)
{
+ init_hpte_page_sizes();
+
if (!debug_pagealloc_enabled()) {
/*
* Pick a size for the linear mapping. Currently, we only
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3d29d40eb0e9..44d2d842cee7 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -208,6 +208,7 @@ OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
+OPAL_CALL_REAL(opal_rm_set_xive, OPAL_SET_XIVE);
OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index fd9444f9fb0c..9ce48ae55062 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2710,15 +2710,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
}
#ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
{
- unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
- struct irq_chip *chip = irq_data_get_irq_chip(d);
struct pnv_phb *phb = container_of(chip, struct pnv_phb,
ioda.irq_chip);
+
+ return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
int64_t rc;
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ struct irq_chip *chip = irq_data_get_irq_chip(d);
- rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+ rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
WARN_ON_ONCE(rc);
icp_native_eoi(d);
@@ -2748,6 +2754,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
irq_set_chip(virq, &phb->ioda.irq_chip);
}
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+ return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
unsigned int hwirq, unsigned int virq,
unsigned int is_64, struct msi_msg *msg)
diff --git a/arch/s390/include/asm/facilities_src.h b/arch/s390/include/asm/facilities_src.h
index 4917728e5828..3b758f66e48b 100644
--- a/arch/s390/include/asm/facilities_src.h
+++ b/arch/s390/include/asm/facilities_src.h
@@ -55,4 +55,28 @@ static struct facility_def facility_defs[] = {
-1 /* END */
}
},
+ {
+ .name = "FACILITIES_KVM",
+ .bits = (int[]){
+ 0, /* N3 instructions */
+ 1, /* z/Arch mode installed */
+ 2, /* z/Arch mode active */
+ 3, /* DAT-enhancement */
+ 4, /* idte segment table */
+ 5, /* idte region table */
+ 6, /* ASN-and-LX reuse */
+ 7, /* stfle */
+ 8, /* enhanced-DAT 1 */
+ 9, /* sense-running-status */
+ 10, /* conditional sske */
+ 13, /* ipte-range */
+ 14, /* nonquiescing key-setting */
+ 73, /* transactional execution */
+ 75, /* access-exception-fetch/store indication */
+ 76, /* msa extension 3 */
+ 77, /* msa extension 4 */
+ 78, /* enhanced-DAT 2 */
+ -1 /* END */
+ }
+ },
};
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8e5daf7a76ce..a41faf34b034 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -28,7 +28,7 @@
#define KVM_S390_BSCA_CPU_SLOTS 64
#define KVM_S390_ESCA_CPU_SLOTS 248
-#define KVM_MAX_VCPUS KVM_S390_ESCA_CPU_SLOTS
+#define KVM_MAX_VCPUS 255
#define KVM_USER_MEM_SLOTS 32
/*
@@ -245,72 +245,72 @@ struct sie_page {
} __packed;
struct kvm_vcpu_stat {
- u32 exit_userspace;
- u32 exit_null;
- u32 exit_external_request;
- u32 exit_external_interrupt;
- u32 exit_stop_request;
- u32 exit_validity;
- u32 exit_instruction;
- u32 exit_pei;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 instruction_lctl;
- u32 instruction_lctlg;
- u32 instruction_stctl;
- u32 instruction_stctg;
- u32 exit_program_interruption;
- u32 exit_instr_and_program;
- u32 exit_operation_exception;
- u32 deliver_external_call;
- u32 deliver_emergency_signal;
- u32 deliver_service_signal;
- u32 deliver_virtio_interrupt;
- u32 deliver_stop_signal;
- u32 deliver_prefix_signal;
- u32 deliver_restart_signal;
- u32 deliver_program_int;
- u32 deliver_io_int;
- u32 exit_wait_state;
- u32 instruction_pfmf;
- u32 instruction_stidp;
- u32 instruction_spx;
- u32 instruction_stpx;
- u32 instruction_stap;
- u32 instruction_storage_key;
- u32 instruction_ipte_interlock;
- u32 instruction_stsch;
- u32 instruction_chsc;
- u32 instruction_stsi;
- u32 instruction_stfl;
- u32 instruction_tprot;
- u32 instruction_sie;
- u32 instruction_essa;
- u32 instruction_sthyi;
- u32 instruction_sigp_sense;
- u32 instruction_sigp_sense_running;
- u32 instruction_sigp_external_call;
- u32 instruction_sigp_emergency;
- u32 instruction_sigp_cond_emergency;
- u32 instruction_sigp_start;
- u32 instruction_sigp_stop;
- u32 instruction_sigp_stop_store_status;
- u32 instruction_sigp_store_status;
- u32 instruction_sigp_store_adtl_status;
- u32 instruction_sigp_arch;
- u32 instruction_sigp_prefix;
- u32 instruction_sigp_restart;
- u32 instruction_sigp_init_cpu_reset;
- u32 instruction_sigp_cpu_reset;
- u32 instruction_sigp_unknown;
- u32 diagnose_10;
- u32 diagnose_44;
- u32 diagnose_9c;
- u32 diagnose_258;
- u32 diagnose_308;
- u32 diagnose_500;
+ u64 exit_userspace;
+ u64 exit_null;
+ u64 exit_external_request;
+ u64 exit_external_interrupt;
+ u64 exit_stop_request;
+ u64 exit_validity;
+ u64 exit_instruction;
+ u64 exit_pei;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 instruction_lctl;
+ u64 instruction_lctlg;
+ u64 instruction_stctl;
+ u64 instruction_stctg;
+ u64 exit_program_interruption;
+ u64 exit_instr_and_program;
+ u64 exit_operation_exception;
+ u64 deliver_external_call;
+ u64 deliver_emergency_signal;
+ u64 deliver_service_signal;
+ u64 deliver_virtio_interrupt;
+ u64 deliver_stop_signal;
+ u64 deliver_prefix_signal;
+ u64 deliver_restart_signal;
+ u64 deliver_program_int;
+ u64 deliver_io_int;
+ u64 exit_wait_state;
+ u64 instruction_pfmf;
+ u64 instruction_stidp;
+ u64 instruction_spx;
+ u64 instruction_stpx;
+ u64 instruction_stap;
+ u64 instruction_storage_key;
+ u64 instruction_ipte_interlock;
+ u64 instruction_stsch;
+ u64 instruction_chsc;
+ u64 instruction_stsi;
+ u64 instruction_stfl;
+ u64 instruction_tprot;
+ u64 instruction_sie;
+ u64 instruction_essa;
+ u64 instruction_sthyi;
+ u64 instruction_sigp_sense;
+ u64 instruction_sigp_sense_running;
+ u64 instruction_sigp_external_call;
+ u64 instruction_sigp_emergency;
+ u64 instruction_sigp_cond_emergency;
+ u64 instruction_sigp_start;
+ u64 instruction_sigp_stop;
+ u64 instruction_sigp_stop_store_status;
+ u64 instruction_sigp_store_status;
+ u64 instruction_sigp_store_adtl_status;
+ u64 instruction_sigp_arch;
+ u64 instruction_sigp_prefix;
+ u64 instruction_sigp_restart;
+ u64 instruction_sigp_init_cpu_reset;
+ u64 instruction_sigp_cpu_reset;
+ u64 instruction_sigp_unknown;
+ u64 diagnose_10;
+ u64 diagnose_44;
+ u64 diagnose_9c;
+ u64 diagnose_258;
+ u64 diagnose_308;
+ u64 diagnose_500;
};
#define PGM_OPERATION 0x01
@@ -577,7 +577,7 @@ struct kvm_vcpu_arch {
};
struct kvm_vm_stat {
- u32 remote_tlb_flush;
+ ulong remote_tlb_flush;
};
struct kvm_arch_memory_slot {
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 1f95cc1faeb7..f3df9e0a5dec 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -125,6 +125,7 @@ int main(void)
OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
+ OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 54200208bf24..4aa8a7e2a1da 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -495,6 +495,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
switch (code) {
+ case PGM_PROTECTION:
+ switch (prot) {
+ case PROT_TYPE_ALC:
+ tec->b60 = 1;
+ /* FALL THROUGH */
+ case PROT_TYPE_DAT:
+ tec->b61 = 1;
+ break;
+ default: /* LA and KEYC set b61 to 0, other params undefined */
+ return code;
+ }
+ /* FALL THROUGH */
case PGM_ASCE_TYPE:
case PGM_PAGE_TRANSLATION:
case PGM_REGION_FIRST_TRANS:
@@ -504,8 +516,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
/*
* op_access_id only applies to MOVE_PAGE -> set bit 61
* exc_access_id has to be set to 0 for some instructions. Both
- * cases have to be handled by the caller. We can always store
- * exc_access_id, as it is undefined for non-ar cases.
+ * cases have to be handled by the caller.
*/
tec->addr = gva >> PAGE_SHIFT;
tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
@@ -516,25 +527,13 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
case PGM_ASTE_VALIDITY:
case PGM_ASTE_SEQUENCE:
case PGM_EXTENDED_AUTHORITY:
+ /*
+ * We can always store exc_access_id, as it is
+ * undefined for non-ar cases. It is undefined for
+ * most DAT protection exceptions.
+ */
pgm->exc_access_id = ar;
break;
- case PGM_PROTECTION:
- switch (prot) {
- case PROT_TYPE_ALC:
- tec->b60 = 1;
- /* FALL THROUGH */
- case PROT_TYPE_DAT:
- tec->b61 = 1;
- tec->addr = gva >> PAGE_SHIFT;
- tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
- /* exc_access_id is undefined for most cases */
- pgm->exc_access_id = ar;
- break;
- default: /* LA and KEYC set b61 to 0, other params undefined */
- break;
- }
- break;
}
return code;
}
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 31a05330d11c..d7c6a7f53ced 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -206,7 +206,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu,
int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg)
{
- int ret = 0, nr_wp = 0, nr_bp = 0, i, size;
+ int ret = 0, nr_wp = 0, nr_bp = 0, i;
struct kvm_hw_breakpoint *bp_data = NULL;
struct kvm_hw_wp_info_arch *wp_info = NULL;
struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -216,17 +216,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
return -EINVAL;
- size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint);
- bp_data = kmalloc(size, GFP_KERNEL);
- if (!bp_data) {
- ret = -ENOMEM;
- goto error;
- }
-
- if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) {
- ret = -EFAULT;
- goto error;
- }
+ bp_data = memdup_user(dbg->arch.hw_bp,
+ sizeof(*bp_data) * dbg->arch.nr_hw_bp);
+ if (IS_ERR(bp_data))
+ return PTR_ERR(bp_data);
for (i = 0; i < dbg->arch.nr_hw_bp; i++) {
switch (bp_data[i].type) {
@@ -241,17 +234,19 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
}
}
- size = nr_wp * sizeof(struct kvm_hw_wp_info_arch);
- if (size > 0) {
- wp_info = kmalloc(size, GFP_KERNEL);
+ if (nr_wp > 0) {
+ wp_info = kmalloc_array(nr_wp,
+ sizeof(*wp_info),
+ GFP_KERNEL);
if (!wp_info) {
ret = -ENOMEM;
goto error;
}
}
- size = nr_bp * sizeof(struct kvm_hw_bp_info_arch);
- if (size > 0) {
- bp_info = kmalloc(size, GFP_KERNEL);
+ if (nr_bp > 0) {
+ bp_info = kmalloc_array(nr_bp,
+ sizeof(*bp_info),
+ GFP_KERNEL);
if (!bp_info) {
ret = -ENOMEM;
goto error;
@@ -382,14 +377,20 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
}
+#define PER_CODE_MASK (PER_EVENT_MASK >> 24)
+#define PER_CODE_BRANCH (PER_EVENT_BRANCH >> 24)
+#define PER_CODE_IFETCH (PER_EVENT_IFETCH >> 24)
+#define PER_CODE_STORE (PER_EVENT_STORE >> 24)
+#define PER_CODE_STORE_REAL (PER_EVENT_STORE_REAL >> 24)
+
#define per_bp_event(code) \
- (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH))
+ (code & (PER_CODE_IFETCH | PER_CODE_BRANCH))
#define per_write_wp_event(code) \
- (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL))
+ (code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
static int debug_exit_required(struct kvm_vcpu *vcpu)
{
- u32 perc = (vcpu->arch.sie_block->perc << 24);
+ u8 perc = vcpu->arch.sie_block->perc;
struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
struct kvm_hw_wp_info_arch *wp_info = NULL;
struct kvm_hw_bp_info_arch *bp_info = NULL;
@@ -444,7 +445,7 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
const u8 ilen = kvm_s390_get_ilen(vcpu);
struct kvm_s390_pgm_info pgm_info = {
.code = PGM_PER,
- .per_code = PER_EVENT_IFETCH >> 24,
+ .per_code = PER_CODE_IFETCH,
.per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
};
@@ -458,33 +459,33 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
static void filter_guest_per_event(struct kvm_vcpu *vcpu)
{
- u32 perc = vcpu->arch.sie_block->perc << 24;
+ const u8 perc = vcpu->arch.sie_block->perc;
u64 peraddr = vcpu->arch.sie_block->peraddr;
u64 addr = vcpu->arch.sie_block->gpsw.addr;
u64 cr9 = vcpu->arch.sie_block->gcr[9];
u64 cr10 = vcpu->arch.sie_block->gcr[10];
u64 cr11 = vcpu->arch.sie_block->gcr[11];
/* filter all events, demanded by the guest */
- u32 guest_perc = perc & cr9 & PER_EVENT_MASK;
+ u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
if (!guest_per_enabled(vcpu))
guest_perc = 0;
/* filter "successful-branching" events */
- if (guest_perc & PER_EVENT_BRANCH &&
+ if (guest_perc & PER_CODE_BRANCH &&
cr9 & PER_CONTROL_BRANCH_ADDRESS &&
!in_addr_range(addr, cr10, cr11))
- guest_perc &= ~PER_EVENT_BRANCH;
+ guest_perc &= ~PER_CODE_BRANCH;
/* filter "instruction-fetching" events */
- if (guest_perc & PER_EVENT_IFETCH &&
+ if (guest_perc & PER_CODE_IFETCH &&
!in_addr_range(peraddr, cr10, cr11))
- guest_perc &= ~PER_EVENT_IFETCH;
+ guest_perc &= ~PER_CODE_IFETCH;
/* All other PER events will be given to the guest */
/* TODO: Check altered address/address space */
- vcpu->arch.sie_block->perc = guest_perc >> 24;
+ vcpu->arch.sie_block->perc = guest_perc;
if (!guest_perc)
vcpu->arch.sie_block->iprcc &= ~PGM_PER;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index dfd0ca2638fa..1cab8a177d0e 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -29,6 +29,7 @@ static const intercept_handler_t instruction_handlers[256] = {
[0x01] = kvm_s390_handle_01,
[0x82] = kvm_s390_handle_lpsw,
[0x83] = kvm_s390_handle_diag,
+ [0xaa] = kvm_s390_handle_aa,
[0xae] = kvm_s390_handle_sigp,
[0xb2] = kvm_s390_handle_b2,
[0xb6] = kvm_s390_handle_stctl,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 24524c0f3ef8..be4db07f70d3 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -24,6 +24,8 @@
#include <asm/sclp.h>
#include <asm/isc.h>
#include <asm/gmap.h>
+#include <asm/switch_to.h>
+#include <asm/nmi.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "trace-s390.h"
@@ -40,6 +42,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
return 0;
+ BUG_ON(!kvm_s390_use_sca_entries());
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -68,6 +71,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
{
int expect, rc;
+ BUG_ON(!kvm_s390_use_sca_entries());
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -109,6 +113,8 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
int rc, expect;
+ if (!kvm_s390_use_sca_entries())
+ return;
atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags);
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
@@ -400,12 +406,78 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu)
return rc ? -EFAULT : 0;
}
+static int __write_machine_check(struct kvm_vcpu *vcpu,
+ struct kvm_s390_mchk_info *mchk)
+{
+ unsigned long ext_sa_addr;
+ freg_t fprs[NUM_FPRS];
+ union mci mci;
+ int rc;
+
+ mci.val = mchk->mcic;
+ /* take care of lazy register loading via vcpu load/put */
+ save_fpu_regs();
+ save_access_regs(vcpu->run->s.regs.acrs);
+
+ /* Extended save area */
+ rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr,
+ sizeof(unsigned long));
+ /* Only bits 0-53 are used for address formation */
+ ext_sa_addr &= ~0x3ffUL;
+ if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) {
+ if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs,
+ 512))
+ mci.vr = 0;
+ } else {
+ mci.vr = 0;
+ }
+
+ /* General interruption information */
+ rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID);
+ rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE);
+
+ /* Register-save areas */
+ if (MACHINE_HAS_VX) {
+ convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
+ rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128);
+ } else {
+ rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA,
+ vcpu->run->s.regs.fprs, 128);
+ }
+ rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA,
+ vcpu->run->s.regs.gprs, 128);
+ rc |= put_guest_lc(vcpu, current->thread.fpu.fpc,
+ (u32 __user *) __LC_FP_CREG_SAVE_AREA);
+ rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr,
+ (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA);
+ rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu),
+ (u64 __user *) __LC_CPU_TIMER_SAVE_AREA);
+ rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8,
+ (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA);
+ rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA,
+ &vcpu->run->s.regs.acrs, 64);
+ rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA,
+ &vcpu->arch.sie_block->gcr, 128);
+
+ /* Extended interruption information */
+ rc |= put_guest_lc(vcpu, mchk->ext_damage_code,
+ (u32 __user *) __LC_EXT_DAMAGE_CODE);
+ rc |= put_guest_lc(vcpu, mchk->failing_storage_address,
+ (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
+ rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout,
+ sizeof(mchk->fixed_logout));
+ return rc ? -EFAULT : 0;
+}
+
static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
{
struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_mchk_info mchk = {};
- unsigned long adtl_status_addr;
int deliver = 0;
int rc = 0;
@@ -446,29 +518,9 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
KVM_S390_MCHK,
mchk.cr14, mchk.mcic);
-
- rc = kvm_s390_vcpu_store_status(vcpu,
- KVM_S390_STORE_STATUS_PREFIXED);
- rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR,
- &adtl_status_addr,
- sizeof(unsigned long));
- rc |= kvm_s390_vcpu_store_adtl_status(vcpu,
- adtl_status_addr);
- rc |= put_guest_lc(vcpu, mchk.mcic,
- (u64 __user *) __LC_MCCK_CODE);
- rc |= put_guest_lc(vcpu, mchk.failing_storage_address,
- (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR);
- rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA,
- &mchk.fixed_logout,
- sizeof(mchk.fixed_logout));
- rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
- &vcpu->arch.sie_block->gpsw,
- sizeof(psw_t));
- rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
- &vcpu->arch.sie_block->gpsw,
- sizeof(psw_t));
+ rc = __write_machine_check(vcpu, &mchk);
}
- return rc ? -EFAULT : 0;
+ return rc;
}
static int __must_check __deliver_restart(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f142215ed30d..0a9ce9dd7825 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -132,10 +132,7 @@ module_param(nested, int, S_IRUGO);
MODULE_PARM_DESC(nested, "Nested virtualization support");
/* upper facilities limit for kvm */
-unsigned long kvm_s390_fac_list_mask[16] = {
- 0xffe6000000000000UL,
- 0x005e000000000000UL,
-};
+unsigned long kvm_s390_fac_list_mask[16] = { FACILITIES_KVM };
unsigned long kvm_s390_fac_list_mask_size(void)
{
@@ -376,7 +373,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
r = KVM_S390_BSCA_CPU_SLOTS;
- if (sclp.has_esca && sclp.has_64bscao)
+ if (!kvm_s390_use_sca_entries())
+ r = KVM_MAX_VCPUS;
+ else if (sclp.has_esca && sclp.has_64bscao)
r = KVM_S390_ESCA_CPU_SLOTS;
break;
case KVM_CAP_NR_MEMSLOTS:
@@ -1490,6 +1489,16 @@ out_err:
return rc;
}
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
@@ -1553,6 +1562,8 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
static void sca_del_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!kvm_s390_use_sca_entries())
+ return;
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1570,6 +1581,13 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu)
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!kvm_s390_use_sca_entries()) {
+ struct bsca_block *sca = vcpu->kvm->arch.sca;
+
+ /* we still need the basic sca for the ipte control */
+ vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
+ vcpu->arch.sie_block->scaol = (__u32)(__u64)sca;
+ }
read_lock(&vcpu->kvm->arch.sca_lock);
if (vcpu->kvm->arch.use_esca) {
struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -1650,6 +1668,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
{
int rc;
+ if (!kvm_s390_use_sca_entries()) {
+ if (id < KVM_MAX_VCPUS)
+ return true;
+ return false;
+ }
if (id < KVM_S390_BSCA_CPU_SLOTS)
return true;
if (!sclp.has_esca || !sclp.has_64bscao)
@@ -1938,8 +1961,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->eca |= 1;
if (sclp.has_sigpif)
vcpu->arch.sie_block->eca |= 0x10000000U;
- if (test_kvm_facility(vcpu->kvm, 64))
- vcpu->arch.sie_block->ecb3 |= 0x01;
if (test_kvm_facility(vcpu->kvm, 129)) {
vcpu->arch.sie_block->eca |= 0x00020000;
vcpu->arch.sie_block->ecd |= 0x20000000;
@@ -2694,6 +2715,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
kvm_clear_async_pf_completion_queue(vcpu);
}
+ /*
+ * If userspace sets the riccb (e.g. after migration) to a valid state,
+ * we should enable RI here instead of doing the lazy enablement.
+ */
+ if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
+ test_kvm_facility(vcpu->kvm, 64)) {
+ struct runtime_instr_cb *riccb =
+ (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+
+ if (riccb->valid)
+ vcpu->arch.sie_block->ecb3 |= 0x01;
+ }
+
kvm_run->kvm_dirty_regs = 0;
}
@@ -2837,38 +2871,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
return kvm_s390_store_status_unloaded(vcpu, addr);
}
-/*
- * store additional status at address
- */
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
- unsigned long gpa)
-{
- /* Only bits 0-53 are used for address formation */
- if (!(gpa & ~0x3ff))
- return 0;
-
- return write_guest_abs(vcpu, gpa & ~0x3ff,
- (void *)&vcpu->run->s.regs.vrs, 512);
-}
-
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
-{
- if (!test_kvm_facility(vcpu->kvm, 129))
- return 0;
-
- /*
- * The guest VXRS are in the host VXRs due to the lazy
- * copying in vcpu load/put. We can simply call save_fpu_regs()
- * to save the current register state because we are in the
- * middle of a load/put cycle.
- *
- * Let's update our copies before we save it into the save area.
- */
- save_fpu_regs();
-
- return kvm_s390_store_adtl_status_unloaded(vcpu, addr);
-}
-
static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index b8432862a817..3a4e97f1a9e6 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -20,6 +20,7 @@
#include <linux/kvm_host.h>
#include <asm/facility.h>
#include <asm/processor.h>
+#include <asm/sclp.h>
typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
@@ -245,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
/* implemented in priv.c */
int is_valid_psw(psw_t *psw);
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu);
int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
@@ -273,10 +275,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu);
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
- unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
-int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
@@ -389,4 +388,13 @@ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
return &sca->ipte_control;
}
+static inline int kvm_s390_use_sca_entries(void)
+{
+ /*
+ * Without SIGP interpretation, only SRS interpretation (if available)
+ * might use the entries. By not setting the entries and keeping them
+ * invalid, hardware will not access them but intercept.
+ */
+ return sclp.has_sigpif;
+}
#endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 46160388e996..e18435355c16 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -32,6 +32,24 @@
#include "kvm-s390.h"
#include "trace.h"
+static int handle_ri(struct kvm_vcpu *vcpu)
+{
+ if (test_kvm_facility(vcpu->kvm, 64)) {
+ vcpu->arch.sie_block->ecb3 |= 0x01;
+ kvm_s390_retry_instr(vcpu);
+ return 0;
+ } else
+ return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
+int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
+{
+ if ((vcpu->arch.sie_block->ipa & 0xf) <= 4)
+ return handle_ri(vcpu);
+ else
+ return -EOPNOTSUPP;
+}
+
/* Handle SCK (SET CLOCK) interception */
static int handle_set_clock(struct kvm_vcpu *vcpu)
{
@@ -1093,6 +1111,9 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
static const intercept_handler_t eb_handlers[256] = {
[0x2f] = handle_lctlg,
[0x25] = handle_stctg,
+ [0x60] = handle_ri,
+ [0x61] = handle_ri,
+ [0x62] = handle_ri,
};
int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 94d54d0defa7..02223cb4bcfd 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
return 0;
}
- ret = __pvclock_read_cycles(pvti);
+ ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
/* refer to vread_tsc() comment for rationale */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4d0159..4b20f7304b9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -568,6 +568,7 @@ struct kvm_vcpu_arch {
struct kvm_steal_time steal;
} st;
+ u64 tsc_offset;
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
@@ -701,6 +702,8 @@ struct kvm_hv {
/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
u64 hv_crash_ctl;
+
+ HV_REFERENCE_TSC_PAGE tsc_ref;
};
struct kvm_arch {
@@ -781,54 +784,56 @@ struct kvm_arch {
bool disabled_lapic_found;
/* Struct members for AVIC */
+ u32 avic_vm_id;
u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
};
struct kvm_vm_stat {
- u32 mmu_shadow_zapped;
- u32 mmu_pte_write;
- u32 mmu_pte_updated;
- u32 mmu_pde_zapped;
- u32 mmu_flooded;
- u32 mmu_recycled;
- u32 mmu_cache_miss;
- u32 mmu_unsync;
- u32 remote_tlb_flush;
- u32 lpages;
+ ulong mmu_shadow_zapped;
+ ulong mmu_pte_write;
+ ulong mmu_pte_updated;
+ ulong mmu_pde_zapped;
+ ulong mmu_flooded;
+ ulong mmu_recycled;
+ ulong mmu_cache_miss;
+ ulong mmu_unsync;
+ ulong remote_tlb_flush;
+ ulong lpages;
};
struct kvm_vcpu_stat {
- u32 pf_fixed;
- u32 pf_guest;
- u32 tlb_flush;
- u32 invlpg;
-
- u32 exits;
- u32 io_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 irq_window_exits;
- u32 nmi_window_exits;
- u32 halt_exits;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 request_irq_exits;
- u32 irq_exits;
- u32 host_state_reload;
- u32 efer_reload;
- u32 fpu_reload;
- u32 insn_emulation;
- u32 insn_emulation_fail;
- u32 hypercalls;
- u32 irq_injections;
- u32 nmi_injections;
+ u64 pf_fixed;
+ u64 pf_guest;
+ u64 tlb_flush;
+ u64 invlpg;
+
+ u64 exits;
+ u64 io_exits;
+ u64 mmio_exits;
+ u64 signal_exits;
+ u64 irq_window_exits;
+ u64 nmi_window_exits;
+ u64 halt_exits;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 request_irq_exits;
+ u64 irq_exits;
+ u64 host_state_reload;
+ u64 efer_reload;
+ u64 fpu_reload;
+ u64 insn_emulation;
+ u64 insn_emulation_fail;
+ u64 hypercalls;
+ u64 irq_injections;
+ u64 nmi_injections;
};
struct x86_instruction_info;
@@ -951,7 +956,6 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d019f0cc80ec..3ad741b84072 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
}
static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+ u64 tsc)
{
- u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+ u64 delta = tsc - src->tsc_timestamp;
cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
src->tsc_shift);
return src->system_time + offset;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 27a0228c9cae..b816971f5da4 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -355,6 +355,7 @@ void load_ucode_amd_ap(void)
unsigned int cpu = smp_processor_id();
struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
+ u8 *cont = container;
u32 rev, eax;
u16 eq_id;
@@ -371,8 +372,11 @@ void load_ucode_amd_ap(void)
if (check_current_patch_level(&rev, false))
return;
+ /* Add CONFIG_RANDOMIZE_MEMORY offset. */
+ cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+
eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+ eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
eq_id = find_equiv_id(eq, eax);
if (!eq_id)
@@ -434,6 +438,9 @@ int __init save_microcode_in_initrd_amd(void)
else
container = cont_va;
+ /* Add CONFIG_RANDOMIZE_MEMORY offset. */
+ container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 3599404e3089..5b2cc889ce34 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
do {
version = pvclock_read_begin(src);
- ret = __pvclock_read_cycles(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
flags = src->flags;
} while (pvclock_read_retry(src, version));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2a6e84a30a54..4296beb8fdd3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -100,10 +100,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
/* Logical package management. We might want to allocate that dynamically */
static int *physical_to_logical_pkg __read_mostly;
static unsigned long *physical_package_map __read_mostly;;
-static unsigned long *logical_package_map __read_mostly;
static unsigned int max_physical_pkg_id __read_mostly;
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
+static unsigned int logical_packages __read_mostly;
+static bool logical_packages_frozen __read_mostly;
/* Maximum number of SMT threads on any online core */
int __max_smt_threads __read_mostly;
@@ -277,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
if (test_and_set_bit(pkg, physical_package_map))
goto found;
- new = find_first_zero_bit(logical_package_map, __max_logical_packages);
- if (new >= __max_logical_packages) {
+ if (logical_packages_frozen) {
physical_to_logical_pkg[pkg] = -1;
- pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+ pr_warn("APIC(%x) Package %u exceeds logical package max\n",
apicid, pkg);
return -ENOSPC;
}
- set_bit(new, logical_package_map);
+
+ new = logical_packages++;
pr_info("APIC(%x) Converting physical %u to logical package %u\n",
apicid, pkg, new);
physical_to_logical_pkg[pkg] = new;
@@ -341,6 +342,7 @@ static void __init smp_init_package_map(void)
}
__max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
+ logical_packages = 0;
/*
* Possibly larger than what we need as the number of apic ids per
@@ -352,10 +354,6 @@ static void __init smp_init_package_map(void)
memset(physical_to_logical_pkg, 0xff, size);
size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
physical_package_map = kzalloc(size, GFP_KERNEL);
- size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
- logical_package_map = kzalloc(size, GFP_KERNEL);
-
- pr_info("Max logical packages: %u\n", __max_logical_packages);
for_each_present_cpu(cpu) {
unsigned int apicid = apic->cpu_present_to_apicid(cpu);
@@ -369,6 +367,15 @@ static void __init smp_init_package_map(void)
set_cpu_possible(cpu, false);
set_cpu_present(cpu, false);
}
+
+ if (logical_packages > __max_logical_packages) {
+ pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n",
+ logical_packages, __max_logical_packages);
+ logical_packages_frozen = true;
+ __max_logical_packages = logical_packages;
+ }
+
+ pr_info("Max logical packages: %u\n", __max_logical_packages);
}
void __init smp_store_boot_cpu_info(void)
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 464fa477afbf..3bff20710471 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o page_track.o
+ hyperv.o page_track.o debugfs.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000000..c19c7ede9bd6
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ struct dentry *ret;
+
+ ret = debugfs_create_file("tsc-offset", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_offset_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ if (kvm_has_tsc_control) {
+ ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_fops);
+ if (!ret)
+ return -ENOMEM;
+ ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_frac_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01bd7b7a6866..42b1c83741c8 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
static u64 get_time_ref_counter(struct kvm *kvm)
{
- return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * The guest has not set up the TSC page or the clock isn't
+ * stable, fall back to get_kvmclock_ns.
+ */
+ if (!hv->tsc_ref.tsc_sequence)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
}
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
return 0;
}
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ return;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ return;
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ return;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ return;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ return;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
@@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
mark_page_dirty(kvm, gfn);
break;
}
- case HV_X64_MSR_REFERENCE_TSC: {
- u64 gfn;
- HV_REFERENCE_TSC_PAGE tsc_ref;
-
- memset(&tsc_ref, 0, sizeof(tsc_ref));
+ case HV_X64_MSR_REFERENCE_TSC:
hv->hv_tsc_page = data;
- if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
- break;
- gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(
- kvm,
- gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
- &tsc_ref, sizeof(tsc_ref)))
- return 1;
- mark_page_dirty(kvm, gfn);
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
break;
- }
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
return kvm_hv_msr_set_crash_data(vcpu,
msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4bd1d3..cd1119538add 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+
#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af523d84d102..9b4221471e3d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,8 @@
#include <linux/sched.h>
#include <linux/trace_events.h>
#include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -41,6 +43,7 @@
#include <asm/desc.h>
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS 8
+#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS 24
+#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+ (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
@@ -185,6 +201,23 @@ struct vcpu_svm {
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
};
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
@@ -242,6 +275,10 @@ static int avic;
module_param(avic, int, S_IRUGO);
#endif
+/* AVIC VM ID bit masks and lock */
+static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+static DEFINE_SPINLOCK(avic_vm_id_lock);
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_arch,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static spinlock_t svm_vm_data_hash_lock;
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_arch *ka = NULL;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_arch *vm_data = &kvm->arch;
+
+ if (vm_data->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ if (!vcpu)
+ return 0;
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void)
if (avic) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_AVIC) ||
- !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+ !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
avic = false;
- else
+ } else {
pr_info("AVIC enabled\n");
+
+ hash_init(svm_vm_data_hash);
+ spin_lock_init(&svm_vm_data_hash_lock);
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ }
}
return 0;
@@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return svm->vmcb->control.tsc_offset;
-}
-
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+static inline int avic_get_next_vm_id(void)
+{
+ int id;
+
+ spin_lock(&avic_vm_id_lock);
+
+ /* AVIC VM ID is one-based. */
+ id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+ if (id <= AVIC_VM_ID_MASK)
+ __set_bit(id, avic_vm_id_bitmap);
+ else
+ id = -EAGAIN;
+
+ spin_unlock(&avic_vm_id_lock);
+ return id;
+}
+
+static inline int avic_free_vm_id(int id)
+{
+ if (id <= 0 || id > AVIC_VM_ID_MASK)
+ return -EINVAL;
+
+ spin_lock(&avic_vm_id_lock);
+ __clear_bit(id, avic_vm_id_bitmap);
+ spin_unlock(&avic_vm_id_lock);
+ return 0;
+}
+
static void avic_vm_destroy(struct kvm *kvm)
{
+ unsigned long flags;
struct kvm_arch *vm_data = &kvm->arch;
+ avic_free_vm_id(vm_data->avic_vm_id);
+
if (vm_data->avic_logical_id_table_page)
__free_page(vm_data->avic_logical_id_table_page);
if (vm_data->avic_physical_id_table_page)
__free_page(vm_data->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&vm_data->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
}
static int avic_vm_init(struct kvm *kvm)
{
- int err = -ENOMEM;
+ unsigned long flags;
+ int vm_id, err = -ENOMEM;
struct kvm_arch *vm_data = &kvm->arch;
struct page *p_page;
struct page *l_page;
@@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm)
if (!avic)
return 0;
+ vm_id = avic_get_next_vm_id();
+ if (vm_id < 0)
+ return vm_id;
+ vm_data->avic_vm_id = (u32)vm_id;
+
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL);
if (!p_page)
@@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm)
vm_data->avic_logical_id_table_page = l_page;
clear_page(page_address(l_page));
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
return 0;
free_avic:
@@ -1323,31 +1452,34 @@ free_avic:
return err;
}
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
- u64 entry;
- int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- svm->avic_is_running = is_run;
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
- if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
- return;
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ if (list_empty(&svm->ir_list))
+ goto out;
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- if (is_run)
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
}
static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+ svm->avic_is_running);
}
static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->avic_is_running = is_run;
+ if (is_run)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+}
+
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
err = avic_init_backing_page(&svm->vcpu);
if (err)
goto free_page4;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
}
/* We initialize this flag to true to make sure that the is_running
@@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
kvm_vcpu_wake_up(vcpu);
}
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+
+ /**
+ * In some cases, the existing irte is updaed and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtialization is disabled for the vcpu.
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+ pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+ host_irq, e->gsi,
+ vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
.read_l1_tsc = svm_read_l1_tsc,
@@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.pmu_ops = &amd_pmu_ops,
.deliver_posted_interrupt = svm_deliver_avic_intr,
+ .update_pi_irte = svm_update_pi_irte,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f9939d0722b4..2577183b40d9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
static unsigned long *vmx_vmread_bitmap;
static unsigned long *vmx_vmwrite_bitmap;
@@ -2524,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
else if (cpu_has_secondary_exec_ctrls() &&
(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
- else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ } else {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+ }
} else {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode;
@@ -2609,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return host_tsc + tsc_offset;
}
-static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
- return vmcs_read64(TSC_OFFSET);
-}
-
/*
* writes 'offset' into guest's timestamp counter offset register
*/
@@ -4687,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
msr, MSR_TYPE_R | MSR_TYPE_W);
}
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
{
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ if (apicv_active) {
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+ } else {
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ }
}
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ if (apicv_active) {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+ } else {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ }
}
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_W);
+ if (apicv_active) {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_W);
+ } else {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_W);
+ }
}
static bool vmx_get_enable_apicv(void)
@@ -5288,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (is_guest_mode(vcpu))
- return;
+ if (!is_guest_mode(vcpu)) {
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->soft_vnmi_blocked = 1;
+ vmx->vnmi_blocked_time = 0;
+ }
- if (!cpu_has_virtual_nmis()) {
- /*
- * Tracking the NMI-blocked state in software is built upon
- * finding the next open IRQ window. This, in turn, depends on
- * well-behaving guests: They have to keep IRQs disabled at
- * least as long as the NMI handler runs. Otherwise we may
- * cause NMI nesting, maybe breaking the guest. But as this is
- * highly unlikely, we can live with the residual risk.
- */
- vmx->soft_vnmi_blocked = 1;
- vmx->vnmi_blocked_time = 0;
+ ++vcpu->stat.nmi_injections;
+ vmx->nmi_known_unmasked = false;
}
- ++vcpu->stat.nmi_injections;
- vmx->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
+
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
@@ -6369,22 +6395,32 @@ static __init int hardware_setup(void)
if (!vmx_msr_bitmap_legacy_x2apic)
goto out2;
+ vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+ goto out3;
+
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode)
- goto out3;
+ goto out4;
vmx_msr_bitmap_longmode_x2apic =
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
- goto out4;
+ goto out5;
+
+ vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+ goto out6;
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
- goto out6;
+ goto out7;
vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
- goto out7;
+ goto out8;
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6403,7 +6439,7 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
- goto out8;
+ goto out9;
}
if (boot_cpu_has(X86_FEATURE_NX))
@@ -6470,20 +6506,35 @@ static __init int hardware_setup(void)
vmx_msr_bitmap_legacy, PAGE_SIZE);
memcpy(vmx_msr_bitmap_longmode_x2apic,
vmx_msr_bitmap_longmode, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ vmx_msr_bitmap_longmode, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+ /*
+ * enable_apicv && kvm_vcpu_apicv_active()
+ */
for (msr = 0x800; msr <= 0x8ff; msr++)
- vmx_disable_intercept_msr_read_x2apic(msr);
+ vmx_disable_intercept_msr_read_x2apic(msr, true);
/* TMCCT */
- vmx_enable_intercept_msr_read_x2apic(0x839);
+ vmx_enable_intercept_msr_read_x2apic(0x839, true);
/* TPR */
- vmx_disable_intercept_msr_write_x2apic(0x808);
+ vmx_disable_intercept_msr_write_x2apic(0x808, true);
/* EOI */
- vmx_disable_intercept_msr_write_x2apic(0x80b);
+ vmx_disable_intercept_msr_write_x2apic(0x80b, true);
/* SELF-IPI */
- vmx_disable_intercept_msr_write_x2apic(0x83f);
+ vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+
+ /*
+ * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+ * !enable_apicv
+ */
+ /* TPR */
+ vmx_disable_intercept_msr_read_x2apic(0x808, false);
+ vmx_disable_intercept_msr_write_x2apic(0x808, false);
if (enable_ept) {
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6530,14 +6581,18 @@ static __init int hardware_setup(void)
return alloc_kvm_area();
-out8:
+out9:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out7:
+out8:
free_page((unsigned long)vmx_vmread_bitmap);
+out7:
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
out6:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
+out5:
free_page((unsigned long)vmx_msr_bitmap_longmode);
+out4:
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
out3:
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
out2:
@@ -6553,7 +6608,9 @@ out:
static __exit void hardware_unsetup(void)
{
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
free_page((unsigned long)vmx_msr_bitmap_legacy);
free_page((unsigned long)vmx_msr_bitmap_longmode);
free_page((unsigned long)vmx_io_bitmap_b);
@@ -8444,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
- /*
- * There is not point to enable virtualize x2apic without enable
- * apicv
- */
- if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!cpu_has_vmx_virtualize_x2apic_mode())
return;
if (!cpu_need_tpr_shadow(vcpu))
@@ -10805,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
* We are now running in L2, mmu_notifier will force to reload the
* page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
*/
- kvm_vcpu_reload_apic_access_page(vcpu);
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
/*
* Exiting from L2 to L1, we're now back to L1 which thinks it just
@@ -11286,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
.read_l1_tsc = vmx_read_l1_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 57ffe7893104..3ee8a91a78c3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
{
- u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+ u64 curr_offset = vcpu->arch.tsc_offset;
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
@@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_offset = offset;
+}
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = get_kernel_ns();
+ ns = ktime_get_boot_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
@@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
update_ia32_tsc_adjust_msr(vcpu, offset);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
#endif
}
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+ struct kvm_arch *ka = &kvm->arch;
+ s64 ns;
+
+ if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+ u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+ } else {
+ ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+ }
+
+ return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ unsigned long flags;
+ s64 ns;
+
+ local_irq_save(flags);
+ ns = __get_kvmclock_ns(kvm);
+ local_irq_restore(flags);
+
+ return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct pvclock_vcpu_time_info guest_hv_clock;
+
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return;
+
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
+ */
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
@@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
- struct pvclock_vcpu_time_info guest_hv_clock;
u8 pvclock_flags;
bool use_master_clock;
@@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = get_kernel_ns();
+ kernel_ns = ktime_get_boot_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_restore(flags);
- if (!vcpu->pv_time_enabled)
- return 0;
+ /* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
- /* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
- return 0;
-
- /* This VCPU is paused, but it's legal for a guest to read another
- * VCPU's kvmclock, so we really have to follow the specification where
- * it says that version is odd if data is being modified, and even after
- * it is consistent.
- *
- * Version field updates must be kept separate. This is because
- * kvm_write_guest_cached might use a "rep movs" instruction, and
- * writes within a string instruction are weakly ordered. So there
- * are three writes overall.
- *
- * As a small optimization, only write the version field in the first
- * and third write. The vcpu->pv_time cache is still valid, because the
- * version field is the first in the struct.
- */
- BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
- vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
-
- smp_wmb();
-
- /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
- if (vcpu->pvclock_set_guest_stopped_request) {
- pvclock_flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
-
/* If the host uses TSC clocksource, then it is stable */
+ pvclock_flags = 0;
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
vcpu->hv_clock.flags = pvclock_flags;
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
-
- smp_wmb();
-
- vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ if (vcpu->pv_time_enabled)
+ kvm_setup_pvclock_page(v);
+ if (v == kvm_get_vcpu(v->kvm, 0))
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -2750,7 +2793,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
/*
@@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
- s64 delta;
r = -EFAULT;
if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
local_irq_disable();
- now_ns = get_kernel_ns();
- delta = user_ns.clock - now_ns;
+ now_ns = __get_kvmclock_ns(kvm);
+ kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
local_irq_enable();
- kvm->arch.kvmclock_offset = delta;
kvm_gen_update_masterclock(kvm);
break;
}
@@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm_clock_data user_ns;
u64 now_ns;
- local_irq_disable();
- now_ns = get_kernel_ns();
- user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
- local_irq_enable();
+ now_ns = get_kvmclock_ns(kvm);
+ user_ns.clock = now_ns;
user_ns.flags = 0;
memset(&user_ns.pad, 0, sizeof(user_ns.pad));
@@ -7529,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
* before any KVM threads can be running. Unfortunately, we can't
* bring the TSCs fully up to date with real time, as we aren't yet far
* enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, get_kernel_ns() will be using boot
+ * elapsed; our helper function, ktime_get_boot_ns() will be using boot
* variables that haven't been updated yet.
*
* So we simply find the maximum observed TSC above, then record the
@@ -7764,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+ kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
pvclock_update_vm_gtod_copy(kvm);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a82ca466b62e..e8ff3e4ce38a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
return kvm_register_write(vcpu, reg, val);
}
-static inline u64 get_kernel_ns(void)
-{
- return ktime_get_boot_ns();
-}
-
static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
{
return !(kvm->arch.disabled_quirks & quirk);
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a3e3ccc87138..9634557a5444 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -113,7 +113,7 @@ static int set_up_temporary_mappings(void)
return result;
}
- temp_level4_pgt = (unsigned long)pgd - __PAGE_OFFSET;
+ temp_level4_pgt = __pa(pgd);
return 0;
}
diff --git a/drivers/clocksource/bcm_kona_timer.c b/drivers/clocksource/bcm_kona_timer.c
index 7e3fd375a627..92f6e4deee74 100644
--- a/drivers/clocksource/bcm_kona_timer.c
+++ b/drivers/clocksource/bcm_kona_timer.c
@@ -66,10 +66,10 @@ static void kona_timer_disable_and_clear(void __iomem *base)
}
-static void
+static int
kona_timer_get_counter(void __iomem *timer_base, uint32_t *msw, uint32_t *lsw)
{
- int loop_limit = 4;
+ int loop_limit = 3;
/*
* Read 64-bit free running counter
@@ -83,18 +83,19 @@ kona_timer_get_counter(void __iomem *timer_base, uint32_t *msw, uint32_t *lsw)
* if new hi-word is equal to previously read hi-word then stop.
*/
- while (--loop_limit) {
+ do {
*msw = readl(timer_base + KONA_GPTIMER_STCHI_OFFSET);
*lsw = readl(timer_base + KONA_GPTIMER_STCLO_OFFSET);
if (*msw == readl(timer_base + KONA_GPTIMER_STCHI_OFFSET))
break;
- }
+ } while (--loop_limit);
if (!loop_limit) {
pr_err("bcm_kona_timer: getting counter failed.\n");
pr_err(" Timer will be impacted\n");
+ return -ETIMEDOUT;
}
- return;
+ return 0;
}
static int kona_timer_set_next_event(unsigned long clc,
@@ -112,8 +113,11 @@ static int kona_timer_set_next_event(unsigned long clc,
uint32_t lsw, msw;
uint32_t reg;
+ int ret;
- kona_timer_get_counter(timers.tmr_regs, &msw, &lsw);
+ ret = kona_timer_get_counter(timers.tmr_regs, &msw, &lsw);
+ if (ret)
+ return ret;
/* Load the "next" event tick value */
writel(lsw + clc, timers.tmr_regs + KONA_GPTIMER_STCM0_OFFSET);
diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c
index d91e8725917c..b4b3ab5a11ad 100644
--- a/drivers/clocksource/mips-gic-timer.c
+++ b/drivers/clocksource/mips-gic-timer.c
@@ -164,7 +164,7 @@ void __init gic_clocksource_init(unsigned int frequency)
gic_start_count();
}
-static void __init gic_clocksource_of_init(struct device_node *node)
+static int __init gic_clocksource_of_init(struct device_node *node)
{
struct clk *clk;
int ret;
diff --git a/drivers/clocksource/time-armada-370-xp.c b/drivers/clocksource/time-armada-370-xp.c
index 719b478d136e..3c39e6f45971 100644
--- a/drivers/clocksource/time-armada-370-xp.c
+++ b/drivers/clocksource/time-armada-370-xp.c
@@ -338,7 +338,6 @@ static int __init armada_xp_timer_init(struct device_node *np)
struct clk *clk = of_clk_get_by_name(np, "fixed");
int ret;
- clk = of_clk_get(np, 0);
if (IS_ERR(clk)) {
pr_err("Failed to get clock");
return PTR_ERR(clk);
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index d0c1dab9b435..dff1a4a6dc1b 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -251,6 +251,14 @@ config EDAC_SBRIDGE
Support for error detection and correction the Intel
Sandy Bridge, Ivy Bridge and Haswell Integrated Memory Controllers.
+config EDAC_SKX
+ tristate "Intel Skylake server Integrated MC"
+ depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
+ depends on PCI_MMCONFIG
+ help
+ Support for error detection and correction the Intel
+ Skylake server Integrated Memory Controllers.
+
config EDAC_MPC85XX
tristate "Freescale MPC83xx / MPC85xx"
depends on EDAC_MM_EDAC && FSL_SOC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index f9e4a3e0e6e9..986049925b08 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_EDAC_I5400) += i5400_edac.o
obj-$(CONFIG_EDAC_I7300) += i7300_edac.o
obj-$(CONFIG_EDAC_I7CORE) += i7core_edac.o
obj-$(CONFIG_EDAC_SBRIDGE) += sb_edac.o
+obj-$(CONFIG_EDAC_SKX) += skx_edac.o
obj-$(CONFIG_EDAC_E7XXX) += e7xxx_edac.o
obj-$(CONFIG_EDAC_E752X) += e752x_edac.o
obj-$(CONFIG_EDAC_I82443BXGX) += i82443bxgx_edac.o
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
new file mode 100644
index 000000000000..0ff4878c2aa1
--- /dev/null
+++ b/drivers/edac/skx_edac.c
@@ -0,0 +1,1121 @@
+/*
+ * EDAC driver for Intel(R) Xeon(R) Skylake processors
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/edac.h>
+#include <linux/mmzone.h>
+#include <linux/smp.h>
+#include <linux/bitmap.h>
+#include <linux/math64.h>
+#include <linux/mod_devicetable.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/mce.h>
+
+#include "edac_core.h"
+
+#define SKX_REVISION " Ver: 1.0 "
+
+/*
+ * Debug macros
+ */
+#define skx_printk(level, fmt, arg...) \
+ edac_printk(level, "skx", fmt, ##arg)
+
+#define skx_mc_printk(mci, level, fmt, arg...) \
+ edac_mc_chipset_printk(mci, level, "skx", fmt, ##arg)
+
+/*
+ * Get a bit field at register value <v>, from bit <lo> to bit <hi>
+ */
+#define GET_BITFIELD(v, lo, hi) \
+ (((v) & GENMASK_ULL((hi), (lo))) >> (lo))
+
+static LIST_HEAD(skx_edac_list);
+
+static u64 skx_tolm, skx_tohm;
+
+#define NUM_IMC 2 /* memory controllers per socket */
+#define NUM_CHANNELS 3 /* channels per memory controller */
+#define NUM_DIMMS 2 /* Max DIMMS per channel */
+
+#define MASK26 0x3FFFFFF /* Mask for 2^26 */
+#define MASK29 0x1FFFFFFF /* Mask for 2^29 */
+
+/*
+ * Each cpu socket contains some pci devices that provide global
+ * information, and also some that are local to each of the two
+ * memory controllers on the die.
+ */
+struct skx_dev {
+ struct list_head list;
+ u8 bus[4];
+ struct pci_dev *sad_all;
+ struct pci_dev *util_all;
+ u32 mcroute;
+ struct skx_imc {
+ struct mem_ctl_info *mci;
+ u8 mc; /* system wide mc# */
+ u8 lmc; /* socket relative mc# */
+ u8 src_id, node_id;
+ struct skx_channel {
+ struct pci_dev *cdev;
+ struct skx_dimm {
+ u8 close_pg;
+ u8 bank_xor_enable;
+ u8 fine_grain_bank;
+ u8 rowbits;
+ u8 colbits;
+ } dimms[NUM_DIMMS];
+ } chan[NUM_CHANNELS];
+ } imc[NUM_IMC];
+};
+static int skx_num_sockets;
+
+struct skx_pvt {
+ struct skx_imc *imc;
+};
+
+struct decoded_addr {
+ struct skx_dev *dev;
+ u64 addr;
+ int socket;
+ int imc;
+ int channel;
+ u64 chan_addr;
+ int sktways;
+ int chanways;
+ int dimm;
+ int rank;
+ int channel_rank;
+ u64 rank_address;
+ int row;
+ int column;
+ int bank_address;
+ int bank_group;
+};
+
+static struct skx_dev *get_skx_dev(u8 bus, u8 idx)
+{
+ struct skx_dev *d;
+
+ list_for_each_entry(d, &skx_edac_list, list) {
+ if (d->bus[idx] == bus)
+ return d;
+ }
+
+ return NULL;
+}
+
+enum munittype {
+ CHAN0, CHAN1, CHAN2, SAD_ALL, UTIL_ALL, SAD
+};
+
+struct munit {
+ u16 did;
+ u16 devfn[NUM_IMC];
+ u8 busidx;
+ u8 per_socket;
+ enum munittype mtype;
+};
+
+/*
+ * List of PCI device ids that we need together with some device
+ * number and function numbers to tell which memory controller the
+ * device belongs to.
+ */
+static const struct munit skx_all_munits[] = {
+ { 0x2054, { }, 1, 1, SAD_ALL },
+ { 0x2055, { }, 1, 1, UTIL_ALL },
+ { 0x2040, { PCI_DEVFN(10, 0), PCI_DEVFN(12, 0) }, 2, 2, CHAN0 },
+ { 0x2044, { PCI_DEVFN(10, 4), PCI_DEVFN(12, 4) }, 2, 2, CHAN1 },
+ { 0x2048, { PCI_DEVFN(11, 0), PCI_DEVFN(13, 0) }, 2, 2, CHAN2 },
+ { 0x208e, { }, 1, 0, SAD },
+ { }
+};
+
+/*
+ * We use the per-socket device 0x2016 to count how many sockets are present,
+ * and to detemine which PCI buses are associated with each socket. Allocate
+ * and build the full list of all the skx_dev structures that we need here.
+ */
+static int get_all_bus_mappings(void)
+{
+ struct pci_dev *pdev, *prev;
+ struct skx_dev *d;
+ u32 reg;
+ int ndev = 0;
+
+ prev = NULL;
+ for (;;) {
+ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x2016, prev);
+ if (!pdev)
+ break;
+ ndev++;
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ pci_dev_put(pdev);
+ return -ENOMEM;
+ }
+ pci_read_config_dword(pdev, 0xCC, &reg);
+ d->bus[0] = GET_BITFIELD(reg, 0, 7);
+ d->bus[1] = GET_BITFIELD(reg, 8, 15);
+ d->bus[2] = GET_BITFIELD(reg, 16, 23);
+ d->bus[3] = GET_BITFIELD(reg, 24, 31);
+ edac_dbg(2, "busses: %x, %x, %x, %x\n",
+ d->bus[0], d->bus[1], d->bus[2], d->bus[3]);
+ list_add_tail(&d->list, &skx_edac_list);
+ skx_num_sockets++;
+ prev = pdev;
+ }
+
+ return ndev;
+}
+
+static int get_all_munits(const struct munit *m)
+{
+ struct pci_dev *pdev, *prev;
+ struct skx_dev *d;
+ u32 reg;
+ int i = 0, ndev = 0;
+
+ prev = NULL;
+ for (;;) {
+ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, m->did, prev);
+ if (!pdev)
+ break;
+ ndev++;
+ if (m->per_socket == NUM_IMC) {
+ for (i = 0; i < NUM_IMC; i++)
+ if (m->devfn[i] == pdev->devfn)
+ break;
+ if (i == NUM_IMC)
+ goto fail;
+ }
+ d = get_skx_dev(pdev->bus->number, m->busidx);
+ if (!d)
+ goto fail;
+
+ /* Be sure that the device is enabled */
+ if (unlikely(pci_enable_device(pdev) < 0)) {
+ skx_printk(KERN_ERR,
+ "Couldn't enable %04x:%04x\n", PCI_VENDOR_ID_INTEL, m->did);
+ goto fail;
+ }
+
+ switch (m->mtype) {
+ case CHAN0: case CHAN1: case CHAN2:
+ pci_dev_get(pdev);
+ d->imc[i].chan[m->mtype].cdev = pdev;
+ break;
+ case SAD_ALL:
+ pci_dev_get(pdev);
+ d->sad_all = pdev;
+ break;
+ case UTIL_ALL:
+ pci_dev_get(pdev);
+ d->util_all = pdev;
+ break;
+ case SAD:
+ /*
+ * one of these devices per core, including cores
+ * that don't exist on this SKU. Ignore any that
+ * read a route table of zero, make sure all the
+ * non-zero values match.
+ */
+ pci_read_config_dword(pdev, 0xB4, &reg);
+ if (reg != 0) {
+ if (d->mcroute == 0)
+ d->mcroute = reg;
+ else if (d->mcroute != reg) {
+ skx_printk(KERN_ERR,
+ "mcroute mismatch\n");
+ goto fail;
+ }
+ }
+ ndev--;
+ break;
+ }
+
+ prev = pdev;
+ }
+
+ return ndev;
+fail:
+ pci_dev_put(pdev);
+ return -ENODEV;
+}
+
+const struct x86_cpu_id skx_cpuids[] = {
+ { X86_VENDOR_INTEL, 6, 0x55, 0, 0 }, /* Skylake */
+ { }
+};
+MODULE_DEVICE_TABLE(x86cpu, skx_cpuids);
+
+static u8 get_src_id(struct skx_dev *d)
+{
+ u32 reg;
+
+ pci_read_config_dword(d->util_all, 0xF0, &reg);
+
+ return GET_BITFIELD(reg, 12, 14);
+}
+
+static u8 skx_get_node_id(struct skx_dev *d)
+{
+ u32 reg;
+
+ pci_read_config_dword(d->util_all, 0xF4, &reg);
+
+ return GET_BITFIELD(reg, 0, 2);
+}
+
+static int get_dimm_attr(u32 reg, int lobit, int hibit, int add, int minval,
+ int maxval, char *name)
+{
+ u32 val = GET_BITFIELD(reg, lobit, hibit);
+
+ if (val < minval || val > maxval) {
+ edac_dbg(2, "bad %s = %d (raw=%x)\n", name, val, reg);
+ return -EINVAL;
+ }
+ return val + add;
+}
+
+#define IS_DIMM_PRESENT(mtr) GET_BITFIELD((mtr), 15, 15)
+
+#define numrank(reg) get_dimm_attr((reg), 12, 13, 0, 1, 2, "ranks")
+#define numrow(reg) get_dimm_attr((reg), 2, 4, 12, 1, 6, "rows")
+#define numcol(reg) get_dimm_attr((reg), 0, 1, 10, 0, 2, "cols")
+
+static int get_width(u32 mtr)
+{
+ switch (GET_BITFIELD(mtr, 8, 9)) {
+ case 0:
+ return DEV_X4;
+ case 1:
+ return DEV_X8;
+ case 2:
+ return DEV_X16;
+ }
+ return DEV_UNKNOWN;
+}
+
+static int skx_get_hi_lo(void)
+{
+ struct pci_dev *pdev;
+ u32 reg;
+
+ pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x2034, NULL);
+ if (!pdev) {
+ edac_dbg(0, "Can't get tolm/tohm\n");
+ return -ENODEV;
+ }
+
+ pci_read_config_dword(pdev, 0xD0, &reg);
+ skx_tolm = reg;
+ pci_read_config_dword(pdev, 0xD4, &reg);
+ skx_tohm = reg;
+ pci_read_config_dword(pdev, 0xD8, &reg);
+ skx_tohm |= (u64)reg << 32;
+
+ pci_dev_put(pdev);
+ edac_dbg(2, "tolm=%llx tohm=%llx\n", skx_tolm, skx_tohm);
+
+ return 0;
+}
+
+static int get_dimm_info(u32 mtr, u32 amap, struct dimm_info *dimm,
+ struct skx_imc *imc, int chan, int dimmno)
+{
+ int banks = 16, ranks, rows, cols, npages;
+ u64 size;
+
+ if (!IS_DIMM_PRESENT(mtr))
+ return 0;
+ ranks = numrank(mtr);
+ rows = numrow(mtr);
+ cols = numcol(mtr);
+
+ /*
+ * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
+ */
+ size = ((1ull << (rows + cols + ranks)) * banks) >> (20 - 3);
+ npages = MiB_TO_PAGES(size);
+
+ edac_dbg(0, "mc#%d: channel %d, dimm %d, %lld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
+ imc->mc, chan, dimmno, size, npages,
+ banks, ranks, rows, cols);
+
+ imc->chan[chan].dimms[dimmno].close_pg = GET_BITFIELD(mtr, 0, 0);
+ imc->chan[chan].dimms[dimmno].bank_xor_enable = GET_BITFIELD(mtr, 9, 9);
+ imc->chan[chan].dimms[dimmno].fine_grain_bank = GET_BITFIELD(amap, 0, 0);
+ imc->chan[chan].dimms[dimmno].rowbits = rows;
+ imc->chan[chan].dimms[dimmno].colbits = cols;
+
+ dimm->nr_pages = npages;
+ dimm->grain = 32;
+ dimm->dtype = get_width(mtr);
+ dimm->mtype = MEM_DDR4;
+ dimm->edac_mode = EDAC_SECDED; /* likely better than this */
+ snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
+ imc->src_id, imc->lmc, chan, dimmno);
+
+ return 1;
+}
+
+#define SKX_GET_MTMTR(dev, reg) \
+ pci_read_config_dword((dev), 0x87c, &reg)
+
+static bool skx_check_ecc(struct pci_dev *pdev)
+{
+ u32 mtmtr;
+
+ SKX_GET_MTMTR(pdev, mtmtr);
+
+ return !!GET_BITFIELD(mtmtr, 2, 2);
+}
+
+static int skx_get_dimm_config(struct mem_ctl_info *mci)
+{
+ struct skx_pvt *pvt = mci->pvt_info;
+ struct skx_imc *imc = pvt->imc;
+ struct dimm_info *dimm;
+ int i, j;
+ u32 mtr, amap;
+ int ndimms;
+
+ for (i = 0; i < NUM_CHANNELS; i++) {
+ ndimms = 0;
+ pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap);
+ for (j = 0; j < NUM_DIMMS; j++) {
+ dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
+ mci->n_layers, i, j, 0);
+ pci_read_config_dword(imc->chan[i].cdev,
+ 0x80 + 4*j, &mtr);
+ ndimms += get_dimm_info(mtr, amap, dimm, imc, i, j);
+ }
+ if (ndimms && !skx_check_ecc(imc->chan[0].cdev)) {
+ skx_printk(KERN_ERR, "ECC is disabled on imc %d\n", imc->mc);
+ return -ENODEV;
+ }
+ }
+
+ return 0;
+}
+
+static void skx_unregister_mci(struct skx_imc *imc)
+{
+ struct mem_ctl_info *mci = imc->mci;
+
+ if (!mci)
+ return;
+
+ edac_dbg(0, "MC%d: mci = %p\n", imc->mc, mci);
+
+ /* Remove MC sysfs nodes */
+ edac_mc_del_mc(mci->pdev);
+
+ edac_dbg(1, "%s: free mci struct\n", mci->ctl_name);
+ kfree(mci->ctl_name);
+ edac_mc_free(mci);
+}
+
+static int skx_register_mci(struct skx_imc *imc)
+{
+ struct mem_ctl_info *mci;
+ struct edac_mc_layer layers[2];
+ struct pci_dev *pdev = imc->chan[0].cdev;
+ struct skx_pvt *pvt;
+ int rc;
+
+ /* allocate a new MC control structure */
+ layers[0].type = EDAC_MC_LAYER_CHANNEL;
+ layers[0].size = NUM_CHANNELS;
+ layers[0].is_virt_csrow = false;
+ layers[1].type = EDAC_MC_LAYER_SLOT;
+ layers[1].size = NUM_DIMMS;
+ layers[1].is_virt_csrow = true;
+ mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers,
+ sizeof(struct skx_pvt));
+
+ if (unlikely(!mci))
+ return -ENOMEM;
+
+ edac_dbg(0, "MC#%d: mci = %p\n", imc->mc, mci);
+
+ /* Associate skx_dev and mci for future usage */
+ imc->mci = mci;
+ pvt = mci->pvt_info;
+ pvt->imc = imc;
+
+ mci->ctl_name = kasprintf(GFP_KERNEL, "Skylake Socket#%d IMC#%d",
+ imc->node_id, imc->lmc);
+ mci->mtype_cap = MEM_FLAG_DDR4;
+ mci->edac_ctl_cap = EDAC_FLAG_NONE;
+ mci->edac_cap = EDAC_FLAG_NONE;
+ mci->mod_name = "skx_edac.c";
+ mci->dev_name = pci_name(imc->chan[0].cdev);
+ mci->mod_ver = SKX_REVISION;
+ mci->ctl_page_to_phys = NULL;
+
+ rc = skx_get_dimm_config(mci);
+ if (rc < 0)
+ goto fail;
+
+ /* record ptr to the generic device */
+ mci->pdev = &pdev->dev;
+
+ /* add this new MC control structure to EDAC's list of MCs */
+ if (unlikely(edac_mc_add_mc(mci))) {
+ edac_dbg(0, "MC: failed edac_mc_add_mc()\n");
+ rc = -EINVAL;
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ kfree(mci->ctl_name);
+ edac_mc_free(mci);
+ imc->mci = NULL;
+ return rc;
+}
+
+#define SKX_MAX_SAD 24
+
+#define SKX_GET_SAD(d, i, reg) \
+ pci_read_config_dword((d)->sad_all, 0x60 + 8 * (i), &reg)
+#define SKX_GET_ILV(d, i, reg) \
+ pci_read_config_dword((d)->sad_all, 0x64 + 8 * (i), &reg)
+
+#define SKX_SAD_MOD3MODE(sad) GET_BITFIELD((sad), 30, 31)
+#define SKX_SAD_MOD3(sad) GET_BITFIELD((sad), 27, 27)
+#define SKX_SAD_LIMIT(sad) (((u64)GET_BITFIELD((sad), 7, 26) << 26) | MASK26)
+#define SKX_SAD_MOD3ASMOD2(sad) GET_BITFIELD((sad), 5, 6)
+#define SKX_SAD_ATTR(sad) GET_BITFIELD((sad), 3, 4)
+#define SKX_SAD_INTERLEAVE(sad) GET_BITFIELD((sad), 1, 2)
+#define SKX_SAD_ENABLE(sad) GET_BITFIELD((sad), 0, 0)
+
+#define SKX_ILV_REMOTE(tgt) (((tgt) & 8) == 0)
+#define SKX_ILV_TARGET(tgt) ((tgt) & 7)
+
+static bool skx_sad_decode(struct decoded_addr *res)
+{
+ struct skx_dev *d = list_first_entry(&skx_edac_list, typeof(*d), list);
+ u64 addr = res->addr;
+ int i, idx, tgt, lchan, shift;
+ u32 sad, ilv;
+ u64 limit, prev_limit;
+ int remote = 0;
+
+ /* Simple sanity check for I/O space or out of range */
+ if (addr >= skx_tohm || (addr >= skx_tolm && addr < BIT_ULL(32))) {
+ edac_dbg(0, "Address %llx out of range\n", addr);
+ return false;
+ }
+
+restart:
+ prev_limit = 0;
+ for (i = 0; i < SKX_MAX_SAD; i++) {
+ SKX_GET_SAD(d, i, sad);
+ limit = SKX_SAD_LIMIT(sad);
+ if (SKX_SAD_ENABLE(sad)) {
+ if (addr >= prev_limit && addr <= limit)
+ goto sad_found;
+ }
+ prev_limit = limit + 1;
+ }
+ edac_dbg(0, "No SAD entry for %llx\n", addr);
+ return false;
+
+sad_found:
+ SKX_GET_ILV(d, i, ilv);
+
+ switch (SKX_SAD_INTERLEAVE(sad)) {
+ case 0:
+ idx = GET_BITFIELD(addr, 6, 8);
+ break;
+ case 1:
+ idx = GET_BITFIELD(addr, 8, 10);
+ break;
+ case 2:
+ idx = GET_BITFIELD(addr, 12, 14);
+ break;
+ case 3:
+ idx = GET_BITFIELD(addr, 30, 32);
+ break;
+ }
+
+ tgt = GET_BITFIELD(ilv, 4 * idx, 4 * idx + 3);
+
+ /* If point to another node, find it and start over */
+ if (SKX_ILV_REMOTE(tgt)) {
+ if (remote) {
+ edac_dbg(0, "Double remote!\n");
+ return false;
+ }
+ remote = 1;
+ list_for_each_entry(d, &skx_edac_list, list) {
+ if (d->imc[0].src_id == SKX_ILV_TARGET(tgt))
+ goto restart;
+ }
+ edac_dbg(0, "Can't find node %d\n", SKX_ILV_TARGET(tgt));
+ return false;
+ }
+
+ if (SKX_SAD_MOD3(sad) == 0)
+ lchan = SKX_ILV_TARGET(tgt);
+ else {
+ switch (SKX_SAD_MOD3MODE(sad)) {
+ case 0:
+ shift = 6;
+ break;
+ case 1:
+ shift = 8;
+ break;
+ case 2:
+ shift = 12;
+ break;
+ default:
+ edac_dbg(0, "illegal mod3mode\n");
+ return false;
+ }
+ switch (SKX_SAD_MOD3ASMOD2(sad)) {
+ case 0:
+ lchan = (addr >> shift) % 3;
+ break;
+ case 1:
+ lchan = (addr >> shift) % 2;
+ break;
+ case 2:
+ lchan = (addr >> shift) % 2;
+ lchan = (lchan << 1) | ~lchan;
+ break;
+ case 3:
+ lchan = ((addr >> shift) % 2) << 1;
+ break;
+ }
+ lchan = (lchan << 1) | (SKX_ILV_TARGET(tgt) & 1);
+ }
+
+ res->dev = d;
+ res->socket = d->imc[0].src_id;
+ res->imc = GET_BITFIELD(d->mcroute, lchan * 3, lchan * 3 + 2);
+ res->channel = GET_BITFIELD(d->mcroute, lchan * 2 + 18, lchan * 2 + 19);
+
+ edac_dbg(2, "%llx: socket=%d imc=%d channel=%d\n",
+ res->addr, res->socket, res->imc, res->channel);
+ return true;
+}
+
+#define SKX_MAX_TAD 8
+
+#define SKX_GET_TADBASE(d, mc, i, reg) \
+ pci_read_config_dword((d)->imc[mc].chan[0].cdev, 0x850 + 4 * (i), &reg)
+#define SKX_GET_TADWAYNESS(d, mc, i, reg) \
+ pci_read_config_dword((d)->imc[mc].chan[0].cdev, 0x880 + 4 * (i), &reg)
+#define SKX_GET_TADCHNILVOFFSET(d, mc, ch, i, reg) \
+ pci_read_config_dword((d)->imc[mc].chan[ch].cdev, 0x90 + 4 * (i), &reg)
+
+#define SKX_TAD_BASE(b) ((u64)GET_BITFIELD((b), 12, 31) << 26)
+#define SKX_TAD_SKT_GRAN(b) GET_BITFIELD((b), 4, 5)
+#define SKX_TAD_CHN_GRAN(b) GET_BITFIELD((b), 6, 7)
+#define SKX_TAD_LIMIT(b) (((u64)GET_BITFIELD((b), 12, 31) << 26) | MASK26)
+#define SKX_TAD_OFFSET(b) ((u64)GET_BITFIELD((b), 4, 23) << 26)
+#define SKX_TAD_SKTWAYS(b) (1 << GET_BITFIELD((b), 10, 11))
+#define SKX_TAD_CHNWAYS(b) (GET_BITFIELD((b), 8, 9) + 1)
+
+/* which bit used for both socket and channel interleave */
+static int skx_granularity[] = { 6, 8, 12, 30 };
+
+static u64 skx_do_interleave(u64 addr, int shift, int ways, u64 lowbits)
+{
+ addr >>= shift;
+ addr /= ways;
+ addr <<= shift;
+
+ return addr | (lowbits & ((1ull << shift) - 1));
+}
+
+static bool skx_tad_decode(struct decoded_addr *res)
+{
+ int i;
+ u32 base, wayness, chnilvoffset;
+ int skt_interleave_bit, chn_interleave_bit;
+ u64 channel_addr;
+
+ for (i = 0; i < SKX_MAX_TAD; i++) {
+ SKX_GET_TADBASE(res->dev, res->imc, i, base);
+ SKX_GET_TADWAYNESS(res->dev, res->imc, i, wayness);
+ if (SKX_TAD_BASE(base) <= res->addr && res->addr <= SKX_TAD_LIMIT(wayness))
+ goto tad_found;
+ }
+ edac_dbg(0, "No TAD entry for %llx\n", res->addr);
+ return false;
+
+tad_found:
+ res->sktways = SKX_TAD_SKTWAYS(wayness);
+ res->chanways = SKX_TAD_CHNWAYS(wayness);
+ skt_interleave_bit = skx_granularity[SKX_TAD_SKT_GRAN(base)];
+ chn_interleave_bit = skx_granularity[SKX_TAD_CHN_GRAN(base)];
+
+ SKX_GET_TADCHNILVOFFSET(res->dev, res->imc, res->channel, i, chnilvoffset);
+ channel_addr = res->addr - SKX_TAD_OFFSET(chnilvoffset);
+
+ if (res->chanways == 3 && skt_interleave_bit > chn_interleave_bit) {
+ /* Must handle channel first, then socket */
+ channel_addr = skx_do_interleave(channel_addr, chn_interleave_bit,
+ res->chanways, channel_addr);
+ channel_addr = skx_do_interleave(channel_addr, skt_interleave_bit,
+ res->sktways, channel_addr);
+ } else {
+ /* Handle socket then channel. Preserve low bits from original address */
+ channel_addr = skx_do_interleave(channel_addr, skt_interleave_bit,
+ res->sktways, res->addr);
+ channel_addr = skx_do_interleave(channel_addr, chn_interleave_bit,
+ res->chanways, res->addr);
+ }
+
+ res->chan_addr = channel_addr;
+
+ edac_dbg(2, "%llx: chan_addr=%llx sktways=%d chanways=%d\n",
+ res->addr, res->chan_addr, res->sktways, res->chanways);
+ return true;
+}
+
+#define SKX_MAX_RIR 4
+
+#define SKX_GET_RIRWAYNESS(d, mc, ch, i, reg) \
+ pci_read_config_dword((d)->imc[mc].chan[ch].cdev, \
+ 0x108 + 4 * (i), &reg)
+#define SKX_GET_RIRILV(d, mc, ch, idx, i, reg) \
+ pci_read_config_dword((d)->imc[mc].chan[ch].cdev, \
+ 0x120 + 16 * idx + 4 * (i), &reg)
+
+#define SKX_RIR_VALID(b) GET_BITFIELD((b), 31, 31)
+#define SKX_RIR_LIMIT(b) (((u64)GET_BITFIELD((b), 1, 11) << 29) | MASK29)
+#define SKX_RIR_WAYS(b) (1 << GET_BITFIELD((b), 28, 29))
+#define SKX_RIR_CHAN_RANK(b) GET_BITFIELD((b), 16, 19)
+#define SKX_RIR_OFFSET(b) ((u64)(GET_BITFIELD((b), 2, 15) << 26))
+
+static bool skx_rir_decode(struct decoded_addr *res)
+{
+ int i, idx, chan_rank;
+ int shift;
+ u32 rirway, rirlv;
+ u64 rank_addr, prev_limit = 0, limit;
+
+ if (res->dev->imc[res->imc].chan[res->channel].dimms[0].close_pg)
+ shift = 6;
+ else
+ shift = 13;
+
+ for (i = 0; i < SKX_MAX_RIR; i++) {
+ SKX_GET_RIRWAYNESS(res->dev, res->imc, res->channel, i, rirway);
+ limit = SKX_RIR_LIMIT(rirway);
+ if (SKX_RIR_VALID(rirway)) {
+ if (prev_limit <= res->chan_addr &&
+ res->chan_addr <= limit)
+ goto rir_found;
+ }
+ prev_limit = limit;
+ }
+ edac_dbg(0, "No RIR entry for %llx\n", res->addr);
+ return false;
+
+rir_found:
+ rank_addr = res->chan_addr >> shift;
+ rank_addr /= SKX_RIR_WAYS(rirway);
+ rank_addr <<= shift;
+ rank_addr |= res->chan_addr & GENMASK_ULL(shift - 1, 0);
+
+ res->rank_address = rank_addr;
+ idx = (res->chan_addr >> shift) % SKX_RIR_WAYS(rirway);
+
+ SKX_GET_RIRILV(res->dev, res->imc, res->channel, idx, i, rirlv);
+ res->rank_address = rank_addr - SKX_RIR_OFFSET(rirlv);
+ chan_rank = SKX_RIR_CHAN_RANK(rirlv);
+ res->channel_rank = chan_rank;
+ res->dimm = chan_rank / 4;
+ res->rank = chan_rank % 4;
+
+ edac_dbg(2, "%llx: dimm=%d rank=%d chan_rank=%d rank_addr=%llx\n",
+ res->addr, res->dimm, res->rank,
+ res->channel_rank, res->rank_address);
+ return true;
+}
+
+static u8 skx_close_row[] = {
+ 15, 16, 17, 18, 20, 21, 22, 28, 10, 11, 12, 13, 29, 30, 31, 32, 33
+};
+static u8 skx_close_column[] = {
+ 3, 4, 5, 14, 19, 23, 24, 25, 26, 27
+};
+static u8 skx_open_row[] = {
+ 14, 15, 16, 20, 28, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33
+};
+static u8 skx_open_column[] = {
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+};
+static u8 skx_open_fine_column[] = {
+ 3, 4, 5, 7, 8, 9, 10, 11, 12, 13
+};
+
+static int skx_bits(u64 addr, int nbits, u8 *bits)
+{
+ int i, res = 0;
+
+ for (i = 0; i < nbits; i++)
+ res |= ((addr >> bits[i]) & 1) << i;
+ return res;
+}
+
+static int skx_bank_bits(u64 addr, int b0, int b1, int do_xor, int x0, int x1)
+{
+ int ret = GET_BITFIELD(addr, b0, b0) | (GET_BITFIELD(addr, b1, b1) << 1);
+
+ if (do_xor)
+ ret ^= GET_BITFIELD(addr, x0, x0) | (GET_BITFIELD(addr, x1, x1) << 1);
+
+ return ret;
+}
+
+static bool skx_mad_decode(struct decoded_addr *r)
+{
+ struct skx_dimm *dimm = &r->dev->imc[r->imc].chan[r->channel].dimms[r->dimm];
+ int bg0 = dimm->fine_grain_bank ? 6 : 13;
+
+ if (dimm->close_pg) {
+ r->row = skx_bits(r->rank_address, dimm->rowbits, skx_close_row);
+ r->column = skx_bits(r->rank_address, dimm->colbits, skx_close_column);
+ r->column |= 0x400; /* C10 is autoprecharge, always set */
+ r->bank_address = skx_bank_bits(r->rank_address, 8, 9, dimm->bank_xor_enable, 22, 28);
+ r->bank_group = skx_bank_bits(r->rank_address, 6, 7, dimm->bank_xor_enable, 20, 21);
+ } else {
+ r->row = skx_bits(r->rank_address, dimm->rowbits, skx_open_row);
+ if (dimm->fine_grain_bank)
+ r->column = skx_bits(r->rank_address, dimm->colbits, skx_open_fine_column);
+ else
+ r->column = skx_bits(r->rank_address, dimm->colbits, skx_open_column);
+ r->bank_address = skx_bank_bits(r->rank_address, 18, 19, dimm->bank_xor_enable, 22, 23);
+ r->bank_group = skx_bank_bits(r->rank_address, bg0, 17, dimm->bank_xor_enable, 20, 21);
+ }
+ r->row &= (1u << dimm->rowbits) - 1;
+
+ edac_dbg(2, "%llx: row=%x col=%x bank_addr=%d bank_group=%d\n",
+ r->addr, r->row, r->column, r->bank_address,
+ r->bank_group);
+ return true;
+}
+
+static bool skx_decode(struct decoded_addr *res)
+{
+
+ return skx_sad_decode(res) && skx_tad_decode(res) &&
+ skx_rir_decode(res) && skx_mad_decode(res);
+}
+
+#ifdef CONFIG_EDAC_DEBUG
+/*
+ * Debug feature. Make /sys/kernel/debug/skx_edac_test/addr.
+ * Write an address to this file to exercise the address decode
+ * logic in this driver.
+ */
+static struct dentry *skx_test;
+static u64 skx_fake_addr;
+
+static int debugfs_u64_set(void *data, u64 val)
+{
+ struct decoded_addr res;
+
+ res.addr = val;
+ skx_decode(&res);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
+
+static struct dentry *mydebugfs_create(const char *name, umode_t mode,
+ struct dentry *parent, u64 *value)
+{
+ return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
+}
+
+static void setup_skx_debug(void)
+{
+ skx_test = debugfs_create_dir("skx_edac_test", NULL);
+ mydebugfs_create("addr", S_IWUSR, skx_test, &skx_fake_addr);
+}
+
+static void teardown_skx_debug(void)
+{
+ debugfs_remove_recursive(skx_test);
+}
+#else
+static void setup_skx_debug(void)
+{
+}
+
+static void teardown_skx_debug(void)
+{
+}
+#endif /*CONFIG_EDAC_DEBUG*/
+
+static void skx_mce_output_error(struct mem_ctl_info *mci,
+ const struct mce *m,
+ struct decoded_addr *res)
+{
+ enum hw_event_mc_err_type tp_event;
+ char *type, *optype, msg[256];
+ bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
+ bool overflow = GET_BITFIELD(m->status, 62, 62);
+ bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
+ bool recoverable;
+ u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
+ u32 mscod = GET_BITFIELD(m->status, 16, 31);
+ u32 errcode = GET_BITFIELD(m->status, 0, 15);
+ u32 optypenum = GET_BITFIELD(m->status, 4, 6);
+
+ recoverable = GET_BITFIELD(m->status, 56, 56);
+
+ if (uncorrected_error) {
+ if (ripv) {
+ type = "FATAL";
+ tp_event = HW_EVENT_ERR_FATAL;
+ } else {
+ type = "NON_FATAL";
+ tp_event = HW_EVENT_ERR_UNCORRECTED;
+ }
+ } else {
+ type = "CORRECTED";
+ tp_event = HW_EVENT_ERR_CORRECTED;
+ }
+
+ /*
+ * According with Table 15-9 of the Intel Architecture spec vol 3A,
+ * memory errors should fit in this mask:
+ * 000f 0000 1mmm cccc (binary)
+ * where:
+ * f = Correction Report Filtering Bit. If 1, subsequent errors
+ * won't be shown
+ * mmm = error type
+ * cccc = channel
+ * If the mask doesn't match, report an error to the parsing logic
+ */
+ if (!((errcode & 0xef80) == 0x80)) {
+ optype = "Can't parse: it is not a mem";
+ } else {
+ switch (optypenum) {
+ case 0:
+ optype = "generic undef request error";
+ break;
+ case 1:
+ optype = "memory read error";
+ break;
+ case 2:
+ optype = "memory write error";
+ break;
+ case 3:
+ optype = "addr/cmd error";
+ break;
+ case 4:
+ optype = "memory scrubbing error";
+ break;
+ default:
+ optype = "reserved";
+ break;
+ }
+ }
+
+ snprintf(msg, sizeof(msg),
+ "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
+ overflow ? " OVERFLOW" : "",
+ (uncorrected_error && recoverable) ? " recoverable" : "",
+ mscod, errcode,
+ res->socket, res->imc, res->rank,
+ res->bank_group, res->bank_address, res->row, res->column);
+
+ edac_dbg(0, "%s\n", msg);
+
+ /* Call the helper to output message */
+ edac_mc_handle_error(tp_event, mci, core_err_cnt,
+ m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
+ res->channel, res->dimm, -1,
+ optype, msg);
+}
+
+static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ struct decoded_addr res;
+ struct mem_ctl_info *mci;
+ char *type;
+
+ if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+ return NOTIFY_DONE;
+
+ /* ignore unless this is memory related with an address */
+ if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
+ return NOTIFY_DONE;
+
+ res.addr = mce->addr;
+ if (!skx_decode(&res))
+ return NOTIFY_DONE;
+ mci = res.dev->imc[res.imc].mci;
+
+ if (mce->mcgstatus & MCG_STATUS_MCIP)
+ type = "Exception";
+ else
+ type = "Event";
+
+ skx_mc_printk(mci, KERN_DEBUG, "HANDLING MCE MEMORY ERROR\n");
+
+ skx_mc_printk(mci, KERN_DEBUG, "CPU %d: Machine Check %s: %Lx "
+ "Bank %d: %016Lx\n", mce->extcpu, type,
+ mce->mcgstatus, mce->bank, mce->status);
+ skx_mc_printk(mci, KERN_DEBUG, "TSC %llx ", mce->tsc);
+ skx_mc_printk(mci, KERN_DEBUG, "ADDR %llx ", mce->addr);
+ skx_mc_printk(mci, KERN_DEBUG, "MISC %llx ", mce->misc);
+
+ skx_mc_printk(mci, KERN_DEBUG, "PROCESSOR %u:%x TIME %llu SOCKET "
+ "%u APIC %x\n", mce->cpuvendor, mce->cpuid,
+ mce->time, mce->socketid, mce->apicid);
+
+ skx_mce_output_error(mci, mce, &res);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block skx_mce_dec = {
+ .notifier_call = skx_mce_check_error,
+};
+
+static void skx_remove(void)
+{
+ int i, j;
+ struct skx_dev *d, *tmp;
+
+ edac_dbg(0, "\n");
+
+ list_for_each_entry_safe(d, tmp, &skx_edac_list, list) {
+ list_del(&d->list);
+ for (i = 0; i < NUM_IMC; i++) {
+ skx_unregister_mci(&d->imc[i]);
+ for (j = 0; j < NUM_CHANNELS; j++)
+ pci_dev_put(d->imc[i].chan[j].cdev);
+ }
+ pci_dev_put(d->util_all);
+ pci_dev_put(d->sad_all);
+
+ kfree(d);
+ }
+}
+
+/*
+ * skx_init:
+ * make sure we are running on the correct cpu model
+ * search for all the devices we need
+ * check which DIMMs are present.
+ */
+int __init skx_init(void)
+{
+ const struct x86_cpu_id *id;
+ const struct munit *m;
+ int rc = 0, i;
+ u8 mc = 0, src_id, node_id;
+ struct skx_dev *d;
+
+ edac_dbg(2, "\n");
+
+ id = x86_match_cpu(skx_cpuids);
+ if (!id)
+ return -ENODEV;
+
+ rc = skx_get_hi_lo();
+ if (rc)
+ return rc;
+
+ rc = get_all_bus_mappings();
+ if (rc < 0)
+ goto fail;
+ if (rc == 0) {
+ edac_dbg(2, "No memory controllers found\n");
+ return -ENODEV;
+ }
+
+ for (m = skx_all_munits; m->did; m++) {
+ rc = get_all_munits(m);
+ if (rc < 0)
+ goto fail;
+ if (rc != m->per_socket * skx_num_sockets) {
+ edac_dbg(2, "Expected %d, got %d of %x\n",
+ m->per_socket * skx_num_sockets, rc, m->did);
+ rc = -ENODEV;
+ goto fail;
+ }
+ }
+
+ list_for_each_entry(d, &skx_edac_list, list) {
+ src_id = get_src_id(d);
+ node_id = skx_get_node_id(d);
+ edac_dbg(2, "src_id=%d node_id=%d\n", src_id, node_id);
+ for (i = 0; i < NUM_IMC; i++) {
+ d->imc[i].mc = mc++;
+ d->imc[i].lmc = i;
+ d->imc[i].src_id = src_id;
+ d->imc[i].node_id = node_id;
+ rc = skx_register_mci(&d->imc[i]);
+ if (rc < 0)
+ goto fail;
+ }
+ }
+
+ /* Ensure that the OPSTATE is set correctly for POLL or NMI */
+ opstate_init();
+
+ setup_skx_debug();
+
+ mce_register_decode_chain(&skx_mce_dec);
+
+ return 0;
+fail:
+ skx_remove();
+ return rc;
+}
+
+static void __exit skx_exit(void)
+{
+ edac_dbg(2, "\n");
+ mce_unregister_decode_chain(&skx_mce_dec);
+ skx_remove();
+ teardown_skx_debug();
+}
+
+module_init(skx_init);
+module_exit(skx_exit);
+
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tony Luck");
+MODULE_DESCRIPTION("MC Driver for Intel Skylake server processors");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8ebc5f1eb4c0..8c704c86597b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -646,9 +646,9 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
int amdgpu_gart_init(struct amdgpu_device *adev);
void amdgpu_gart_fini(struct amdgpu_device *adev);
-void amdgpu_gart_unbind(struct amdgpu_device *adev, unsigned offset,
+void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
int pages);
-int amdgpu_gart_bind(struct amdgpu_device *adev, unsigned offset,
+int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
int pages, struct page **pagelist,
dma_addr_t *dma_addr, uint32_t flags);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
index 49de92600074..10b5ddf2c588 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
@@ -200,16 +200,7 @@ static int amdgpu_atpx_validate(struct amdgpu_atpx *atpx)
atpx->is_hybrid = false;
if (valid_bits & ATPX_MS_HYBRID_GFX_SUPPORTED) {
printk("ATPX Hybrid Graphics\n");
-#if 1
- /* This is a temporary hack until the D3 cold support
- * makes it upstream. The ATPX power_control method seems
- * to still work on even if the system should be using
- * the new standardized hybrid D3 cold ACPI interface.
- */
- atpx->functions.power_cntl = true;
-#else
atpx->functions.power_cntl = false;
-#endif
atpx->is_hybrid = true;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index 921bce2df0b0..0feea347f680 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -221,7 +221,7 @@ void amdgpu_gart_table_vram_free(struct amdgpu_device *adev)
* Unbinds the requested pages from the gart page table and
* replaces them with the dummy page (all asics).
*/
-void amdgpu_gart_unbind(struct amdgpu_device *adev, unsigned offset,
+void amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
int pages)
{
unsigned t;
@@ -268,7 +268,7 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, unsigned offset,
* (all asics).
* Returns 0 for success, -EINVAL for failure.
*/
-int amdgpu_gart_bind(struct amdgpu_device *adev, unsigned offset,
+int amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
int pages, struct page **pagelist, dma_addr_t *dma_addr,
uint32_t flags)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index b11f4e8868d7..4aa993d19018 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -1187,7 +1187,8 @@ int amdgpu_uvd_ring_test_ib(struct amdgpu_ring *ring, long timeout)
r = 0;
}
-error:
fence_put(fence);
+
+error:
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8e642fc48df4..80120fa4092c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1535,7 +1535,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
r = amd_sched_entity_init(&ring->sched, &vm->entity,
rq, amdgpu_sched_jobs);
if (r)
- return r;
+ goto err;
vm->page_directory_fence = NULL;
@@ -1565,6 +1565,9 @@ error_free_page_directory:
error_free_sched_entity:
amd_sched_entity_fini(&ring->sched, &vm->entity);
+err:
+ drm_free_large(vm->page_tables);
+
return r;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
index e621eba63126..a7d3cb3fead0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -184,7 +184,7 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
sizeof(u32)) + inx;
pr_debug("kfd: get kernel queue doorbell\n"
- " doorbell offset == 0x%08d\n"
+ " doorbell offset == 0x%08X\n"
" kernel address == 0x%08lX\n",
*doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx));
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index ce54e985d91b..0a06f9120b5a 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -464,7 +464,7 @@ static bool drm_fb_helper_is_bound(struct drm_fb_helper *fb_helper)
/* Sometimes user space wants everything disabled, so don't steal the
* display if there's a master. */
- if (lockless_dereference(dev->master))
+ if (READ_ONCE(dev->master))
return false;
drm_for_each_crtc(crtc, dev) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index 87ef34150d46..b382cf505262 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -1333,8 +1333,6 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
if (ret < 0)
return ret;
- mutex_lock(&gpu->lock);
-
/*
* TODO
*
@@ -1348,16 +1346,18 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
if (unlikely(event == ~0U)) {
DRM_ERROR("no free event\n");
ret = -EBUSY;
- goto out_unlock;
+ goto out_pm_put;
}
fence = etnaviv_gpu_fence_alloc(gpu);
if (!fence) {
event_free(gpu, event);
ret = -ENOMEM;
- goto out_unlock;
+ goto out_pm_put;
}
+ mutex_lock(&gpu->lock);
+
gpu->event[event].fence = fence;
submit->fence = fence->seqno;
gpu->active_fence = submit->fence;
@@ -1395,9 +1395,9 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
hangcheck_timer_reset(gpu);
ret = 0;
-out_unlock:
mutex_unlock(&gpu->lock);
+out_pm_put:
etnaviv_gpu_pm_put(gpu);
return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 21f939074abc..20fe9d52e256 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1854,6 +1854,7 @@ struct drm_i915_private {
enum modeset_restore modeset_restore;
struct mutex modeset_restore_lock;
struct drm_atomic_state *modeset_restore_state;
+ struct drm_modeset_acquire_ctx reset_ctx;
struct list_head vm_list; /* Global list of all address spaces */
struct i915_ggtt ggtt; /* VM representing the global address space */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 11681501d7b1..a77ce9983f69 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -879,9 +879,12 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
ret = i915_gem_shmem_pread(dev, obj, args, file);
/* pread for non shmem backed objects */
- if (ret == -EFAULT || ret == -ENODEV)
+ if (ret == -EFAULT || ret == -ENODEV) {
+ intel_runtime_pm_get(to_i915(dev));
ret = i915_gem_gtt_pread(dev, obj, args->size,
args->offset, args->data_ptr);
+ intel_runtime_pm_put(to_i915(dev));
+ }
out:
drm_gem_object_unreference(&obj->base);
@@ -1306,7 +1309,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
* textures). Fallback to the shmem path in that case. */
}
- if (ret == -EFAULT) {
+ if (ret == -EFAULT || ret == -ENOSPC) {
if (obj->phys_handle)
ret = i915_gem_phys_pwrite(obj, args, file);
else if (i915_gem_object_has_struct_page(obj))
@@ -3169,6 +3172,8 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
}
intel_ring_init_seqno(engine, engine->last_submitted_seqno);
+
+ engine->i915->gt.active_engines &= ~intel_engine_flag(engine);
}
void i915_gem_reset(struct drm_device *dev)
@@ -3186,6 +3191,7 @@ void i915_gem_reset(struct drm_device *dev)
for_each_engine(engine, dev_priv)
i915_gem_reset_engine_cleanup(engine);
+ mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
i915_gem_context_reset(dev);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 10f1e32767e6..7a30af79d799 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2873,6 +2873,7 @@ void i915_ggtt_cleanup_hw(struct drm_device *dev)
struct i915_hw_ppgtt *ppgtt = dev_priv->mm.aliasing_ppgtt;
ppgtt->base.cleanup(&ppgtt->base);
+ kfree(ppgtt);
}
i915_gem_cleanup_stolen(dev);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index ce14fe09d962..5c06413ae0e6 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -1536,6 +1536,7 @@ enum skl_disp_power_wells {
#define BALANCE_LEG_MASK(port) (7<<(8+3*(port)))
/* Balance leg disable bits */
#define BALANCE_LEG_DISABLE_SHIFT 23
+#define BALANCE_LEG_DISABLE(port) (1 << (23 + (port)))
/*
* Fence registers
diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c
index 6700a7be7f78..d32f586f9c05 100644
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -600,6 +600,8 @@ static void i915_audio_component_codec_wake_override(struct device *dev,
if (!IS_SKYLAKE(dev_priv) && !IS_KABYLAKE(dev_priv))
return;
+ i915_audio_component_get_power(dev);
+
/*
* Enable/disable generating the codec wake signal, overriding the
* internal logic to generate the codec wake to controller.
@@ -615,6 +617,8 @@ static void i915_audio_component_codec_wake_override(struct device *dev,
I915_WRITE(HSW_AUD_CHICKENBIT, tmp);
usleep_range(1000, 1500);
}
+
+ i915_audio_component_put_power(dev);
}
/* Get CDCLK in kHz */
@@ -648,6 +652,7 @@ static int i915_audio_component_sync_audio_rate(struct device *dev,
!IS_HASWELL(dev_priv))
return 0;
+ i915_audio_component_get_power(dev);
mutex_lock(&dev_priv->av_mutex);
/* 1. get the pipe */
intel_encoder = dev_priv->dig_port_map[port];
@@ -698,6 +703,7 @@ static int i915_audio_component_sync_audio_rate(struct device *dev,
unlock:
mutex_unlock(&dev_priv->av_mutex);
+ i915_audio_component_put_power(dev);
return err;
}
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index dd1d6fe12297..1a7efac65fd5 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -145,7 +145,7 @@ static const struct ddi_buf_trans skl_ddi_translations_dp[] = {
static const struct ddi_buf_trans skl_u_ddi_translations_dp[] = {
{ 0x0000201B, 0x000000A2, 0x0 },
{ 0x00005012, 0x00000088, 0x0 },
- { 0x80007011, 0x000000CD, 0x0 },
+ { 0x80007011, 0x000000CD, 0x1 },
{ 0x80009010, 0x000000C0, 0x1 },
{ 0x0000201B, 0x0000009D, 0x0 },
{ 0x80005012, 0x000000C0, 0x1 },
@@ -158,7 +158,7 @@ static const struct ddi_buf_trans skl_u_ddi_translations_dp[] = {
static const struct ddi_buf_trans skl_y_ddi_translations_dp[] = {
{ 0x00000018, 0x000000A2, 0x0 },
{ 0x00005012, 0x00000088, 0x0 },
- { 0x80007011, 0x000000CD, 0x0 },
+ { 0x80007011, 0x000000CD, 0x3 },
{ 0x80009010, 0x000000C0, 0x3 },
{ 0x00000018, 0x0000009D, 0x0 },
{ 0x80005012, 0x000000C0, 0x3 },
@@ -388,6 +388,40 @@ skl_get_buf_trans_hdmi(struct drm_i915_private *dev_priv, int *n_entries)
}
}
+static int intel_ddi_hdmi_level(struct drm_i915_private *dev_priv, enum port port)
+{
+ int n_hdmi_entries;
+ int hdmi_level;
+ int hdmi_default_entry;
+
+ hdmi_level = dev_priv->vbt.ddi_port_info[port].hdmi_level_shift;
+
+ if (IS_BROXTON(dev_priv))
+ return hdmi_level;
+
+ if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
+ skl_get_buf_trans_hdmi(dev_priv, &n_hdmi_entries);
+ hdmi_default_entry = 8;
+ } else if (IS_BROADWELL(dev_priv)) {
+ n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
+ hdmi_default_entry = 7;
+ } else if (IS_HASWELL(dev_priv)) {
+ n_hdmi_entries = ARRAY_SIZE(hsw_ddi_translations_hdmi);
+ hdmi_default_entry = 6;
+ } else {
+ WARN(1, "ddi translation table missing\n");
+ n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
+ hdmi_default_entry = 7;
+ }
+
+ /* Choose a good default if VBT is badly populated */
+ if (hdmi_level == HDMI_LEVEL_SHIFT_UNKNOWN ||
+ hdmi_level >= n_hdmi_entries)
+ hdmi_level = hdmi_default_entry;
+
+ return hdmi_level;
+}
+
/*
* Starting with Haswell, DDI port buffers must be programmed with correct
* values in advance. The buffer values are different for FDI and DP modes,
@@ -399,7 +433,7 @@ void intel_prepare_ddi_buffer(struct intel_encoder *encoder)
{
struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
u32 iboost_bit = 0;
- int i, n_hdmi_entries, n_dp_entries, n_edp_entries, hdmi_default_entry,
+ int i, n_hdmi_entries, n_dp_entries, n_edp_entries,
size;
int hdmi_level;
enum port port;
@@ -410,7 +444,7 @@ void intel_prepare_ddi_buffer(struct intel_encoder *encoder)
const struct ddi_buf_trans *ddi_translations;
port = intel_ddi_get_encoder_port(encoder);
- hdmi_level = dev_priv->vbt.ddi_port_info[port].hdmi_level_shift;
+ hdmi_level = intel_ddi_hdmi_level(dev_priv, port);
if (IS_BROXTON(dev_priv)) {
if (encoder->type != INTEL_OUTPUT_HDMI)
@@ -430,7 +464,6 @@ void intel_prepare_ddi_buffer(struct intel_encoder *encoder)
skl_get_buf_trans_edp(dev_priv, &n_edp_entries);
ddi_translations_hdmi =
skl_get_buf_trans_hdmi(dev_priv, &n_hdmi_entries);
- hdmi_default_entry = 8;
/* If we're boosting the current, set bit 31 of trans1 */
if (dev_priv->vbt.ddi_port_info[port].hdmi_boost_level ||
dev_priv->vbt.ddi_port_info[port].dp_boost_level)
@@ -456,7 +489,6 @@ void intel_prepare_ddi_buffer(struct intel_encoder *encoder)
n_dp_entries = ARRAY_SIZE(bdw_ddi_translations_dp);
n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
- hdmi_default_entry = 7;
} else if (IS_HASWELL(dev_priv)) {
ddi_translations_fdi = hsw_ddi_translations_fdi;
ddi_translations_dp = hsw_ddi_translations_dp;
@@ -464,7 +496,6 @@ void intel_prepare_ddi_buffer(struct intel_encoder *encoder)
ddi_translations_hdmi = hsw_ddi_translations_hdmi;
n_dp_entries = n_edp_entries = ARRAY_SIZE(hsw_ddi_translations_dp);
n_hdmi_entries = ARRAY_SIZE(hsw_ddi_translations_hdmi);
- hdmi_default_entry = 6;
} else {
WARN(1, "ddi translation table missing\n");
ddi_translations_edp = bdw_ddi_translations_dp;
@@ -474,7 +505,6 @@ void intel_prepare_ddi_buffer(struct intel_encoder *encoder)
n_edp_entries = ARRAY_SIZE(bdw_ddi_translations_edp);
n_dp_entries = ARRAY_SIZE(bdw_ddi_translations_dp);
n_hdmi_entries = ARRAY_SIZE(bdw_ddi_translations_hdmi);
- hdmi_default_entry = 7;
}
switch (encoder->type) {
@@ -505,11 +535,6 @@ void intel_prepare_ddi_buffer(struct intel_encoder *encoder)
if (encoder->type != INTEL_OUTPUT_HDMI)
return;
- /* Choose a good default if VBT is badly populated */
- if (hdmi_level == HDMI_LEVEL_SHIFT_UNKNOWN ||
- hdmi_level >= n_hdmi_entries)
- hdmi_level = hdmi_default_entry;
-
/* Entry 9 is for HDMI: */
I915_WRITE(DDI_BUF_TRANS_LO(port, i),
ddi_translations_hdmi[hdmi_level].trans1 | iboost_bit);
@@ -1379,14 +1404,30 @@ void intel_ddi_disable_pipe_clock(struct intel_crtc *intel_crtc)
TRANS_CLK_SEL_DISABLED);
}
-static void skl_ddi_set_iboost(struct drm_i915_private *dev_priv,
- u32 level, enum port port, int type)
+static void _skl_ddi_set_iboost(struct drm_i915_private *dev_priv,
+ enum port port, uint8_t iboost)
{
+ u32 tmp;
+
+ tmp = I915_READ(DISPIO_CR_TX_BMU_CR0);
+ tmp &= ~(BALANCE_LEG_MASK(port) | BALANCE_LEG_DISABLE(port));
+ if (iboost)
+ tmp |= iboost << BALANCE_LEG_SHIFT(port);
+ else
+ tmp |= BALANCE_LEG_DISABLE(port);
+ I915_WRITE(DISPIO_CR_TX_BMU_CR0, tmp);
+}
+
+static void skl_ddi_set_iboost(struct intel_encoder *encoder, u32 level)
+{
+ struct intel_digital_port *intel_dig_port = enc_to_dig_port(&encoder->base);
+ struct drm_i915_private *dev_priv = to_i915(intel_dig_port->base.base.dev);
+ enum port port = intel_dig_port->port;
+ int type = encoder->type;
const struct ddi_buf_trans *ddi_translations;
uint8_t iboost;
uint8_t dp_iboost, hdmi_iboost;
int n_entries;
- u32 reg;
/* VBT may override standard boost values */
dp_iboost = dev_priv->vbt.ddi_port_info[port].dp_boost_level;
@@ -1428,16 +1469,10 @@ static void skl_ddi_set_iboost(struct drm_i915_private *dev_priv,
return;
}
- reg = I915_READ(DISPIO_CR_TX_BMU_CR0);
- reg &= ~BALANCE_LEG_MASK(port);
- reg &= ~(1 << (BALANCE_LEG_DISABLE_SHIFT + port));
-
- if (iboost)
- reg |= iboost << BALANCE_LEG_SHIFT(port);
- else
- reg |= 1 << (BALANCE_LEG_DISABLE_SHIFT + port);
+ _skl_ddi_set_iboost(dev_priv, port, iboost);
- I915_WRITE(DISPIO_CR_TX_BMU_CR0, reg);
+ if (port == PORT_A && intel_dig_port->max_lanes == 4)
+ _skl_ddi_set_iboost(dev_priv, PORT_E, iboost);
}
static void bxt_ddi_vswing_sequence(struct drm_i915_private *dev_priv,
@@ -1568,7 +1603,7 @@ uint32_t ddi_signal_levels(struct intel_dp *intel_dp)
level = translate_signal_level(signal_levels);
if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
- skl_ddi_set_iboost(dev_priv, level, port, encoder->type);
+ skl_ddi_set_iboost(encoder, level);
else if (IS_BROXTON(dev_priv))
bxt_ddi_vswing_sequence(dev_priv, level, port, encoder->type);
@@ -1637,6 +1672,10 @@ static void intel_ddi_pre_enable(struct intel_encoder *intel_encoder)
intel_dp_stop_link_train(intel_dp);
} else if (type == INTEL_OUTPUT_HDMI) {
struct intel_hdmi *intel_hdmi = enc_to_intel_hdmi(encoder);
+ int level = intel_ddi_hdmi_level(dev_priv, port);
+
+ if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
+ skl_ddi_set_iboost(intel_encoder, level);
intel_hdmi->set_infoframes(encoder,
crtc->config->has_hdmi_sink,
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index dcf93b3d4fb6..2a751b6e0253 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3093,40 +3093,110 @@ static void intel_update_primary_planes(struct drm_device *dev)
for_each_crtc(dev, crtc) {
struct intel_plane *plane = to_intel_plane(crtc->primary);
- struct intel_plane_state *plane_state;
-
- drm_modeset_lock_crtc(crtc, &plane->base);
- plane_state = to_intel_plane_state(plane->base.state);
+ struct intel_plane_state *plane_state =
+ to_intel_plane_state(plane->base.state);
if (plane_state->visible)
plane->update_plane(&plane->base,
to_intel_crtc_state(crtc->state),
plane_state);
+ }
+}
+
+static int
+__intel_display_resume(struct drm_device *dev,
+ struct drm_atomic_state *state)
+{
+ struct drm_crtc_state *crtc_state;
+ struct drm_crtc *crtc;
+ int i, ret;
+
+ intel_modeset_setup_hw_state(dev);
+ i915_redisable_vga(dev);
- drm_modeset_unlock_crtc(crtc);
+ if (!state)
+ return 0;
+
+ for_each_crtc_in_state(state, crtc, crtc_state, i) {
+ /*
+ * Force recalculation even if we restore
+ * current state. With fast modeset this may not result
+ * in a modeset when the state is compatible.
+ */
+ crtc_state->mode_changed = true;
}
+
+ /* ignore any reset values/BIOS leftovers in the WM registers */
+ to_intel_atomic_state(state)->skip_intermediate_wm = true;
+
+ ret = drm_atomic_commit(state);
+
+ WARN_ON(ret == -EDEADLK);
+ return ret;
}
void intel_prepare_reset(struct drm_i915_private *dev_priv)
{
+ struct drm_device *dev = &dev_priv->drm;
+ struct drm_modeset_acquire_ctx *ctx = &dev_priv->reset_ctx;
+ struct drm_atomic_state *state;
+ int ret;
+
/* no reset support for gen2 */
if (IS_GEN2(dev_priv))
return;
- /* reset doesn't touch the display */
+ /*
+ * Need mode_config.mutex so that we don't
+ * trample ongoing ->detect() and whatnot.
+ */
+ mutex_lock(&dev->mode_config.mutex);
+ drm_modeset_acquire_init(ctx, 0);
+ while (1) {
+ ret = drm_modeset_lock_all_ctx(dev, ctx);
+ if (ret != -EDEADLK)
+ break;
+
+ drm_modeset_backoff(ctx);
+ }
+
+ /* reset doesn't touch the display, but flips might get nuked anyway, */
if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv))
return;
- drm_modeset_lock_all(&dev_priv->drm);
/*
* Disabling the crtcs gracefully seems nicer. Also the
* g33 docs say we should at least disable all the planes.
*/
- intel_display_suspend(&dev_priv->drm);
+ state = drm_atomic_helper_duplicate_state(dev, ctx);
+ if (IS_ERR(state)) {
+ ret = PTR_ERR(state);
+ state = NULL;
+ DRM_ERROR("Duplicating state failed with %i\n", ret);
+ goto err;
+ }
+
+ ret = drm_atomic_helper_disable_all(dev, ctx);
+ if (ret) {
+ DRM_ERROR("Suspending crtc's failed with %i\n", ret);
+ goto err;
+ }
+
+ dev_priv->modeset_restore_state = state;
+ state->acquire_ctx = ctx;
+ return;
+
+err:
+ drm_atomic_state_free(state);
}
void intel_finish_reset(struct drm_i915_private *dev_priv)
{
+ struct drm_device *dev = &dev_priv->drm;
+ struct drm_modeset_acquire_ctx *ctx = &dev_priv->reset_ctx;
+ struct drm_atomic_state *state = dev_priv->modeset_restore_state;
+ int ret;
+
/*
* Flips in the rings will be nuked by the reset,
* so complete all pending flips so that user space
@@ -3138,6 +3208,8 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
if (IS_GEN2(dev_priv))
return;
+ dev_priv->modeset_restore_state = NULL;
+
/* reset doesn't touch the display */
if (INTEL_GEN(dev_priv) >= 5 || IS_G4X(dev_priv)) {
/*
@@ -3149,29 +3221,32 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
* FIXME: Atomic will make this obsolete since we won't schedule
* CS-based flips (which might get lost in gpu resets) any more.
*/
- intel_update_primary_planes(&dev_priv->drm);
- return;
- }
-
- /*
- * The display has been reset as well,
- * so need a full re-initialization.
- */
- intel_runtime_pm_disable_interrupts(dev_priv);
- intel_runtime_pm_enable_interrupts(dev_priv);
+ intel_update_primary_planes(dev);
+ } else {
+ /*
+ * The display has been reset as well,
+ * so need a full re-initialization.
+ */
+ intel_runtime_pm_disable_interrupts(dev_priv);
+ intel_runtime_pm_enable_interrupts(dev_priv);
- intel_modeset_init_hw(&dev_priv->drm);
+ intel_modeset_init_hw(dev);
- spin_lock_irq(&dev_priv->irq_lock);
- if (dev_priv->display.hpd_irq_setup)
- dev_priv->display.hpd_irq_setup(dev_priv);
- spin_unlock_irq(&dev_priv->irq_lock);
+ spin_lock_irq(&dev_priv->irq_lock);
+ if (dev_priv->display.hpd_irq_setup)
+ dev_priv->display.hpd_irq_setup(dev_priv);
+ spin_unlock_irq(&dev_priv->irq_lock);
- intel_display_resume(&dev_priv->drm);
+ ret = __intel_display_resume(dev, state);
+ if (ret)
+ DRM_ERROR("Restoring old state failed with %i\n", ret);
- intel_hpd_init(dev_priv);
+ intel_hpd_init(dev_priv);
+ }
- drm_modeset_unlock_all(&dev_priv->drm);
+ drm_modeset_drop_locks(ctx);
+ drm_modeset_acquire_fini(ctx);
+ mutex_unlock(&dev->mode_config.mutex);
}
static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
@@ -16156,9 +16231,10 @@ void intel_display_resume(struct drm_device *dev)
struct drm_atomic_state *state = dev_priv->modeset_restore_state;
struct drm_modeset_acquire_ctx ctx;
int ret;
- bool setup = false;
dev_priv->modeset_restore_state = NULL;
+ if (state)
+ state->acquire_ctx = &ctx;
/*
* This is a cludge because with real atomic modeset mode_config.mutex
@@ -16169,43 +16245,17 @@ void intel_display_resume(struct drm_device *dev)
mutex_lock(&dev->mode_config.mutex);
drm_modeset_acquire_init(&ctx, 0);
-retry:
- ret = drm_modeset_lock_all_ctx(dev, &ctx);
-
- if (ret == 0 && !setup) {
- setup = true;
-
- intel_modeset_setup_hw_state(dev);
- i915_redisable_vga(dev);
- }
-
- if (ret == 0 && state) {
- struct drm_crtc_state *crtc_state;
- struct drm_crtc *crtc;
- int i;
-
- state->acquire_ctx = &ctx;
-
- /* ignore any reset values/BIOS leftovers in the WM registers */
- to_intel_atomic_state(state)->skip_intermediate_wm = true;
-
- for_each_crtc_in_state(state, crtc, crtc_state, i) {
- /*
- * Force recalculation even if we restore
- * current state. With fast modeset this may not result
- * in a modeset when the state is compatible.
- */
- crtc_state->mode_changed = true;
- }
-
- ret = drm_atomic_commit(state);
- }
+ while (1) {
+ ret = drm_modeset_lock_all_ctx(dev, &ctx);
+ if (ret != -EDEADLK)
+ break;
- if (ret == -EDEADLK) {
drm_modeset_backoff(&ctx);
- goto retry;
}
+ if (!ret)
+ ret = __intel_display_resume(dev, state);
+
drm_modeset_drop_locks(&ctx);
drm_modeset_acquire_fini(&ctx);
mutex_unlock(&dev->mode_config.mutex);
diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index 6a7ad3ed1463..3836a1c79714 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -1230,12 +1230,29 @@ static int intel_sanitize_fbc_option(struct drm_i915_private *dev_priv)
if (i915.enable_fbc >= 0)
return !!i915.enable_fbc;
+ if (!HAS_FBC(dev_priv))
+ return 0;
+
if (IS_BROADWELL(dev_priv))
return 1;
return 0;
}
+static bool need_fbc_vtd_wa(struct drm_i915_private *dev_priv)
+{
+#ifdef CONFIG_INTEL_IOMMU
+ /* WaFbcTurnOffFbcWhenHyperVisorIsUsed:skl,bxt */
+ if (intel_iommu_gfx_mapped &&
+ (IS_SKYLAKE(dev_priv) || IS_BROXTON(dev_priv))) {
+ DRM_INFO("Disabling framebuffer compression (FBC) to prevent screen flicker with VT-d enabled\n");
+ return true;
+ }
+#endif
+
+ return false;
+}
+
/**
* intel_fbc_init - Initialize FBC
* @dev_priv: the i915 device
@@ -1253,6 +1270,9 @@ void intel_fbc_init(struct drm_i915_private *dev_priv)
fbc->active = false;
fbc->work.scheduled = false;
+ if (need_fbc_vtd_wa(dev_priv))
+ mkwrite_device_info(dev_priv)->has_fbc = false;
+
i915.enable_fbc = intel_sanitize_fbc_option(dev_priv);
DRM_DEBUG_KMS("Sanitized enable_fbc value: %d\n", i915.enable_fbc);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 97ba6c8cf907..d5deb58a2128 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3344,6 +3344,8 @@ static uint32_t skl_wm_method2(uint32_t pixel_rate, uint32_t pipe_htotal,
plane_bytes_per_line *= 4;
plane_blocks_per_line = DIV_ROUND_UP(plane_bytes_per_line, 512);
plane_blocks_per_line /= 4;
+ } else if (tiling == DRM_FORMAT_MOD_NONE) {
+ plane_blocks_per_line = DIV_ROUND_UP(plane_bytes_per_line, 512) + 1;
} else {
plane_blocks_per_line = DIV_ROUND_UP(plane_bytes_per_line, 512);
}
@@ -6574,9 +6576,7 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv)
{
- if (IS_CHERRYVIEW(dev_priv))
- return;
- else if (IS_VALLEYVIEW(dev_priv))
+ if (IS_VALLEYVIEW(dev_priv))
valleyview_cleanup_gt_powersave(dev_priv);
if (!i915.enable_rc6)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index cca7792f26d5..1d3161bbea24 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1178,8 +1178,8 @@ static int bxt_init_workarounds(struct intel_engine_cs *engine)
I915_WRITE(GEN8_L3SQCREG1, L3_GENERAL_PRIO_CREDITS(62) |
L3_HIGH_PRIO_CREDITS(2));
- /* WaInsertDummyPushConstPs:bxt */
- if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
+ /* WaToEnableHwFixForPushConstHWBug:bxt */
+ if (IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
@@ -1222,8 +1222,8 @@ static int kbl_init_workarounds(struct intel_engine_cs *engine)
I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
GEN8_LQSC_RO_PERF_DIS);
- /* WaInsertDummyPushConstPs:kbl */
- if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
+ /* WaToEnableHwFixForPushConstHWBug:kbl */
+ if (IS_KBL_REVID(dev_priv, KBL_REVID_C0, REVID_FOREVER))
WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
diff --git a/drivers/gpu/drm/mediatek/Kconfig b/drivers/gpu/drm/mediatek/Kconfig
index 23ac8041c562..294de4549922 100644
--- a/drivers/gpu/drm/mediatek/Kconfig
+++ b/drivers/gpu/drm/mediatek/Kconfig
@@ -2,6 +2,9 @@ config DRM_MEDIATEK
tristate "DRM Support for Mediatek SoCs"
depends on DRM
depends on ARCH_MEDIATEK || (ARM && COMPILE_TEST)
+ depends on COMMON_CLK
+ depends on HAVE_ARM_SMCCC
+ depends on OF
select DRM_GEM_CMA_HELPER
select DRM_KMS_HELPER
select DRM_MIPI_DSI
diff --git a/drivers/gpu/drm/radeon/radeon_atpx_handler.c b/drivers/gpu/drm/radeon/radeon_atpx_handler.c
index 6de342861202..ddef0d494084 100644
--- a/drivers/gpu/drm/radeon/radeon_atpx_handler.c
+++ b/drivers/gpu/drm/radeon/radeon_atpx_handler.c
@@ -198,16 +198,7 @@ static int radeon_atpx_validate(struct radeon_atpx *atpx)
atpx->is_hybrid = false;
if (valid_bits & ATPX_MS_HYBRID_GFX_SUPPORTED) {
printk("ATPX Hybrid Graphics\n");
-#if 1
- /* This is a temporary hack until the D3 cold support
- * makes it upstream. The ATPX power_control method seems
- * to still work on even if the system should be using
- * the new standardized hybrid D3 cold ACPI interface.
- */
- atpx->functions.power_cntl = true;
-#else
atpx->functions.power_cntl = false;
-#endif
atpx->is_hybrid = true;
}
diff --git a/drivers/hwmon/it87.c b/drivers/hwmon/it87.c
index 730d84028260..d0203a115eff 100644
--- a/drivers/hwmon/it87.c
+++ b/drivers/hwmon/it87.c
@@ -491,7 +491,7 @@ struct it87_sio_data {
struct it87_data {
const struct attribute_group *groups[7];
enum chips type;
- u16 features;
+ u32 features;
u8 peci_mask;
u8 old_peci_mask;
diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c
index f23372669f77..1bb97f658b47 100644
--- a/drivers/i2c/busses/i2c-at91.c
+++ b/drivers/i2c/busses/i2c-at91.c
@@ -38,6 +38,7 @@
#define AT91_I2C_TIMEOUT msecs_to_jiffies(100) /* transfer timeout */
#define AT91_I2C_DMA_THRESHOLD 8 /* enable DMA if transfer size is bigger than this threshold */
#define AUTOSUSPEND_TIMEOUT 2000
+#define AT91_I2C_MAX_ALT_CMD_DATA_SIZE 256
/* AT91 TWI register definitions */
#define AT91_TWI_CR 0x0000 /* Control Register */
@@ -141,6 +142,7 @@ struct at91_twi_dev {
unsigned twi_cwgr_reg;
struct at91_twi_pdata *pdata;
bool use_dma;
+ bool use_alt_cmd;
bool recv_len_abort;
u32 fifo_size;
struct at91_twi_dma dma;
@@ -269,7 +271,7 @@ static void at91_twi_write_next_byte(struct at91_twi_dev *dev)
/* send stop when last byte has been written */
if (--dev->buf_len == 0)
- if (!dev->pdata->has_alt_cmd)
+ if (!dev->use_alt_cmd)
at91_twi_write(dev, AT91_TWI_CR, AT91_TWI_STOP);
dev_dbg(dev->dev, "wrote 0x%x, to go %d\n", *dev->buf, dev->buf_len);
@@ -292,7 +294,7 @@ static void at91_twi_write_data_dma_callback(void *data)
* we just have to enable TXCOMP one.
*/
at91_twi_write(dev, AT91_TWI_IER, AT91_TWI_TXCOMP);
- if (!dev->pdata->has_alt_cmd)
+ if (!dev->use_alt_cmd)
at91_twi_write(dev, AT91_TWI_CR, AT91_TWI_STOP);
}
@@ -410,7 +412,7 @@ static void at91_twi_read_next_byte(struct at91_twi_dev *dev)
}
/* send stop if second but last byte has been read */
- if (!dev->pdata->has_alt_cmd && dev->buf_len == 1)
+ if (!dev->use_alt_cmd && dev->buf_len == 1)
at91_twi_write(dev, AT91_TWI_CR, AT91_TWI_STOP);
dev_dbg(dev->dev, "read 0x%x, to go %d\n", *dev->buf, dev->buf_len);
@@ -426,7 +428,7 @@ static void at91_twi_read_data_dma_callback(void *data)
dma_unmap_single(dev->dev, sg_dma_address(&dev->dma.sg[0]),
dev->buf_len, DMA_FROM_DEVICE);
- if (!dev->pdata->has_alt_cmd) {
+ if (!dev->use_alt_cmd) {
/* The last two bytes have to be read without using dma */
dev->buf += dev->buf_len - 2;
dev->buf_len = 2;
@@ -443,7 +445,7 @@ static void at91_twi_read_data_dma(struct at91_twi_dev *dev)
struct dma_chan *chan_rx = dma->chan_rx;
size_t buf_len;
- buf_len = (dev->pdata->has_alt_cmd) ? dev->buf_len : dev->buf_len - 2;
+ buf_len = (dev->use_alt_cmd) ? dev->buf_len : dev->buf_len - 2;
dma->direction = DMA_FROM_DEVICE;
/* Keep in mind that we won't use dma to read the last two bytes */
@@ -651,7 +653,7 @@ static int at91_do_twi_transfer(struct at91_twi_dev *dev)
unsigned start_flags = AT91_TWI_START;
/* if only one byte is to be read, immediately stop transfer */
- if (!has_alt_cmd && dev->buf_len <= 1 &&
+ if (!dev->use_alt_cmd && dev->buf_len <= 1 &&
!(dev->msg->flags & I2C_M_RECV_LEN))
start_flags |= AT91_TWI_STOP;
at91_twi_write(dev, AT91_TWI_CR, start_flags);
@@ -745,7 +747,7 @@ static int at91_twi_xfer(struct i2c_adapter *adap, struct i2c_msg *msg, int num)
int ret;
unsigned int_addr_flag = 0;
struct i2c_msg *m_start = msg;
- bool is_read, use_alt_cmd = false;
+ bool is_read;
dev_dbg(&adap->dev, "at91_xfer: processing %d messages:\n", num);
@@ -768,14 +770,16 @@ static int at91_twi_xfer(struct i2c_adapter *adap, struct i2c_msg *msg, int num)
at91_twi_write(dev, AT91_TWI_IADR, internal_address);
}
+ dev->use_alt_cmd = false;
is_read = (m_start->flags & I2C_M_RD);
if (dev->pdata->has_alt_cmd) {
- if (m_start->len > 0) {
+ if (m_start->len > 0 &&
+ m_start->len < AT91_I2C_MAX_ALT_CMD_DATA_SIZE) {
at91_twi_write(dev, AT91_TWI_CR, AT91_TWI_ACMEN);
at91_twi_write(dev, AT91_TWI_ACR,
AT91_TWI_ACR_DATAL(m_start->len) |
((is_read) ? AT91_TWI_ACR_DIR : 0));
- use_alt_cmd = true;
+ dev->use_alt_cmd = true;
} else {
at91_twi_write(dev, AT91_TWI_CR, AT91_TWI_ACMDIS);
}
@@ -784,7 +788,7 @@ static int at91_twi_xfer(struct i2c_adapter *adap, struct i2c_msg *msg, int num)
at91_twi_write(dev, AT91_TWI_MMR,
(m_start->addr << 16) |
int_addr_flag |
- ((!use_alt_cmd && is_read) ? AT91_TWI_MREAD : 0));
+ ((!dev->use_alt_cmd && is_read) ? AT91_TWI_MREAD : 0));
dev->buf_len = m_start->len;
dev->buf = m_start->buf;
diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c
index 19c843828fe2..95f7cac76f89 100644
--- a/drivers/i2c/busses/i2c-bcm-iproc.c
+++ b/drivers/i2c/busses/i2c-bcm-iproc.c
@@ -158,7 +158,7 @@ static irqreturn_t bcm_iproc_i2c_isr(int irq, void *data)
if (status & BIT(IS_M_START_BUSY_SHIFT)) {
iproc_i2c->xfer_is_done = 1;
- complete_all(&iproc_i2c->done);
+ complete(&iproc_i2c->done);
}
writel(status, iproc_i2c->base + IS_OFFSET);
diff --git a/drivers/i2c/busses/i2c-bcm-kona.c b/drivers/i2c/busses/i2c-bcm-kona.c
index ac9f47679c3a..f98743277e3c 100644
--- a/drivers/i2c/busses/i2c-bcm-kona.c
+++ b/drivers/i2c/busses/i2c-bcm-kona.c
@@ -229,7 +229,7 @@ static irqreturn_t bcm_kona_i2c_isr(int irq, void *devid)
dev->base + TXFCR_OFFSET);
writel(status & ~ISR_RESERVED_MASK, dev->base + ISR_OFFSET);
- complete_all(&dev->done);
+ complete(&dev->done);
return IRQ_HANDLED;
}
diff --git a/drivers/i2c/busses/i2c-brcmstb.c b/drivers/i2c/busses/i2c-brcmstb.c
index 3f5a4d71d3bf..385b57bfcb38 100644
--- a/drivers/i2c/busses/i2c-brcmstb.c
+++ b/drivers/i2c/busses/i2c-brcmstb.c
@@ -228,7 +228,7 @@ static irqreturn_t brcmstb_i2c_isr(int irq, void *devid)
return IRQ_NONE;
brcmstb_i2c_enable_disable_irq(dev, INT_DISABLE);
- complete_all(&dev->done);
+ complete(&dev->done);
dev_dbg(dev->device, "isr handled");
return IRQ_HANDLED;
diff --git a/drivers/i2c/busses/i2c-cros-ec-tunnel.c b/drivers/i2c/busses/i2c-cros-ec-tunnel.c
index a0d95ff682ae..2d5ff86398d0 100644
--- a/drivers/i2c/busses/i2c-cros-ec-tunnel.c
+++ b/drivers/i2c/busses/i2c-cros-ec-tunnel.c
@@ -215,7 +215,7 @@ static int ec_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg i2c_msgs[],
msg->outsize = request_len;
msg->insize = response_len;
- result = cros_ec_cmd_xfer(bus->ec, msg);
+ result = cros_ec_cmd_xfer_status(bus->ec, msg);
if (result < 0) {
dev_err(dev, "Error transferring EC i2c message %d\n", result);
goto exit;
diff --git a/drivers/i2c/busses/i2c-meson.c b/drivers/i2c/busses/i2c-meson.c
index 71d3929adf54..76e28980904f 100644
--- a/drivers/i2c/busses/i2c-meson.c
+++ b/drivers/i2c/busses/i2c-meson.c
@@ -211,7 +211,7 @@ static void meson_i2c_stop(struct meson_i2c *i2c)
meson_i2c_add_token(i2c, TOKEN_STOP);
} else {
i2c->state = STATE_IDLE;
- complete_all(&i2c->done);
+ complete(&i2c->done);
}
}
@@ -238,7 +238,7 @@ static irqreturn_t meson_i2c_irq(int irqno, void *dev_id)
dev_dbg(i2c->dev, "error bit set\n");
i2c->error = -ENXIO;
i2c->state = STATE_IDLE;
- complete_all(&i2c->done);
+ complete(&i2c->done);
goto out;
}
@@ -269,7 +269,7 @@ static irqreturn_t meson_i2c_irq(int irqno, void *dev_id)
break;
case STATE_STOP:
i2c->state = STATE_IDLE;
- complete_all(&i2c->done);
+ complete(&i2c->done);
break;
case STATE_IDLE:
break;
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index dfa7a4b4a91d..ac88a524143e 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -379,6 +379,7 @@ static int ocores_i2c_of_probe(struct platform_device *pdev,
if (!clock_frequency_present) {
dev_err(&pdev->dev,
"Missing required parameter 'opencores,ip-clock-frequency'\n");
+ clk_disable_unprepare(i2c->clk);
return -ENODEV;
}
i2c->ip_clock_khz = clock_frequency / 1000;
@@ -467,20 +468,21 @@ static int ocores_i2c_probe(struct platform_device *pdev)
default:
dev_err(&pdev->dev, "Unsupported I/O width (%d)\n",
i2c->reg_io_width);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err_clk;
}
}
ret = ocores_init(&pdev->dev, i2c);
if (ret)
- return ret;
+ goto err_clk;
init_waitqueue_head(&i2c->wait);
ret = devm_request_irq(&pdev->dev, irq, ocores_isr, 0,
pdev->name, i2c);
if (ret) {
dev_err(&pdev->dev, "Cannot claim IRQ\n");
- return ret;
+ goto err_clk;
}
/* hook up driver to tree */
@@ -494,7 +496,7 @@ static int ocores_i2c_probe(struct platform_device *pdev)
ret = i2c_add_adapter(&i2c->adap);
if (ret) {
dev_err(&pdev->dev, "Failed to add adapter\n");
- return ret;
+ goto err_clk;
}
/* add in known devices to the bus */
@@ -504,6 +506,10 @@ static int ocores_i2c_probe(struct platform_device *pdev)
}
return 0;
+
+err_clk:
+ clk_disable_unprepare(i2c->clk);
+ return ret;
}
static int ocores_i2c_remove(struct platform_device *pdev)
diff --git a/drivers/i2c/muxes/i2c-demux-pinctrl.c b/drivers/i2c/muxes/i2c-demux-pinctrl.c
index 8de073aed001..215ac87f606d 100644
--- a/drivers/i2c/muxes/i2c-demux-pinctrl.c
+++ b/drivers/i2c/muxes/i2c-demux-pinctrl.c
@@ -68,7 +68,7 @@ static int i2c_demux_activate_master(struct i2c_demux_pinctrl_priv *priv, u32 ne
adap = of_find_i2c_adapter_by_node(priv->chan[new_chan].parent_np);
if (!adap) {
ret = -ENODEV;
- goto err;
+ goto err_with_revert;
}
p = devm_pinctrl_get_select(adap->dev.parent, priv->bus_name);
@@ -103,6 +103,8 @@ static int i2c_demux_activate_master(struct i2c_demux_pinctrl_priv *priv, u32 ne
err_with_put:
i2c_put_adapter(adap);
+ err_with_revert:
+ of_changeset_revert(&priv->chan[new_chan].chgset);
err:
dev_err(priv->dev, "failed to setup demux-adapter %d (%d)\n", new_chan, ret);
return ret;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4e9784b4e0ac..eedba67b0e3e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -181,7 +181,7 @@ struct crypt_config {
u8 key[0];
};
-#define MIN_IOS 16
+#define MIN_IOS 64
static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1b9795d75ef8..8abde6b8cedc 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -191,7 +191,6 @@ struct raid_dev {
#define RT_FLAG_RS_BITMAP_LOADED 2
#define RT_FLAG_UPDATE_SBS 3
#define RT_FLAG_RESHAPE_RS 4
-#define RT_FLAG_KEEP_RS_FROZEN 5
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -861,6 +860,9 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
{
unsigned long min_region_size = rs->ti->len / (1 << 21);
+ if (rs_is_raid0(rs))
+ return 0;
+
if (!region_size) {
/*
* Choose a reasonable default. All figures in sectors.
@@ -930,6 +932,8 @@ static int validate_raid_redundancy(struct raid_set *rs)
rebuild_cnt++;
switch (rs->raid_type->level) {
+ case 0:
+ break;
case 1:
if (rebuild_cnt >= rs->md.raid_disks)
goto too_many;
@@ -2335,6 +2339,13 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
case 0:
break;
default:
+ /*
+ * We have to keep any raid0 data/metadata device pairs or
+ * the MD raid0 personality will fail to start the array.
+ */
+ if (rs_is_raid0(rs))
+ continue;
+
dev = container_of(rdev, struct raid_dev, rdev);
if (dev->meta_dev)
dm_put_device(ti, dev->meta_dev);
@@ -2579,7 +2590,6 @@ static int rs_prepare_reshape(struct raid_set *rs)
} else {
/* Process raid1 without delta_disks */
mddev->raid_disks = rs->raid_disks;
- set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
reshape = false;
}
} else {
@@ -2590,7 +2600,6 @@ static int rs_prepare_reshape(struct raid_set *rs)
if (reshape) {
set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags);
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
- set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
} else if (mddev->raid_disks < rs->raid_disks)
/* Create new superblocks and bitmaps, if any new disks */
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
@@ -2902,7 +2911,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
- set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags);
/* Takeover ain't recovery, so disable recovery */
rs_setup_recovery(rs, MaxSector);
rs_set_new(rs);
@@ -3386,21 +3394,28 @@ static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- if (test_and_clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) {
- if (!rs->md.suspended)
- mddev_suspend(&rs->md);
- rs->md.ro = 1;
- }
+ if (!rs->md.suspended)
+ mddev_suspend(&rs->md);
+
+ rs->md.ro = 1;
}
static void attempt_restore_of_faulty_devices(struct raid_set *rs)
{
int i;
- uint64_t failed_devices, cleared_failed_devices = 0;
+ uint64_t cleared_failed_devices[DISKS_ARRAY_ELEMS];
unsigned long flags;
+ bool cleared = false;
struct dm_raid_superblock *sb;
+ struct mddev *mddev = &rs->md;
struct md_rdev *r;
+ /* RAID personalities have to provide hot add/remove methods or we need to bail out. */
+ if (!mddev->pers || !mddev->pers->hot_add_disk || !mddev->pers->hot_remove_disk)
+ return;
+
+ memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
+
for (i = 0; i < rs->md.raid_disks; i++) {
r = &rs->dev[i].rdev;
if (test_bit(Faulty, &r->flags) && r->sb_page &&
@@ -3420,7 +3435,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
* ourselves.
*/
if ((r->raid_disk >= 0) &&
- (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
+ (mddev->pers->hot_remove_disk(mddev, r) != 0))
/* Failed to revive this device, try next */
continue;
@@ -3430,22 +3445,30 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags);
clear_bit(In_sync, &r->flags);
- if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+ if (mddev->pers->hot_add_disk(mddev, r)) {
r->raid_disk = -1;
r->saved_raid_disk = -1;
r->flags = flags;
} else {
r->recovery_offset = 0;
- cleared_failed_devices |= 1 << i;
+ set_bit(i, (void *) cleared_failed_devices);
+ cleared = true;
}
}
}
- if (cleared_failed_devices) {
+
+ /* If any failed devices could be cleared, update all sbs failed_devices bits */
+ if (cleared) {
+ uint64_t failed_devices[DISKS_ARRAY_ELEMS];
+
rdev_for_each(r, &rs->md) {
sb = page_address(r->sb_page);
- failed_devices = le64_to_cpu(sb->failed_devices);
- failed_devices &= ~cleared_failed_devices;
- sb->failed_devices = cpu_to_le64(failed_devices);
+ sb_retrieve_failed_devices(sb, failed_devices);
+
+ for (i = 0; i < DISKS_ARRAY_ELEMS; i++)
+ failed_devices[i] &= ~cleared_failed_devices[i];
+
+ sb_update_failed_devices(sb, failed_devices);
}
}
}
@@ -3610,26 +3633,15 @@ static void raid_resume(struct dm_target *ti)
* devices are reachable again.
*/
attempt_restore_of_faulty_devices(rs);
- } else {
- mddev->ro = 0;
- mddev->in_sync = 0;
+ }
- /*
- * When passing in flags to the ctr, we expect userspace
- * to reset them because they made it to the superblocks
- * and reload the mapping anyway.
- *
- * -> only unfreeze recovery in case of a table reload or
- * we'll have a bogus recovery/reshape position
- * retrieved from the superblock by the ctr because
- * the ongoing recovery/reshape will change it after read.
- */
- if (!test_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags))
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ mddev->ro = 0;
+ mddev->in_sync = 0;
- if (mddev->suspended)
- mddev_resume(mddev);
- }
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+ if (mddev->suspended)
+ mddev_resume(mddev);
}
static struct target_type raid_target = {
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 4ace1da17db8..6c25213ab38c 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -210,14 +210,17 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
struct path_info *pi = NULL;
struct dm_path *current_path = NULL;
+ local_irq_save(flags);
current_path = *this_cpu_ptr(s->current_path);
if (current_path) {
percpu_counter_dec(&s->repeat_count);
- if (percpu_counter_read_positive(&s->repeat_count) > 0)
+ if (percpu_counter_read_positive(&s->repeat_count) > 0) {
+ local_irq_restore(flags);
return current_path;
+ }
}
- spin_lock_irqsave(&s->lock, flags);
+ spin_lock(&s->lock);
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 7792266db259..3ce69536a7b3 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1631,8 +1631,7 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
*/
err:
- if (it.node)
- of_node_put(it.node);
+ of_node_put(it.node);
return rc;
}
@@ -2343,20 +2342,13 @@ struct device_node *of_graph_get_endpoint_by_regs(
const struct device_node *parent, int port_reg, int reg)
{
struct of_endpoint endpoint;
- struct device_node *node, *prev_node = NULL;
-
- while (1) {
- node = of_graph_get_next_endpoint(parent, prev_node);
- of_node_put(prev_node);
- if (!node)
- break;
+ struct device_node *node = NULL;
+ for_each_endpoint_of_node(parent, node) {
of_graph_parse_endpoint(node, &endpoint);
if (((port_reg == -1) || (endpoint.port == port_reg)) &&
((reg == -1) || (endpoint.id == reg)))
return node;
-
- prev_node = node;
}
return NULL;
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 55f1b8391149..085c6389afd1 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -517,7 +517,7 @@ static void *__unflatten_device_tree(const void *blob,
pr_warning("End of tree marker overwritten: %08x\n",
be32_to_cpup(mem + size));
- if (detached) {
+ if (detached && mynodes) {
of_node_set_flag(*mynodes, OF_DETACHED);
pr_debug("unflattened tree is detached\n");
}
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 89a71c6074fc..a2e68f740eda 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -544,12 +544,15 @@ void __init of_irq_init(const struct of_device_id *matches)
list_del(&desc->list);
+ of_node_set_flag(desc->dev, OF_POPULATED);
+
pr_debug("of_irq_init: init %s (%p), parent %p\n",
desc->dev->full_name,
desc->dev, desc->interrupt_parent);
ret = desc->irq_init_cb(desc->dev,
desc->interrupt_parent);
if (ret) {
+ of_node_clear_flag(desc->dev, OF_POPULATED);
kfree(desc);
continue;
}
@@ -559,8 +562,6 @@ void __init of_irq_init(const struct of_device_id *matches)
* its children can get processed in a subsequent pass.
*/
list_add_tail(&desc->list, &intc_parent_list);
-
- of_node_set_flag(desc->dev, OF_POPULATED);
}
/* Get the next pending parent that might have children */
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 8aa197691074..f39ccd5aa701 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -497,6 +497,7 @@ int of_platform_default_populate(struct device_node *root,
}
EXPORT_SYMBOL_GPL(of_platform_default_populate);
+#ifndef CONFIG_PPC
static int __init of_platform_default_populate_init(void)
{
struct device_node *node;
@@ -521,6 +522,7 @@ static int __init of_platform_default_populate_init(void)
return 0;
}
arch_initcall_sync(of_platform_default_populate_init);
+#endif
static int of_platform_device_destroy(struct device *dev, void *data)
{
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index b381b3718a98..5648b715fed9 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -63,7 +63,7 @@ static int ioctl_send_fib(struct aac_dev * dev, void __user *arg)
struct fib *fibptr;
struct hw_fib * hw_fib = (struct hw_fib *)0;
dma_addr_t hw_fib_pa = (dma_addr_t)0LL;
- unsigned size;
+ unsigned int size, osize;
int retval;
if (dev->in_reset) {
@@ -87,7 +87,8 @@ static int ioctl_send_fib(struct aac_dev * dev, void __user *arg)
* will not overrun the buffer when we copy the memory. Return
* an error if we would.
*/
- size = le16_to_cpu(kfib->header.Size) + sizeof(struct aac_fibhdr);
+ osize = size = le16_to_cpu(kfib->header.Size) +
+ sizeof(struct aac_fibhdr);
if (size < le16_to_cpu(kfib->header.SenderSize))
size = le16_to_cpu(kfib->header.SenderSize);
if (size > dev->max_fib_size) {
@@ -118,6 +119,14 @@ static int ioctl_send_fib(struct aac_dev * dev, void __user *arg)
goto cleanup;
}
+ /* Sanity check the second copy */
+ if ((osize != le16_to_cpu(kfib->header.Size) +
+ sizeof(struct aac_fibhdr))
+ || (size < le16_to_cpu(kfib->header.SenderSize))) {
+ retval = -EINVAL;
+ goto cleanup;
+ }
+
if (kfib->header.Command == cpu_to_le16(TakeABreakPt)) {
aac_adapter_interrupt(dev);
/*
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index a569c65f22b1..dcf36537a767 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -2923,7 +2923,7 @@ static int fcoe_ctlr_vlan_recv(struct fcoe_ctlr *fip, struct sk_buff *skb)
mutex_unlock(&fip->ctlr_mutex);
drop:
- kfree(skb);
+ kfree_skb(skb);
return rc;
}
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 2dab3dc2aa69..c1ed25adb17e 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -5037,7 +5037,7 @@ static int megasas_init_fw(struct megasas_instance *instance)
/* Find first memory bar */
bar_list = pci_select_bars(instance->pdev, IORESOURCE_MEM);
instance->bar = find_first_bit(&bar_list, sizeof(unsigned long));
- if (pci_request_selected_regions(instance->pdev, instance->bar,
+ if (pci_request_selected_regions(instance->pdev, 1<<instance->bar,
"megasas: LSI")) {
dev_printk(KERN_DEBUG, &instance->pdev->dev, "IO memory region busy!\n");
return -EBUSY;
@@ -5339,7 +5339,7 @@ fail_ready_state:
iounmap(instance->reg_set);
fail_ioremap:
- pci_release_selected_regions(instance->pdev, instance->bar);
+ pci_release_selected_regions(instance->pdev, 1<<instance->bar);
return -EINVAL;
}
@@ -5360,7 +5360,7 @@ static void megasas_release_mfi(struct megasas_instance *instance)
iounmap(instance->reg_set);
- pci_release_selected_regions(instance->pdev, instance->bar);
+ pci_release_selected_regions(instance->pdev, 1<<instance->bar);
}
/**
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index ec837544f784..52d8bbf7feb5 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -2603,7 +2603,7 @@ megasas_release_fusion(struct megasas_instance *instance)
iounmap(instance->reg_set);
- pci_release_selected_regions(instance->pdev, instance->bar);
+ pci_release_selected_regions(instance->pdev, 1<<instance->bar);
}
/**
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 751f13edece0..750f82c339d4 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -2188,6 +2188,17 @@ mpt3sas_base_map_resources(struct MPT3SAS_ADAPTER *ioc)
} else
ioc->msix96_vector = 0;
+ if (ioc->is_warpdrive) {
+ ioc->reply_post_host_index[0] = (resource_size_t __iomem *)
+ &ioc->chip->ReplyPostHostIndex;
+
+ for (i = 1; i < ioc->cpu_msix_table_sz; i++)
+ ioc->reply_post_host_index[i] =
+ (resource_size_t __iomem *)
+ ((u8 __iomem *)&ioc->chip->Doorbell + (0x4000 + ((i - 1)
+ * 4)));
+ }
+
list_for_each_entry(reply_q, &ioc->reply_queue_list, list)
pr_info(MPT3SAS_FMT "%s: IRQ %d\n",
reply_q->name, ((ioc->msix_enable) ? "PCI-MSI-X enabled" :
@@ -5280,17 +5291,6 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
if (r)
goto out_free_resources;
- if (ioc->is_warpdrive) {
- ioc->reply_post_host_index[0] = (resource_size_t __iomem *)
- &ioc->chip->ReplyPostHostIndex;
-
- for (i = 1; i < ioc->cpu_msix_table_sz; i++)
- ioc->reply_post_host_index[i] =
- (resource_size_t __iomem *)
- ((u8 __iomem *)&ioc->chip->Doorbell + (0x4000 + ((i - 1)
- * 4)));
- }
-
pci_set_drvdata(ioc->pdev, ioc->shost);
r = _base_get_ioc_facts(ioc, CAN_SLEEP);
if (r)
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 53ef1cb6418e..0e8601aa877a 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -778,6 +778,8 @@ static void ses_intf_remove_enclosure(struct scsi_device *sdev)
if (!edev)
return;
+ enclosure_unregister(edev);
+
ses_dev = edev->scratch;
edev->scratch = NULL;
@@ -789,7 +791,6 @@ static void ses_intf_remove_enclosure(struct scsi_device *sdev)
kfree(edev->component[0].scratch);
put_device(&edev->edev);
- enclosure_unregister(edev);
}
static void ses_intf_remove(struct device *cdev,
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 71912301ef7f..0f3f62e81e5b 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1354,7 +1354,6 @@ made_compressed_probe:
spin_lock_init(&acm->write_lock);
spin_lock_init(&acm->read_lock);
mutex_init(&acm->mutex);
- acm->rx_endpoint = usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress);
acm->is_int_ep = usb_endpoint_xfer_int(epread);
if (acm->is_int_ep)
acm->bInterval = epread->bInterval;
@@ -1394,14 +1393,14 @@ made_compressed_probe:
urb->transfer_dma = rb->dma;
if (acm->is_int_ep) {
usb_fill_int_urb(urb, acm->dev,
- acm->rx_endpoint,
+ usb_rcvintpipe(usb_dev, epread->bEndpointAddress),
rb->base,
acm->readsize,
acm_read_bulk_callback, rb,
acm->bInterval);
} else {
usb_fill_bulk_urb(urb, acm->dev,
- acm->rx_endpoint,
+ usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress),
rb->base,
acm->readsize,
acm_read_bulk_callback, rb);
diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h
index 05ce308d5d2a..1f1eabfd8462 100644
--- a/drivers/usb/class/cdc-acm.h
+++ b/drivers/usb/class/cdc-acm.h
@@ -96,7 +96,6 @@ struct acm {
struct acm_rb read_buffers[ACM_NR];
struct acm_wb *putbuffer; /* for acm_tty_put_char() */
int rx_buflimit;
- int rx_endpoint;
spinlock_t read_lock;
int write_used; /* number of non-empty write buffers */
int transmitting;
diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 31ccdccd7a04..051163189810 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -171,6 +171,31 @@ static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno,
ep, buffer, size);
}
+static const unsigned short low_speed_maxpacket_maxes[4] = {
+ [USB_ENDPOINT_XFER_CONTROL] = 8,
+ [USB_ENDPOINT_XFER_ISOC] = 0,
+ [USB_ENDPOINT_XFER_BULK] = 0,
+ [USB_ENDPOINT_XFER_INT] = 8,
+};
+static const unsigned short full_speed_maxpacket_maxes[4] = {
+ [USB_ENDPOINT_XFER_CONTROL] = 64,
+ [USB_ENDPOINT_XFER_ISOC] = 1023,
+ [USB_ENDPOINT_XFER_BULK] = 64,
+ [USB_ENDPOINT_XFER_INT] = 64,
+};
+static const unsigned short high_speed_maxpacket_maxes[4] = {
+ [USB_ENDPOINT_XFER_CONTROL] = 64,
+ [USB_ENDPOINT_XFER_ISOC] = 1024,
+ [USB_ENDPOINT_XFER_BULK] = 512,
+ [USB_ENDPOINT_XFER_INT] = 1023,
+};
+static const unsigned short super_speed_maxpacket_maxes[4] = {
+ [USB_ENDPOINT_XFER_CONTROL] = 512,
+ [USB_ENDPOINT_XFER_ISOC] = 1024,
+ [USB_ENDPOINT_XFER_BULK] = 1024,
+ [USB_ENDPOINT_XFER_INT] = 1024,
+};
+
static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
int asnum, struct usb_host_interface *ifp, int num_ep,
unsigned char *buffer, int size)
@@ -179,6 +204,8 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
struct usb_endpoint_descriptor *d;
struct usb_host_endpoint *endpoint;
int n, i, j, retval;
+ unsigned int maxp;
+ const unsigned short *maxpacket_maxes;
d = (struct usb_endpoint_descriptor *) buffer;
buffer += d->bLength;
@@ -286,6 +313,42 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
endpoint->desc.wMaxPacketSize = cpu_to_le16(8);
}
+ /* Validate the wMaxPacketSize field */
+ maxp = usb_endpoint_maxp(&endpoint->desc);
+
+ /* Find the highest legal maxpacket size for this endpoint */
+ i = 0; /* additional transactions per microframe */
+ switch (to_usb_device(ddev)->speed) {
+ case USB_SPEED_LOW:
+ maxpacket_maxes = low_speed_maxpacket_maxes;
+ break;
+ case USB_SPEED_FULL:
+ maxpacket_maxes = full_speed_maxpacket_maxes;
+ break;
+ case USB_SPEED_HIGH:
+ /* Bits 12..11 are allowed only for HS periodic endpoints */
+ if (usb_endpoint_xfer_int(d) || usb_endpoint_xfer_isoc(d)) {
+ i = maxp & (BIT(12) | BIT(11));
+ maxp &= ~i;
+ }
+ /* fallthrough */
+ default:
+ maxpacket_maxes = high_speed_maxpacket_maxes;
+ break;
+ case USB_SPEED_SUPER:
+ case USB_SPEED_SUPER_PLUS:
+ maxpacket_maxes = super_speed_maxpacket_maxes;
+ break;
+ }
+ j = maxpacket_maxes[usb_endpoint_type(&endpoint->desc)];
+
+ if (maxp > j) {
+ dev_warn(ddev, "config %d interface %d altsetting %d endpoint 0x%X has invalid maxpacket %d, setting to %d\n",
+ cfgno, inum, asnum, d->bEndpointAddress, maxp, j);
+ maxp = j;
+ endpoint->desc.wMaxPacketSize = cpu_to_le16(i | maxp);
+ }
+
/*
* Some buggy high speed devices have bulk endpoints using
* maxpacket sizes other than 512. High speed HCDs may not
@@ -293,9 +356,6 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, int inum,
*/
if (to_usb_device(ddev)->speed == USB_SPEED_HIGH
&& usb_endpoint_xfer_bulk(d)) {
- unsigned maxp;
-
- maxp = usb_endpoint_maxp(&endpoint->desc) & 0x07ff;
if (maxp != 512)
dev_warn(ddev, "config %d interface %d altsetting %d "
"bulk endpoint 0x%X has invalid maxpacket %d\n",
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index e9f5043a2167..e6a6d67c8705 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -241,7 +241,8 @@ static int usbdev_mmap(struct file *file, struct vm_area_struct *vma)
goto error_decrease_mem;
}
- mem = usb_alloc_coherent(ps->dev, size, GFP_USER, &dma_handle);
+ mem = usb_alloc_coherent(ps->dev, size, GFP_USER | __GFP_NOWARN,
+ &dma_handle);
if (!mem) {
ret = -ENOMEM;
goto error_free_usbm;
@@ -2582,7 +2583,9 @@ static unsigned int usbdev_poll(struct file *file,
if (file->f_mode & FMODE_WRITE && !list_empty(&ps->async_completed))
mask |= POLLOUT | POLLWRNORM;
if (!connected(ps))
- mask |= POLLERR | POLLHUP;
+ mask |= POLLHUP;
+ if (list_empty(&ps->list))
+ mask |= POLLERR;
return mask;
}
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index bee13517676f..1d5fc32d06d0 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -1052,14 +1052,11 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type)
/* Continue a partial initialization */
if (type == HUB_INIT2 || type == HUB_INIT3) {
- device_lock(hub->intfdev);
+ device_lock(&hdev->dev);
/* Was the hub disconnected while we were waiting? */
- if (hub->disconnected) {
- device_unlock(hub->intfdev);
- kref_put(&hub->kref, hub_release);
- return;
- }
+ if (hub->disconnected)
+ goto disconnected;
if (type == HUB_INIT2)
goto init2;
goto init3;
@@ -1262,7 +1259,7 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type)
queue_delayed_work(system_power_efficient_wq,
&hub->init_work,
msecs_to_jiffies(delay));
- device_unlock(hub->intfdev);
+ device_unlock(&hdev->dev);
return; /* Continues at init3: below */
} else {
msleep(delay);
@@ -1281,12 +1278,12 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type)
/* Scan all ports that need attention */
kick_hub_wq(hub);
- /* Allow autosuspend if it was suppressed */
- if (type <= HUB_INIT3)
+ if (type == HUB_INIT2 || type == HUB_INIT3) {
+ /* Allow autosuspend if it was suppressed */
+ disconnected:
usb_autopm_put_interface_async(to_usb_interface(hub->intfdev));
-
- if (type == HUB_INIT2 || type == HUB_INIT3)
- device_unlock(hub->intfdev);
+ device_unlock(&hdev->dev);
+ }
kref_put(&hub->kref, hub_release);
}
@@ -1315,8 +1312,6 @@ static void hub_quiesce(struct usb_hub *hub, enum hub_quiescing_type type)
struct usb_device *hdev = hub->hdev;
int i;
- cancel_delayed_work_sync(&hub->init_work);
-
/* hub_wq and related activity won't re-trigger */
hub->quiescing = 1;
diff --git a/drivers/usb/dwc3/dwc3-of-simple.c b/drivers/usb/dwc3/dwc3-of-simple.c
index 974335377d9f..e56d59b19a0e 100644
--- a/drivers/usb/dwc3/dwc3-of-simple.c
+++ b/drivers/usb/dwc3/dwc3-of-simple.c
@@ -61,6 +61,7 @@ static int dwc3_of_simple_probe(struct platform_device *pdev)
if (!simple->clks)
return -ENOMEM;
+ platform_set_drvdata(pdev, simple);
simple->dev = dev;
for (i = 0; i < simple->num_clocks; i++) {
diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c
index 45f5a232d9fb..2eb84d6c24a6 100644
--- a/drivers/usb/dwc3/dwc3-pci.c
+++ b/drivers/usb/dwc3/dwc3-pci.c
@@ -37,6 +37,7 @@
#define PCI_DEVICE_ID_INTEL_BXT 0x0aaa
#define PCI_DEVICE_ID_INTEL_BXT_M 0x1aaa
#define PCI_DEVICE_ID_INTEL_APL 0x5aaa
+#define PCI_DEVICE_ID_INTEL_KBP 0xa2b0
static const struct acpi_gpio_params reset_gpios = { 0, 0, false };
static const struct acpi_gpio_params cs_gpios = { 1, 0, false };
@@ -227,6 +228,7 @@ static const struct pci_device_id dwc3_pci_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BXT), },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BXT_M), },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_APL), },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBP), },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_NL_USB), },
{ } /* Terminating Entry */
};
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 8f8c2157910e..1f5597ef945d 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -829,7 +829,7 @@ static void dwc3_prepare_one_trb(struct dwc3_ep *dep,
if (!req->request.no_interrupt && !chain)
trb->ctrl |= DWC3_TRB_CTRL_IOC | DWC3_TRB_CTRL_ISP_IMI;
- if (last)
+ if (last && !usb_endpoint_xfer_isoc(dep->endpoint.desc))
trb->ctrl |= DWC3_TRB_CTRL_LST;
if (chain)
@@ -1955,7 +1955,8 @@ static void dwc3_gadget_free_endpoints(struct dwc3 *dwc)
static int __dwc3_cleanup_done_trbs(struct dwc3 *dwc, struct dwc3_ep *dep,
struct dwc3_request *req, struct dwc3_trb *trb,
- const struct dwc3_event_depevt *event, int status)
+ const struct dwc3_event_depevt *event, int status,
+ int chain)
{
unsigned int count;
unsigned int s_pkt = 0;
@@ -1964,17 +1965,22 @@ static int __dwc3_cleanup_done_trbs(struct dwc3 *dwc, struct dwc3_ep *dep,
dep->queued_requests--;
trace_dwc3_complete_trb(dep, trb);
+ /*
+ * If we're in the middle of series of chained TRBs and we
+ * receive a short transfer along the way, DWC3 will skip
+ * through all TRBs including the last TRB in the chain (the
+ * where CHN bit is zero. DWC3 will also avoid clearing HWO
+ * bit and SW has to do it manually.
+ *
+ * We're going to do that here to avoid problems of HW trying
+ * to use bogus TRBs for transfers.
+ */
+ if (chain && (trb->ctrl & DWC3_TRB_CTRL_HWO))
+ trb->ctrl &= ~DWC3_TRB_CTRL_HWO;
+
if ((trb->ctrl & DWC3_TRB_CTRL_HWO) && status != -ESHUTDOWN)
- /*
- * We continue despite the error. There is not much we
- * can do. If we don't clean it up we loop forever. If
- * we skip the TRB then it gets overwritten after a
- * while since we use them in a ring buffer. A BUG()
- * would help. Lets hope that if this occurs, someone
- * fixes the root cause instead of looking away :)
- */
- dev_err(dwc->dev, "%s's TRB (%p) still owned by HW\n",
- dep->name, trb);
+ return 1;
+
count = trb->size & DWC3_TRB_SIZE_MASK;
if (dep->direction) {
@@ -2013,15 +2019,7 @@ static int __dwc3_cleanup_done_trbs(struct dwc3 *dwc, struct dwc3_ep *dep,
s_pkt = 1;
}
- /*
- * We assume here we will always receive the entire data block
- * which we should receive. Meaning, if we program RX to
- * receive 4K but we receive only 2K, we assume that's all we
- * should receive and we simply bounce the request back to the
- * gadget driver for further processing.
- */
- req->request.actual += req->request.length - count;
- if (s_pkt)
+ if (s_pkt && !chain)
return 1;
if ((event->status & DEPEVT_STATUS_LST) &&
(trb->ctrl & (DWC3_TRB_CTRL_LST |
@@ -2040,13 +2038,17 @@ static int dwc3_cleanup_done_reqs(struct dwc3 *dwc, struct dwc3_ep *dep,
struct dwc3_trb *trb;
unsigned int slot;
unsigned int i;
+ int count = 0;
int ret;
do {
+ int chain;
+
req = next_request(&dep->started_list);
if (WARN_ON_ONCE(!req))
return 1;
+ chain = req->request.num_mapped_sgs > 0;
i = 0;
do {
slot = req->first_trb_index + i;
@@ -2054,13 +2056,22 @@ static int dwc3_cleanup_done_reqs(struct dwc3 *dwc, struct dwc3_ep *dep,
slot++;
slot %= DWC3_TRB_NUM;
trb = &dep->trb_pool[slot];
+ count += trb->size & DWC3_TRB_SIZE_MASK;
ret = __dwc3_cleanup_done_trbs(dwc, dep, req, trb,
- event, status);
+ event, status, chain);
if (ret)
break;
} while (++i < req->request.num_mapped_sgs);
+ /*
+ * We assume here we will always receive the entire data block
+ * which we should receive. Meaning, if we program RX to
+ * receive 4K but we receive only 2K, we assume that's all we
+ * should receive and we simply bounce the request back to the
+ * gadget driver for further processing.
+ */
+ req->request.actual += req->request.length - count;
dwc3_gadget_giveback(dep, req, status);
if (ret)
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index eb648485a58c..5ebe6af7976e 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1913,6 +1913,8 @@ unknown:
break;
case USB_RECIP_ENDPOINT:
+ if (!cdev->config)
+ break;
endp = ((w_index & 0x80) >> 3) | (w_index & 0x0f);
list_for_each_entry(f, &cdev->config->functions, list) {
if (test_bit(endp, f->endpoints))
@@ -2124,14 +2126,14 @@ int composite_os_desc_req_prepare(struct usb_composite_dev *cdev,
cdev->os_desc_req = usb_ep_alloc_request(ep0, GFP_KERNEL);
if (!cdev->os_desc_req) {
- ret = PTR_ERR(cdev->os_desc_req);
+ ret = -ENOMEM;
goto end;
}
/* OS feature descriptor length <= 4kB */
cdev->os_desc_req->buf = kmalloc(4096, GFP_KERNEL);
if (!cdev->os_desc_req->buf) {
- ret = PTR_ERR(cdev->os_desc_req->buf);
+ ret = -ENOMEM;
kfree(cdev->os_desc_req);
goto end;
}
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
index 70cf3477f951..f9237fe2be05 100644
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -1490,7 +1490,9 @@ void unregister_gadget_item(struct config_item *item)
{
struct gadget_info *gi = to_gadget_info(item);
+ mutex_lock(&gi->lock);
unregister_gadget(gi);
+ mutex_unlock(&gi->lock);
}
EXPORT_SYMBOL_GPL(unregister_gadget_item);
diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c
index 943c21aafd3b..ab6ac1b74ac0 100644
--- a/drivers/usb/gadget/function/rndis.c
+++ b/drivers/usb/gadget/function/rndis.c
@@ -680,6 +680,12 @@ static int rndis_reset_response(struct rndis_params *params,
{
rndis_reset_cmplt_type *resp;
rndis_resp_t *r;
+ u8 *xbuf;
+ u32 length;
+
+ /* drain the response queue */
+ while ((xbuf = rndis_get_next_response(params, &length)))
+ rndis_free_response(params, xbuf);
r = rndis_add_response(params, sizeof(rndis_reset_cmplt_type));
if (!r)
diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c
index a3f7e7c55ebb..5f562c1ec795 100644
--- a/drivers/usb/gadget/function/u_ether.c
+++ b/drivers/usb/gadget/function/u_ether.c
@@ -556,7 +556,8 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb,
/* Multi frame CDC protocols may store the frame for
* later which is not a dropped frame.
*/
- if (dev->port_usb->supports_multi_frame)
+ if (dev->port_usb &&
+ dev->port_usb->supports_multi_frame)
goto multiframe;
goto drop;
}
diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c
index 66753ba7a42e..31125a4a2658 100644
--- a/drivers/usb/gadget/function/uvc_configfs.c
+++ b/drivers/usb/gadget/function/uvc_configfs.c
@@ -2023,7 +2023,7 @@ static int uvcg_streaming_class_allow_link(struct config_item *src,
if (!data) {
kfree(*class_array);
*class_array = NULL;
- ret = PTR_ERR(data);
+ ret = -ENOMEM;
goto unlock;
}
cl_arr = *class_array;
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index aa3707bdebb4..16104b5ebdcb 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -542,7 +542,7 @@ static ssize_t ep_aio(struct kiocb *iocb,
*/
spin_lock_irq(&epdata->dev->lock);
value = -ENODEV;
- if (unlikely(epdata->ep))
+ if (unlikely(epdata->ep == NULL))
goto fail;
req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC);
@@ -606,7 +606,7 @@ ep_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
if (is_sync_kiocb(iocb)) {
value = ep_io(epdata, buf, len);
- if (value >= 0 && copy_to_iter(buf, value, to))
+ if (value >= 0 && (copy_to_iter(buf, value, to) != value))
value = -EFAULT;
} else {
struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL);
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index ff8685ea7219..934f83881c30 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1145,7 +1145,7 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
if (ret != -EPROBE_DEFER)
list_del(&driver->pending);
if (ret)
- goto err4;
+ goto err5;
break;
}
}
@@ -1154,6 +1154,9 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
return 0;
+err5:
+ device_del(&udc->dev);
+
err4:
list_del(&udc->list);
mutex_unlock(&udc_lock);
diff --git a/drivers/usb/gadget/udc/fsl_qe_udc.c b/drivers/usb/gadget/udc/fsl_qe_udc.c
index 93d28cb00b76..cf8819a5c5b2 100644
--- a/drivers/usb/gadget/udc/fsl_qe_udc.c
+++ b/drivers/usb/gadget/udc/fsl_qe_udc.c
@@ -2053,7 +2053,7 @@ static void setup_received_handle(struct qe_udc *udc,
struct qe_ep *ep;
if (wValue != 0 || wLength != 0
- || pipe > USB_MAX_ENDPOINTS)
+ || pipe >= USB_MAX_ENDPOINTS)
break;
ep = &udc->eps[pipe];
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index a962b89b65a6..1e5f529d51a2 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -332,11 +332,11 @@ static void ehci_turn_off_all_ports(struct ehci_hcd *ehci)
int port = HCS_N_PORTS(ehci->hcs_params);
while (port--) {
- ehci_writel(ehci, PORT_RWC_BITS,
- &ehci->regs->port_status[port]);
spin_unlock_irq(&ehci->lock);
ehci_port_power(ehci, port, false);
spin_lock_irq(&ehci->lock);
+ ehci_writel(ehci, PORT_RWC_BITS,
+ &ehci->regs->port_status[port]);
}
}
diff --git a/drivers/usb/host/max3421-hcd.c b/drivers/usb/host/max3421-hcd.c
index c369c29e496d..2f7690092a7f 100644
--- a/drivers/usb/host/max3421-hcd.c
+++ b/drivers/usb/host/max3421-hcd.c
@@ -1675,7 +1675,7 @@ max3421_gpout_set_value(struct usb_hcd *hcd, u8 pin_number, u8 value)
if (pin_number > 7)
return;
- mask = 1u << pin_number;
+ mask = 1u << (pin_number % 4);
idx = pin_number / 4;
if (value)
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index d61fcc48099e..730b9fd26685 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -386,6 +386,9 @@ static int xhci_stop_device(struct xhci_hcd *xhci, int slot_id, int suspend)
ret = 0;
virt_dev = xhci->devs[slot_id];
+ if (!virt_dev)
+ return -ENODEV;
+
cmd = xhci_alloc_command(xhci, false, true, GFP_NOIO);
if (!cmd) {
xhci_dbg(xhci, "Couldn't allocate command structure.\n");
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 4fd041bec332..d7b0f97abbad 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -314,11 +314,12 @@ static void xhci_pci_remove(struct pci_dev *dev)
usb_remove_hcd(xhci->shared_hcd);
usb_put_hcd(xhci->shared_hcd);
}
- usb_hcd_pci_remove(dev);
/* Workaround for spurious wakeups at shutdown with HSW */
if (xhci->quirks & XHCI_SPURIOUS_WAKEUP)
pci_set_power_state(dev, PCI_D3hot);
+
+ usb_hcd_pci_remove(dev);
}
#ifdef CONFIG_PM
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 918e0c739b79..fd9fd12e4861 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1334,12 +1334,6 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
cmd = list_entry(xhci->cmd_list.next, struct xhci_command, cmd_list);
- if (cmd->command_trb != xhci->cmd_ring->dequeue) {
- xhci_err(xhci,
- "Command completion event does not match command\n");
- return;
- }
-
del_timer(&xhci->cmd_timer);
trace_xhci_cmd_completion(cmd_trb, (struct xhci_generic_trb *) event);
@@ -1351,6 +1345,13 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
xhci_handle_stopped_cmd_ring(xhci, cmd);
return;
}
+
+ if (cmd->command_trb != xhci->cmd_ring->dequeue) {
+ xhci_err(xhci,
+ "Command completion event does not match command\n");
+ return;
+ }
+
/*
* Host aborted the command ring, check if the current command was
* supposed to be aborted, otherwise continue normally.
@@ -3243,7 +3244,8 @@ int xhci_queue_bulk_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
send_addr = addr;
/* Queue the TRBs, even if they are zero-length */
- for (enqd_len = 0; enqd_len < full_len; enqd_len += trb_buff_len) {
+ for (enqd_len = 0; first_trb || enqd_len < full_len;
+ enqd_len += trb_buff_len) {
field = TRB_TYPE(TRB_NORMAL);
/* TRB buffer should not cross 64KB boundaries */
diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c
index 52c27cab78c3..9b5b3b2281ca 100644
--- a/drivers/usb/misc/ftdi-elan.c
+++ b/drivers/usb/misc/ftdi-elan.c
@@ -665,7 +665,7 @@ static ssize_t ftdi_elan_read(struct file *file, char __user *buffer,
{
char data[30 *3 + 4];
char *d = data;
- int m = (sizeof(data) - 1) / 3;
+ int m = (sizeof(data) - 1) / 3 - 1;
int bytes_read = 0;
int retry_on_empty = 10;
int retry_on_timeout = 5;
@@ -1684,7 +1684,7 @@ wait:if (ftdi->disconnected > 0) {
int i = 0;
char data[30 *3 + 4];
char *d = data;
- int m = (sizeof(data) - 1) / 3;
+ int m = (sizeof(data) - 1) / 3 - 1;
int l = 0;
struct u132_target *target = &ftdi->target[ed];
struct u132_command *command = &ftdi->command[
@@ -1876,7 +1876,7 @@ more:{
if (packet_bytes > 2) {
char diag[30 *3 + 4];
char *d = diag;
- int m = (sizeof(diag) - 1) / 3;
+ int m = (sizeof(diag) - 1) / 3 - 1;
char *b = ftdi->bulk_in_buffer;
int bytes_read = 0;
diag[0] = 0;
@@ -2053,7 +2053,7 @@ static int ftdi_elan_synchronize(struct usb_ftdi *ftdi)
if (packet_bytes > 2) {
char diag[30 *3 + 4];
char *d = diag;
- int m = (sizeof(diag) - 1) / 3;
+ int m = (sizeof(diag) - 1) / 3 - 1;
char *b = ftdi->bulk_in_buffer;
int bytes_read = 0;
unsigned char c = 0;
@@ -2155,7 +2155,7 @@ more:{
if (packet_bytes > 2) {
char diag[30 *3 + 4];
char *d = diag;
- int m = (sizeof(diag) - 1) / 3;
+ int m = (sizeof(diag) - 1) / 3 - 1;
char *b = ftdi->bulk_in_buffer;
int bytes_read = 0;
diag[0] = 0;
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index 6b978f04b8d7..5c8210dc6fd9 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -585,7 +585,6 @@ static void sg_timeout(unsigned long _req)
{
struct usb_sg_request *req = (struct usb_sg_request *) _req;
- req->status = -ETIMEDOUT;
usb_sg_cancel(req);
}
@@ -616,8 +615,10 @@ static int perform_sglist(
mod_timer(&sg_timer, jiffies +
msecs_to_jiffies(SIMPLE_IO_TIMEOUT));
usb_sg_wait(req);
- del_timer_sync(&sg_timer);
- retval = req->status;
+ if (!del_timer_sync(&sg_timer))
+ retval = -ETIMEDOUT;
+ else
+ retval = req->status;
/* FIXME check resulting data pattern */
@@ -2602,7 +2603,7 @@ usbtest_ioctl(struct usb_interface *intf, unsigned int code, void *buf)
ktime_get_ts64(&start);
retval = usbtest_do_ioctl(intf, param_32);
- if (retval)
+ if (retval < 0)
goto free_mutex;
ktime_get_ts64(&end);
diff --git a/drivers/usb/phy/phy-omap-otg.c b/drivers/usb/phy/phy-omap-otg.c
index 6f6d2a7fd5a0..6523af4f8f93 100644
--- a/drivers/usb/phy/phy-omap-otg.c
+++ b/drivers/usb/phy/phy-omap-otg.c
@@ -140,6 +140,8 @@ static int omap_otg_probe(struct platform_device *pdev)
(rev >> 4) & 0xf, rev & 0xf, config->extcon, otg_dev->id,
otg_dev->vbus);
+ platform_set_drvdata(pdev, otg_dev);
+
return 0;
}
diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 8fbbc2d32371..ac67bab9124c 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -514,7 +514,8 @@ static struct renesas_usbhs_platform_info *usbhs_parse_dt(struct device *dev)
if (gpio > 0)
dparam->enable_gpio = gpio;
- if (dparam->type == USBHS_TYPE_RCAR_GEN2)
+ if (dparam->type == USBHS_TYPE_RCAR_GEN2 ||
+ dparam->type == USBHS_TYPE_RCAR_GEN3)
dparam->has_usb_dmac = 1;
return info;
diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c
index 280ed5ff021b..857e78337324 100644
--- a/drivers/usb/renesas_usbhs/fifo.c
+++ b/drivers/usb/renesas_usbhs/fifo.c
@@ -871,7 +871,7 @@ static int usbhsf_dma_prepare_push(struct usbhs_pkt *pkt, int *is_done)
/* use PIO if packet is less than pio_dma_border or pipe is DCP */
if ((len < usbhs_get_dparam(priv, pio_dma_border)) ||
- usbhs_pipe_is_dcp(pipe))
+ usbhs_pipe_type_is(pipe, USB_ENDPOINT_XFER_ISOC))
goto usbhsf_pio_prepare_push;
/* check data length if this driver don't use USB-DMAC */
@@ -976,7 +976,7 @@ static int usbhsf_dma_prepare_pop_with_usb_dmac(struct usbhs_pkt *pkt,
/* use PIO if packet is less than pio_dma_border or pipe is DCP */
if ((pkt->length < usbhs_get_dparam(priv, pio_dma_border)) ||
- usbhs_pipe_is_dcp(pipe))
+ usbhs_pipe_type_is(pipe, USB_ENDPOINT_XFER_ISOC))
goto usbhsf_pio_prepare_pop;
fifo = usbhsf_get_dma_fifo(priv, pkt);
diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c
index 50f3363cc382..92bc83b92d10 100644
--- a/drivers/usb/renesas_usbhs/mod_gadget.c
+++ b/drivers/usb/renesas_usbhs/mod_gadget.c
@@ -617,10 +617,13 @@ static int usbhsg_ep_enable(struct usb_ep *ep,
* use dmaengine if possible.
* It will use pio handler if impossible.
*/
- if (usb_endpoint_dir_in(desc))
+ if (usb_endpoint_dir_in(desc)) {
pipe->handler = &usbhs_fifo_dma_push_handler;
- else
+ } else {
pipe->handler = &usbhs_fifo_dma_pop_handler;
+ usbhs_xxxsts_clear(priv, BRDYSTS,
+ usbhs_pipe_number(pipe));
+ }
ret = 0;
}
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 00820809139a..b2d767e743fc 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -648,6 +648,8 @@ static const struct usb_device_id id_table_combined[] = {
{ USB_DEVICE(FTDI_VID, FTDI_ELV_TFD128_PID) },
{ USB_DEVICE(FTDI_VID, FTDI_ELV_FM3RX_PID) },
{ USB_DEVICE(FTDI_VID, FTDI_ELV_WS777_PID) },
+ { USB_DEVICE(FTDI_VID, FTDI_PALMSENS_PID) },
+ { USB_DEVICE(FTDI_VID, FTDI_IVIUM_XSTAT_PID) },
{ USB_DEVICE(FTDI_VID, LINX_SDMUSBQSS_PID) },
{ USB_DEVICE(FTDI_VID, LINX_MASTERDEVEL2_PID) },
{ USB_DEVICE(FTDI_VID, LINX_FUTURE_0_PID) },
@@ -1008,6 +1010,7 @@ static const struct usb_device_id id_table_combined[] = {
{ USB_DEVICE(ICPDAS_VID, ICPDAS_I7560U_PID) },
{ USB_DEVICE(ICPDAS_VID, ICPDAS_I7561U_PID) },
{ USB_DEVICE(ICPDAS_VID, ICPDAS_I7563U_PID) },
+ { USB_DEVICE(WICED_VID, WICED_USB20706V2_PID) },
{ } /* Terminating entry */
};
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index c5d6c1e73e8e..f87a938cf005 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -406,6 +406,12 @@
#define FTDI_4N_GALAXY_DE_3_PID 0xF3C2
/*
+ * Ivium Technologies product IDs
+ */
+#define FTDI_PALMSENS_PID 0xf440
+#define FTDI_IVIUM_XSTAT_PID 0xf441
+
+/*
* Linx Technologies product ids
*/
#define LINX_SDMUSBQSS_PID 0xF448 /* Linx SDM-USB-QS-S */
@@ -673,6 +679,12 @@
#define INTREPID_NEOVI_PID 0x0701
/*
+ * WICED USB UART
+ */
+#define WICED_VID 0x0A5C
+#define WICED_USB20706V2_PID 0x6422
+
+/*
* Definitions for ID TECH (www.idt-net.com) devices
*/
#define IDTECH_VID 0x0ACD /* ID TECH Vendor ID */
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 8e07536c233a..bc472584a229 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -274,6 +274,12 @@ static void option_instat_callback(struct urb *urb);
#define TELIT_PRODUCT_LE920 0x1200
#define TELIT_PRODUCT_LE910 0x1201
#define TELIT_PRODUCT_LE910_USBCFG4 0x1206
+#define TELIT_PRODUCT_LE920A4_1207 0x1207
+#define TELIT_PRODUCT_LE920A4_1208 0x1208
+#define TELIT_PRODUCT_LE920A4_1211 0x1211
+#define TELIT_PRODUCT_LE920A4_1212 0x1212
+#define TELIT_PRODUCT_LE920A4_1213 0x1213
+#define TELIT_PRODUCT_LE920A4_1214 0x1214
/* ZTE PRODUCTS */
#define ZTE_VENDOR_ID 0x19d2
@@ -628,6 +634,11 @@ static const struct option_blacklist_info telit_le920_blacklist = {
.reserved = BIT(1) | BIT(5),
};
+static const struct option_blacklist_info telit_le920a4_blacklist_1 = {
+ .sendsetup = BIT(0),
+ .reserved = BIT(1),
+};
+
static const struct option_blacklist_info telit_le922_blacklist_usbcfg0 = {
.sendsetup = BIT(2),
.reserved = BIT(0) | BIT(1) | BIT(3),
@@ -1203,6 +1214,16 @@ static const struct usb_device_id option_ids[] = {
.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 },
{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920),
.driver_info = (kernel_ulong_t)&telit_le920_blacklist },
+ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1207) },
+ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1208),
+ .driver_info = (kernel_ulong_t)&telit_le920a4_blacklist_1 },
+ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1211),
+ .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 },
+ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1212),
+ .driver_info = (kernel_ulong_t)&telit_le920a4_blacklist_1 },
+ { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1213, 0xff) },
+ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1214),
+ .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 },
{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */
{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0002, 0xff, 0xff, 0xff),
.driver_info = (kernel_ulong_t)&net_intf1_blacklist },
@@ -1966,6 +1987,7 @@ static const struct usb_device_id option_ids[] = {
.driver_info = (kernel_ulong_t)&net_intf4_blacklist },
{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e01, 0xff, 0xff, 0xff) }, /* D-Link DWM-152/C1 */
{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3e02, 0xff, 0xff, 0xff) }, /* D-Link DWM-156/C1 */
+ { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x7e11, 0xff, 0xff, 0xff) }, /* D-Link DWM-156/A3 */
{ USB_DEVICE_INTERFACE_CLASS(0x2020, 0x4000, 0xff) }, /* OLICARD300 - MT6225 */
{ USB_DEVICE(INOVIA_VENDOR_ID, INOVIA_SEW858) },
{ USB_DEVICE(VIATELECOM_VENDOR_ID, VIATELECOM_PRODUCT_CDS7) },
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index b1b9bac44016..d213cf44a7e4 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1433,7 +1433,7 @@ int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[]
rc = usb_register(udriver);
if (rc)
- return rc;
+ goto failed_usb_register;
for (sd = serial_drivers; *sd; ++sd) {
(*sd)->usb_driver = udriver;
@@ -1451,6 +1451,8 @@ int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[]
while (sd-- > serial_drivers)
usb_serial_deregister(*sd);
usb_deregister(udriver);
+failed_usb_register:
+ kfree(udriver);
return rc;
}
EXPORT_SYMBOL_GPL(usb_serial_register_drivers);
diff --git a/fs/iomap.c b/fs/iomap.c
index 48141b8eff5f..0342254646e3 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -84,8 +84,11 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
* Now the data has been copied, commit the range we've copied. This
* should not fail unless the filesystem has had a fatal error.
*/
- ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
- flags, &iomap);
+ if (ops->iomap_end) {
+ ret = ops->iomap_end(inode, pos, length,
+ written > 0 ? written : 0,
+ flags, &iomap);
+ }
return written ? written : ret;
}
@@ -194,12 +197,9 @@ again:
if (mapping_writably_mapped(inode->i_mapping))
flush_dcache_page(page);
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
- mark_page_accessed(page);
status = iomap_write_end(inode, pos, bytes, copied, page);
if (unlikely(status < 0))
@@ -470,13 +470,18 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
if (ret)
return ret;
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret)
- return ret;
+ if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+ }
while (len > 0) {
ret = iomap_apply(inode, start, len, 0, ops, &ctx,
iomap_fiemap_actor);
+ /* inode with no (attribute) mapping will give ENOENT */
+ if (ret == -ENOENT)
+ break;
if (ret < 0)
return ret;
if (ret == 0)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 776ae2f325d1..3dd8f1d54498 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1582,6 +1582,7 @@ xfs_alloc_ag_vextent_small(
xfs_extlen_t *flenp, /* result length */
int *stat) /* status: 0-freelist, 1-normal/none */
{
+ struct xfs_owner_info oinfo;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1624,6 +1625,18 @@ xfs_alloc_ag_vextent_small(
error0);
args->wasfromfl = 1;
trace_xfs_alloc_small_freelist(args);
+
+ /*
+ * If we're feeding an AGFL block to something that
+ * doesn't live in the free space, we need to clear
+ * out the OWN_AG rmap.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_rmap_free(args->tp, args->agbp, args->agno,
+ fbno, 1, &oinfo);
+ if (error)
+ goto error0;
+
*stat = 0;
return 0;
}
@@ -2264,6 +2277,7 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_longest),
offsetof(xfs_agf_t, agf_btreeblks),
offsetof(xfs_agf_t, agf_uuid),
+ offsetof(xfs_agf_t, agf_rmap_blocks),
sizeof(xfs_agf_t)
};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f814d42c73b2..e6a8bea0f7ba 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -640,12 +640,15 @@ typedef struct xfs_agf {
__be32 agf_btreeblks; /* # of blocks held in AGF btrees */
uuid_t agf_uuid; /* uuid of filesystem */
+ __be32 agf_rmap_blocks; /* rmapbt blocks used */
+ __be32 agf_padding; /* padding */
+
/*
* reserve some contiguous space for future logged fields before we add
* the unlogged fields. This makes the range logging via flags and
* structure offsets much simpler.
*/
- __be64 agf_spare64[16];
+ __be64 agf_spare64[15];
/* unlogged fields, written during buffer writeback. */
__be64 agf_lsn; /* last write sequence */
@@ -670,7 +673,8 @@ typedef struct xfs_agf {
#define XFS_AGF_LONGEST 0x00000400
#define XFS_AGF_BTREEBLKS 0x00000800
#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_NUM_BITS 13
+#define XFS_AGF_RMAP_BLOCKS 0x00002000
+#define XFS_AGF_NUM_BITS 14
#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
@@ -686,7 +690,8 @@ typedef struct xfs_agf {
{ XFS_AGF_FREEBLKS, "FREEBLKS" }, \
{ XFS_AGF_LONGEST, "LONGEST" }, \
{ XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
- { XFS_AGF_UUID, "UUID" }
+ { XFS_AGF_UUID, "UUID" }, \
+ { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index bc1faebc84ec..17b8eeb34ac8 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -98,6 +98,8 @@ xfs_rmapbt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
int error;
xfs_agblock_t bno;
@@ -124,6 +126,8 @@ xfs_rmapbt_alloc_block(
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
+ be32_add_cpu(&agf->agf_rmap_blocks, 1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
@@ -143,6 +147,8 @@ xfs_rmapbt_free_block(
bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
bno, 1);
+ be32_add_cpu(&agf->agf_rmap_blocks, -1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
if (error)
return error;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 47a318ce82e0..607cc29bba21 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -115,7 +115,6 @@ xfs_buf_ioacct_dec(
if (!(bp->b_flags & _XBF_IN_FLIGHT))
return;
- ASSERT(bp->b_flags & XBF_ASYNC);
bp->b_flags &= ~_XBF_IN_FLIGHT;
percpu_counter_dec(&bp->b_target->bt_io_count);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ed95e5bb04e6..e612a0233710 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -741,9 +741,20 @@ xfs_file_dax_write(
* page is inserted into the pagecache when we have to serve a write
* fault on a hole. It should never be dirtied and can simply be
* dropped from the pagecache once we get real data for the page.
+ *
+ * XXX: This is racy against mmap, and there's nothing we can do about
+ * it. dax_do_io() should really do this invalidation internally as
+ * it will know if we've allocated over a holei for this specific IO and
+ * if so it needs to update the mapping tree and invalidate existing
+ * PTEs over the newly allocated range. Remove this invalidation when
+ * dax_do_io() is fixed up.
*/
if (mapping->nrpages) {
- ret = invalidate_inode_pages2(mapping);
+ loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0f96847b90e1..0b7f986745c1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -248,6 +248,7 @@ xfs_growfs_data_private(
agf->agf_roots[XFS_BTNUM_RMAPi] =
cpu_to_be32(XFS_RMAP_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_blocks = cpu_to_be32(1);
}
agf->agf_flfirst = cpu_to_be32(1);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2114d53df433..2af0dda1c978 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -715,12 +715,16 @@ xfs_iomap_write_allocate(
* is in the delayed allocation extent on which we sit
* but before our buffer starts.
*/
-
nimaps = 0;
while (nimaps == 0) {
nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+ /*
+ * We have already reserved space for the extent and any
+ * indirect blocks when creating the delalloc extent,
+ * there is no need to reserve space in this transaction
+ * again.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
@@ -1037,20 +1041,14 @@ xfs_file_iomap_begin(
return error;
trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- } else if (nimaps) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
} else {
+ ASSERT(nimaps);
+
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- iomap->offset = offset;
- iomap->length = length;
+ trace_xfs_iomap_found(ip, offset, length, 0, &imap);
}
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
return 0;
}
@@ -1112,3 +1110,48 @@ struct iomap_ops xfs_iomap_ops = {
.iomap_begin = xfs_file_iomap_begin,
.iomap_end = xfs_file_iomap_end,
};
+
+static int
+xfs_xattr_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1, error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+
+ /* if there are no attribute fork or extents, return ENOENT */
+ if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+ error = -ENOENT;
+ goto out_unlock;
+ }
+
+ ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+
+ if (!error) {
+ ASSERT(nimaps);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ }
+
+ return error;
+}
+
+struct iomap_ops xfs_xattr_iomap_ops = {
+ .iomap_begin = xfs_xattr_iomap_begin,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e066d045e2ff..fb8aca3d69ab 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -35,5 +35,6 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
extern struct iomap_ops xfs_iomap_ops;
+extern struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab820f84ed50..b24c3102fa93 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1009,7 +1009,14 @@ xfs_vn_fiemap(
int error;
xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
- error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+ error = iomap_fiemap(inode, fieinfo, start, length,
+ &xfs_xattr_iomap_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, length,
+ &xfs_iomap_ops);
+ }
xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 551b7e26980c..7e88bec3f359 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1298,7 +1298,6 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
DEFINE_IOMAP_EVENT(xfs_iomap_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 1bb954842725..436aa4e42221 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -527,13 +527,13 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
* object's lifetime is managed by something other than RCU. That
* "something other" might be reference counting or simple immortality.
*
- * The seemingly unused void * variable is to validate @p is indeed a pointer
- * type. All pointer types silently cast to void *.
+ * The seemingly unused size_t variable is to validate @p is indeed a pointer
+ * type by making sure it can be dereferenced.
*/
#define lockless_dereference(p) \
({ \
typeof(p) _________p1 = READ_ONCE(p); \
- __maybe_unused const void * const _________p2 = _________p1; \
+ size_t __maybe_unused __size_of_ptr = sizeof(*(p)); \
smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
(_________p1); \
})
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9c28b4d4c90b..01c0b9cc3915 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -265,6 +265,7 @@ struct kvm_vcpu {
#endif
bool preempted;
struct kvm_vcpu_arch arch;
+ struct dentry *debugfs_dentry;
};
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -749,6 +750,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
+bool kvm_arch_has_vcpu_debugfs(void);
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
+
int kvm_arch_hardware_enable(void);
void kvm_arch_hardware_disable(void);
int kvm_arch_hardware_setup(void);
diff --git a/arch/x86/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 9906505c998a..8d9643767142 100644
--- a/arch/x86/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -29,3 +29,4 @@ CONFIG_NET_9P_VIRTIO=y
CONFIG_SCSI_LOWLEVEL=y
CONFIG_SCSI_VIRTIO=y
CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1903b8f3a705..5650f5317e0c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
return ret;
}
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
- struct event_function_struct efs = {
- .event = event,
- .func = func,
- .data = data,
- };
-
- int ret = event_function(&efs);
- WARN_ON_ONCE(ret);
-}
-
static void event_function_call(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
raw_spin_unlock_irq(&ctx->lock);
}
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct task_struct *task = READ_ONCE(ctx->task);
+ struct perf_event_context *task_ctx = NULL;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (task) {
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ task_ctx = ctx;
+ }
+
+ perf_ctx_lock(cpuctx, task_ctx);
+
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
+
+ if (task) {
+ /*
+ * We must be either inactive or active and the right task,
+ * otherwise we're screwed, since we cannot IPI to somewhere
+ * else.
+ */
+ if (ctx->is_active) {
+ if (WARN_ON_ONCE(task != current))
+ goto unlock;
+
+ if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+ goto unlock;
+ }
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ func(event, cpuctx, ctx, data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
+
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
@@ -3513,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group)
.group = group,
.ret = 0,
};
- smp_call_function_single(event->oncpu,
- __perf_event_read, &data, 1);
- ret = data.ret;
+ ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+ /* The event must have been read from an online CPU: */
+ WARN_ON_ONCE(ret);
+ ret = ret ? : data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
@@ -6584,15 +6621,6 @@ got_name:
}
/*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
- return filter->filter && filter->inode;
-}
-
-/*
* Check whether inode and address range match filter criteria.
*/
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6653,6 +6681,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
struct perf_event_context *ctx;
int ctxn;
+ /*
+ * Data tracing isn't supported yet and as such there is no need
+ * to keep track of anything that isn't related to executable code:
+ */
+ if (!(vma->vm_flags & VM_EXEC))
+ return;
+
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -7805,7 +7840,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
list_for_each_entry(filter, &ifh->list, entry) {
event->addr_filters_offs[count] = 0;
- if (perf_addr_filter_needs_mmap(filter))
+ /*
+ * Adjust base offset if the filter is associated to a binary
+ * that needs to be mapped:
+ */
+ if (filter->inode)
event->addr_filters_offs[count] =
perf_addr_filter_apply(filter, mm);
@@ -7936,8 +7975,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
goto fail;
}
- if (token == IF_SRC_FILE) {
- filename = match_strdup(&args[2]);
+ if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+ int fpos = filter->range ? 2 : 1;
+
+ filename = match_strdup(&args[fpos]);
if (!filename) {
ret = -ENOMEM;
goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2083..8c50276b60d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ if (!ptep) {
+ mem_cgroup_cancel_charge(kpage, memcg, false);
goto unlock;
+ }
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9a0178c2ac1d..b02228411d57 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur.node = list_entry(bm->cur.node->list.next,
- struct rtree_node, list);
- if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
bm->cur.node_bit = 0;
touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/* No more nodes, goto next zone */
- bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur.zone->list != &bm->zones) {
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9858266fb0b3..a846cf89eb96 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* idle, or potentially user or system time. Due to rounding,
* other time can exceed ticks occasionally.
*/
- other = account_other_time(cputime);
+ other = account_other_time(ULONG_MAX);
if (other >= cputime)
return;
cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
}
cputime = cputime_one_jiffy;
- steal = steal_account_process_time(cputime);
+ steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
return;
@@ -516,7 +521,7 @@ void account_idle_ticks(unsigned long ticks)
}
cputime = jiffies_to_cputime(ticks);
- steal = steal_account_process_time(cputime);
+ steal = steal_account_process_time(ULONG_MAX);
if (steal >= cputime)
return;
@@ -614,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -649,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
@@ -694,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
unsigned long now = READ_ONCE(jiffies);
cputime_t delta, other;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
delta = jiffies_to_cputime(now - tsk->vtime_snap);
other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
diff --git a/security/Kconfig b/security/Kconfig
index df28f2b6f3e1..da10d9b573a4 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -136,6 +136,7 @@ config HAVE_ARCH_HARDENED_USERCOPY
config HARDENED_USERCOPY
bool "Harden memory copies between kernel and userspace"
depends on HAVE_ARCH_HARDENED_USERCOPY
+ depends on HAVE_HARDENED_USERCOPY_ALLOCATOR
select BUG
help
This option checks for obviously wrong memory regions when
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index f209ea151dca..3051f86a9b5f 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -87,9 +87,11 @@ struct kvm_regs {
/* Supported VGICv3 address types */
#define KVM_VGIC_V3_ADDR_TYPE_DIST 2
#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3
+#define KVM_VGIC_ITS_ADDR_TYPE 4
#define KVM_VGIC_V3_DIST_SIZE SZ_64K
#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K)
#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h
index 3b8e99ef9d58..a2ffec4139ad 100644
--- a/tools/arch/s390/include/uapi/asm/kvm.h
+++ b/tools/arch/s390/include/uapi/asm/kvm.h
@@ -93,6 +93,47 @@ struct kvm_s390_vm_cpu_machine {
__u64 fac_list[256];
};
+#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2
+#define KVM_S390_VM_CPU_MACHINE_FEAT 3
+
+#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024
+#define KVM_S390_VM_CPU_FEAT_ESOP 0
+#define KVM_S390_VM_CPU_FEAT_SIEF2 1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO 2
+#define KVM_S390_VM_CPU_FEAT_SIIF 3
+#define KVM_S390_VM_CPU_FEAT_GPERE 4
+#define KVM_S390_VM_CPU_FEAT_GSLS 5
+#define KVM_S390_VM_CPU_FEAT_IB 6
+#define KVM_S390_VM_CPU_FEAT_CEI 7
+#define KVM_S390_VM_CPU_FEAT_IBS 8
+#define KVM_S390_VM_CPU_FEAT_SKEY 9
+#define KVM_S390_VM_CPU_FEAT_CMMA 10
+#define KVM_S390_VM_CPU_FEAT_PFMFI 11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF 12
+struct kvm_s390_vm_cpu_feat {
+ __u64 feat[16];
+};
+
+#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4
+#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5
+/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */
+struct kvm_s390_vm_cpu_subfunc {
+ __u8 plo[32]; /* always */
+ __u8 ptff[16]; /* with TOD-clock steering */
+ __u8 kmac[16]; /* with MSA */
+ __u8 kmc[16]; /* with MSA */
+ __u8 km[16]; /* with MSA */
+ __u8 kimd[16]; /* with MSA */
+ __u8 klmd[16]; /* with MSA */
+ __u8 pckmo[16]; /* with MSA3 */
+ __u8 kmctr[16]; /* with MSA4 */
+ __u8 kmf[16]; /* with MSA4 */
+ __u8 kmo[16]; /* with MSA4 */
+ __u8 pcc[16]; /* with MSA4 */
+ __u8 ppno[16]; /* with MSA5 */
+ __u8 reserved[1824];
+};
+
/* kvm attributes for crypto */
#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0
#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1
diff --git a/tools/arch/s390/include/uapi/asm/sie.h b/tools/arch/s390/include/uapi/asm/sie.h
index 8fb5d4a6dd25..3ac634368939 100644
--- a/tools/arch/s390/include/uapi/asm/sie.h
+++ b/tools/arch/s390/include/uapi/asm/sie.h
@@ -140,6 +140,7 @@
exit_code_ipa0(0xB2, 0x4c, "TAR"), \
exit_code_ipa0(0xB2, 0x50, "CSP"), \
exit_code_ipa0(0xB2, 0x54, "MVPG"), \
+ exit_code_ipa0(0xB2, 0x56, "STHYI"), \
exit_code_ipa0(0xB2, 0x58, "BSG"), \
exit_code_ipa0(0xB2, 0x5a, "BSA"), \
exit_code_ipa0(0xB2, 0x5f, "CHSC"), \
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index 8d4dc97d80ba..35745a733100 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -97,6 +97,7 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev,
}
}
+#ifdef HAVE_LIBELF_SUPPORT
void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
int ntevs)
{
@@ -118,5 +119,6 @@ void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
}
}
}
+#endif /* HAVE_LIBELF_SUPPORT */
#endif
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index fb51457ba338..a2412e9d883b 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -501,7 +501,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
struct intel_pt_recording *ptr =
container_of(itr, struct intel_pt_recording, itr);
struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu;
- bool have_timing_info;
+ bool have_timing_info, need_immediate = false;
struct perf_evsel *evsel, *intel_pt_evsel = NULL;
const struct cpu_map *cpus = evlist->cpus;
bool privileged = geteuid() == 0 || perf_event_paranoid() < 0;
@@ -655,6 +655,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
ptr->have_sched_switch = 3;
} else {
opts->record_switch_events = true;
+ need_immediate = true;
if (cpu_wide)
ptr->have_sched_switch = 3;
else
@@ -700,6 +701,9 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
tracking_evsel->attr.freq = 0;
tracking_evsel->attr.sample_period = 1;
+ if (need_immediate)
+ tracking_evsel->immediate = true;
+
/* In per-cpu case, always need the time of mmap events etc */
if (!cpu_map__empty(cpus)) {
perf_evsel__set_sample_bit(tracking_evsel, TIME);
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index d608a2c9e48c..d1ce29be560e 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -88,6 +88,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
if (mem->operation & MEM_OPERATION_LOAD)
perf_mem_events[PERF_MEM_EVENTS__LOAD].record = true;
+ if (mem->operation & MEM_OPERATION_STORE)
+ perf_mem_events[PERF_MEM_EVENTS__STORE].record = true;
+
if (perf_mem_events[PERF_MEM_EVENTS__LOAD].record)
rec_argv[i++] = "-W";
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 9c640a8081c7..c859e59dfe3e 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -371,14 +371,16 @@ static int perf_session__check_output_opt(struct perf_session *session)
if (!no_callchain) {
bool use_callchain = false;
+ bool not_pipe = false;
evlist__for_each_entry(session->evlist, evsel) {
+ not_pipe = true;
if (evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
use_callchain = true;
break;
}
}
- if (!use_callchain)
+ if (not_pipe && !use_callchain)
symbol_conf.use_callchain = false;
}
@@ -1690,8 +1692,13 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
snprintf(scripts_path, MAXPATHLEN, "%s/scripts", get_argv_exec_path());
scripts_dir = opendir(scripts_path);
- if (!scripts_dir)
- return -1;
+ if (!scripts_dir) {
+ fprintf(stdout,
+ "open(%s) failed.\n"
+ "Check \"PERF_EXEC_PATH\" env to set scripts dir.\n",
+ scripts_path);
+ exit(-1);
+ }
for_each_lang(scripts_path, scripts_dir, lang_dirent) {
snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 9c8f15da86ce..8ff6c6a61291 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -123,8 +123,6 @@ struct intel_pt_decoder {
bool have_calc_cyc_to_tsc;
int exec_mode;
unsigned int insn_bytes;
- uint64_t sign_bit;
- uint64_t sign_bits;
uint64_t period;
enum intel_pt_period_type period_type;
uint64_t tot_insn_cnt;
@@ -191,9 +189,6 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params)
decoder->data = params->data;
decoder->return_compression = params->return_compression;
- decoder->sign_bit = (uint64_t)1 << 47;
- decoder->sign_bits = ~(((uint64_t)1 << 48) - 1);
-
decoder->period = params->period;
decoder->period_type = params->period_type;
@@ -362,21 +357,30 @@ int intel_pt__strerror(int code, char *buf, size_t buflen)
return 0;
}
-static uint64_t intel_pt_calc_ip(struct intel_pt_decoder *decoder,
- const struct intel_pt_pkt *packet,
+static uint64_t intel_pt_calc_ip(const struct intel_pt_pkt *packet,
uint64_t last_ip)
{
uint64_t ip;
switch (packet->count) {
- case 2:
+ case 1:
ip = (last_ip & (uint64_t)0xffffffffffff0000ULL) |
packet->payload;
break;
- case 4:
+ case 2:
ip = (last_ip & (uint64_t)0xffffffff00000000ULL) |
packet->payload;
break;
+ case 3:
+ ip = packet->payload;
+ /* Sign-extend 6-byte ip */
+ if (ip & (uint64_t)0x800000000000ULL)
+ ip |= (uint64_t)0xffff000000000000ULL;
+ break;
+ case 4:
+ ip = (last_ip & (uint64_t)0xffff000000000000ULL) |
+ packet->payload;
+ break;
case 6:
ip = packet->payload;
break;
@@ -384,16 +388,12 @@ static uint64_t intel_pt_calc_ip(struct intel_pt_decoder *decoder,
return 0;
}
- if (ip & decoder->sign_bit)
- return ip | decoder->sign_bits;
-
return ip;
}
static inline void intel_pt_set_last_ip(struct intel_pt_decoder *decoder)
{
- decoder->last_ip = intel_pt_calc_ip(decoder, &decoder->packet,
- decoder->last_ip);
+ decoder->last_ip = intel_pt_calc_ip(&decoder->packet, decoder->last_ip);
}
static inline void intel_pt_set_ip(struct intel_pt_decoder *decoder)
@@ -1657,6 +1657,12 @@ next:
}
}
+static inline bool intel_pt_have_ip(struct intel_pt_decoder *decoder)
+{
+ return decoder->last_ip || decoder->packet.count == 0 ||
+ decoder->packet.count == 3 || decoder->packet.count == 6;
+}
+
/* Walk PSB+ packets to get in sync. */
static int intel_pt_walk_psb(struct intel_pt_decoder *decoder)
{
@@ -1677,8 +1683,7 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder)
case INTEL_PT_FUP:
decoder->pge = true;
- if (decoder->last_ip || decoder->packet.count == 6 ||
- decoder->packet.count == 0) {
+ if (intel_pt_have_ip(decoder)) {
uint64_t current_ip = decoder->ip;
intel_pt_set_ip(decoder);
@@ -1767,8 +1772,7 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
case INTEL_PT_TIP_PGE:
case INTEL_PT_TIP:
decoder->pge = decoder->packet.type != INTEL_PT_TIP_PGD;
- if (decoder->last_ip || decoder->packet.count == 6 ||
- decoder->packet.count == 0)
+ if (intel_pt_have_ip(decoder))
intel_pt_set_ip(decoder);
if (decoder->ip)
return 0;
@@ -1776,9 +1780,7 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
case INTEL_PT_FUP:
if (decoder->overflow) {
- if (decoder->last_ip ||
- decoder->packet.count == 6 ||
- decoder->packet.count == 0)
+ if (intel_pt_have_ip(decoder))
intel_pt_set_ip(decoder);
if (decoder->ip)
return 0;
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
index b1257c816310..4f7b32020487 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
@@ -292,36 +292,46 @@ static int intel_pt_get_ip(enum intel_pt_pkt_type type, unsigned int byte,
const unsigned char *buf, size_t len,
struct intel_pt_pkt *packet)
{
- switch (byte >> 5) {
+ int ip_len;
+
+ packet->count = byte >> 5;
+
+ switch (packet->count) {
case 0:
- packet->count = 0;
+ ip_len = 0;
break;
case 1:
if (len < 3)
return INTEL_PT_NEED_MORE_BYTES;
- packet->count = 2;
+ ip_len = 2;
packet->payload = le16_to_cpu(*(uint16_t *)(buf + 1));
break;
case 2:
if (len < 5)
return INTEL_PT_NEED_MORE_BYTES;
- packet->count = 4;
+ ip_len = 4;
packet->payload = le32_to_cpu(*(uint32_t *)(buf + 1));
break;
case 3:
- case 6:
+ case 4:
if (len < 7)
return INTEL_PT_NEED_MORE_BYTES;
- packet->count = 6;
+ ip_len = 6;
memcpy_le64(&packet->payload, buf + 1, 6);
break;
+ case 6:
+ if (len < 9)
+ return INTEL_PT_NEED_MORE_BYTES;
+ ip_len = 8;
+ packet->payload = le64_to_cpu(*(uint64_t *)(buf + 1));
+ break;
default:
return INTEL_PT_BAD_PACKET;
}
packet->type = type;
- return packet->count + 1;
+ return ip_len + 1;
}
static int intel_pt_get_mode(const unsigned char *buf, size_t len,
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 9f3305f6b6d5..95f0884aae02 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -1,3 +1,4 @@
+#include <sys/sysmacros.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index 9aed9c332da6..9c3b9ed5b3c3 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -133,7 +133,7 @@ int probe_file__open_both(int *kfd, int *ufd, int flag)
/* Get raw string list of current kprobe_events or uprobe_events */
struct strlist *probe_file__get_rawlist(int fd)
{
- int ret, idx;
+ int ret, idx, fddup;
FILE *fp;
char buf[MAX_CMDLEN];
char *p;
@@ -143,8 +143,17 @@ struct strlist *probe_file__get_rawlist(int fd)
return NULL;
sl = strlist__new(NULL, NULL);
+ if (sl == NULL)
+ return NULL;
+
+ fddup = dup(fd);
+ if (fddup < 0)
+ goto out_free_sl;
+
+ fp = fdopen(fddup, "r");
+ if (!fp)
+ goto out_close_fddup;
- fp = fdopen(dup(fd), "r");
while (!feof(fp)) {
p = fgets(buf, MAX_CMDLEN, fp);
if (!p)
@@ -156,13 +165,21 @@ struct strlist *probe_file__get_rawlist(int fd)
ret = strlist__add(sl, buf);
if (ret < 0) {
pr_debug("strlist__add failed (%d)\n", ret);
- strlist__delete(sl);
- return NULL;
+ goto out_close_fp;
}
}
fclose(fp);
return sl;
+
+out_close_fp:
+ fclose(fp);
+ goto out_free_sl;
+out_close_fddup:
+ close(fddup);
+out_free_sl:
+ strlist__delete(sl);
+ return NULL;
}
static struct strlist *__probe_file__get_namelist(int fd, bool include_group)
@@ -447,12 +464,17 @@ static int probe_cache__load(struct probe_cache *pcache)
{
struct probe_cache_entry *entry = NULL;
char buf[MAX_CMDLEN], *p;
- int ret = 0;
+ int ret = 0, fddup;
FILE *fp;
- fp = fdopen(dup(pcache->fd), "r");
- if (!fp)
+ fddup = dup(pcache->fd);
+ if (fddup < 0)
+ return -errno;
+ fp = fdopen(fddup, "r");
+ if (!fp) {
+ close(fddup);
return -EINVAL;
+ }
while (!feof(fp)) {
if (!fgets(buf, MAX_CMDLEN, fp))
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index a34321e9b44d..a811c13a74d6 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -837,7 +837,8 @@ int dso__load_sym(struct dso *dso, struct map *map,
sec = syms_ss->symtab;
shdr = syms_ss->symshdr;
- if (elf_section_by_name(elf, &ehdr, &tshdr, ".text", NULL))
+ if (elf_section_by_name(runtime_ss->elf, &runtime_ss->ehdr, &tshdr,
+ ".text", NULL))
dso->text_offset = tshdr.sh_addr - tshdr.sh_offset;
if (runtime_ss->opdsec)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b3fa12ce1166..81dfc73d3df3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -559,9 +559,11 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm)
debugfs_remove_recursive(kvm->debugfs_dentry);
- for (i = 0; i < kvm_debugfs_num_entries; i++)
- kfree(kvm->debugfs_stat_data[i]);
- kfree(kvm->debugfs_stat_data);
+ if (kvm->debugfs_stat_data) {
+ for (i = 0; i < kvm_debugfs_num_entries; i++)
+ kfree(kvm->debugfs_stat_data[i]);
+ kfree(kvm->debugfs_stat_data);
+ }
}
static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
@@ -2369,6 +2371,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
{
struct kvm_vcpu *vcpu = filp->private_data;
+ debugfs_remove_recursive(vcpu->debugfs_dentry);
kvm_put_kvm(vcpu->kvm);
return 0;
}
@@ -2391,6 +2394,32 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
}
+static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ char dir_name[ITOA_MAX_LEN * 2];
+ int ret;
+
+ if (!kvm_arch_has_vcpu_debugfs())
+ return 0;
+
+ if (!debugfs_initialized())
+ return 0;
+
+ snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
+ vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
+ vcpu->kvm->debugfs_dentry);
+ if (!vcpu->debugfs_dentry)
+ return -ENOMEM;
+
+ ret = kvm_arch_create_vcpu_debugfs(vcpu);
+ if (ret < 0) {
+ debugfs_remove_recursive(vcpu->debugfs_dentry);
+ return ret;
+ }
+
+ return 0;
+}
+
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
@@ -2423,6 +2452,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
if (r)
goto vcpu_destroy;
+ r = kvm_create_vcpu_debugfs(vcpu);
+ if (r)
+ goto vcpu_destroy;
+
mutex_lock(&kvm->lock);
if (kvm_get_vcpu_by_id(kvm, id)) {
r = -EEXIST;
@@ -2454,6 +2487,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
+ debugfs_remove_recursive(vcpu->debugfs_dentry);
vcpu_destroy:
kvm_arch_vcpu_destroy(vcpu);
vcpu_decrement:
@@ -3619,7 +3653,7 @@ static int vm_stat_get_per_vm(void *data, u64 *val)
{
struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
- *val = *(u32 *)((void *)stat_data->kvm + stat_data->offset);
+ *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
return 0;
}
@@ -3649,7 +3683,7 @@ static int vcpu_stat_get_per_vm(void *data, u64 *val)
*val = 0;
kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
- *val += *(u32 *)((void *)vcpu + stat_data->offset);
+ *val += *(u64 *)((void *)vcpu + stat_data->offset);
return 0;
}