summaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/Kconfig19
-rw-r--r--arch/powerpc/Makefile15
-rw-r--r--arch/powerpc/include/asm/code-patching.h21
-rw-r--r--arch/powerpc/include/asm/eeh.h5
-rw-r--r--arch/powerpc/include/asm/ftrace.h5
-rw-r--r--arch/powerpc/include/asm/hvcall.h1
-rw-r--r--arch/powerpc/include/asm/module.h12
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h2
-rw-r--r--arch/powerpc/include/asm/perf_event_server.h10
-rw-r--r--arch/powerpc/include/asm/sections.h12
-rw-r--r--arch/powerpc/kernel/Makefile12
-rw-r--r--arch/powerpc/kernel/eeh.c37
-rw-r--r--arch/powerpc/kernel/eeh_cache.c11
-rw-r--r--arch/powerpc/kernel/eeh_dev.c1
-rw-r--r--arch/powerpc/kernel/eeh_driver.c150
-rw-r--r--arch/powerpc/kernel/eeh_pe.c38
-rw-r--r--arch/powerpc/kernel/entry_64.S166
-rw-r--r--arch/powerpc/kernel/ftrace.c132
-rw-r--r--arch/powerpc/kernel/module.c5
-rw-r--r--arch/powerpc/kernel/module_32.c20
-rw-r--r--arch/powerpc/kernel/module_64.c214
-rw-r--r--arch/powerpc/kernel/paca.c11
-rw-r--r--arch/powerpc/kernel/pci-hotplug.c2
-rw-r--r--arch/powerpc/kernel/pci_dn.c19
-rw-r--r--arch/powerpc/lib/Makefile4
-rw-r--r--arch/powerpc/perf/core-book3s.c2
-rw-r--r--arch/powerpc/perf/hv-24x7.c225
-rw-r--r--arch/powerpc/perf/hv-24x7.h1
-rw-r--r--arch/powerpc/perf/power7-pmu.c18
-rw-r--r--arch/powerpc/perf/power8-events-list.h51
-rw-r--r--arch/powerpc/perf/power8-pmu.c110
-rw-r--r--arch/powerpc/platforms/powermac/Makefile2
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c238
-rwxr-xr-xarch/powerpc/scripts/gcc-check-mprofile-kernel.sh23
34 files changed, 1259 insertions, 335 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d500772a1d9b..b580ce0c25a8 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -94,6 +94,7 @@ config PPC
select OF_RESERVED_MEM
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_DYNAMIC_FTRACE
+ select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select SYSCTL_EXCEPTION_TRACE
@@ -373,6 +374,24 @@ config PPC_TRANSACTIONAL_MEM
---help---
Support user-mode Transactional Memory on POWERPC.
+config DISABLE_MPROFILE_KERNEL
+ bool "Disable use of mprofile-kernel for kernel tracing"
+ depends on PPC64 && CPU_LITTLE_ENDIAN
+ default y
+ help
+ Selecting this options disables use of the mprofile-kernel ABI for
+ kernel tracing. That will cause options such as live patching
+ (CONFIG_LIVEPATCH) which depend on CONFIG_DYNAMIC_FTRACE_WITH_REGS to
+ be disabled also.
+
+ If you have a toolchain which supports mprofile-kernel, then you can
+ enable this. Otherwise leave it disabled. If you're not sure, say
+ "N".
+
+config MPROFILE_KERNEL
+ depends on PPC64 && CPU_LITTLE_ENDIAN
+ def_bool !DISABLE_MPROFILE_KERNEL
+
config IOMMU_HELPER
def_bool PPC64
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 1a0ee01b0f46..709a22a3e824 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -133,6 +133,21 @@ else
CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64
endif
+ifdef CONFIG_MPROFILE_KERNEL
+ ifeq ($(shell $(srctree)/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
+ CC_FLAGS_FTRACE := -pg -mprofile-kernel
+ KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL
+ else
+ # If the user asked for mprofile-kernel but the toolchain doesn't
+ # support it, emit a warning and deliberately break the build later
+ # with mprofile-kernel-not-supported. We would prefer to make this an
+ # error right here, but then the user would never be able to run
+ # oldconfig to change their configuration.
+ $(warning Compiler does not support mprofile-kernel, set CONFIG_DISABLE_MPROFILE_KERNEL)
+ CC_FLAGS_FTRACE := -mprofile-kernel-not-supported
+ endif
+endif
+
CFLAGS-$(CONFIG_CELL_CPU) += $(call cc-option,-mcpu=cell)
CFLAGS-$(CONFIG_POWER4_CPU) += $(call cc-option,-mcpu=power4)
CFLAGS-$(CONFIG_POWER5_CPU) += $(call cc-option,-mcpu=power5)
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 840a5509b3f1..994c60a857ce 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -99,4 +99,25 @@ static inline unsigned long ppc_global_function_entry(void *func)
#endif
}
+#ifdef CONFIG_PPC64
+/*
+ * Some instruction encodings commonly used in dynamic ftracing
+ * and function live patching.
+ */
+
+/* This must match the definition of STK_GOT in <asm/ppc_asm.h> */
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define R2_STACK_OFFSET 24
+#else
+#define R2_STACK_OFFSET 40
+#endif
+
+#define PPC_INST_LD_TOC (PPC_INST_LD | ___PPC_RT(__REG_R2) | \
+ ___PPC_RA(__REG_R1) | R2_STACK_OFFSET)
+
+/* usually preceded by a mflr r0 */
+#define PPC_INST_STD_LR (PPC_INST_STD | ___PPC_RS(__REG_R0) | \
+ ___PPC_RA(__REG_R1) | PPC_LR_STKOFF)
+#endif /* CONFIG_PPC64 */
+
#endif /* _ASM_POWERPC_CODE_PATCHING_H */
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 867c39b45df6..fb9f376ae27b 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -72,6 +72,7 @@ struct pci_dn;
#define EEH_PE_PHB (1 << 1) /* PHB PE */
#define EEH_PE_DEVICE (1 << 2) /* Device PE */
#define EEH_PE_BUS (1 << 3) /* Bus PE */
+#define EEH_PE_VF (1 << 4) /* VF PE */
#define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */
#define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */
@@ -136,11 +137,15 @@ struct eeh_dev {
int pcix_cap; /* Saved PCIx capability */
int pcie_cap; /* Saved PCIe capability */
int aer_cap; /* Saved AER capability */
+ int af_cap; /* Saved AF capability */
struct eeh_pe *pe; /* Associated PE */
struct list_head list; /* Form link list in the PE */
+ struct list_head rmv_list; /* Record the removed edevs */
struct pci_controller *phb; /* Associated PHB */
struct pci_dn *pdn; /* Associated PCI device node */
struct pci_dev *pdev; /* Associated PCI device */
+ bool in_error; /* Error flag for edev */
+ struct pci_dev *physfn; /* Associated SRIOV PF */
struct pci_bus *bus; /* PCI bus for partial hotplug */
};
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index ef89b1465573..50ca7585abe2 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -46,6 +46,8 @@
extern void _mcount(void);
#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_ADDR ((unsigned long)ftrace_caller)
+# define FTRACE_REGS_ADDR FTRACE_ADDR
static inline unsigned long ftrace_call_adjust(unsigned long addr)
{
/* reloction of mcount call site is the same as the address */
@@ -58,6 +60,9 @@ struct dyn_arch_ftrace {
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* __ASSEMBLY__ */
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#define ARCH_SUPPORTS_FTRACE_OPS 1
+#endif
#endif
#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64) && !defined(__ASSEMBLY__)
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index e3b54dd4f730..0bc9c284aa10 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -94,6 +94,7 @@
#define H_SG_LIST -72
#define H_OP_MODE -73
#define H_COP_HW -74
+#define H_STATE -75
#define H_UNSUPPORTED_FLAG_START -256
#define H_UNSUPPORTED_FLAG_END -511
#define H_MULTI_THREADS_ACTIVE -9005
diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index 5f1526fddccf..cd4ffd86765f 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -78,10 +78,18 @@ struct mod_arch_specific {
# endif /* MODULE */
#endif
-bool is_module_trampoline(u32 *insns);
-int module_trampoline_target(struct module *mod, u32 *trampoline,
+int module_trampoline_target(struct module *mod, unsigned long trampoline,
unsigned long *target);
+#ifdef CONFIG_DYNAMIC_FTRACE
+int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sechdrs);
+#else
+static inline int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sechdrs)
+{
+ return 0;
+}
+#endif
+
struct exception_table_entry;
void sort_ex_table(struct exception_table_entry *start,
struct exception_table_entry *finish);
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index b0b43f5fbc5f..9f165e8a77bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -212,6 +212,7 @@ struct pci_dn {
#define IODA_INVALID_PE (-1)
#ifdef CONFIG_PPC_POWERNV
int pe_number;
+ int vf_index; /* VF index in the PF */
#ifdef CONFIG_PCI_IOV
u16 vfs_expanded; /* number of VFs IOV BAR expanded */
u16 num_vfs; /* number of VFs enabled*/
@@ -220,6 +221,7 @@ struct pci_dn {
#define IODA_INVALID_M64 (-1)
int (*m64_map)[PCI_SRIOV_NUM_BARS];
#endif /* CONFIG_PCI_IOV */
+ int mps; /* Maximum Payload Size */
#endif
struct list_head child_list;
struct list_head list;
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 814622146d5a..e157489ee7a1 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -136,16 +136,24 @@ extern ssize_t power_events_sysfs_show(struct device *dev,
* event 'cpu-cycles' can have two entries in sysfs: 'cpu-cycles' and
* 'PM_CYC' where the latter is the name by which the event is known in
* POWER CPU specification.
+ *
+ * Similarly, some hardware and cache events use the same event code. Eg.
+ * on POWER8, both "cache-references" and "L1-dcache-loads" events refer
+ * to the same event, PM_LD_REF_L1. The suffix, allows us to have two
+ * sysfs objects for the same event and thus two entries/aliases in sysfs.
*/
#define EVENT_VAR(_id, _suffix) event_attr_##_id##_suffix
#define EVENT_PTR(_id, _suffix) &EVENT_VAR(_id, _suffix).attr.attr
#define EVENT_ATTR(_name, _id, _suffix) \
- PMU_EVENT_ATTR(_name, EVENT_VAR(_id, _suffix), PME_##_id, \
+ PMU_EVENT_ATTR(_name, EVENT_VAR(_id, _suffix), _id, \
power_events_sysfs_show)
#define GENERIC_EVENT_ATTR(_name, _id) EVENT_ATTR(_name, _id, _g)
#define GENERIC_EVENT_PTR(_id) EVENT_PTR(_id, _g)
+#define CACHE_EVENT_ATTR(_name, _id) EVENT_ATTR(_name, _id, _c)
+#define CACHE_EVENT_PTR(_id) EVENT_PTR(_id, _c)
+
#define POWER_EVENT_ATTR(_name, _id) EVENT_ATTR(_name, _id, _p)
#define POWER_EVENT_PTR(_id) EVENT_PTR(_id, _p)
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index a5e930aca804..abf5866e08c6 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -22,6 +22,18 @@ static inline int in_kernel_text(unsigned long addr)
return 0;
}
+static inline unsigned long kernel_toc_addr(void)
+{
+ /* Defined by the linker, see vmlinux.lds.S */
+ extern unsigned long __toc_start;
+
+ /*
+ * The TOC register (r2) points 32kB into the TOC, so that 64kB of
+ * the TOC can be addressed using a single machine instruction.
+ */
+ return (unsigned long)(&__toc_start) + 0x8000UL;
+}
+
static inline int overlaps_interrupt_vector_text(unsigned long start,
unsigned long end)
{
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 794f22adf99d..2da380fcc34c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -16,14 +16,14 @@ endif
ifdef CONFIG_FUNCTION_TRACER
# Do not trace early boot code
-CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
-CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
-CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
-CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
+CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
# do not trace tracer code
-CFLAGS_REMOVE_ftrace.o = -pg -mno-sched-epilog
+CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
# timers used by tracing
-CFLAGS_REMOVE_time.o = -pg -mno-sched-epilog
+CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
endif
obj-y := cputable.o ptrace.o syscalls.o \
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 8c6005cf1583..6544017eb90b 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -268,13 +268,6 @@ static void *eeh_dump_pe_log(void *data, void *flag)
struct eeh_dev *edev, *tmp;
size_t *plen = flag;
- /* If the PE's config space is blocked, 0xFF's will be
- * returned. It's pointless to collect the log in this
- * case.
- */
- if (pe->state & EEH_PE_CFG_BLOCKED)
- return NULL;
-
eeh_pe_for_each_dev(pe, edev, tmp)
*plen += eeh_dump_dev_log(edev, pci_regs_buf + *plen,
EEH_PCI_REGS_LOG_LEN - *plen);
@@ -677,7 +670,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
/* Check if the request is finished successfully */
if (active_flag) {
rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
- if (rc <= 0)
+ if (rc < 0)
return rc;
if (rc & active_flag)
@@ -761,7 +754,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
case pcie_deassert_reset:
eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
eeh_unfreeze_pe(pe, false);
- eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+ if (!(pe->type & EEH_PE_VF))
+ eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev);
eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
break;
@@ -769,14 +763,16 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED);
eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
- eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
+ if (!(pe->type & EEH_PE_VF))
+ eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
eeh_ops->reset(pe, EEH_RESET_HOT);
break;
case pcie_warm_reset:
eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED);
eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
- eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
+ if (!(pe->type & EEH_PE_VF))
+ eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
break;
default:
@@ -1243,6 +1239,14 @@ void eeh_remove_device(struct pci_dev *dev)
* from the parent PE during the BAR resotre.
*/
edev->pdev = NULL;
+
+ /*
+ * The flag "in_error" is used to trace EEH devices for VFs
+ * in error state or not. It's set in eeh_report_error(). If
+ * it's not set, eeh_report_{reset,resume}() won't be called
+ * for the VF EEH device.
+ */
+ edev->in_error = false;
dev->dev.archdata.edev = NULL;
if (!(edev->pe->state & EEH_PE_KEEP))
eeh_rmv_from_parent_pe(edev);
@@ -1537,6 +1541,17 @@ int eeh_pe_get_state(struct eeh_pe *pe)
if (!eeh_ops || !eeh_ops->get_state)
return -ENOENT;
+ /*
+ * If the parent PE is owned by the host kernel and is undergoing
+ * error recovery, we should return the PE state as temporarily
+ * unavailable so that the error recovery on the guest is suspended
+ * until the recovery completes on the host.
+ */
+ if (pe->parent &&
+ !(pe->state & EEH_PE_REMOVED) &&
+ (pe->parent->state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING)))
+ return EEH_PE_STATE_UNAVAIL;
+
result = eeh_ops->get_state(pe, NULL);
rst_active = !!(result & EEH_STATE_RESET_ACTIVE);
dma_en = !!(result & EEH_STATE_DMA_ENABLED);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index a1e86e172e3c..ddbcfab7efdf 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -195,8 +195,11 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
return;
}
- /* Walk resources on this device, poke them into the tree */
- for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ /*
+ * Walk resources on this device, poke the first 7 (6 normal BAR and 1
+ * ROM BAR) into the tree.
+ */
+ for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
resource_size_t start = pci_resource_start(dev,i);
resource_size_t end = pci_resource_end(dev,i);
unsigned long flags = pci_resource_flags(dev,i);
@@ -222,10 +225,6 @@ void eeh_addr_cache_insert_dev(struct pci_dev *dev)
{
unsigned long flags;
- /* Ignore PCI bridges */
- if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
- return;
-
spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
__eeh_addr_cache_insert_dev(dev);
spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index aabba94ff9cb..7815095fe3d8 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -67,6 +67,7 @@ void *eeh_dev_init(struct pci_dn *pdn, void *data)
edev->pdn = pdn;
edev->phb = phb;
INIT_LIST_HEAD(&edev->list);
+ INIT_LIST_HEAD(&edev->rmv_list);
return NULL;
}
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 650cfb31ea3d..fb6207d2c604 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -34,6 +34,11 @@
#include <asm/prom.h>
#include <asm/rtas.h>
+struct eeh_rmv_data {
+ struct list_head edev_list;
+ int removed;
+};
+
/**
* eeh_pcid_name - Retrieve name of PCI device driver
* @pdev: PCI device
@@ -190,7 +195,7 @@ static void *eeh_report_error(void *data, void *userdata)
enum pci_ers_result rc, *res = userdata;
struct pci_driver *driver;
- if (!dev || eeh_dev_removed(edev))
+ if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
return NULL;
dev->error_state = pci_channel_io_frozen;
@@ -211,6 +216,7 @@ static void *eeh_report_error(void *data, void *userdata)
if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+ edev->in_error = true;
eeh_pcid_put(dev);
return NULL;
}
@@ -231,7 +237,7 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
enum pci_ers_result rc, *res = userdata;
struct pci_driver *driver;
- if (!dev || eeh_dev_removed(edev))
+ if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
return NULL;
driver = eeh_pcid_get(dev);
@@ -271,7 +277,7 @@ static void *eeh_report_reset(void *data, void *userdata)
enum pci_ers_result rc, *res = userdata;
struct pci_driver *driver;
- if (!dev || eeh_dev_removed(edev))
+ if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
return NULL;
dev->error_state = pci_channel_io_normal;
@@ -282,7 +288,8 @@ static void *eeh_report_reset(void *data, void *userdata)
if (!driver->err_handler ||
!driver->err_handler->slot_reset ||
- (edev->mode & EEH_DEV_NO_HANDLER)) {
+ (edev->mode & EEH_DEV_NO_HANDLER) ||
+ (!edev->in_error)) {
eeh_pcid_put(dev);
return NULL;
}
@@ -326,20 +333,23 @@ static void *eeh_report_resume(void *data, void *userdata)
{
struct eeh_dev *edev = (struct eeh_dev *)data;
struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ bool was_in_error;
struct pci_driver *driver;
- if (!dev || eeh_dev_removed(edev))
+ if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
return NULL;
dev->error_state = pci_channel_io_normal;
driver = eeh_pcid_get(dev);
if (!driver) return NULL;
+ was_in_error = edev->in_error;
+ edev->in_error = false;
eeh_enable_irq(dev);
if (!driver->err_handler ||
!driver->err_handler->resume ||
- (edev->mode & EEH_DEV_NO_HANDLER)) {
+ (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
edev->mode &= ~EEH_DEV_NO_HANDLER;
eeh_pcid_put(dev);
return NULL;
@@ -365,7 +375,7 @@ static void *eeh_report_failure(void *data, void *userdata)
struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
struct pci_driver *driver;
- if (!dev || eeh_dev_removed(edev))
+ if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
return NULL;
dev->error_state = pci_channel_io_perm_failure;
@@ -386,12 +396,40 @@ static void *eeh_report_failure(void *data, void *userdata)
return NULL;
}
+static void *eeh_add_virt_device(void *data, void *userdata)
+{
+ struct pci_driver *driver;
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+ if (!(edev->physfn)) {
+ pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
+ __func__, edev->phb->global_number, pdn->busno,
+ PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+ return NULL;
+ }
+
+ driver = eeh_pcid_get(dev);
+ if (driver) {
+ eeh_pcid_put(dev);
+ if (driver->err_handler)
+ return NULL;
+ }
+
+#ifdef CONFIG_PPC_POWERNV
+ pci_iov_add_virtfn(edev->physfn, pdn->vf_index, 0);
+#endif
+ return NULL;
+}
+
static void *eeh_rmv_device(void *data, void *userdata)
{
struct pci_driver *driver;
struct eeh_dev *edev = (struct eeh_dev *)data;
struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
- int *removed = (int *)userdata;
+ struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
+ int *removed = rmv_data ? &rmv_data->removed : NULL;
/*
* Actually, we should remove the PCI bridges as well.
@@ -416,7 +454,11 @@ static void *eeh_rmv_device(void *data, void *userdata)
driver = eeh_pcid_get(dev);
if (driver) {
eeh_pcid_put(dev);
- if (driver->err_handler &&
+ if (removed &&
+ eeh_pe_passed(edev->pe))
+ return NULL;
+ if (removed &&
+ driver->err_handler &&
driver->err_handler->error_detected &&
driver->err_handler->slot_reset)
return NULL;
@@ -427,11 +469,29 @@ static void *eeh_rmv_device(void *data, void *userdata)
pci_name(dev));
edev->bus = dev->bus;
edev->mode |= EEH_DEV_DISCONNECTED;
- (*removed)++;
+ if (removed)
+ (*removed)++;
- pci_lock_rescan_remove();
- pci_stop_and_remove_bus_device(dev);
- pci_unlock_rescan_remove();
+ if (edev->physfn) {
+#ifdef CONFIG_PPC_POWERNV
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+ pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0);
+ edev->pdev = NULL;
+
+ /*
+ * We have to set the VF PE number to invalid one, which is
+ * required to plug the VF successfully.
+ */
+ pdn->pe_number = IODA_INVALID_PE;
+#endif
+ if (rmv_data)
+ list_add(&edev->rmv_list, &rmv_data->edev_list);
+ } else {
+ pci_lock_rescan_remove();
+ pci_stop_and_remove_bus_device(dev);
+ pci_unlock_rescan_remove();
+ }
return NULL;
}
@@ -545,11 +605,13 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
* During the reset, udev might be invoked because those affected
* PCI devices will be removed and then added.
*/
-static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
+ struct eeh_rmv_data *rmv_data)
{
struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
struct timeval tstamp;
- int cnt, rc, removed = 0;
+ int cnt, rc;
+ struct eeh_dev *edev;
/* pcibios will clear the counter; save the value */
cnt = pe->freeze_count;
@@ -563,12 +625,16 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
*/
eeh_pe_state_mark(pe, EEH_PE_KEEP);
if (bus) {
- eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
- pci_lock_rescan_remove();
- pcibios_remove_pci_devices(bus);
- pci_unlock_rescan_remove();
+ if (pe->type & EEH_PE_VF) {
+ eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
+ } else {
+ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+ pci_lock_rescan_remove();
+ pcibios_remove_pci_devices(bus);
+ pci_unlock_rescan_remove();
+ }
} else if (frozen_bus) {
- eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
+ eeh_pe_dev_traverse(pe, eeh_rmv_device, &rmv_data);
}
/*
@@ -610,14 +676,22 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
* PE. We should disconnect it so the binding can be
* rebuilt when adding PCI devices.
*/
+ edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
- pcibios_add_pci_devices(bus);
- } else if (frozen_bus && removed) {
+ if (pe->type & EEH_PE_VF)
+ eeh_add_virt_device(edev, NULL);
+ else
+ pcibios_add_pci_devices(bus);
+ } else if (frozen_bus && rmv_data->removed) {
pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
ssleep(5);
+ edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
- pcibios_add_pci_devices(frozen_bus);
+ if (pe->type & EEH_PE_VF)
+ eeh_add_virt_device(edev, NULL);
+ else
+ pcibios_add_pci_devices(frozen_bus);
}
eeh_pe_state_clear(pe, EEH_PE_KEEP);
@@ -636,8 +710,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
static void eeh_handle_normal_event(struct eeh_pe *pe)
{
struct pci_bus *frozen_bus;
+ struct eeh_dev *edev, *tmp;
int rc = 0;
enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+ struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
frozen_bus = eeh_pe_bus_get(pe);
if (!frozen_bus) {
@@ -692,7 +768,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
*/
if (result == PCI_ERS_RESULT_NONE) {
pr_info("EEH: Reset with hotplug activity\n");
- rc = eeh_reset_device(pe, frozen_bus);
+ rc = eeh_reset_device(pe, frozen_bus, NULL);
if (rc) {
pr_warn("%s: Unable to reset, err=%d\n",
__func__, rc);
@@ -744,7 +820,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
/* If any device called out for a reset, then reset the slot */
if (result == PCI_ERS_RESULT_NEED_RESET) {
pr_info("EEH: Reset without hotplug activity\n");
- rc = eeh_reset_device(pe, NULL);
+ rc = eeh_reset_device(pe, NULL, &rmv_data);
if (rc) {
pr_warn("%s: Cannot reset, err=%d\n",
__func__, rc);
@@ -764,6 +840,15 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
goto hard_fail;
}
+ /*
+ * For those hot removed VFs, we should add back them after PF get
+ * recovered properly.
+ */
+ list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) {
+ eeh_add_virt_device(edev, NULL);
+ list_del(&edev->rmv_list);
+ }
+
/* Tell all device drivers that they can resume operations */
pr_info("EEH: Notify device driver to resume\n");
eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
@@ -803,12 +888,17 @@ perm_error:
* the their PCI config any more.
*/
if (frozen_bus) {
- eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
- eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+ if (pe->type & EEH_PE_VF) {
+ eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
+ eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+ } else {
+ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+ eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
- pci_lock_rescan_remove();
- pcibios_remove_pci_devices(frozen_bus);
- pci_unlock_rescan_remove();
+ pci_lock_rescan_remove();
+ pcibios_remove_pci_devices(frozen_bus);
+ pci_unlock_rescan_remove();
+ }
}
}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 98f81800e00c..eea48d8baf49 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
* EEH device already having associated PE, but
* the direct parent EEH device doesn't have yet.
*/
- pdn = pdn ? pdn->parent : NULL;
+ if (edev->physfn)
+ pdn = pci_get_pdn(edev->physfn);
+ else
+ pdn = pdn ? pdn->parent : NULL;
while (pdn) {
/* We're poking out of PCI territory */
parent = pdn_to_eeh_dev(pdn);
@@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
}
/* Create a new EEH PE */
- pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+ if (edev->physfn)
+ pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
+ else
+ pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
if (!pe) {
pr_err("%s: out of memory!\n", __func__);
return -ENOMEM;
@@ -920,25 +926,21 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe)
*/
struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
{
- struct pci_bus *bus = NULL;
struct eeh_dev *edev;
struct pci_dev *pdev;
- if (pe->type & EEH_PE_PHB) {
- bus = pe->phb->bus;
- } else if (pe->type & EEH_PE_BUS ||
- pe->type & EEH_PE_DEVICE) {
- if (pe->state & EEH_PE_PRI_BUS) {
- bus = pe->bus;
- goto out;
- }
+ if (pe->type & EEH_PE_PHB)
+ return pe->phb->bus;
- edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
- pdev = eeh_dev_to_pci_dev(edev);
- if (pdev)
- bus = pdev->bus;
- }
+ /* The primary bus might be cached during probe time */
+ if (pe->state & EEH_PE_PRI_BUS)
+ return pe->bus;
-out:
- return bus;
+ /* Retrieve the parent PCI bus of first (top) PCI device */
+ edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list);
+ pdev = eeh_dev_to_pci_dev(edev);
+ if (pdev)
+ return pdev->bus;
+
+ return NULL;
}
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 038e0a1425e7..b9b125327f27 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -1158,8 +1158,12 @@ _GLOBAL(enter_prom)
#ifdef CONFIG_DYNAMIC_FTRACE
_GLOBAL(mcount)
_GLOBAL(_mcount)
- blr
+ mflr r12
+ mtctr r12
+ mtlr r0
+ bctr
+#ifndef CC_USING_MPROFILE_KERNEL
_GLOBAL_TOC(ftrace_caller)
/* Taken from output of objdump from lib64/glibc */
mflr r3
@@ -1181,6 +1185,115 @@ _GLOBAL(ftrace_graph_stub)
ld r0, 128(r1)
mtlr r0
addi r1, r1, 112
+
+#else /* CC_USING_MPROFILE_KERNEL */
+/*
+ *
+ * ftrace_caller() is the function that replaces _mcount() when ftrace is
+ * active.
+ *
+ * We arrive here after a function A calls function B, and we are the trace
+ * function for B. When we enter r1 points to A's stack frame, B has not yet
+ * had a chance to allocate one yet.
+ *
+ * Additionally r2 may point either to the TOC for A, or B, depending on
+ * whether B did a TOC setup sequence before calling us.
+ *
+ * On entry the LR points back to the _mcount() call site, and r0 holds the
+ * saved LR as it was on entry to B, ie. the original return address at the
+ * call site in A.
+ *
+ * Our job is to save the register state into a struct pt_regs (on the stack)
+ * and then arrange for the ftrace function to be called.
+ */
+_GLOBAL(ftrace_caller)
+ /* Save the original return address in A's stack frame */
+ std r0,LRSAVE(r1)
+
+ /* Create our stack frame + pt_regs */
+ stdu r1,-SWITCH_FRAME_SIZE(r1)
+
+ /* Save all gprs to pt_regs */
+ SAVE_8GPRS(0,r1)
+ SAVE_8GPRS(8,r1)
+ SAVE_8GPRS(16,r1)
+ SAVE_8GPRS(24,r1)
+
+ /* Load special regs for save below */
+ mfmsr r8
+ mfctr r9
+ mfxer r10
+ mfcr r11
+
+ /* Get the _mcount() call site out of LR */
+ mflr r7
+ /* Save it as pt_regs->nip & pt_regs->link */
+ std r7, _NIP(r1)
+ std r7, _LINK(r1)
+
+ /* Save callee's TOC in the ABI compliant location */
+ std r2, 24(r1)
+ ld r2,PACATOC(r13) /* get kernel TOC in r2 */
+
+ addis r3,r2,function_trace_op@toc@ha
+ addi r3,r3,function_trace_op@toc@l
+ ld r5,0(r3)
+
+ /* Calculate ip from nip-4 into r3 for call below */
+ subi r3, r7, MCOUNT_INSN_SIZE
+
+ /* Put the original return address in r4 as parent_ip */
+ mr r4, r0
+
+ /* Save special regs */
+ std r8, _MSR(r1)
+ std r9, _CTR(r1)
+ std r10, _XER(r1)
+ std r11, _CCR(r1)
+
+ /* Load &pt_regs in r6 for call below */
+ addi r6, r1 ,STACK_FRAME_OVERHEAD
+
+ /* ftrace_call(r3, r4, r5, r6) */
+.globl ftrace_call
+ftrace_call:
+ bl ftrace_stub
+ nop
+
+ /* Load ctr with the possibly modified NIP */
+ ld r3, _NIP(r1)
+ mtctr r3
+
+ /* Restore gprs */
+ REST_8GPRS(0,r1)
+ REST_8GPRS(8,r1)
+ REST_8GPRS(16,r1)
+ REST_8GPRS(24,r1)
+
+ /* Restore callee's TOC */
+ ld r2, 24(r1)
+
+ /* Pop our stack frame */
+ addi r1, r1, SWITCH_FRAME_SIZE
+
+ /* Restore original LR for return to B */
+ ld r0, LRSAVE(r1)
+ mtlr r0
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ stdu r1, -112(r1)
+.globl ftrace_graph_call
+ftrace_graph_call:
+ b ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+ addi r1, r1, 112
+#endif
+
+ ld r0,LRSAVE(r1) /* restore callee's lr at _mcount site */
+ mtlr r0
+ bctr /* jump after _mcount site */
+#endif /* CC_USING_MPROFILE_KERNEL */
+
_GLOBAL(ftrace_stub)
blr
#else
@@ -1213,6 +1326,7 @@ _GLOBAL(ftrace_stub)
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#ifndef CC_USING_MPROFILE_KERNEL
_GLOBAL(ftrace_graph_caller)
/* load r4 with local address */
ld r4, 128(r1)
@@ -1237,6 +1351,56 @@ _GLOBAL(ftrace_graph_caller)
addi r1, r1, 112
blr
+#else /* CC_USING_MPROFILE_KERNEL */
+_GLOBAL(ftrace_graph_caller)
+ /* with -mprofile-kernel, parameter regs are still alive at _mcount */
+ std r10, 104(r1)
+ std r9, 96(r1)
+ std r8, 88(r1)
+ std r7, 80(r1)
+ std r6, 72(r1)
+ std r5, 64(r1)
+ std r4, 56(r1)
+ std r3, 48(r1)
+
+ /* Save callee's TOC in the ABI compliant location */
+ std r2, 24(r1)
+ ld r2, PACATOC(r13) /* get kernel TOC in r2 */
+
+ mfctr r4 /* ftrace_caller has moved local addr here */
+ std r4, 40(r1)
+ mflr r3 /* ftrace_caller has restored LR from stack */
+ subi r4, r4, MCOUNT_INSN_SIZE
+
+ bl prepare_ftrace_return
+ nop
+
+ /*
+ * prepare_ftrace_return gives us the address we divert to.
+ * Change the LR to this.
+ */
+ mtlr r3
+
+ ld r0, 40(r1)
+ mtctr r0
+ ld r10, 104(r1)
+ ld r9, 96(r1)
+ ld r8, 88(r1)
+ ld r7, 80(r1)
+ ld r6, 72(r1)
+ ld r5, 64(r1)
+ ld r4, 56(r1)
+ ld r3, 48(r1)
+
+ /* Restore callee's TOC */
+ ld r2, 24(r1)
+
+ addi r1, r1, 112
+ mflr r0
+ std r0, LRSAVE(r1)
+ bctr
+#endif /* CC_USING_MPROFILE_KERNEL */
+
_GLOBAL(return_to_handler)
/* need to save return values */
std r4, -32(r1)
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 44d4d8eb3c85..9dac18dabd03 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -61,8 +61,11 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
return -EFAULT;
/* Make sure it is what we expect it to be */
- if (replaced != old)
+ if (replaced != old) {
+ pr_err("%p: replaced (%#x) != old (%#x)",
+ (void *)ip, replaced, old);
return -EINVAL;
+ }
/* replace the text with the new text */
if (patch_instruction((unsigned int *)ip, new))
@@ -106,14 +109,15 @@ static int
__ftrace_make_nop(struct module *mod,
struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned int op;
- unsigned long entry, ptr;
+ unsigned long entry, ptr, tramp;
unsigned long ip = rec->ip;
- void *tramp;
+ unsigned int op, pop;
/* read where this goes */
- if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
+ if (probe_kernel_read(&op, (void *)ip, sizeof(int))) {
+ pr_err("Fetching opcode failed.\n");
return -EFAULT;
+ }
/* Make sure that that this is still a 24bit jump */
if (!is_bl_op(op)) {
@@ -122,14 +126,9 @@ __ftrace_make_nop(struct module *mod,
}
/* lets find where the pointer goes */
- tramp = (void *)find_bl_target(ip, op);
-
- pr_devel("ip:%lx jumps to %p", ip, tramp);
+ tramp = find_bl_target(ip, op);
- if (!is_module_trampoline(tramp)) {
- pr_err("Not a trampoline\n");
- return -EINVAL;
- }
+ pr_devel("ip:%lx jumps to %lx", ip, tramp);
if (module_trampoline_target(mod, tramp, &ptr)) {
pr_err("Failed to get trampoline target\n");
@@ -158,10 +157,42 @@ __ftrace_make_nop(struct module *mod,
*
* Use a b +8 to jump over the load.
*/
- op = 0x48000008; /* b +8 */
- if (patch_instruction((unsigned int *)ip, op))
+ pop = PPC_INST_BRANCH | 8; /* b +8 */
+
+ /*
+ * Check what is in the next instruction. We can see ld r2,40(r1), but
+ * on first pass after boot we will see mflr r0.
+ */
+ if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) {
+ pr_err("Fetching op failed.\n");
+ return -EFAULT;
+ }
+
+ if (op != PPC_INST_LD_TOC) {
+ unsigned int inst;
+
+ if (probe_kernel_read(&inst, (void *)(ip - 4), 4)) {
+ pr_err("Fetching instruction at %lx failed.\n", ip - 4);
+ return -EFAULT;
+ }
+
+ /* We expect either a mlfr r0, or a std r0, LRSAVE(r1) */
+ if (inst != PPC_INST_MFLR && inst != PPC_INST_STD_LR) {
+ pr_err("Unexpected instructions around bl _mcount\n"
+ "when enabling dynamic ftrace!\t"
+ "(%08x,bl,%08x)\n", inst, op);
+ return -EINVAL;
+ }
+
+ /* When using -mkernel_profile there is no load to jump over */
+ pop = PPC_INST_NOP;
+ }
+
+ if (patch_instruction((unsigned int *)ip, pop)) {
+ pr_err("Patching NOP failed.\n");
return -EPERM;
+ }
return 0;
}
@@ -287,16 +318,15 @@ int ftrace_make_nop(struct module *mod,
#ifdef CONFIG_MODULES
#ifdef CONFIG_PPC64
+/*
+ * Examine the existing instructions for __ftrace_make_call.
+ * They should effectively be a NOP, and follow formal constraints,
+ * depending on the ABI. Return false if they don't.
+ */
+#ifndef CC_USING_MPROFILE_KERNEL
static int
-__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
{
- unsigned int op[2];
- void *ip = (void *)rec->ip;
-
- /* read where this goes */
- if (probe_kernel_read(op, ip, sizeof(op)))
- return -EFAULT;
-
/*
* We expect to see:
*
@@ -306,8 +336,34 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
* The load offset is different depending on the ABI. For simplicity
* just mask it out when doing the compare.
*/
- if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
- pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
+ if ((op0 != 0x48000008) || ((op1 & 0xffff0000) != 0xe8410000))
+ return 0;
+ return 1;
+}
+#else
+static int
+expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
+{
+ /* look for patched "NOP" on ppc64 with -mprofile-kernel */
+ if (op0 != PPC_INST_NOP)
+ return 0;
+ return 1;
+}
+#endif
+
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned int op[2];
+ void *ip = (void *)rec->ip;
+
+ /* read where this goes */
+ if (probe_kernel_read(op, ip, sizeof(op)))
+ return -EFAULT;
+
+ if (!expected_nop_sequence(ip, op[0], op[1])) {
+ pr_err("Unexpected call sequence at %p: %x %x\n",
+ ip, op[0], op[1]);
return -EINVAL;
}
@@ -330,7 +386,16 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
return 0;
}
-#else
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ return ftrace_make_call(rec, addr);
+}
+#endif
+
+#else /* !CONFIG_PPC64: */
static int
__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
@@ -455,20 +520,13 @@ void ftrace_replace_code(int enable)
}
}
+/*
+ * Use the default ftrace_modify_all_code, but without
+ * stop_machine().
+ */
void arch_ftrace_update_code(int command)
{
- if (command & FTRACE_UPDATE_CALLS)
- ftrace_replace_code(1);
- else if (command & FTRACE_DISABLE_CALLS)
- ftrace_replace_code(0);
-
- if (command & FTRACE_UPDATE_TRACE_FUNC)
- ftrace_update_ftrace_func(ftrace_trace_function);
-
- if (command & FTRACE_START_FUNC_RET)
- ftrace_enable_ftrace_graph_caller();
- else if (command & FTRACE_STOP_FUNC_RET)
- ftrace_disable_ftrace_graph_caller();
+ ftrace_modify_all_code(command);
}
int __init ftrace_dyn_arch_init(void)
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 9547381b631a..d1f1b35bf0c7 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -47,6 +47,11 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs, struct module *me)
{
const Elf_Shdr *sect;
+ int rc;
+
+ rc = module_finalize_ftrace(me, sechdrs);
+ if (rc)
+ return rc;
/* Apply feature fixups */
sect = find_section(hdr, sechdrs, "__ftr_fixup");
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 2c01665eb410..5a7a78f12562 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -181,7 +181,7 @@ static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val)
/* Set up a trampoline in the PLT to bounce us to the distant function */
static uint32_t do_plt_call(void *location,
Elf32_Addr val,
- Elf32_Shdr *sechdrs,
+ const Elf32_Shdr *sechdrs,
struct module *mod)
{
struct ppc_plt_entry *entry;
@@ -294,11 +294,19 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
return -ENOEXEC;
}
}
+
+ return 0;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
- module->arch.tramp =
- do_plt_call(module->core_layout.base,
- (unsigned long)ftrace_caller,
- sechdrs, module);
-#endif
+int module_finalize_ftrace(struct module *module, const Elf_Shdr *sechdrs)
+{
+ module->arch.tramp = do_plt_call(module->core_layout.base,
+ (unsigned long)ftrace_caller,
+ sechdrs, module);
+ if (!module->arch.tramp)
+ return -ENOENT;
+
return 0;
}
+#endif
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 08b7a40de5f8..9ce9a25f58b5 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -31,6 +31,7 @@
#include <asm/code-patching.h>
#include <linux/sort.h>
#include <asm/setup.h>
+#include <asm/sections.h>
/* FIXME: We don't do .init separately. To do this, we'd need to have
a separate r2 value in the init and core section, and stub between
@@ -41,7 +42,6 @@
--RR. */
#if defined(_CALL_ELF) && _CALL_ELF == 2
-#define R2_STACK_OFFSET 24
/* An address is simply the address of the function. */
typedef unsigned long func_desc_t;
@@ -73,7 +73,6 @@ static unsigned int local_entry_offset(const Elf64_Sym *sym)
return PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
}
#else
-#define R2_STACK_OFFSET 40
/* An address is address of the OPD entry, which contains address of fn. */
typedef struct ppc64_opd_entry func_desc_t;
@@ -96,6 +95,8 @@ static unsigned int local_entry_offset(const Elf64_Sym *sym)
}
#endif
+#define STUB_MAGIC 0x73747562 /* stub */
+
/* Like PPC32, we need little trampolines to do > 24-bit jumps (into
the kernel itself). But on PPC64, these need to be used for every
jump, actually, to reset r2 (TOC+0x8000). */
@@ -105,7 +106,8 @@ struct ppc64_stub_entry
* need 6 instructions on ABIv2 but we always allocate 7 so
* so we don't have to modify the trampoline load instruction. */
u32 jump[7];
- u32 unused;
+ /* Used by ftrace to identify stubs */
+ u32 magic;
/* Data for the above code */
func_desc_t funcdata;
};
@@ -139,70 +141,39 @@ static u32 ppc64_stub_insns[] = {
};
#ifdef CONFIG_DYNAMIC_FTRACE
-
-static u32 ppc64_stub_mask[] = {
- 0xffff0000,
- 0xffff0000,
- 0xffffffff,
- 0xffffffff,
-#if !defined(_CALL_ELF) || _CALL_ELF != 2
- 0xffffffff,
-#endif
- 0xffffffff,
- 0xffffffff
-};
-
-bool is_module_trampoline(u32 *p)
+int module_trampoline_target(struct module *mod, unsigned long addr,
+ unsigned long *target)
{
- unsigned int i;
- u32 insns[ARRAY_SIZE(ppc64_stub_insns)];
-
- BUILD_BUG_ON(sizeof(ppc64_stub_insns) != sizeof(ppc64_stub_mask));
+ struct ppc64_stub_entry *stub;
+ func_desc_t funcdata;
+ u32 magic;
- if (probe_kernel_read(insns, p, sizeof(insns)))
+ if (!within_module_core(addr, mod)) {
+ pr_err("%s: stub %lx not in module %s\n", __func__, addr, mod->name);
return -EFAULT;
-
- for (i = 0; i < ARRAY_SIZE(ppc64_stub_insns); i++) {
- u32 insna = insns[i];
- u32 insnb = ppc64_stub_insns[i];
- u32 mask = ppc64_stub_mask[i];
-
- if ((insna & mask) != (insnb & mask))
- return false;
}
- return true;
-}
-
-int module_trampoline_target(struct module *mod, u32 *trampoline,
- unsigned long *target)
-{
- u32 buf[2];
- u16 upper, lower;
- long offset;
- void *toc_entry;
+ stub = (struct ppc64_stub_entry *)addr;
- if (probe_kernel_read(buf, trampoline, sizeof(buf)))
+ if (probe_kernel_read(&magic, &stub->magic, sizeof(magic))) {
+ pr_err("%s: fault reading magic for stub %lx for %s\n", __func__, addr, mod->name);
return -EFAULT;
+ }
- upper = buf[0] & 0xffff;
- lower = buf[1] & 0xffff;
-
- /* perform the addis/addi, both signed */
- offset = ((short)upper << 16) + (short)lower;
+ if (magic != STUB_MAGIC) {
+ pr_err("%s: bad magic for stub %lx for %s\n", __func__, addr, mod->name);
+ return -EFAULT;
+ }
- /*
- * Now get the address this trampoline jumps to. This
- * is always 32 bytes into our trampoline stub.
- */
- toc_entry = (void *)mod->arch.toc + offset + 32;
+ if (probe_kernel_read(&funcdata, &stub->funcdata, sizeof(funcdata))) {
+ pr_err("%s: fault reading funcdata for stub %lx for %s\n", __func__, addr, mod->name);
+ return -EFAULT;
+ }
- if (probe_kernel_read(target, toc_entry, sizeof(*target)))
- return -EFAULT;
+ *target = stub_func_addr(funcdata);
return 0;
}
-
#endif
/* Count how many different 24-bit relocations (different symbol,
@@ -413,7 +384,7 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr,
/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this
gives the value maximum span in an instruction which uses a signed
offset) */
-static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me)
+static inline unsigned long my_r2(const Elf64_Shdr *sechdrs, struct module *me)
{
return sechdrs[me->arch.toc_section].sh_addr + 0x8000;
}
@@ -426,7 +397,7 @@ static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me)
#define PPC_HA(v) PPC_HI ((v) + 0x8000)
/* Patch stub to reference function and correct r2 value. */
-static inline int create_stub(Elf64_Shdr *sechdrs,
+static inline int create_stub(const Elf64_Shdr *sechdrs,
struct ppc64_stub_entry *entry,
unsigned long addr,
struct module *me)
@@ -447,12 +418,14 @@ static inline int create_stub(Elf64_Shdr *sechdrs,
entry->jump[0] |= PPC_HA(reladdr);
entry->jump[1] |= PPC_LO(reladdr);
entry->funcdata = func_desc(addr);
+ entry->magic = STUB_MAGIC;
+
return 1;
}
/* Create stub to jump to function described in this OPD/ptr: we need the
stub to set up the TOC ptr (r2) for the function. */
-static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
+static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
unsigned long addr,
struct module *me)
{
@@ -476,17 +449,60 @@ static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
return (unsigned long)&stubs[i];
}
+#ifdef CC_USING_MPROFILE_KERNEL
+static bool is_early_mcount_callsite(u32 *instruction)
+{
+ /*
+ * Check if this is one of the -mprofile-kernel sequences.
+ */
+ if (instruction[-1] == PPC_INST_STD_LR &&
+ instruction[-2] == PPC_INST_MFLR)
+ return true;
+
+ if (instruction[-1] == PPC_INST_MFLR)
+ return true;
+
+ return false;
+}
+
+/*
+ * In case of _mcount calls, do not save the current callee's TOC (in r2) into
+ * the original caller's stack frame. If we did we would clobber the saved TOC
+ * value of the original caller.
+ */
+static void squash_toc_save_inst(const char *name, unsigned long addr)
+{
+ struct ppc64_stub_entry *stub = (struct ppc64_stub_entry *)addr;
+
+ /* Only for calls to _mcount */
+ if (strcmp("_mcount", name) != 0)
+ return;
+
+ stub->jump[2] = PPC_INST_NOP;
+}
+#else
+static void squash_toc_save_inst(const char *name, unsigned long addr) { }
+
+/* without -mprofile-kernel, mcount calls are never early */
+static bool is_early_mcount_callsite(u32 *instruction)
+{
+ return false;
+}
+#endif
+
/* We expect a noop next: if it is, replace it with instruction to
restore r2. */
static int restore_r2(u32 *instruction, struct module *me)
{
if (*instruction != PPC_INST_NOP) {
+ if (is_early_mcount_callsite(instruction - 1))
+ return 1;
pr_err("%s: Expect noop after relocate, got %08x\n",
me->name, *instruction);
return 0;
}
/* ld r2,R2_STACK_OFFSET(r1) */
- *instruction = 0xe8410000 | R2_STACK_OFFSET;
+ *instruction = PPC_INST_LD_TOC;
return 1;
}
@@ -611,6 +627,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return -ENOENT;
if (!restore_r2((u32 *)location + 1, me))
return -ENOEXEC;
+
+ squash_toc_save_inst(strtab + sym->st_name, value);
} else
value += local_entry_offset(sym);
@@ -693,12 +711,84 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
}
}
+ return 0;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
- me->arch.toc = my_r2(sechdrs, me);
- me->arch.tramp = stub_for_addr(sechdrs,
- (unsigned long)ftrace_caller,
- me);
+
+#ifdef CC_USING_MPROFILE_KERNEL
+
+#define PACATOC offsetof(struct paca_struct, kernel_toc)
+
+/*
+ * For mprofile-kernel we use a special stub for ftrace_caller() because we
+ * can't rely on r2 containing this module's TOC when we enter the stub.
+ *
+ * That can happen if the function calling us didn't need to use the toc. In
+ * that case it won't have setup r2, and the r2 value will be either the
+ * kernel's toc, or possibly another modules toc.
+ *
+ * To deal with that this stub uses the kernel toc, which is always accessible
+ * via the paca (in r13). The target (ftrace_caller()) is responsible for
+ * saving and restoring the toc before returning.
+ */
+static unsigned long create_ftrace_stub(const Elf64_Shdr *sechdrs, struct module *me)
+{
+ struct ppc64_stub_entry *entry;
+ unsigned int i, num_stubs;
+ static u32 stub_insns[] = {
+ 0xe98d0000 | PACATOC, /* ld r12,PACATOC(r13) */
+ 0x3d8c0000, /* addis r12,r12,<high> */
+ 0x398c0000, /* addi r12,r12,<low> */
+ 0x7d8903a6, /* mtctr r12 */
+ 0x4e800420, /* bctr */
+ };
+ long reladdr;
+
+ num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*entry);
+
+ /* Find the next available stub entry */
+ entry = (void *)sechdrs[me->arch.stubs_section].sh_addr;
+ for (i = 0; i < num_stubs && stub_func_addr(entry->funcdata); i++, entry++);
+
+ if (i >= num_stubs) {
+ pr_err("%s: Unable to find a free slot for ftrace stub.\n", me->name);
+ return 0;
+ }
+
+ memcpy(entry->jump, stub_insns, sizeof(stub_insns));
+
+ /* Stub uses address relative to kernel toc (from the paca) */
+ reladdr = (unsigned long)ftrace_caller - kernel_toc_addr();
+ if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) {
+ pr_err("%s: Address of ftrace_caller out of range of kernel_toc.\n", me->name);
+ return 0;
+ }
+
+ entry->jump[1] |= PPC_HA(reladdr);
+ entry->jump[2] |= PPC_LO(reladdr);
+
+ /* Eventhough we don't use funcdata in the stub, it's needed elsewhere. */
+ entry->funcdata = func_desc((unsigned long)ftrace_caller);
+ entry->magic = STUB_MAGIC;
+
+ return (unsigned long)entry;
+}
+#else
+static unsigned long create_ftrace_stub(const Elf64_Shdr *sechdrs, struct module *me)
+{
+ return stub_for_addr(sechdrs, (unsigned long)ftrace_caller, me);
+}
#endif
+int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sechdrs)
+{
+ mod->arch.toc = my_r2(sechdrs, mod);
+ mod->arch.tramp = create_ftrace_stub(sechdrs, mod);
+
+ if (!mod->arch.tramp)
+ return -ENOENT;
+
return 0;
}
+#endif
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 01ea0edf0579..93dae296b6be 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -17,10 +17,6 @@
#include <asm/pgtable.h>
#include <asm/kexec.h>
-/* This symbol is provided by the linker - let it fill in the paca
- * field correctly */
-extern unsigned long __toc_start;
-
#ifdef CONFIG_PPC_BOOK3S
/*
@@ -149,11 +145,6 @@ EXPORT_SYMBOL(paca);
void __init initialise_paca(struct paca_struct *new_paca, int cpu)
{
- /* The TOC register (GPR2) points 32kB into the TOC, so that 64kB
- * of the TOC can be addressed using a single machine instruction.
- */
- unsigned long kernel_toc = (unsigned long)(&__toc_start) + 0x8000UL;
-
#ifdef CONFIG_PPC_BOOK3S
new_paca->lppaca_ptr = new_lppaca(cpu);
#else
@@ -161,7 +152,7 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
#endif
new_paca->lock_token = 0x8000;
new_paca->paca_index = cpu;
- new_paca->kernel_toc = kernel_toc;
+ new_paca->kernel_toc = kernel_toc_addr();
new_paca->kernelbase = (unsigned long) _stext;
/* Only set MSR:IR/DR when MMU is initialized */
new_paca->kernel_msr = MSR_KERNEL & ~(MSR_IR | MSR_DR);
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 7f9ed0c1f6b9..59c436189f46 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -55,7 +55,7 @@ void pcibios_remove_pci_devices(struct pci_bus *bus)
pr_debug("PCI: Removing devices on bus %04x:%02x\n",
pci_domain_nr(bus), bus->number);
- list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+ list_for_each_entry_safe_reverse(dev, tmp, &bus->devices, bus_list) {
pr_debug(" Removing %s...\n", pci_name(dev));
pci_stop_and_remove_bus_device(dev);
}
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index b3b4df91b792..38102cb9baa9 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -139,6 +139,7 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
#ifdef CONFIG_PCI_IOV
static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
struct pci_dev *pdev,
+ int vf_index,
int busno, int devfn)
{
struct pci_dn *pdn;
@@ -158,6 +159,7 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
pdn->busno = busno;
pdn->devfn = devfn;
#ifdef CONFIG_PPC_POWERNV
+ pdn->vf_index = vf_index;
pdn->pe_number = IODA_INVALID_PE;
#endif
INIT_LIST_HEAD(&pdn->child_list);
@@ -179,6 +181,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
{
#ifdef CONFIG_PCI_IOV
struct pci_dn *parent, *pdn;
+ struct eeh_dev *edev;
int i;
/* Only support IOV for now */
@@ -196,7 +199,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
return NULL;
for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
- pdn = add_one_dev_pci_data(parent, NULL,
+ pdn = add_one_dev_pci_data(parent, NULL, i,
pci_iov_virtfn_bus(pdev, i),
pci_iov_virtfn_devfn(pdev, i));
if (!pdn) {
@@ -204,6 +207,12 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
__func__, i);
return NULL;
}
+
+ /* Create the EEH device for the VF */
+ eeh_dev_init(pdn, pci_bus_to_host(pdev->bus));
+ edev = pdn_to_eeh_dev(pdn);
+ BUG_ON(!edev);
+ edev->physfn = pdev;
}
#endif /* CONFIG_PCI_IOV */
@@ -215,6 +224,7 @@ void remove_dev_pci_data(struct pci_dev *pdev)
#ifdef CONFIG_PCI_IOV
struct pci_dn *parent;
struct pci_dn *pdn, *tmp;
+ struct eeh_dev *edev;
int i;
/*
@@ -256,6 +266,13 @@ void remove_dev_pci_data(struct pci_dev *pdev)
pdn->devfn != pci_iov_virtfn_devfn(pdev, i))
continue;
+ /* Release EEH device for the VF */
+ edev = pdn_to_eeh_dev(pdn);
+ if (edev) {
+ pdn->edev = NULL;
+ kfree(edev);
+ }
+
if (!list_empty(&pdn->list))
list_del(&pdn->list);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index e46b06822bea..ba21be15310f 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -6,8 +6,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-CFLAGS_REMOVE_code-patching.o = -pg
-CFLAGS_REMOVE_feature-fixups.o = -pg
+CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
obj-y += string.o alloc.o crtsavres.o ppc_ksyms.o code-patching.o \
feature-fixups.o
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index d1e65ce545b3..97a1d40d8696 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -651,7 +651,7 @@ static void pmao_restore_workaround(bool ebb)
/*
* We are already soft-disabled in power_pmu_enable(). We need to hard
- * enable to actually prevent the PMU exception from firing.
+ * disable to actually prevent the PMU exception from firing.
*/
hard_irq_disable();
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9f9dfda9ed2c..59012e750abe 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -27,20 +27,6 @@
#include "hv-24x7-catalog.h"
#include "hv-common.h"
-static const char *event_domain_suffix(unsigned domain)
-{
- switch (domain) {
-#define DOMAIN(n, v, x, c) \
- case HV_PERF_DOMAIN_##n: \
- return "__" #n;
-#include "hv-24x7-domains.h"
-#undef DOMAIN
- default:
- WARN(1, "unknown domain %d\n", domain);
- return "__UNKNOWN_DOMAIN_SUFFIX";
- }
-}
-
static bool domain_is_valid(unsigned domain)
{
switch (domain) {
@@ -68,6 +54,24 @@ static bool is_physical_domain(unsigned domain)
}
}
+static const char *domain_name(unsigned domain)
+{
+ if (!domain_is_valid(domain))
+ return NULL;
+
+ switch (domain) {
+ case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip";
+ case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core";
+ case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core";
+ case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip";
+ case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node";
+ case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node";
+ }
+
+ WARN_ON_ONCE(domain);
+ return NULL;
+}
+
static bool catalog_entry_domain_is_valid(unsigned domain)
{
return is_physical_domain(domain);
@@ -101,6 +105,7 @@ static bool catalog_entry_domain_is_valid(unsigned domain)
EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
/* u16 */
EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
+EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
/* u32, see "data_offset" */
EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
@@ -115,6 +120,7 @@ static struct attribute *format_attrs[] = {
&format_attr_domain.attr,
&format_attr_offset.attr,
&format_attr_core.attr,
+ &format_attr_chip.attr,
&format_attr_vcpu.attr,
&format_attr_lpar.attr,
NULL,
@@ -274,32 +280,70 @@ static unsigned long h_get_24x7_catalog_page(char page[],
version, index);
}
-static unsigned core_domains[] = {
- HV_PERF_DOMAIN_PHYS_CORE,
- HV_PERF_DOMAIN_VCPU_HOME_CORE,
- HV_PERF_DOMAIN_VCPU_HOME_CHIP,
- HV_PERF_DOMAIN_VCPU_HOME_NODE,
- HV_PERF_DOMAIN_VCPU_REMOTE_NODE,
-};
-/* chip event data always yeilds a single event, core yeilds multiple */
-#define MAX_EVENTS_PER_EVENT_DATA ARRAY_SIZE(core_domains)
-
+/*
+ * Each event we find in the catalog, will have a sysfs entry. Format the
+ * data for this sysfs entry based on the event's domain.
+ *
+ * Events belonging to the Chip domain can only be monitored in that domain.
+ * i.e the domain for these events is a fixed/knwon value.
+ *
+ * Events belonging to the Core domain can be monitored either in the physical
+ * core or in one of the virtual CPU domains. So the domain value for these
+ * events must be specified by the user (i.e is a required parameter). Format
+ * the Core events with 'domain=?' so the perf-tool can error check required
+ * parameters.
+ *
+ * NOTE: For the Core domain events, rather than making domain a required
+ * parameter we could default it to PHYS_CORE and allowe users to
+ * override the domain to one of the VCPU domains.
+ *
+ * However, this can make the interface a little inconsistent.
+ *
+ * If we set domain=2 (PHYS_CHIP) and allow user to override this field
+ * the user may be tempted to also modify the "offset=x" field in which
+ * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
+ * HPM_INST (offset=0x20) events. With:
+ *
+ * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
+ *
+ * we end up monitoring HPM_INST, while the command line has HPM_PCYC.
+ *
+ * By not assigning a default value to the domain for the Core events,
+ * we can have simple guidelines:
+ *
+ * - Specifying values for parameters with "=?" is required.
+ *
+ * - Specifying (i.e overriding) values for other parameters
+ * is undefined.
+ */
static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
{
const char *sindex;
const char *lpar;
+ const char *domain_str;
+ char buf[8];
- if (is_physical_domain(domain)) {
+ switch (domain) {
+ case HV_PERF_DOMAIN_PHYS_CHIP:
+ snprintf(buf, sizeof(buf), "%d", domain);
+ domain_str = buf;
+ lpar = "0x0";
+ sindex = "chip";
+ break;
+ case HV_PERF_DOMAIN_PHYS_CORE:
+ domain_str = "?";
lpar = "0x0";
sindex = "core";
- } else {
+ break;
+ default:
+ domain_str = "?";
lpar = "?";
sindex = "vcpu";
}
return kasprintf(GFP_KERNEL,
- "domain=0x%x,offset=0x%x,%s=?,lpar=%s",
- domain,
+ "domain=%s,offset=0x%x,%s=?,lpar=%s",
+ domain_str,
be16_to_cpu(event->event_counter_offs) +
be16_to_cpu(event->event_group_record_offs),
sindex,
@@ -339,6 +383,15 @@ static struct attribute *device_str_attr_create_(char *name, char *str)
return &attr->attr.attr;
}
+/*
+ * Allocate and initialize strings representing event attributes.
+ *
+ * NOTE: The strings allocated here are never destroyed and continue to
+ * exist till shutdown. This is to allow us to create as many events
+ * from the catalog as possible, even if we encounter errors with some.
+ * In case of changes to error paths in future, these may need to be
+ * freed by the caller.
+ */
static struct attribute *device_str_attr_create(char *name, int name_max,
int name_nonce,
char *str, size_t str_max)
@@ -370,16 +423,6 @@ out_s:
return NULL;
}
-static void device_str_attr_destroy(struct attribute *attr)
-{
- struct dev_ext_attribute *d;
-
- d = container_of(attr, struct dev_ext_attribute, attr.attr);
- kfree(d->var);
- kfree(d->attr.attr.name);
- kfree(d);
-}
-
static struct attribute *event_to_attr(unsigned ix,
struct hv_24x7_event_data *event,
unsigned domain,
@@ -387,7 +430,6 @@ static struct attribute *event_to_attr(unsigned ix,
{
int event_name_len;
char *ev_name, *a_ev_name, *val;
- const char *ev_suffix;
struct attribute *attr;
if (!domain_is_valid(domain)) {
@@ -400,14 +442,13 @@ static struct attribute *event_to_attr(unsigned ix,
if (!val)
return NULL;
- ev_suffix = event_domain_suffix(domain);
ev_name = event_name(event, &event_name_len);
if (!nonce)
- a_ev_name = kasprintf(GFP_KERNEL, "%.*s%s",
- (int)event_name_len, ev_name, ev_suffix);
+ a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
+ (int)event_name_len, ev_name);
else
- a_ev_name = kasprintf(GFP_KERNEL, "%.*s%s__%d",
- (int)event_name_len, ev_name, ev_suffix, nonce);
+ a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
+ (int)event_name_len, ev_name, nonce);
if (!a_ev_name)
goto out_val;
@@ -452,45 +493,14 @@ event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
return device_str_attr_create(name, nl, nonce, desc, dl);
}
-static ssize_t event_data_to_attrs(unsigned ix, struct attribute **attrs,
+static int event_data_to_attrs(unsigned ix, struct attribute **attrs,
struct hv_24x7_event_data *event, int nonce)
{
- unsigned i;
-
- switch (event->domain) {
- case HV_PERF_DOMAIN_PHYS_CHIP:
- *attrs = event_to_attr(ix, event, event->domain, nonce);
- return 1;
- case HV_PERF_DOMAIN_PHYS_CORE:
- for (i = 0; i < ARRAY_SIZE(core_domains); i++) {
- attrs[i] = event_to_attr(ix, event, core_domains[i],
- nonce);
- if (!attrs[i]) {
- pr_warn("catalog event %u: individual attr %u "
- "creation failure\n", ix, i);
- for (; i; i--)
- device_str_attr_destroy(attrs[i - 1]);
- return -1;
- }
- }
- return i;
- default:
- pr_warn("catalog event %u: domain %u is not allowed in the "
- "catalog\n", ix, event->domain);
+ *attrs = event_to_attr(ix, event, event->domain, nonce);
+ if (!*attrs)
return -1;
- }
-}
-static size_t event_to_attr_ct(struct hv_24x7_event_data *event)
-{
- switch (event->domain) {
- case HV_PERF_DOMAIN_PHYS_CHIP:
- return 1;
- case HV_PERF_DOMAIN_PHYS_CORE:
- return ARRAY_SIZE(core_domains);
- default:
- return 0;
- }
+ return 0;
}
static unsigned long vmalloc_to_phys(void *v)
@@ -726,9 +736,8 @@ static int create_events_from_catalog(struct attribute ***events_,
goto e_free;
}
- if (SIZE_MAX / MAX_EVENTS_PER_EVENT_DATA - 1 < event_entry_count) {
- pr_err("event_entry_count %zu is invalid\n",
- event_entry_count);
+ if (SIZE_MAX - 1 < event_entry_count) {
+ pr_err("event_entry_count %zu is invalid\n", event_entry_count);
ret = -EIO;
goto e_free;
}
@@ -801,7 +810,7 @@ static int create_events_from_catalog(struct attribute ***events_,
continue;
}
- attr_max += event_to_attr_ct(event);
+ attr_max++;
}
event_idx_last = event_idx;
@@ -851,12 +860,12 @@ static int create_events_from_catalog(struct attribute ***events_,
nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
ct = event_data_to_attrs(event_idx, events + event_attr_ct,
event, nonce);
- if (ct <= 0) {
+ if (ct < 0) {
pr_warn("event %zu (%.*s) creation failure, skipping\n",
event_idx, nl, name);
junk_events++;
} else {
- event_attr_ct += ct;
+ event_attr_ct++;
event_descs[desc_ct] = event_to_desc_attr(event, nonce);
if (event_descs[desc_ct])
desc_ct++;
@@ -961,6 +970,27 @@ e_free:
return ret;
}
+static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ int d, n, count = 0;
+ const char *str;
+
+ for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
+ str = domain_name(d);
+ if (!str)
+ continue;
+
+ n = sprintf(page, "%d: %s\n", d, str);
+ if (n < 0)
+ break;
+
+ count += n;
+ page += n;
+ }
+ return count;
+}
+
#define PAGE_0_ATTR(_name, _fmt, _expr) \
static ssize_t _name##_show(struct device *dev, \
struct device_attribute *dev_attr, \
@@ -989,6 +1019,7 @@ PAGE_0_ATTR(catalog_version, "%lld\n",
PAGE_0_ATTR(catalog_len, "%lld\n",
(unsigned long long)be32_to_cpu(page_0->length) * 4096);
static BIN_ATTR_RO(catalog, 0/* real length varies */);
+static DEVICE_ATTR_RO(domains);
static struct bin_attribute *if_bin_attrs[] = {
&bin_attr_catalog,
@@ -998,6 +1029,7 @@ static struct bin_attribute *if_bin_attrs[] = {
static struct attribute *if_attrs[] = {
&dev_attr_catalog_len.attr,
&dev_attr_catalog_version.attr,
+ &dev_attr_domains.attr,
NULL,
};
@@ -1089,10 +1121,16 @@ static int add_event_to_24x7_request(struct perf_event *event,
return -EINVAL;
}
- if (is_physical_domain(event_get_domain(event)))
+ switch (event_get_domain(event)) {
+ case HV_PERF_DOMAIN_PHYS_CHIP:
+ idx = event_get_chip(event);
+ break;
+ case HV_PERF_DOMAIN_PHYS_CORE:
idx = event_get_core(event);
- else
+ break;
+ default:
idx = event_get_vcpu(event);
+ }
i = request_buffer->num_requests++;
req = &request_buffer->requests[i];
@@ -1208,11 +1246,12 @@ static int h_24x7_event_init(struct perf_event *event)
return -EACCES;
}
- /* see if the event complains */
+ /* Get the initial value of the counter for this event */
if (single_24x7_request(event, &ct)) {
pr_devel("test hcall failed\n");
return -EIO;
}
+ (void)local64_xchg(&event->hw.prev_count, ct);
return 0;
}
@@ -1275,6 +1314,16 @@ static void h_24x7_event_read(struct perf_event *event)
h24x7hw = &get_cpu_var(hv_24x7_hw);
h24x7hw->events[i] = event;
put_cpu_var(h24x7hw);
+ /*
+ * Clear the event count so we can compute the _change_
+ * in the 24x7 raw counter value at the end of the txn.
+ *
+ * Note that we could alternatively read the 24x7 value
+ * now and save its value in event->hw.prev_count. But
+ * that would require issuing a hcall, which would then
+ * defeat the purpose of using the txn interface.
+ */
+ local64_set(&event->count, 0);
}
put_cpu_var(hv_24x7_reqb);
diff --git a/arch/powerpc/perf/hv-24x7.h b/arch/powerpc/perf/hv-24x7.h
index c57d67dc9f3f..791455e7f5cf 100644
--- a/arch/powerpc/perf/hv-24x7.h
+++ b/arch/powerpc/perf/hv-24x7.h
@@ -7,6 +7,7 @@ enum hv_perf_domains {
#define DOMAIN(n, v, x, c) HV_PERF_DOMAIN_##n = v,
#include "hv-24x7-domains.h"
#undef DOMAIN
+ HV_PERF_DOMAIN_MAX,
};
struct hv_24x7_request {
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 5b62f2389290..a383c23a9070 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -54,7 +54,7 @@
* Power7 event codes.
*/
#define EVENT(_name, _code) \
- PME_##_name = _code,
+ _name = _code,
enum {
#include "power7-events-list.h"
@@ -318,14 +318,14 @@ static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[])
}
static int power7_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = PME_PM_CYC,
- [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PME_PM_GCT_NOSLOT_CYC,
- [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PME_PM_CMPLU_STALL,
- [PERF_COUNT_HW_INSTRUCTIONS] = PME_PM_INST_CMPL,
- [PERF_COUNT_HW_CACHE_REFERENCES] = PME_PM_LD_REF_L1,
- [PERF_COUNT_HW_CACHE_MISSES] = PME_PM_LD_MISS_L1,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PME_PM_BRU_FIN,
- [PERF_COUNT_HW_BRANCH_MISSES] = PME_PM_BR_MPRED,
+ [PERF_COUNT_HW_CPU_CYCLES] = PM_CYC,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_GCT_NOSLOT_CYC,
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PM_CMPLU_STALL,
+ [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_CMPL,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = PM_LD_REF_L1,
+ [PERF_COUNT_HW_CACHE_MISSES] = PM_LD_MISS_L1,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BRU_FIN,
+ [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED,
};
#define C(x) PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h
new file mode 100644
index 000000000000..741b77edd03e
--- /dev/null
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -0,0 +1,51 @@
+/*
+ * Performance counter support for POWER8 processors.
+ *
+ * Copyright 2014 Sukadev Bhattiprolu, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Power8 event codes.
+ */
+EVENT(PM_CYC, 0x0001e)
+EVENT(PM_GCT_NOSLOT_CYC, 0x100f8)
+EVENT(PM_CMPLU_STALL, 0x4000a)
+EVENT(PM_INST_CMPL, 0x00002)
+EVENT(PM_BRU_FIN, 0x10068)
+EVENT(PM_BR_MPRED_CMPL, 0x400f6)
+
+/* All L1 D cache load references counted at finish, gated by reject */
+EVENT(PM_LD_REF_L1, 0x100ee)
+/* Load Missed L1 */
+EVENT(PM_LD_MISS_L1, 0x3e054)
+/* Store Missed L1 */
+EVENT(PM_ST_MISS_L1, 0x300f0)
+/* L1 cache data prefetches */
+EVENT(PM_L1_PREF, 0x0d8b8)
+/* Instruction fetches from L1 */
+EVENT(PM_INST_FROM_L1, 0x04080)
+/* Demand iCache Miss */
+EVENT(PM_L1_ICACHE_MISS, 0x200fd)
+/* Instruction Demand sectors wriittent into IL1 */
+EVENT(PM_L1_DEMAND_WRITE, 0x0408c)
+/* Instruction prefetch written into IL1 */
+EVENT(PM_IC_PREF_WRITE, 0x0408e)
+/* The data cache was reloaded from local core's L3 due to a demand load */
+EVENT(PM_DATA_FROM_L3, 0x4c042)
+/* Demand LD - L3 Miss (not L2 hit and not L3 hit) */
+EVENT(PM_DATA_FROM_L3MISS, 0x300fe)
+/* All successful D-side store dispatches for this thread */
+EVENT(PM_L2_ST, 0x17080)
+/* All successful D-side store dispatches for this thread that were L2 Miss */
+EVENT(PM_L2_ST_MISS, 0x17082)
+/* Total HW L3 prefetches(Load+store) */
+EVENT(PM_L3_PREF_ALL, 0x4e052)
+/* Data PTEG reload */
+EVENT(PM_DTLB_MISS, 0x300fc)
+/* ITLB Reloaded */
+EVENT(PM_ITLB_MISS, 0x400fc)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index be9b7aec216f..690d9186a855 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -17,48 +17,16 @@
#include <asm/firmware.h>
#include <asm/cputable.h>
-
/*
* Some power8 event codes.
*/
-#define PM_CYC 0x0001e
-#define PM_GCT_NOSLOT_CYC 0x100f8
-#define PM_CMPLU_STALL 0x4000a
-#define PM_INST_CMPL 0x00002
-#define PM_BRU_FIN 0x10068
-#define PM_BR_MPRED_CMPL 0x400f6
-
-/* All L1 D cache load references counted at finish, gated by reject */
-#define PM_LD_REF_L1 0x100ee
-/* Load Missed L1 */
-#define PM_LD_MISS_L1 0x3e054
-/* Store Missed L1 */
-#define PM_ST_MISS_L1 0x300f0
-/* L1 cache data prefetches */
-#define PM_L1_PREF 0x0d8b8
-/* Instruction fetches from L1 */
-#define PM_INST_FROM_L1 0x04080
-/* Demand iCache Miss */
-#define PM_L1_ICACHE_MISS 0x200fd
-/* Instruction Demand sectors wriittent into IL1 */
-#define PM_L1_DEMAND_WRITE 0x0408c
-/* Instruction prefetch written into IL1 */
-#define PM_IC_PREF_WRITE 0x0408e
-/* The data cache was reloaded from local core's L3 due to a demand load */
-#define PM_DATA_FROM_L3 0x4c042
-/* Demand LD - L3 Miss (not L2 hit and not L3 hit) */
-#define PM_DATA_FROM_L3MISS 0x300fe
-/* All successful D-side store dispatches for this thread */
-#define PM_L2_ST 0x17080
-/* All successful D-side store dispatches for this thread that were L2 Miss */
-#define PM_L2_ST_MISS 0x17082
-/* Total HW L3 prefetches(Load+store) */
-#define PM_L3_PREF_ALL 0x4e052
-/* Data PTEG reload */
-#define PM_DTLB_MISS 0x300fc
-/* ITLB Reloaded */
-#define PM_ITLB_MISS 0x400fc
+#define EVENT(_name, _code) _name = _code,
+
+enum {
+#include "power8-events-list.h"
+};
+#undef EVENT
/*
* Raw event encoding for POWER8:
@@ -604,6 +572,71 @@ static void power8_disable_pmc(unsigned int pmc, unsigned long mmcr[])
mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1));
}
+GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-frontend, PM_GCT_NOSLOT_CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-backend, PM_CMPLU_STALL);
+GENERIC_EVENT_ATTR(instructions, PM_INST_CMPL);
+GENERIC_EVENT_ATTR(branch-instructions, PM_BRU_FIN);
+GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL);
+GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1);
+GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1);
+
+CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1);
+CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1);
+
+CACHE_EVENT_ATTR(L1-dcache-prefetches, PM_L1_PREF);
+CACHE_EVENT_ATTR(L1-dcache-store-misses, PM_ST_MISS_L1);
+CACHE_EVENT_ATTR(L1-icache-load-misses, PM_L1_ICACHE_MISS);
+CACHE_EVENT_ATTR(L1-icache-loads, PM_INST_FROM_L1);
+CACHE_EVENT_ATTR(L1-icache-prefetches, PM_IC_PREF_WRITE);
+
+CACHE_EVENT_ATTR(LLC-load-misses, PM_DATA_FROM_L3MISS);
+CACHE_EVENT_ATTR(LLC-loads, PM_DATA_FROM_L3);
+CACHE_EVENT_ATTR(LLC-prefetches, PM_L3_PREF_ALL);
+CACHE_EVENT_ATTR(LLC-store-misses, PM_L2_ST_MISS);
+CACHE_EVENT_ATTR(LLC-stores, PM_L2_ST);
+
+CACHE_EVENT_ATTR(branch-load-misses, PM_BR_MPRED_CMPL);
+CACHE_EVENT_ATTR(branch-loads, PM_BRU_FIN);
+CACHE_EVENT_ATTR(dTLB-load-misses, PM_DTLB_MISS);
+CACHE_EVENT_ATTR(iTLB-load-misses, PM_ITLB_MISS);
+
+static struct attribute *power8_events_attr[] = {
+ GENERIC_EVENT_PTR(PM_CYC),
+ GENERIC_EVENT_PTR(PM_GCT_NOSLOT_CYC),
+ GENERIC_EVENT_PTR(PM_CMPLU_STALL),
+ GENERIC_EVENT_PTR(PM_INST_CMPL),
+ GENERIC_EVENT_PTR(PM_BRU_FIN),
+ GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
+ GENERIC_EVENT_PTR(PM_LD_REF_L1),
+ GENERIC_EVENT_PTR(PM_LD_MISS_L1),
+
+ CACHE_EVENT_PTR(PM_LD_MISS_L1),
+ CACHE_EVENT_PTR(PM_LD_REF_L1),
+ CACHE_EVENT_PTR(PM_L1_PREF),
+ CACHE_EVENT_PTR(PM_ST_MISS_L1),
+ CACHE_EVENT_PTR(PM_L1_ICACHE_MISS),
+ CACHE_EVENT_PTR(PM_INST_FROM_L1),
+ CACHE_EVENT_PTR(PM_IC_PREF_WRITE),
+ CACHE_EVENT_PTR(PM_DATA_FROM_L3MISS),
+ CACHE_EVENT_PTR(PM_DATA_FROM_L3),
+ CACHE_EVENT_PTR(PM_L3_PREF_ALL),
+ CACHE_EVENT_PTR(PM_L2_ST_MISS),
+ CACHE_EVENT_PTR(PM_L2_ST),
+
+ CACHE_EVENT_PTR(PM_BR_MPRED_CMPL),
+ CACHE_EVENT_PTR(PM_BRU_FIN),
+
+ CACHE_EVENT_PTR(PM_DTLB_MISS),
+ CACHE_EVENT_PTR(PM_ITLB_MISS),
+ NULL
+};
+
+static struct attribute_group power8_pmu_events_group = {
+ .name = "events",
+ .attrs = power8_events_attr,
+};
+
PMU_FORMAT_ATTR(event, "config:0-49");
PMU_FORMAT_ATTR(pmcxsel, "config:0-7");
PMU_FORMAT_ATTR(mark, "config:8");
@@ -640,6 +673,7 @@ struct attribute_group power8_pmu_format_group = {
static const struct attribute_group *power8_pmu_attr_groups[] = {
&power8_pmu_format_group,
+ &power8_pmu_events_group,
NULL,
};
diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile
index 52c6ce1cc985..1eb7b45e017d 100644
--- a/arch/powerpc/platforms/powermac/Makefile
+++ b/arch/powerpc/platforms/powermac/Makefile
@@ -2,7 +2,7 @@ CFLAGS_bootx_init.o += -fPIC
ifdef CONFIG_FUNCTION_TRACER
# Do not trace early boot code
-CFLAGS_REMOVE_bootx_init.o = -pg -mno-sched-epilog
+CFLAGS_REMOVE_bootx_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
endif
obj-y += pic.o setup.o time.o feature.o pci.o \
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 811917219bf1..950b3e539057 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -371,6 +371,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
edev->mode &= 0xFFFFFF00;
edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+ edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
@@ -879,6 +880,120 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
}
}
+static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type,
+ int pos, u16 mask)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ int i, status = 0;
+
+ /* Wait for Transaction Pending bit to be cleared */
+ for (i = 0; i < 4; i++) {
+ eeh_ops->read_config(pdn, pos, 2, &status);
+ if (!(status & mask))
+ return;
+
+ msleep((1 << i) * 100);
+ }
+
+ pr_warn("%s: Pending transaction while issuing %sFLR to %04x:%02x:%02x.%01x\n",
+ __func__, type,
+ edev->phb->global_number, pdn->busno,
+ PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+}
+
+static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ u32 reg = 0;
+
+ if (WARN_ON(!edev->pcie_cap))
+ return -ENOTTY;
+
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
+ if (!(reg & PCI_EXP_DEVCAP_FLR))
+ return -ENOTTY;
+
+ switch (option) {
+ case EEH_RESET_HOT:
+ case EEH_RESET_FUNDAMENTAL:
+ pnv_eeh_wait_for_pending(pdn, "",
+ edev->pcie_cap + PCI_EXP_DEVSTA,
+ PCI_EXP_DEVSTA_TRPND);
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, &reg);
+ reg |= PCI_EXP_DEVCTL_BCR_FLR;
+ eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, reg);
+ msleep(EEH_PE_RST_HOLD_TIME);
+ break;
+ case EEH_RESET_DEACTIVATE:
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, &reg);
+ reg &= ~PCI_EXP_DEVCTL_BCR_FLR;
+ eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 4, reg);
+ msleep(EEH_PE_RST_SETTLE_TIME);
+ break;
+ }
+
+ return 0;
+}
+
+static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ u32 cap = 0;
+
+ if (WARN_ON(!edev->af_cap))
+ return -ENOTTY;
+
+ eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap);
+ if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
+ return -ENOTTY;
+
+ switch (option) {
+ case EEH_RESET_HOT:
+ case EEH_RESET_FUNDAMENTAL:
+ /*
+ * Wait for Transaction Pending bit to clear. A word-aligned
+ * test is used, so we use the conrol offset rather than status
+ * and shift the test bit to match.
+ */
+ pnv_eeh_wait_for_pending(pdn, "AF",
+ edev->af_cap + PCI_AF_CTRL,
+ PCI_AF_STATUS_TP << 8);
+ eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL,
+ 1, PCI_AF_CTRL_FLR);
+ msleep(EEH_PE_RST_HOLD_TIME);
+ break;
+ case EEH_RESET_DEACTIVATE:
+ eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0);
+ msleep(EEH_PE_RST_SETTLE_TIME);
+ break;
+ }
+
+ return 0;
+}
+
+static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option)
+{
+ struct eeh_dev *edev;
+ struct pci_dn *pdn;
+ int ret;
+
+ /* The VF PE should have only one child device */
+ edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list);
+ pdn = eeh_dev_to_pdn(edev);
+ if (!pdn)
+ return -ENXIO;
+
+ ret = pnv_eeh_do_flr(pdn, option);
+ if (!ret)
+ return ret;
+
+ return pnv_eeh_do_af_flr(pdn, option);
+}
+
/**
* pnv_eeh_reset - Reset the specified PE
* @pe: EEH PE
@@ -940,7 +1055,9 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
}
bus = eeh_pe_bus_get(pe);
- if (pci_is_root_bus(bus) ||
+ if (pe->type & EEH_PE_VF)
+ ret = pnv_eeh_reset_vf_pe(pe, option);
+ else if (pci_is_root_bus(bus) ||
pci_is_root_bus(bus->parent))
ret = pnv_eeh_root_reset(hose, option);
else
@@ -1079,6 +1196,14 @@ static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn)
if (!edev || !edev->pe)
return false;
+ /*
+ * We will issue FLR or AF FLR to all VFs, which are contained
+ * in VF PE. It relies on the EEH PCI config accessors. So we
+ * can't block them during the window.
+ */
+ if (edev->physfn && (edev->pe->state & EEH_PE_RESET))
+ return false;
+
if (edev->pe->state & EEH_PE_CFG_BLOCKED)
return true;
@@ -1463,6 +1588,65 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
return ret;
}
+static int pnv_eeh_restore_vf_config(struct pci_dn *pdn)
+{
+ struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ u32 devctl, cmd, cap2, aer_capctl;
+ int old_mps;
+
+ if (edev->pcie_cap) {
+ /* Restore MPS */
+ old_mps = (ffs(pdn->mps) - 8) << 5;
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 2, &devctl);
+ devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
+ devctl |= old_mps;
+ eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 2, devctl);
+
+ /* Disable Completion Timeout */
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
+ 4, &cap2);
+ if (cap2 & 0x10) {
+ eeh_ops->read_config(pdn,
+ edev->pcie_cap + PCI_EXP_DEVCTL2,
+ 4, &cap2);
+ cap2 |= 0x10;
+ eeh_ops->write_config(pdn,
+ edev->pcie_cap + PCI_EXP_DEVCTL2,
+ 4, cap2);
+ }
+ }
+
+ /* Enable SERR and parity checking */
+ eeh_ops->read_config(pdn, PCI_COMMAND, 2, &cmd);
+ cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+ eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd);
+
+ /* Enable report various errors */
+ if (edev->pcie_cap) {
+ eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 2, &devctl);
+ devctl &= ~PCI_EXP_DEVCTL_CERE;
+ devctl |= (PCI_EXP_DEVCTL_NFERE |
+ PCI_EXP_DEVCTL_FERE |
+ PCI_EXP_DEVCTL_URRE);
+ eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ 2, devctl);
+ }
+
+ /* Enable ECRC generation and check */
+ if (edev->pcie_cap && edev->aer_cap) {
+ eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+ 4, &aer_capctl);
+ aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+ eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+ 4, aer_capctl);
+ }
+
+ return 0;
+}
+
static int pnv_eeh_restore_config(struct pci_dn *pdn)
{
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
@@ -1472,9 +1656,21 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
if (!edev)
return -EEXIST;
- phb = edev->phb->private_data;
- ret = opal_pci_reinit(phb->opal_id,
- OPAL_REINIT_PCI_DEV, edev->config_addr);
+ /*
+ * We have to restore the PCI config space after reset since the
+ * firmware can't see SRIOV VFs.
+ *
+ * FIXME: The MPS, error routing rules, timeout setting are worthy
+ * to be exported by firmware in extendible way.
+ */
+ if (edev->physfn) {
+ ret = pnv_eeh_restore_vf_config(pdn);
+ } else {
+ phb = edev->phb->private_data;
+ ret = opal_pci_reinit(phb->opal_id,
+ OPAL_REINIT_PCI_DEV, edev->config_addr);
+ }
+
if (ret) {
pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
__func__, edev->config_addr, ret);
@@ -1503,6 +1699,40 @@ static struct eeh_ops pnv_eeh_ops = {
.restore_config = pnv_eeh_restore_config
};
+void pcibios_bus_add_device(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+
+ if (!pdev->is_virtfn)
+ return;
+
+ /*
+ * The following operations will fail if VF's sysfs files
+ * aren't created or its resources aren't finalized.
+ */
+ eeh_add_device_early(pdn);
+ eeh_add_device_late(pdev);
+ eeh_sysfs_add_device(pdev);
+}
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ int parent_mps;
+
+ if (!pdev->is_virtfn)
+ return;
+
+ /* Synchronize MPS for VF and PF */
+ parent_mps = pcie_get_mps(pdev->physfn);
+ if ((128 << pdev->pcie_mpss) >= parent_mps)
+ pcie_set_mps(pdev, parent_mps);
+ pdn->mps = pcie_get_mps(pdev);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
+#endif /* CONFIG_PCI_IOV */
+
/**
* eeh_powernv_init - Register platform dependent EEH operations
*
diff --git a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh b/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
new file mode 100755
index 000000000000..c658d8cf760b
--- /dev/null
+++ b/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# To debug, uncomment the following line
+# set -x
+
+# Test whether the compile option -mprofile-kernel exists and generates
+# profiling code (ie. a call to _mcount()).
+echo "int func() { return 0; }" | \
+ $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
+ grep -q "_mcount"
+
+# Test whether the notrace attribute correctly suppresses calls to _mcount().
+
+echo -e "#include <linux/compiler.h>\nnotrace int func() { return 0; }" | \
+ $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
+ grep -q "_mcount" && \
+ exit 1
+
+echo "OK"
+exit 0