From 0237199186e7a4aa5310741f0a6498a20c820fd7 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Tue, 20 Nov 2018 11:00:18 +0800 Subject: x86/CPU/AMD: Set the CPB bit unconditionally on F17h Some F17h models do not have CPB set in CPUID even though the CPU supports it. Set the feature bit unconditionally on all F17h. [ bp: Rewrite commit message and patch. ] Signed-off-by: Jiaxun Yang Signed-off-by: Borislav Petkov Acked-by: Tom Lendacky Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Sherry Hurwitz Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20181120030018.5185-1-jiaxun.yang@flygoat.com --- arch/x86/kernel/cpu/amd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 69f6bbb41be0..01004bfb1a1b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -819,11 +819,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); - /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects - * all up to and including B1. - */ - if (c->x86_model <= 1 && c->x86_stepping <= 1) + + /* Fix erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) set_cpu_cap(c, X86_FEATURE_CPB); } -- cgit v1.2.3 From 2ff40250691eaf28866eab449148843b39f65d7b Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Thu, 10 Jan 2019 13:53:32 +0000 Subject: perf/core, arch/x86: Use PERF_PMU_CAP_NO_EXCLUDE for exclusion incapable PMUs For drivers that do not support context exclusion let's advertise the PERF_PMU_CAP_NOEXCLUDE capability. This ensures that perf will prevent us from handling events where any exclusion flags are set. Let's also remove the now unnecessary check for exclusion flags. PMU drivers that support at least one exclude flag won't have the PERF_PMU_CAP_NOEXCLUDE capability set - these PMU drivers should still check and fail on unsupported exclude flags. These missing tests are not added in this patch. Signed-off-by: Andrew Murray Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Mark Rutland Cc: Matt Turner Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Richard Henderson Cc: Russell King Cc: Sascha Hauer Cc: Shawn Guo Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: robin.murphy@arm.com Cc: suzuki.poulose@arm.com Link: https://lkml.kernel.org/r/1547128414-50693-11-git-send-email-andrew.murray@arm.com Signed-off-by: Ingo Molnar --- arch/x86/events/amd/ibs.c | 13 +------------ arch/x86/events/amd/power.c | 10 ++-------- arch/x86/events/intel/cstate.c | 12 +++--------- arch/x86/events/intel/rapl.c | 9 ++------- arch/x86/events/intel/uncore_snb.c | 9 ++------- arch/x86/events/msr.c | 10 ++-------- 6 files changed, 12 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index d50bb4dc0650..62f317c9113a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -253,15 +253,6 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config) return -EOPNOTSUPP; } -static const struct perf_event_attr ibs_notsupp = { - .exclude_user = 1, - .exclude_kernel = 1, - .exclude_hv = 1, - .exclude_idle = 1, - .exclude_host = 1, - .exclude_guest = 1, -}; - static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -282,9 +273,6 @@ static int perf_ibs_init(struct perf_event *event) if (event->pmu != &perf_ibs->pmu) return -ENOENT; - if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp)) - return -EINVAL; - if (config & ~perf_ibs->config_mask) return -EINVAL; @@ -537,6 +525,7 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_CONFIG_MASK, diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 2aefacf5c5b2..c5ff084551c6 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -136,14 +136,7 @@ static int pmu_event_init(struct perf_event *event) return -ENOENT; /* Unsupported modes and filters. */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - /* no sampling */ - event->attr.sample_period) + if (event->attr.sample_period) return -EINVAL; if (cfg != AMD_POWER_EVENTSEL_PKG) @@ -226,6 +219,7 @@ static struct pmu pmu_class = { .start = pmu_event_start, .stop = pmu_event_stop, .read = pmu_event_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; static int power_cpu_exit(unsigned int cpu) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index d2e780705c5a..94a4b7fc75d0 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -280,13 +280,7 @@ static int cstate_pmu_event_init(struct perf_event *event) return -ENOENT; /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ + if (event->attr.sample_period) /* no sampling */ return -EINVAL; if (event->cpu < 0) @@ -437,7 +431,7 @@ static struct pmu cstate_core_pmu = { .start = cstate_pmu_event_start, .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, .module = THIS_MODULE, }; @@ -451,7 +445,7 @@ static struct pmu cstate_pkg_pmu = { .start = cstate_pmu_event_start, .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, .module = THIS_MODULE, }; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 91039ffed633..94dc564146ca 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -397,13 +397,7 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ + if (event->attr.sample_period) /* no sampling */ return -EINVAL; /* must be done before validate_group */ @@ -699,6 +693,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; rapl_pmus->pmu.module = THIS_MODULE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; return 0; } diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 2593b0d7aeee..b12517fae77a 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -397,13 +397,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event) return -EINVAL; /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ + if (event->attr.sample_period) /* no sampling */ return -EINVAL; /* @@ -497,6 +491,7 @@ static struct pmu snb_uncore_imc_pmu = { .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; static struct intel_uncore_ops snb_uncore_imc_ops = { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 1b9f85abf9bc..a878e6286e4a 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -160,13 +160,7 @@ static int msr_event_init(struct perf_event *event) return -ENOENT; /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ + if (event->attr.sample_period) /* no sampling */ return -EINVAL; if (cfg >= PERF_MSR_EVENT_MAX) @@ -256,7 +250,7 @@ static struct pmu pmu_msr = { .start = msr_event_start, .stop = msr_event_stop, .read = msr_event_update, - .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, }; static int __init msr_init(void) -- cgit v1.2.3 From 88dbe3c94e2773cbe200bf58dd88abacf27053e7 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Thu, 10 Jan 2019 13:53:33 +0000 Subject: perf/core, arch/x86: Strengthen exclusion checks with PERF_PMU_CAP_NO_EXCLUDE For x86 PMUs that do not support context exclusion let's advertise the PERF_PMU_CAP_NO_EXCLUDE capability. This ensures that perf will prevent us from handling events where any exclusion flags are set. Let's also remove the now unnecessary check for exclusion flags. This change means that amd/iommu and amd/uncore will now also indicate that they do not support exclude_{hv|idle} and intel/uncore that it does not support exclude_{guest|host}. Signed-off-by: Andrew Murray Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Mark Rutland Cc: Matt Turner Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Richard Henderson Cc: Russell King Cc: Sascha Hauer Cc: Shawn Guo Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: robin.murphy@arm.com Cc: suzuki.poulose@arm.com Link: https://lkml.kernel.org/r/1547128414-50693-12-git-send-email-andrew.murray@arm.com Signed-off-by: Ingo Molnar --- arch/x86/events/amd/iommu.c | 6 +----- arch/x86/events/amd/uncore.c | 7 ++----- arch/x86/events/intel/uncore.c | 9 +-------- 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 3210fee27e7f..7635c23f7d82 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -223,11 +223,6 @@ static int perf_iommu_event_init(struct perf_event *event) if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - /* IOMMU counters do not have usr/os/guest/host bits */ - if (event->attr.exclude_user || event->attr.exclude_kernel || - event->attr.exclude_host || event->attr.exclude_guest) - return -EINVAL; - if (event->cpu < 0) return -EINVAL; @@ -414,6 +409,7 @@ static const struct pmu iommu_pmu __initconst = { .read = perf_iommu_read, .task_ctx_nr = perf_invalid_context, .attr_groups = amd_iommu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; static __init int init_one_iommu(unsigned int idx) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 398df6eaa109..79cfd3b30ceb 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -201,11 +201,6 @@ static int amd_uncore_event_init(struct perf_event *event) if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) return -EINVAL; - /* NB and Last level cache counters do not have usr/os/guest/host bits */ - if (event->attr.exclude_user || event->attr.exclude_kernel || - event->attr.exclude_host || event->attr.exclude_guest) - return -EINVAL; - /* and we do not enable counter overflow interrupts */ hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; @@ -307,6 +302,7 @@ static struct pmu amd_nb_pmu = { .start = amd_uncore_start, .stop = amd_uncore_stop, .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; static struct pmu amd_llc_pmu = { @@ -317,6 +313,7 @@ static struct pmu amd_llc_pmu = { .start = amd_uncore_start, .stop = amd_uncore_stop, .read = amd_uncore_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 27a461414b30..d516161c00c4 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -695,14 +695,6 @@ static int uncore_pmu_event_init(struct perf_event *event) if (pmu->func_id < 0) return -ENOENT; - /* - * Uncore PMU does measure at all privilege level all the time. - * So it doesn't make sense to specify any exclude bits. - */ - if (event->attr.exclude_user || event->attr.exclude_kernel || - event->attr.exclude_hv || event->attr.exclude_idle) - return -EINVAL; - /* Sampling not supported yet */ if (hwc->sample_period) return -EINVAL; @@ -800,6 +792,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, .module = THIS_MODULE, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; } else { pmu->pmu = *pmu->type->pmu; -- cgit v1.2.3 From 840018668ce2d96783356204ff282d6c9b0e5f66 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Thu, 31 Jan 2019 11:47:08 -0700 Subject: perf/aux: Make perf_event accessible to setup_aux() When pmu::setup_aux() is called the coresight PMU needs to know which sink to use for the session by looking up the information in the event's attr::config2 field. As such simply replace the cpu information by the complete perf_event structure and change all affected customers. Signed-off-by: Mathieu Poirier Reviewed-by: Suzuki Poulouse Acked-by: Peter Zijlstra Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Jiri Olsa Cc: Mark Rutland Cc: Martin Schwidefsky Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/20190131184714.20388-2-mathieu.poirier@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- arch/s390/kernel/perf_cpum_sf.c | 6 +++--- arch/x86/events/intel/bts.c | 4 +++- arch/x86/events/intel/pt.c | 5 +++-- drivers/hwtracing/coresight/coresight-etm-perf.c | 6 +++--- drivers/perf/arm_spe_pmu.c | 6 +++--- include/linux/perf_event.h | 2 +- kernel/events/ring_buffer.c | 2 +- 7 files changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index bfabeb1889cc..1266194afb02 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1600,7 +1600,7 @@ static void aux_sdb_init(unsigned long sdb) /* * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling - * @cpu: On which to allocate, -1 means current + * @event: Event the buffer is setup for, event->cpu == -1 means current * @pages: Array of pointers to buffer pages passed from perf core * @nr_pages: Total pages * @snapshot: Flag for snapshot mode @@ -1612,8 +1612,8 @@ static void aux_sdb_init(unsigned long sdb) * * Return the private AUX buffer structure if success or NULL if fails. */ -static void *aux_buffer_setup(int cpu, void **pages, int nr_pages, - bool snapshot) +static void *aux_buffer_setup(struct perf_event *event, void **pages, + int nr_pages, bool snapshot) { struct sf_buffer *sfb; struct aux_buffer *aux; diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index a01ef1b0f883..7cdd7b13bbda 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -77,10 +77,12 @@ static size_t buf_size(struct page *page) } static void * -bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) +bts_buffer_setup_aux(struct perf_event *event, void **pages, + int nr_pages, bool overwrite) { struct bts_buffer *buf; struct page *page; + int cpu = event->cpu; int node = (cpu == -1) ? cpu : cpu_to_node(cpu); unsigned long offset; size_t size = nr_pages << PAGE_SHIFT; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 9494ca68fd9d..c0e86ff21f81 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1114,10 +1114,11 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, * Return: Our private PT buffer structure. */ static void * -pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot) +pt_buffer_setup_aux(struct perf_event *event, void **pages, + int nr_pages, bool snapshot) { struct pt_buffer *buf; - int node, ret; + int node, ret, cpu = event->cpu; if (!nr_pages) return NULL; diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index abe8249b893b..f21eb28b6782 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -177,15 +177,15 @@ static void etm_free_aux(void *data) schedule_work(&event_data->work); } -static void *etm_setup_aux(int event_cpu, void **pages, +static void *etm_setup_aux(struct perf_event *event, void **pages, int nr_pages, bool overwrite) { - int cpu; + int cpu = event->cpu; cpumask_t *mask; struct coresight_device *sink; struct etm_event_data *event_data = NULL; - event_data = alloc_event_data(event_cpu); + event_data = alloc_event_data(cpu); if (!event_data) return NULL; INIT_WORK(&event_data->work, free_event_data); diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 8e46a9dad2fa..7cb766dafe85 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -824,10 +824,10 @@ static void arm_spe_pmu_read(struct perf_event *event) { } -static void *arm_spe_pmu_setup_aux(int cpu, void **pages, int nr_pages, - bool snapshot) +static void *arm_spe_pmu_setup_aux(struct perf_event *event, void **pages, + int nr_pages, bool snapshot) { - int i; + int i, cpu = event->cpu; struct page **pglist; struct arm_spe_pmu_buf *buf; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6cb5d483ab34..d9c3610e0e25 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -410,7 +410,7 @@ struct pmu { /* * Set up pmu-private data structures for an AUX area */ - void *(*setup_aux) (int cpu, void **pages, + void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 805f0423ee0b..70ae2422cbaf 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -657,7 +657,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, goto out; } - rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; -- cgit v1.2.3 From 0f42b790c9ba5ec2f25b7da8b0b6d361082d67b0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 4 Feb 2019 14:23:29 -0800 Subject: x86/cpufeature: Add facility to check for min microcode revisions For bug workarounds or checks, it is useful to check for specific microcode revisions. Add a new generic function to match the CPU with stepping. Add the other function to check the min microcode revisions for the matched CPU. A new table format is introduced to facilitate the quirk to fill the related information. This does not change the existing x86_cpu_id because it's an ABI shared with modules, and also has quite different requirements, as in no wildcards, but everything has to be matched exactly. Originally-by: Andi Kleen Suggested-by: Thomas Gleixner Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: eranian@google.com Link: https://lkml.kernel.org/r/1549319013-4522-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpu_device_id.h | 28 ++++++++++++++++++++++++++++ arch/x86/kernel/cpu/match.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index baeba0567126..3417110574c1 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -11,4 +11,32 @@ extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); +/* + * Match specific microcode revisions. + * + * vendor/family/model/stepping must be all set. + * + * Only checks against the boot CPU. When mixed-stepping configs are + * valid for a CPU model, add a quirk for every valid stepping and + * do the fine-tuning in the quirk handler. + */ + +struct x86_cpu_desc { + __u8 x86_family; + __u8 x86_vendor; + __u8 x86_model; + __u8 x86_stepping; + __u32 x86_microcode_rev; +}; + +#define INTEL_CPU_DESC(mod, step, rev) { \ + .x86_family = 6, \ + .x86_vendor = X86_VENDOR_INTEL, \ + .x86_model = mod, \ + .x86_stepping = step, \ + .x86_microcode_rev = rev, \ +} + +extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table); + #endif diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 3fed38812eea..6dd78d8235e4 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -48,3 +48,34 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) return NULL; } EXPORT_SYMBOL(x86_match_cpu); + +static const struct x86_cpu_desc * +x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + const struct x86_cpu_desc *m; + + for (m = match; m->x86_family | m->x86_model; m++) { + if (c->x86_vendor != m->x86_vendor) + continue; + if (c->x86 != m->x86_family) + continue; + if (c->x86_model != m->x86_model) + continue; + if (c->x86_stepping != m->x86_stepping) + continue; + return m; + } + return NULL; +} + +bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) +{ + const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); + + if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); -- cgit v1.2.3 From 9b545c04abd4f7246a3bde040efde587abebb23c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2019 14:23:30 -0800 Subject: perf/x86/kvm: Avoid unnecessary work in guest filtering KVM added a workaround for PEBS events leaking into guests with commit: 26a4f3c08de4 ("perf/x86: disable PEBS on a guest entry.") This uses the VT entry/exit list to add an extra disable of the PEBS_ENABLE MSR. Intel also added a fix for this issue to microcode updates on Haswell/Broadwell/Skylake. It turns out using the MSR entry/exit list makes VM exits significantly slower. The list is only needed for disabling PEBS, because the GLOBAL_CTRL change gets optimized by KVM into changing the VMCS. Check for the microcode updates that have the microcode fix for leaking PEBS, and disable the extra entry/exit list entry for PEBS_ENABLE. In addition we always clear the GLOBAL_CTRL for the PEBS counter while running in the guest, which is enough to make them never fire at the wrong side of the host/guest transition. The overhead for VM exits with the filtering active with the patch is reduced from 8% to 4%. The microcode patch has already been merged into future platforms. This patch is one-off thing. The quirks is used here. For other old platforms which doesn't have microcode patch and quirks, extra disable of the PEBS_ENABLE MSR is still required. Signed-off-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Link: https://lkml.kernel.org/r/1549319013-4522-2-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 74 ++++++++++++++++++++++++++++++++++++++------ arch/x86/events/intel/ds.c | 2 ++ arch/x86/events/perf_event.h | 15 ++++----- 3 files changed, 75 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index daafb893449b..8fe2afa9c818 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../perf_event.h" @@ -3206,16 +3207,27 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; - /* - * If PMU counter has PEBS enabled it is not enough to disable counter - * on a guest entry since PEBS memory write can overshoot guest entry - * and corrupt guest memory. Disabling PEBS solves the problem. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; + if (x86_pmu.flags & PMU_FL_PEBS_ALL) + arr[0].guest &= ~cpuc->pebs_enabled; + else + arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + *nr = 1; + + if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + arr[1].msr = MSR_IA32_PEBS_ENABLE; + arr[1].host = cpuc->pebs_enabled; + arr[1].guest = 0; + *nr = 2; + } - *nr = 2; return arr; } @@ -3739,6 +3751,47 @@ static __init void intel_clovertown_quirk(void) x86_pmu.pebs_constraints = NULL; } +static const struct x86_cpu_desc isolation_ucodes[] = { + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_CORE, 3, 0x0000001f), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_ULT, 1, 0x0000001e), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_GT3E, 1, 0x00000015), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 2, 0x00000037), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 4, 0x0000000a), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_CORE, 4, 0x00000023), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_GT3E, 1, 0x00000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 2, 0x00000010), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 3, 0x07000009), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 4, 0x0f000009), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 5, 0x0e000002), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_MOBILE, 3, 0x0000007c), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_DESKTOP, 3, 0x0000007c), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 9, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 9, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 10, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 11, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 12, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 10, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 11, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 12, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 13, 0x0000004e), + {} +}; + +static void intel_check_pebs_isolation(void) +{ + x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes); +} + +static __init void intel_pebs_isolation_quirk(void) +{ + WARN_ON_ONCE(x86_pmu.check_microcode); + x86_pmu.check_microcode = intel_check_pebs_isolation; + intel_check_pebs_isolation(); +} + static int intel_snb_pebs_broken(int cpu) { u32 rev = UINT_MAX; /* default to broken for unknown models */ @@ -4431,6 +4484,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_HASWELL_ULT: case INTEL_FAM6_HASWELL_GT3E: x86_add_quirk(intel_ht_bug); + x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -4462,6 +4516,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_BROADWELL_XEON_D: case INTEL_FAM6_BROADWELL_GT3E: case INTEL_FAM6_BROADWELL_X: + x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -4524,6 +4579,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_MOBILE: case INTEL_FAM6_KABYLAKE_DESKTOP: + x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index e9acf1d2e7b2..10c99ce1fead 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1628,6 +1628,8 @@ void __init intel_ds_init(void) x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; + if (x86_pmu.version <= 4) + x86_pmu.pebs_no_isolation = 1; if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; int format = x86_pmu.intel_cap.pebs_format; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 78d7b7031bfc..dea716e1f713 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -601,13 +601,14 @@ struct x86_pmu { /* * Intel DebugStore bits */ - unsigned int bts :1, - bts_active :1, - pebs :1, - pebs_active :1, - pebs_broken :1, - pebs_prec_dist :1, - pebs_no_tlb :1; + unsigned int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1, + pebs_prec_dist :1, + pebs_no_tlb :1, + pebs_no_isolation :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); -- cgit v1.2.3 From a96fff8df28ddd2f6710e5af454a45014c73183c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 4 Feb 2019 14:23:31 -0800 Subject: perf/x86/intel: Clean up SNB PEBS quirk Clean up SNB PEBS quirk to use the new facility to check for min microcode revisions. Only check the boot CPU, assuming models and features are consistent over all CPUs. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Link: https://lkml.kernel.org/r/1549319013-4522-3-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8fe2afa9c818..87f4ed203c07 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3792,36 +3792,21 @@ static __init void intel_pebs_isolation_quirk(void) intel_check_pebs_isolation(); } -static int intel_snb_pebs_broken(int cpu) -{ - u32 rev = UINT_MAX; /* default to broken for unknown models */ - - switch (cpu_data(cpu).x86_model) { - case INTEL_FAM6_SANDYBRIDGE: - rev = 0x28; - break; - - case INTEL_FAM6_SANDYBRIDGE_X: - switch (cpu_data(cpu).x86_stepping) { - case 6: rev = 0x618; break; - case 7: rev = 0x70c; break; - } - } +static const struct x86_cpu_desc pebs_ucodes[] = { + INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE, 7, 0x00000028), + INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 6, 0x00000618), + INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 7, 0x0000070c), + {} +}; - return (cpu_data(cpu).microcode < rev); +static bool intel_snb_pebs_broken(void) +{ + return !x86_cpu_has_min_microcode_rev(pebs_ucodes); } static void intel_snb_check_microcode(void) { - int pebs_broken = 0; - int cpu; - - for_each_online_cpu(cpu) { - if ((pebs_broken = intel_snb_pebs_broken(cpu))) - break; - } - - if (pebs_broken == x86_pmu.pebs_broken) + if (intel_snb_pebs_broken() == x86_pmu.pebs_broken) return; /* -- cgit v1.2.3 From bef9f2714f8cf036af3af52a15f534d2cc40c230 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 4 Feb 2019 14:23:32 -0800 Subject: perf/x86/intel: Clean up counter freezing quirk Clean up counter freezing quirk to use the new facility to check for min microcode revisions. Rename the counter freezing quirk related functions. Because other platforms, e.g. Goldmont, also needs to call the quirk. Only check the boot CPU, assuming models and features are consistent over all CPUs. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Link: https://lkml.kernel.org/r/1549319013-4522-4-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 87f4ed203c07..c79c0165d838 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3923,23 +3923,18 @@ static __init void intel_nehalem_quirk(void) } } -static bool intel_glp_counter_freezing_broken(int cpu) -{ - u32 rev = UINT_MAX; /* default to broken for unknown stepping */ - - switch (cpu_data(cpu).x86_stepping) { - case 1: - rev = 0x28; - break; - case 8: - rev = 0x6; - break; - } +static const struct x86_cpu_desc counter_freezing_ucodes[] = { + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), + {} +}; - return (cpu_data(cpu).microcode < rev); +static bool intel_counter_freezing_broken(void) +{ + return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes); } -static __init void intel_glp_counter_freezing_quirk(void) +static __init void intel_counter_freezing_quirk(void) { /* Check if it's already disabled */ if (disable_counter_freezing) @@ -3949,7 +3944,7 @@ static __init void intel_glp_counter_freezing_quirk(void) * If the system starts with the wrong ucode, leave the * counter-freezing feature permanently disabled. */ - if (intel_glp_counter_freezing_broken(raw_smp_processor_id())) { + if (intel_counter_freezing_broken()) { pr_info("PMU counter freezing disabled due to CPU errata," "please upgrade microcode\n"); x86_pmu.counter_freezing = false; @@ -4326,7 +4321,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - x86_add_quirk(intel_glp_counter_freezing_quirk); + x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, -- cgit v1.2.3 From af63147c1edacfb75a28885a7cd7e6f44e626164 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 4 Feb 2019 14:23:33 -0800 Subject: perf/x86/intel: Add counter freezing quirk for Goldmont A microcode patch is also needed for Goldmont while counter freezing feature is enabled. Otherwise, there will be some issues, e.g. PMI lost. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Link: https://lkml.kernel.org/r/1549319013-4522-5-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c79c0165d838..024a515d9102 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3924,8 +3924,12 @@ static __init void intel_nehalem_quirk(void) } static const struct x86_cpu_desc counter_freezing_ucodes[] = { - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_X, 1, 0x00000028), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), {} }; @@ -4295,6 +4299,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_ATOM_GOLDMONT_X: + x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, -- cgit v1.2.3 From 0192e6535ebe9af68614198ced4fd6d37b778ebf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:11:19 +0900 Subject: x86/kprobes: Prohibit probing on optprobe template code Prohibit probing on optprobe template code, since it is not a code but a template instruction sequence. If we modify this template, copied template must be broken. Signed-off-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 9326638cbee2 ("kprobes, x86: Use NOKPROBE_SYMBOL() instead of __kprobes annotation") Link: http://lkml.kernel.org/r/154998787911.31052.15274376330136234452.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/opt.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 6adf6e6c2933..544bd41a514c 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -141,6 +141,11 @@ asm ( void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); +NOKPROBE_SYMBOL(optprobe_template_func); +NOKPROBE_SYMBOL(optprobe_template_entry); +NOKPROBE_SYMBOL(optprobe_template_val); +NOKPROBE_SYMBOL(optprobe_template_call); +NOKPROBE_SYMBOL(optprobe_template_end); #define TMPL_MOVE_IDX \ ((long)optprobe_template_val - (long)optprobe_template_entry) -- cgit v1.2.3 From 877b145f0f4723133f934be402b8dfc769eb971f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:11:47 +0900 Subject: x86/kprobes: Move trampoline code into RODATA Move optprobe trampoline code into RODATA since it is not executed, but copied and modified to be used on a trampoline buffer. Signed-off-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998790744.31052.3016106262944915510.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/opt.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 544bd41a514c..f14262952015 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -97,6 +97,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) } asm ( + ".pushsection .rodata\n" "optprobe_template_func:\n" ".global optprobe_template_entry\n" "optprobe_template_entry:\n" @@ -136,16 +137,10 @@ asm ( #endif ".global optprobe_template_end\n" "optprobe_template_end:\n" - ".type optprobe_template_func, @function\n" - ".size optprobe_template_func, .-optprobe_template_func\n"); + ".popsection\n"); void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); -NOKPROBE_SYMBOL(optprobe_template_func); -NOKPROBE_SYMBOL(optprobe_template_entry); -NOKPROBE_SYMBOL(optprobe_template_val); -NOKPROBE_SYMBOL(optprobe_template_call); -NOKPROBE_SYMBOL(optprobe_template_end); #define TMPL_MOVE_IDX \ ((long)optprobe_template_val - (long)optprobe_template_entry) -- cgit v1.2.3 From c13324a505c7790fe91a9df35be2e0462abccdb0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:12:15 +0900 Subject: x86/kprobes: Prohibit probing on functions before kprobe_int3_handler() Prohibit probing on the functions called before kprobe_int3_handler() in do_int3(). More specifically, ftrace_int3_handler(), poke_int3_handler(), and ist_enter(). And since rcu_nmi_enter() is called by ist_enter(), it also should be marked as NOKPROBE_SYMBOL. Since those are handled before kprobe_int3_handler(), probing those functions can cause a breakpoint recursion and crash the kernel. Signed-off-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998793571.31052.11301258949601150994.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 3 ++- arch/x86/kernel/ftrace.c | 3 ++- arch/x86/kernel/traps.c | 1 + kernel/rcu/tree.c | 2 ++ 4 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ebeac487a20c..e8b628b1b279 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -764,8 +765,8 @@ int poke_int3_handler(struct pt_regs *regs) regs->ip = (unsigned long) bp_int3_handler; return 1; - } +NOKPROBE_SYMBOL(poke_int3_handler); /** * text_poke_bp() -- update instructions on live kernel on SMP diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8257a59704ae..3e3789c8f8e1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -269,7 +269,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } -static int is_ftrace_caller(unsigned long ip) +static nokprobe_inline int is_ftrace_caller(unsigned long ip) { if (ip == ftrace_update_func) return 1; @@ -299,6 +299,7 @@ int ftrace_int3_handler(struct pt_regs *regs) return 1; } +NOKPROBE_SYMBOL(ftrace_int3_handler); static int ftrace_write(unsigned long ip, const char *val, int size) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9b7c4ca8f0a7..e289ce1332ab 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -111,6 +111,7 @@ void ist_enter(struct pt_regs *regs) /* This code is a bit fragile. Test it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); } +NOKPROBE_SYMBOL(ist_enter); void ist_exit(struct pt_regs *regs) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9180158756d2..74db52a0a466 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "tree.h" #include "rcu.h" @@ -872,6 +873,7 @@ void rcu_nmi_enter(void) { rcu_nmi_enter_common(false); } +NOKPROBE_SYMBOL(rcu_nmi_enter); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle -- cgit v1.2.3 From 0eae81dc9f026d899c70f3931bf3bca6d7aa6938 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 13 Feb 2019 01:12:44 +0900 Subject: x86/kprobes: Prohibit probing on IRQ handlers directly Prohibit probing on IRQ handlers in irqentry_text because if it interrupts user mode, at that point we haven't changed to kernel space yet and which eventually leads a double fault. E.g. # echo p apic_timer_interrupt > kprobe_events # echo 1 > events/kprobes/enable PANIC: double fault, error_code: 0x0 CPU: 1 PID: 814 Comm: less Not tainted 4.20.0-rc3+ #30 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:error_entry+0x12/0xf0 [snip] Call Trace: ? native_iret+0x7/0x7 ? async_page_fault+0x8/0x30 ? trace_hardirqs_on_thunk+0x1c/0x1c ? error_entry+0x7c/0xf0 ? async_page_fault+0x8/0x30 ? native_iret+0x7/0x7 ? int3+0xa/0x20 ? trace_hardirqs_on_thunk+0x1c/0x1c ? error_entry+0x7c/0xf0 ? int3+0xa/0x20 ? apic_timer_interrupt+0x1/0x20 Kernel panic - not syncing: Machine halted. Kernel Offset: disabled ---[ end Kernel panic - not syncing: Machine halted. ]--- Signed-off-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Andrea Righi Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/154998796400.31052.8406236614820687840.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4ba75afba527..a034cb808e7e 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1028,6 +1028,13 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); int __init arch_populate_kprobe_blacklist(void) { + int ret; + + ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); + if (ret) + return ret; + return kprobe_add_area_blacklist((unsigned long)__entry_text_start, (unsigned long)__entry_text_end); } -- cgit v1.2.3 From c60f83b813e5b25ccd5de7e8c8925c31b3aebcc1 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 15 Feb 2019 13:56:55 +0200 Subject: perf, pt, coresight: Fix address filters for vmas with non-zero offset Currently, the address range calculation for file-based filters works as long as the vma that maps the matching part of the object file starts from offset zero into the file (vm_pgoff==0). Otherwise, the resulting filter range would be off by vm_pgoff pages. Another related problem is that in case of a partially matching vma, that is, a vma that matches part of a filter region, the filter range size wouldn't be adjusted. Fix the arithmetics around address filter range calculations, taking into account vma offset, so that the entire calculation is done before the filter configuration is passed to the PMU drivers instead of having those drivers do the final bit of arithmetics. Based on the patch by Adrian Hunter . Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Tested-by: Mathieu Poirier Acked-by: Peter Zijlstra Cc: Jiri Olsa Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/20190215115655.63469-3-alexander.shishkin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/events/intel/pt.c | 9 +-- drivers/hwtracing/coresight/coresight-etm-perf.c | 7 +- include/linux/perf_event.h | 7 +- kernel/events/core.c | 81 ++++++++++++++---------- 4 files changed, 62 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c0e86ff21f81..fb3a2f13fc70 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1223,7 +1223,8 @@ static int pt_event_addr_filters_validate(struct list_head *filters) static void pt_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *head = perf_event_addr_filters(event); - unsigned long msr_a, msr_b, *offs = event->addr_filters_offs; + unsigned long msr_a, msr_b; + struct perf_addr_filter_range *fr = event->addr_filter_ranges; struct pt_filters *filters = event->hw.addr_filters; struct perf_addr_filter *filter; int range = 0; @@ -1232,12 +1233,12 @@ static void pt_event_addr_filters_sync(struct perf_event *event) return; list_for_each_entry(filter, &head->list, entry) { - if (filter->path.dentry && !offs[range]) { + if (filter->path.dentry && !fr[range].start) { msr_a = msr_b = 0; } else { /* apply the offset */ - msr_a = filter->offset + offs[range]; - msr_b = filter->size + msr_a - 1; + msr_a = fr[range].start; + msr_b = msr_a + fr[range].size - 1; } filters->filter[range].msr_a = msr_a; diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index 8c88bf0a1e5f..4d5a2b9f9d6a 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -433,15 +433,16 @@ static int etm_addr_filters_validate(struct list_head *filters) static void etm_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *head = perf_event_addr_filters(event); - unsigned long start, stop, *offs = event->addr_filters_offs; + unsigned long start, stop; + struct perf_addr_filter_range *fr = event->addr_filter_ranges; struct etm_filters *filters = event->hw.addr_filters; struct etm_filter *etm_filter; struct perf_addr_filter *filter; int i = 0; list_for_each_entry(filter, &head->list, entry) { - start = filter->offset + offs[i]; - stop = start + filter->size; + start = fr[i].start; + stop = start + fr[i].size; etm_filter = &filters->etm_filter[i]; switch (filter->action) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d9c3610e0e25..6ebc72f65017 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -490,6 +490,11 @@ struct perf_addr_filters_head { unsigned int nr_file_filters; }; +struct perf_addr_filter_range { + unsigned long start; + unsigned long size; +}; + /** * enum perf_event_state - the states of an event: */ @@ -666,7 +671,7 @@ struct perf_event { /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ - unsigned long *addr_filters_offs; + struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; void (*destroy)(struct perf_event *); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d89efc0a3e0..16609f6737da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2799,7 +2799,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in - * event::addr_filters_offs array and bump the event::addr_filters_gen; + * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. @@ -4446,7 +4446,7 @@ static void _free_event(struct perf_event *event) perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); @@ -6687,7 +6687,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; restart++; } @@ -7367,28 +7368,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, return true; } +static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, + struct vm_area_struct *vma, + struct perf_addr_filter_range *fr) +{ + unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct file *file = vma->vm_file; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + return false; + + if (filter->offset < off) { + fr->start = vma->vm_start; + fr->size = min(vma_size, filter->size - (off - filter->offset)); + } else { + fr->start = vma->vm_start + filter->offset - off; + fr->size = min(vma->vm_end - fr->start, filter->size); + } + + return true; +} + static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; - struct file *file = vma->vm_file; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; + unsigned long flags; if (!has_addr_filter(event)) return; - if (!file) + if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (perf_addr_filter_match(filter, file, off, - vma->vm_end - vma->vm_start)) { - event->addr_filters_offs[count] = vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, + &event->addr_filter_ranges[count])) restart++; - } count++; } @@ -8978,26 +8998,19 @@ static void perf_addr_filters_splice(struct perf_event *event, * @filter; if so, adjust filter's address range. * Called with mm::mmap_sem down for reading. */ -static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, - struct mm_struct *mm) +static void perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm, + struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct file *file = vma->vm_file; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - unsigned long vma_size = vma->vm_end - vma->vm_start; - - if (!file) + if (!vma->vm_file) continue; - if (!perf_addr_filter_match(filter, file, off, vma_size)) - continue; - - return vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, fr)) + return; } - - return 0; } /* @@ -9031,15 +9044,15 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; /* * Adjust base offset if the filter is associated to a binary * that needs to be mapped: */ if (filter->path.dentry) - event->addr_filters_offs[count] = - perf_addr_filter_apply(filter, mm); + perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); count++; } @@ -10305,10 +10318,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; if (has_addr_filter(event)) { - event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, - sizeof(unsigned long), - GFP_KERNEL); - if (!event->addr_filters_offs) { + event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, + sizeof(struct perf_addr_filter_range), + GFP_KERNEL); + if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } @@ -10321,9 +10334,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); raw_spin_lock_irq(&ifh->lock); - memcpy(event->addr_filters_offs, - event->parent->addr_filters_offs, - pmu->nr_addr_filters * sizeof(unsigned long)); + memcpy(event->addr_filter_ranges, + event->parent->addr_filter_ranges, + pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); raw_spin_unlock_irq(&ifh->lock); } @@ -10345,7 +10358,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; err_addr_filters: - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); -- cgit v1.2.3