From 6af7e4f77259ee946103387372cb159f2e99a6d4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 1 Sep 2016 08:52:29 -0500 Subject: PCI: Mark Haswell Power Control Unit as having non-compliant BARs The Haswell Power Control Unit has a non-PCI register (CONFIG_TDP_NOMINAL) where BAR 0 is supposed to be. This is erratum HSE43 in the spec update referenced below: The PCIe* Base Specification indicates that Configuration Space Headers have a base address register at offset 0x10. Due to this erratum, the Power Control Unit's CONFIG_TDP_NOMINAL CSR (Bus 1; Device 30; Function 3; Offset 0x10) is located where a base register is expected. Mark the PCU as having non-compliant BARs so we don't try to probe any of them. There are no other BARs on this device. Rename the quirk so it's not Broadwell-specific. Link: http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html Link: http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-datasheet-vol-2.html (section 5.4, Device 30 Function 3) Link: https://bugzilla.kernel.org/show_bug.cgi?id=153881 Reported-by: Paul Menzel Tested-by: Prarit Bhargava Signed-off-by: Bjorn Helgaas Acked-by: Myron Stowe --- arch/x86/pci/fixup.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 837ea36a837d..6d52b94f4bb9 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -553,15 +553,21 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); /* - * Broadwell EP Home Agent BARs erroneously return non-zero values when read. + * Device [8086:2fc0] + * Erratum HSE43 + * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html * - * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html - * entry BDF2. + * Devices [8086:6f60,6fa0,6fc0] + * Erratum BDF2 + * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html */ -static void pci_bdwep_bar(struct pci_dev *dev) +static void pci_invalid_bar(struct pci_dev *dev) { dev->non_compliant_bars = 1; } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); -- cgit v1.2.3 From cc2187a6e037bc64404f63c6d650ff263c2200c0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 4 Sep 2016 11:37:36 +0200 Subject: x86/microcode/AMD: Fix load of builtin microcode with randomized memory We do not need to add the randomization offset when the microcode is built in. Reported-and-tested-by: Emanuel Czirai Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20160904093736.GA11939@pd.tnic Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/microcode/amd.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index b816971f5da4..620ab06bcf45 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -54,6 +54,7 @@ static LIST_HEAD(pcache); */ static u8 *container; static size_t container_size; +static bool ucode_builtin; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; @@ -281,18 +282,22 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { struct cpio_data cp; + bool *builtin; void **data; size_t *size; #ifdef CONFIG_X86_32 data = (void **)__pa_nodebug(&ucode_cpio.data); size = (size_t *)__pa_nodebug(&ucode_cpio.size); + builtin = (bool *)__pa_nodebug(&ucode_builtin); #else data = &ucode_cpio.data; size = &ucode_cpio.size; + builtin = &ucode_builtin; #endif - if (!load_builtin_amd_microcode(&cp, family)) + *builtin = load_builtin_amd_microcode(&cp, family); + if (!*builtin) cp = find_ucode_in_initrd(); if (!(cp.data && cp.size)) @@ -373,7 +378,8 @@ void load_ucode_amd_ap(void) return; /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + if (!ucode_builtin) + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; eax = cpuid_eax(0x00000001); eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); @@ -439,7 +445,8 @@ int __init save_microcode_in_initrd_amd(void) container = cont_va; /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + if (!ucode_builtin) + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -- cgit v1.2.3 From dadb57abc37499f565b23933dbf49b435c3ba8af Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Mon, 29 Aug 2016 14:38:51 -0600 Subject: efi/libstub: Allocate headspace in efi_get_memory_map() efi_get_memory_map() allocates a buffer to store the memory map that it retrieves. This buffer may need to be reused by the client after ExitBootServices() is called, at which point allocations are not longer permitted. To support this usecase, provide the allocated buffer size back to the client, and allocate some additional headroom to account for any reasonable growth in the map that is likely to happen between the call to efi_get_memory_map() and the client reusing the buffer. Signed-off-by: Jeffrey Hugo Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Leif Lindholm Cc: Ingo Molnar Cc: Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 20 ++++-- drivers/firmware/efi/libstub/efi-stub-helper.c | 96 ++++++++++++++++++-------- drivers/firmware/efi/libstub/fdt.c | 17 +++-- drivers/firmware/efi/libstub/random.c | 12 +++- include/linux/efi.h | 15 ++-- 5 files changed, 111 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ff574dad95cc..c5b7c7b4f0d7 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1008,7 +1008,7 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle, bool is64) { struct efi_info *efi = &boot_params->efi_info; - unsigned long map_sz, key, desc_size; + unsigned long map_sz, key, desc_size, buff_size; efi_memory_desc_t *mem_map; struct setup_data *e820ext; const char *signature; @@ -1019,14 +1019,20 @@ static efi_status_t exit_boot(struct boot_params *boot_params, bool called_exit = false; u8 nr_entries; int i; - - nr_desc = 0; - e820ext = NULL; - e820ext_size = 0; + struct efi_boot_memmap map; + + nr_desc = 0; + e820ext = NULL; + e820ext_size = 0; + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; get_map: - status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size, - &desc_version, &key); + status = efi_get_memory_map(sys_table, &map); if (status != EFI_SUCCESS) return status; diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 3bd127f95315..29368ac69221 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -41,6 +41,8 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE; #define EFI_ALLOC_ALIGN EFI_PAGE_SIZE #endif +#define EFI_MMAP_NR_SLACK_SLOTS 8 + struct file_info { efi_file_handle_t *handle; u64 size; @@ -63,49 +65,62 @@ void efi_printk(efi_system_table_t *sys_table_arg, char *str) } } +static inline bool mmap_has_headroom(unsigned long buff_size, + unsigned long map_size, + unsigned long desc_size) +{ + unsigned long slack = buff_size - map_size; + + return slack / desc_size >= EFI_MMAP_NR_SLACK_SLOTS; +} + efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg, - efi_memory_desc_t **map, - unsigned long *map_size, - unsigned long *desc_size, - u32 *desc_ver, - unsigned long *key_ptr) + struct efi_boot_memmap *map) { efi_memory_desc_t *m = NULL; efi_status_t status; unsigned long key; u32 desc_version; - *map_size = sizeof(*m) * 32; + *map->desc_size = sizeof(*m); + *map->map_size = *map->desc_size * 32; + *map->buff_size = *map->map_size; again: - /* - * Add an additional efi_memory_desc_t because we're doing an - * allocation which may be in a new descriptor region. - */ - *map_size += sizeof(*m); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - *map_size, (void **)&m); + *map->map_size, (void **)&m); if (status != EFI_SUCCESS) goto fail; - *desc_size = 0; + *map->desc_size = 0; key = 0; - status = efi_call_early(get_memory_map, map_size, m, - &key, desc_size, &desc_version); - if (status == EFI_BUFFER_TOO_SMALL) { + status = efi_call_early(get_memory_map, map->map_size, m, + &key, map->desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL || + !mmap_has_headroom(*map->buff_size, *map->map_size, + *map->desc_size)) { efi_call_early(free_pool, m); + /* + * Make sure there is some entries of headroom so that the + * buffer can be reused for a new map after allocations are + * no longer permitted. Its unlikely that the map will grow to + * exceed this headroom once we are ready to trigger + * ExitBootServices() + */ + *map->map_size += *map->desc_size * EFI_MMAP_NR_SLACK_SLOTS; + *map->buff_size = *map->map_size; goto again; } if (status != EFI_SUCCESS) efi_call_early(free_pool, m); - if (key_ptr && status == EFI_SUCCESS) - *key_ptr = key; - if (desc_ver && status == EFI_SUCCESS) - *desc_ver = desc_version; + if (map->key_ptr && status == EFI_SUCCESS) + *map->key_ptr = key; + if (map->desc_ver && status == EFI_SUCCESS) + *map->desc_ver = desc_version; fail: - *map = m; + *map->map = m; return status; } @@ -113,13 +128,20 @@ fail: unsigned long get_dram_base(efi_system_table_t *sys_table_arg) { efi_status_t status; - unsigned long map_size; + unsigned long map_size, buff_size; unsigned long membase = EFI_ERROR; struct efi_memory_map map; efi_memory_desc_t *md; + struct efi_boot_memmap boot_map; - status = efi_get_memory_map(sys_table_arg, (efi_memory_desc_t **)&map.map, - &map_size, &map.desc_size, NULL, NULL); + boot_map.map = (efi_memory_desc_t **)&map.map; + boot_map.map_size = &map_size; + boot_map.desc_size = &map.desc_size; + boot_map.desc_ver = NULL; + boot_map.key_ptr = NULL; + boot_map.buff_size = &buff_size; + + status = efi_get_memory_map(sys_table_arg, &boot_map); if (status != EFI_SUCCESS) return membase; @@ -144,15 +166,22 @@ efi_status_t efi_high_alloc(efi_system_table_t *sys_table_arg, unsigned long size, unsigned long align, unsigned long *addr, unsigned long max) { - unsigned long map_size, desc_size; + unsigned long map_size, desc_size, buff_size; efi_memory_desc_t *map; efi_status_t status; unsigned long nr_pages; u64 max_addr = 0; int i; + struct efi_boot_memmap boot_map; + + boot_map.map = ↦ + boot_map.map_size = &map_size; + boot_map.desc_size = &desc_size; + boot_map.desc_ver = NULL; + boot_map.key_ptr = NULL; + boot_map.buff_size = &buff_size; - status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size, - NULL, NULL); + status = efi_get_memory_map(sys_table_arg, &boot_map); if (status != EFI_SUCCESS) goto fail; @@ -230,14 +259,21 @@ efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg, unsigned long size, unsigned long align, unsigned long *addr) { - unsigned long map_size, desc_size; + unsigned long map_size, desc_size, buff_size; efi_memory_desc_t *map; efi_status_t status; unsigned long nr_pages; int i; + struct efi_boot_memmap boot_map; + + boot_map.map = ↦ + boot_map.map_size = &map_size; + boot_map.desc_size = &desc_size; + boot_map.desc_ver = NULL; + boot_map.key_ptr = NULL; + boot_map.buff_size = &buff_size; - status = efi_get_memory_map(sys_table_arg, &map, &map_size, &desc_size, - NULL, NULL); + status = efi_get_memory_map(sys_table_arg, &boot_map); if (status != EFI_SUCCESS) goto fail; diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index e58abfa953cc..bec0fa8d8746 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -175,13 +175,21 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, unsigned long fdt_addr, unsigned long fdt_size) { - unsigned long map_size, desc_size; + unsigned long map_size, desc_size, buff_size; u32 desc_ver; unsigned long mmap_key; efi_memory_desc_t *memory_map, *runtime_map; unsigned long new_fdt_size; efi_status_t status; int runtime_entry_count = 0; + struct efi_boot_memmap map; + + map.map = &runtime_map; + map.map_size = &map_size; + map.desc_size = &desc_size; + map.desc_ver = &desc_ver; + map.key_ptr = &mmap_key; + map.buff_size = &buff_size; /* * Get a copy of the current memory map that we will use to prepare @@ -189,8 +197,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, * subsequent allocations adding entries, since they could not affect * the number of EFI_MEMORY_RUNTIME regions. */ - status = efi_get_memory_map(sys_table, &runtime_map, &map_size, - &desc_size, &desc_ver, &mmap_key); + status = efi_get_memory_map(sys_table, &map); if (status != EFI_SUCCESS) { pr_efi_err(sys_table, "Unable to retrieve UEFI memory map.\n"); return status; @@ -199,6 +206,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, pr_efi(sys_table, "Exiting boot services and installing virtual address map...\n"); + map.map = &memory_map; /* * Estimate size of new FDT, and allocate memory for it. We * will allocate a bigger buffer if this ends up being too @@ -218,8 +226,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table, * we can get the memory map key needed for * exit_boot_services(). */ - status = efi_get_memory_map(sys_table, &memory_map, &map_size, - &desc_size, &desc_ver, &mmap_key); + status = efi_get_memory_map(sys_table, &map); if (status != EFI_SUCCESS) goto fail_free_new_fdt; diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c index 53f6d3fe6d86..0c9f58c5ba50 100644 --- a/drivers/firmware/efi/libstub/random.c +++ b/drivers/firmware/efi/libstub/random.c @@ -73,12 +73,20 @@ efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg, unsigned long random_seed) { unsigned long map_size, desc_size, total_slots = 0, target_slot; + unsigned long buff_size; efi_status_t status; efi_memory_desc_t *memory_map; int map_offset; + struct efi_boot_memmap map; - status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size, - &desc_size, NULL, NULL); + map.map = &memory_map; + map.map_size = &map_size; + map.desc_size = &desc_size; + map.desc_ver = NULL; + map.key_ptr = NULL; + map.buff_size = &buff_size; + + status = efi_get_memory_map(sys_table_arg, &map); if (status != EFI_SUCCESS) return status; diff --git a/include/linux/efi.h b/include/linux/efi.h index 23cd3ced8c1a..943fee524176 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -118,6 +118,15 @@ typedef struct { u32 imagesize; } efi_capsule_header_t; +struct efi_boot_memmap { + efi_memory_desc_t **map; + unsigned long *map_size; + unsigned long *desc_size; + u32 *desc_ver; + unsigned long *key_ptr; + unsigned long *buff_size; +}; + /* * EFI capsule flags */ @@ -1371,11 +1380,7 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg, efi_loaded_image_t *image, int *cmd_line_len); efi_status_t efi_get_memory_map(efi_system_table_t *sys_table_arg, - efi_memory_desc_t **map, - unsigned long *map_size, - unsigned long *desc_size, - u32 *desc_ver, - unsigned long *key_ptr); + struct efi_boot_memmap *map); efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg, unsigned long size, unsigned long align, -- cgit v1.2.3 From d64934019f6cc39202e2f78063709f61ca5cb364 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Mon, 29 Aug 2016 14:38:54 -0600 Subject: x86/efi: Use efi_exit_boot_services() The eboot code directly calls ExitBootServices. This is inadvisable as the UEFI spec details a complex set of errors, race conditions, and API interactions that the caller of ExitBootServices must get correct. The eboot code attempts allocations after calling ExitBootSerives which is not permitted per the spec. Call the efi_exit_boot_services() helper intead, which handles the allocation scenario properly. Signed-off-by: Jeffrey Hugo Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Leif Lindholm Cc: Ingo Molnar Cc: Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 136 +++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 69 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index c5b7c7b4f0d7..94dd4a31f5b3 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1004,85 +1004,87 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, return status; } +struct exit_boot_struct { + struct boot_params *boot_params; + struct efi_info *efi; + struct setup_data *e820ext; + __u32 e820ext_size; + bool is64; +}; + +static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, + struct efi_boot_memmap *map, + void *priv) +{ + static bool first = true; + const char *signature; + __u32 nr_desc; + efi_status_t status; + struct exit_boot_struct *p = priv; + + if (first) { + nr_desc = *map->buff_size / *map->desc_size; + if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) { + u32 nr_e820ext = nr_desc - + ARRAY_SIZE(p->boot_params->e820_map); + + status = alloc_e820ext(nr_e820ext, &p->e820ext, + &p->e820ext_size); + if (status != EFI_SUCCESS) + return status; + } + first = false; + } + + signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; + memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); + + p->efi->efi_systab = (unsigned long)sys_table_arg; + p->efi->efi_memdesc_size = *map->desc_size; + p->efi->efi_memdesc_version = *map->desc_ver; + p->efi->efi_memmap = (unsigned long)*map->map; + p->efi->efi_memmap_size = *map->map_size; + +#ifdef CONFIG_X86_64 + p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; + p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; +#endif + + return EFI_SUCCESS; +} + static efi_status_t exit_boot(struct boot_params *boot_params, void *handle, bool is64) { - struct efi_info *efi = &boot_params->efi_info; unsigned long map_sz, key, desc_size, buff_size; efi_memory_desc_t *mem_map; struct setup_data *e820ext; - const char *signature; __u32 e820ext_size; - __u32 nr_desc, prev_nr_desc; efi_status_t status; __u32 desc_version; - bool called_exit = false; - u8 nr_entries; - int i; struct efi_boot_memmap map; + struct exit_boot_struct priv; + + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; + priv.boot_params = boot_params; + priv.efi = &boot_params->efi_info; + priv.e820ext = NULL; + priv.e820ext_size = 0; + priv.is64 = is64; - nr_desc = 0; - e820ext = NULL; - e820ext_size = 0; - map.map = &mem_map; - map.map_size = &map_sz; - map.desc_size = &desc_size; - map.desc_ver = &desc_version; - map.key_ptr = &key; - map.buff_size = &buff_size; - -get_map: - status = efi_get_memory_map(sys_table, &map); - + /* Might as well exit boot services now */ + status = efi_exit_boot_services(sys_table, handle, &map, &priv, + exit_boot_func); if (status != EFI_SUCCESS) return status; - prev_nr_desc = nr_desc; - nr_desc = map_sz / desc_size; - if (nr_desc > prev_nr_desc && - nr_desc > ARRAY_SIZE(boot_params->e820_map)) { - u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map); - - status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size); - if (status != EFI_SUCCESS) - goto free_mem_map; - - efi_call_early(free_pool, mem_map); - goto get_map; /* Allocated memory, get map again */ - } - - signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; - memcpy(&efi->efi_loader_signature, signature, sizeof(__u32)); - - efi->efi_systab = (unsigned long)sys_table; - efi->efi_memdesc_size = desc_size; - efi->efi_memdesc_version = desc_version; - efi->efi_memmap = (unsigned long)mem_map; - efi->efi_memmap_size = map_sz; - -#ifdef CONFIG_X86_64 - efi->efi_systab_hi = (unsigned long)sys_table >> 32; - efi->efi_memmap_hi = (unsigned long)mem_map >> 32; -#endif - - /* Might as well exit boot services now */ - status = efi_call_early(exit_boot_services, handle, key); - if (status != EFI_SUCCESS) { - /* - * ExitBootServices() will fail if any of the event - * handlers change the memory map. In which case, we - * must be prepared to retry, but only once so that - * we're guaranteed to exit on repeated failures instead - * of spinning forever. - */ - if (called_exit) - goto free_mem_map; - - called_exit = true; - efi_call_early(free_pool, mem_map); - goto get_map; - } - + e820ext = priv.e820ext; + e820ext_size = priv.e820ext_size; /* Historic? */ boot_params->alt_mem_k = 32 * 1024; @@ -1091,10 +1093,6 @@ get_map: return status; return EFI_SUCCESS; - -free_mem_map: - efi_call_early(free_pool, mem_map); - return status; } /* -- cgit v1.2.3 From e12c8f36f3f7a60d55938c5aed5999278fa92bcb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 30 Aug 2016 16:14:00 +0800 Subject: KVM: lapic: adjust preemption timer correctly when goes TSC backward MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TSC_OFFSET will be adjusted if discovers TSC backward during vCPU load. The preemption timer, which relies on the guest tsc to reprogram its preemption timer value, is also reprogrammed if vCPU is scheded in to a different pCPU. However, the current implementation reprogram preemption timer before TSC_OFFSET is adjusted to the right value, resulting in the preemption timer firing prematurely. This patch fix it by adjusting TSC_OFFSET before reprogramming preemption timer if TSC backward. Cc: Paolo Bonzini Cc: Radim Krċmář Cc: Yunhong Jiang Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19f9f9e05c2a..699f8726539a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2743,16 +2743,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } + if (kvm_lapic_hv_timer_in_use(vcpu) && + kvm_x86_ops->set_hv_timer(vcpu, + kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration -- cgit v1.2.3 From 79d102cbfd2e9d94257fcc7c82807ef1cdf80322 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 5 Sep 2016 17:30:07 +0200 Subject: perf/x86/intel/cqm: Check cqm/mbm enabled state in event init Yanqiu Zhang reported kernel panic when using mbm event on system where CQM is detected but without mbm event support, like with perf: # perf stat -e 'intel_cqm/event=3/' -a BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 IP: [] update_sample+0xbc/0xe0 ... [] __intel_mbm_event_init+0x18/0x20 [] flush_smp_call_function_queue+0x7b/0x160 [] generic_smp_call_function_single_interrupt+0x13/0x60 [] smp_call_function_interrupt+0x27/0x40 [] call_function_interrupt+0x8c/0xa0 ... The reason is that we currently allow to init mbm event even if mbm support is not detected. Adding checks for both cqm and mbm events and support into cqm's event_init. Fixes: 33c3cc7acfd9 ("perf/x86/mbm: Add Intel Memory B/W Monitoring enumeration and init") Reported-by: Yanqiu Zhang Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Cc: Vikas Shivappa Cc: Tony Luck Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1473089407-21857-1-git-send-email-jolsa@kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/cqm.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 783c49ddef29..8f82b02934fa 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -458,6 +458,11 @@ static void __intel_cqm_event_count(void *info); static void init_mbm_sample(u32 rmid, u32 evt_type); static void __intel_mbm_event_count(void *info); +static bool is_cqm_event(int e) +{ + return (e == QOS_L3_OCCUP_EVENT_ID); +} + static bool is_mbm_event(int e) { return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); @@ -1366,6 +1371,10 @@ static int intel_cqm_event_init(struct perf_event *event) (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) return -EINVAL; + if ((is_cqm_event(event->attr.config) && !cqm_enabled) || + (is_mbm_event(event->attr.config) && !mbm_enabled)) + return -EINVAL; + /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || -- cgit v1.2.3 From e6971009a95a74f28c58bbae415c40effad1226c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 Sep 2016 11:56:01 -0700 Subject: x86/uaccess: force copy_*_user() to be inlined As already done with __copy_*_user(), mark copy_*_user() as __always_inline. Without this, the checks for things like __builtin_const_p() won't work consistently in either hardened usercopy nor the recent adjustments for detecting usercopy overflows at compile time. The change in kernel text size is detectable, but very small: text data bss dec hex filename 12118735 5768608 14229504 32116847 1ea106f vmlinux.before 12120207 5768608 14229504 32118319 1ea162f vmlinux.after Signed-off-by: Kees Cook --- arch/x86/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c3f291195294..e3af86f58eaf 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -705,7 +705,7 @@ static inline void copy_user_overflow(int size, unsigned long count) WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -725,7 +725,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); -- cgit v1.2.3 From ce29856a5e1aabe52e18b2c60db1490769a6ab55 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Mon, 1 Aug 2016 23:01:56 +0200 Subject: um/ptrace: Fix the syscall number update after a ptrace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the syscall number after each PTRACE_SETREGS on ORIG_*AX. This is needed to get the potentially altered syscall number in the seccomp filters after RET_TRACE. This fix four seccomp_bpf tests: > [ RUN ] TRACE_syscall.skip_after_RET_TRACE > seccomp_bpf.c:1560:TRACE_syscall.skip_after_RET_TRACE:Expected -1 (18446744073709551615) == syscall(39) (26) > seccomp_bpf.c:1561:TRACE_syscall.skip_after_RET_TRACE:Expected 1 (1) == (*__errno_location ()) (22) > [ FAIL ] TRACE_syscall.skip_after_RET_TRACE > [ RUN ] TRACE_syscall.kill_after_RET_TRACE > TRACE_syscall.kill_after_RET_TRACE: Test exited normally instead of by signal (code: 1) > [ FAIL ] TRACE_syscall.kill_after_RET_TRACE > [ RUN ] TRACE_syscall.skip_after_ptrace > seccomp_bpf.c:1622:TRACE_syscall.skip_after_ptrace:Expected -1 (18446744073709551615) == syscall(39) (26) > seccomp_bpf.c:1623:TRACE_syscall.skip_after_ptrace:Expected 1 (1) == (*__errno_location ()) (22) > [ FAIL ] TRACE_syscall.skip_after_ptrace > [ RUN ] TRACE_syscall.kill_after_ptrace > TRACE_syscall.kill_after_ptrace: Test exited normally instead of by signal (code: 1) > [ FAIL ] TRACE_syscall.kill_after_ptrace Fixes: 26703c636c1f ("um/ptrace: run seccomp after ptrace") Signed-off-by: Mickaël Salaün Acked-by: Kees Cook Cc: Jeff Dike Cc: Richard Weinberger Cc: James Morris Cc: user-mode-linux-devel@lists.sourceforge.net Signed-off-by: James Morris Signed-off-by: Kees Cook --- arch/um/kernel/skas/syscall.c | 5 ----- arch/x86/um/ptrace_32.c | 3 +++ arch/x86/um/ptrace_64.c | 4 ++++ 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 0728fee94398..b783ac87d98a 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -27,12 +27,7 @@ void handle_syscall(struct uml_pt_regs *r) if (secure_computing(NULL) == -1) goto out; - /* Update the syscall number after orig_ax has potentially been updated - * with ptrace. - */ - UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); syscall = UPT_SYSCALL_NR(r); - if (syscall >= 0 && syscall <= __NR_syscall_max) PT_REGS_SET_SYSCALL_RETURN(regs, EXECUTE_SYSCALL(syscall, regs)); diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index ebd4dd6ef73b..a7ef7b131e25 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -84,7 +84,10 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case EAX: case EIP: case UESP: + break; case ORIG_EAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: if (value && (value & 3) != 3) diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index faab418876ce..0b5c184dd5b3 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -78,7 +78,11 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case RSI: case RDI: case RBP: + break; + case ORIG_RAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: -- cgit v1.2.3 From c291b015158577be533dd5a959dfc09bab119eed Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 7 Sep 2016 10:21:33 +0800 Subject: x86/apic: Fix num_processors value in case of failure If the topology package map check of the APIC ID and the CPU is a failure, we don't generate the processor info for that APIC ID yet we increase disabled_cpus by one - which is buggy. Only increase num_processors once we are sure we don't fail. Signed-off-by: Dou Liyang Acked-by: David Rientjes Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1473214893-16481-1-git-send-email-douly.fnst@cn.fujitsu.com [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 50c95af0f017..f3e9b2df4b16 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2093,7 +2093,6 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2116,10 +2115,13 @@ int generic_processor_info(int apicid, int version) pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } + num_processors++; + /* * Validate version */ -- cgit v1.2.3 From a4497a86fb9b855c5ac8503fdc959393b00bb643 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 8 Sep 2016 10:15:28 -0400 Subject: x86, clock: Fix kvm guest tsc initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When booting a kvm guest on AMD with the latest kernel the following messages are displayed in the boot log: tsc: Unable to calibrate against PIT tsc: HPET/PMTIMER calibration failed aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") introduced a change to account for a difference in cpu and tsc frequencies for Intel SKL processors. Before this change the native tsc set x86_platform.calibrate_tsc to native_calibrate_tsc() which is a hardware calibration of the tsc, and in tsc_init() executed tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; The kvm code changed x86_platform.calibrate_tsc to kvm_get_tsc_khz() and executed the same tsc_init() function. This meant that KVM guests did not execute the native hardware calibration function. After aa297292d708, there are separate native calibrations for cpu_khz and tsc_khz. The code sets x86_platform.calibrate_tsc to native_calibrate_tsc() which is now an Intel specific calibration function, and x86_platform.calibrate_cpu to native_calibrate_cpu() which is the "old" native_calibrate_tsc() function (ie, the native hardware calibration function). tsc_init() now does cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; The kvm code should not call the hardware initialization in native_calibrate_cpu(), as it isn't applicable for kvm and it didn't do that prior to aa297292d708. This patch resolves this issue by setting x86_platform.calibrate_cpu to kvm_get_tsc_khz(). v2: I had originally set x86_platform.calibrate_cpu to cpu_khz_from_cpuid(), however, pbonzini pointed out that the CPUID leaf in that function is not available in KVM. I have changed the function pointer to kvm_get_tsc_khz(). Fixes: aa297292d708 ("x86/tsc: Enumerate SKL cpu_khz and tsc_khz via CPUID") Signed-off-by: Prarit Bhargava Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: Len Brown Cc: "Peter Zijlstra (Intel)" Cc: Borislav Petkov Cc: Adrian Hunter Cc: "Christopher S. Hall" Cc: David Woodhouse Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvmclock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd26bb..3692249a70f1 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -289,6 +289,7 @@ void __init kvmclock_init(void) put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3 From 7d762e49c2117d3829eb3355f2617aea080ed3a7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 9 Sep 2016 18:08:23 +0200 Subject: perf/x86/amd/uncore: Prevent use after free The resent conversion of the cpu hotplug support in the uncore driver introduced a regression due to the way the callbacks are invoked at initialization time. The old code called the prepare/starting/online function on each online cpu as a block. The new code registers the hotplug callbacks in the core for each state. The core invokes the callbacks at each registration on all online cpus. The code implicitely relied on the prepare/starting/online callbacks being called as combo on a particular cpu, which was not obvious and completely undocumented. The resulting subtle wreckage happens due to the way how the uncore code manages shared data structures for cpus which share an uncore resource in hardware. The sharing is determined in the cpu starting callback, but the prepare callback allocates per cpu data for the upcoming cpu because potential sharing is unknown at this point. If the starting callback finds a online cpu which shares the hardware resource it takes a refcount on the percpu data of that cpu and puts the own data structure into a 'free_at_online' pointer of that shared data structure. The online callback frees that. With the old model this worked because in a starting callback only one non unused structure (the one of the starting cpu) was available. The new code allocates the data structures for all cpus when the prepare callback is registered. Now the starting function iterates through all online cpus and looks for a data structure (skipping its own) which has a matching hardware id. The id member of the data structure is initialized to 0, but the hardware id can be 0 as well. The resulting wreckage is: CPU0 finds a matching id on CPU1, takes a refcount on CPU1 data and puts its own data structure into CPU1s data structure to be freed. CPU1 skips CPU0 because the data structure is its allegedly unsued own. It finds a matching id on CPU2, takes a refcount on CPU1 data and puts its own data structure into CPU2s data structure to be freed. .... Now the online callbacks are invoked. CPU0 has a pointer to CPU1s data and frees the original CPU0 data. So far so good. CPU1 has a pointer to CPU2s data and frees the original CPU1 data, which is still referenced by CPU0 ---> Booom So there are two issues to be solved here: 1) The id field must be initialized at allocation time to a value which cannot be a valid hardware id, i.e. -1 This prevents the above scenario, but now CPU1 and CPU2 both stick their own data structure into the free_at_online pointer of CPU0. So we leak CPU1s data structure. 2) Fix the memory leak described in #1 Instead of having a single pointer, use a hlist to enqueue the superflous data structures which are then freed by the first cpu invoking the online callback. Ideally we should know the sharing _before_ invoking the prepare callback, but that's way beyond the scope of this bug fix. [ tglx: Rewrote changelog ] Fixes: 96b2bd3866a0 ("perf/x86/amd/uncore: Convert to hotplug state machine") Reported-and-tested-by: Eric Sandeen Signed-off-by: Sebastian Andrzej Siewior Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20160909160822.lowgmkdwms2dheyv@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/events/amd/uncore.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index e6131d4454e6..65577f081d07 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -29,6 +29,8 @@ #define COUNTER_SHIFT 16 +static HLIST_HEAD(uncore_unused_list); + struct amd_uncore { int id; int refcnt; @@ -39,7 +41,7 @@ struct amd_uncore { cpumask_t *active_mask; struct pmu *pmu; struct perf_event *events[MAX_COUNTERS]; - struct amd_uncore *free_when_cpu_online; + struct hlist_node node; }; static struct amd_uncore * __percpu *amd_uncore_nb; @@ -306,6 +308,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->id = -1; *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } @@ -319,6 +322,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; uncore_l2->active_mask = &amd_l2_active_mask; uncore_l2->pmu = &amd_l2_pmu; + uncore_l2->id = -1; *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; } @@ -348,7 +352,7 @@ amd_uncore_find_online_sibling(struct amd_uncore *this, continue; if (this->id == that->id) { - that->free_when_cpu_online = this; + hlist_add_head(&this->node, &uncore_unused_list); this = that; break; } @@ -388,13 +392,23 @@ static int amd_uncore_cpu_starting(unsigned int cpu) return 0; } +static void uncore_clean_online(void) +{ + struct amd_uncore *uncore; + struct hlist_node *n; + + hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { + hlist_del(&uncore->node); + kfree(uncore); + } +} + static void uncore_online(unsigned int cpu, struct amd_uncore * __percpu *uncores) { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); - kfree(uncore->free_when_cpu_online); - uncore->free_when_cpu_online = NULL; + uncore_clean_online(); if (cpu == uncore->cpu) cpumask_set_cpu(cpu, uncore->active_mask); -- cgit v1.2.3 From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Sep 2016 08:51:21 -0700 Subject: mm: fix cache mode of dax pmd mappings track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as uncacheable rendering them impractical for application usage. DAX-pte mappings are cached and the goal of establishing DAX-pmd mappings is to attain more performance, not dramatically less (3 orders of magnitude). track_pfn_insert() relies on a previous call to reserve_memtype() to establish the expected page_cache_mode for the range. While memremap() arranges for reserve_memtype() to be called, devm_memremap_pages() does not. So, teach track_pfn_insert() and untrack_pfn() how to handle tracking without a vma, and arrange for devm_memremap_pages() to establish the write-back-cache reservation in the memtype tree. Cc: Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Nilesh Choudhury Cc: Kirill A. Shutemov Reported-by: Toshi Kani Reported-by: Kai Zhang Acked-by: Andrew Morton Signed-off-by: Dan Williams --- arch/x86/mm/pat.c | 17 ++++++++++------- kernel/memremap.c | 9 +++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ecb1b69c1651..170cc4ff057b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); -- cgit v1.2.3 From a9a94401c2b5805c71e39427b1af1bf1b9f67cd0 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:51 +0300 Subject: perf/x86/intel/bts: Fix confused ordering of PMU callbacks The intel_bts driver is using a CPU-local 'started' variable to order callbacks and PMIs and make sure that AUX transactions don't get messed up. However, the ordering rules in regard to this variable is a complete mess, which recently resulted in perf_fuzzer-triggered warnings and panics. The general ordering rule that is patch is enforcing is that this cpu-local variable be set only when the cpu-local AUX transaction is active; consequently, this variable is to be checked before the AUX related bits can be touched. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-4-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 104 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 0a6e393a2e62..61e1d713b114 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -31,7 +31,17 @@ struct bts_ctx { struct perf_output_handle handle; struct debug_store ds_back; - int started; + int state; +}; + +/* BTS context states: */ +enum { + /* no ongoing AUX transactions */ + BTS_STATE_STOPPED = 0, + /* AUX transaction is on, BTS tracing is disabled */ + BTS_STATE_INACTIVE, + /* AUX transaction is on, BTS tracing is running */ + BTS_STATE_ACTIVE, }; static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); @@ -204,6 +214,15 @@ static void bts_update(struct bts_ctx *bts) static int bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); +/* + * Ordering PMU callbacks wrt themselves and the PMI is done by means + * of bts::state, which: + * - is set when bts::handle::event is valid, that is, between + * perf_aux_output_begin() and perf_aux_output_end(); + * - is zero otherwise; + * - is ordered against bts::handle::event with a compiler barrier. + */ + static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); @@ -221,10 +240,13 @@ static void __bts_event_start(struct perf_event *event) /* * local barrier to make sure that ds configuration made it - * before we enable BTS + * before we enable BTS and bts::state goes ACTIVE */ wmb(); + /* INACTIVE/STOPPED -> ACTIVE */ + WRITE_ONCE(bts->state, BTS_STATE_ACTIVE); + intel_pmu_enable_bts(config); } @@ -251,9 +273,6 @@ static void bts_event_start(struct perf_event *event, int flags) __bts_event_start(event); - /* PMI handler: this counter is running and likely generating PMIs */ - ACCESS_ONCE(bts->started) = 1; - return; fail_end_stop: @@ -263,30 +282,34 @@ fail_stop: event->hw.state = PERF_HES_STOPPED; } -static void __bts_event_stop(struct perf_event *event) +static void __bts_event_stop(struct perf_event *event, int state) { + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ + WRITE_ONCE(bts->state, state); + /* * No extra synchronization is mandated by the documentation to have * BTS data stores globally visible. */ intel_pmu_disable_bts(); - - if (event->hw.state & PERF_HES_STOPPED) - return; - - ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; } static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *buf = NULL; + int state = READ_ONCE(bts->state); + + if (state == BTS_STATE_ACTIVE) + __bts_event_stop(event, BTS_STATE_STOPPED); - /* PMI handler: don't restart this counter */ - ACCESS_ONCE(bts->started) = 0; + if (state != BTS_STATE_STOPPED) + buf = perf_get_aux(&bts->handle); - __bts_event_stop(event); + event->hw.state |= PERF_HES_STOPPED; if (flags & PERF_EF_UPDATE) { bts_update(bts); @@ -296,6 +319,7 @@ static void bts_event_stop(struct perf_event *event, int flags) bts->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); } @@ -310,8 +334,20 @@ static void bts_event_stop(struct perf_event *event, int flags) void intel_bts_enable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + int state = READ_ONCE(bts->state); + + /* + * Here we transition from INACTIVE to ACTIVE; + * if we instead are STOPPED from the interrupt handler, + * stay that way. Can't be ACTIVE here though. + */ + if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE)) + return; + + if (state == BTS_STATE_STOPPED) + return; - if (bts->handle.event && bts->started) + if (bts->handle.event) __bts_event_start(bts->handle.event); } @@ -319,8 +355,15 @@ void intel_bts_disable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + /* + * Here we transition from ACTIVE to INACTIVE; + * do nothing for STOPPED or INACTIVE. + */ + if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE) + return; + if (bts->handle.event) - __bts_event_stop(bts->handle.event); + __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE); } static int @@ -407,9 +450,13 @@ int intel_bts_interrupt(void) struct perf_event *event = bts->handle.event; struct bts_buffer *buf; s64 old_head; - int err; + int err = -ENOSPC; - if (!event || !bts->started) + /* + * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, + * so we can only be INACTIVE or STOPPED + */ + if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) return 0; buf = perf_get_aux(&bts->handle); @@ -432,12 +479,21 @@ int intel_bts_interrupt(void) !!local_xchg(&buf->lost, 0)); buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return 1; + if (buf) + err = bts_buffer_reset(buf, &bts->handle); - err = bts_buffer_reset(buf, &bts->handle); - if (err) - perf_aux_output_end(&bts->handle, 0, false); + if (err) { + WRITE_ONCE(bts->state, BTS_STATE_STOPPED); + + if (buf) { + /* + * BTS_STATE_STOPPED should be visible before + * cleared handle::event + */ + barrier(); + perf_aux_output_end(&bts->handle, 0, false); + } + } return 1; } -- cgit v1.2.3 From 4d4c474124649198d9b0a065c06f9362cf18e14e Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:52 +0300 Subject: perf/x86/intel/bts: Fix BTS PMI detection Since BTS doesn't have a dedicated PMI status bit, the driver needs to take extra care to check for the condition that triggers it to avoid spurious NMI warnings. Regardless of the local BTS context state, the only way of knowing that the NMI is ours is to compare the write pointer against the interrupt threshold. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-5-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 61e1d713b114..9233edf993e1 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -446,26 +446,37 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) int intel_bts_interrupt(void) { + struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct perf_event *event = bts->handle.event; struct bts_buffer *buf; s64 old_head; - int err = -ENOSPC; + int err = -ENOSPC, handled = 0; + + /* + * The only surefire way of knowing if this NMI is ours is by checking + * the write ptr against the PMI threshold. + */ + if (ds->bts_index >= ds->bts_interrupt_threshold) + handled = 1; /* * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, * so we can only be INACTIVE or STOPPED */ if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) - return 0; + return handled; buf = perf_get_aux(&bts->handle); + if (!buf) + return handled; + /* * Skip snapshot counters: they don't use the interrupt, but * there's no other way of telling, because the pointer will * keep moving */ - if (!buf || buf->snapshot) + if (buf->snapshot) return 0; old_head = local_read(&buf->head); @@ -473,7 +484,7 @@ int intel_bts_interrupt(void) /* no new data */ if (old_head == local_read(&buf->head)) - return 0; + return handled; perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); -- cgit v1.2.3 From ef9ef3befa0d76008e988a9ed9fe439e803351b9 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 6 Sep 2016 16:23:53 +0300 Subject: perf/x86/intel/bts: Kill a silly warning At the moment, intel_bts will WARN() out if there is more than one event writing to the same ring buffer, via SET_OUTPUT, and will only send data from one event to a buffer. There is no reason to have this warning in, so kill it. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160906132353.19887-6-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 9233edf993e1..bdcd6510992c 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -378,8 +378,6 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) return 0; head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); - if (WARN_ON_ONCE(head != local_read(&buf->head))) - return -EINVAL; phys = &buf->buf[buf->cur_buf]; space = phys->offset + phys->displacement + phys->size - head; -- cgit v1.2.3 From 8ef9b8455a2a3049efa9e46e8a6402b972a3eb41 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Sep 2016 14:42:55 +0200 Subject: perf/x86/intel: Fix PEBSv3 record drain Alexander hit the WARN_ON_ONCE(!event) on his Skylake while running the perf fuzzer. This means the PEBSv3 record included a status bit for an inactive event, something that _should_ not happen. Move the code that filters the status bits against our known PEBS events up a spot to guarantee we only deal with events we know about. Further add "continue" statements to the WARN_ON_ONCE()s such that we'll not die nor generate silly events in case we ever do hit them again. Reported-by: Alexander Shishkin Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org Fixes: a3d86542de88 ("perf/x86/intel/pebs: Add PEBSv3 decoding") Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ce9f3f669e6..9b983a474253 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1274,18 +1274,18 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) struct pebs_record_nhm *p = at; u64 pebs_status; - /* PEBS v3 has accurate status bits */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + + /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&p->status, - MAX_PEBS_EVENTS) + for_each_set_bit(bit, (unsigned long *)&pebs_status, + x86_pmu.max_pebs_events) counts[bit]++; continue; } - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; - /* * On some CPUs the PEBS status can be zero when PEBS is * racing with clearing of GLOBAL_STATUS. @@ -1333,8 +1333,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; event = cpuc->events[bit]; - WARN_ON_ONCE(!event); - WARN_ON_ONCE(!event->attr.precise_ip); + if (WARN_ON_ONCE(!event)) + continue; + + if (WARN_ON_ONCE(!event->attr.precise_ip)) + continue; /* log dropped samples number */ if (error[bit]) -- cgit v1.2.3 From cecf62352aee2b4fe114aafd1b8c5f265a4243ce Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 11:22:33 +0300 Subject: perf/x86/intel: Don't disable "intel_bts" around "intel" event batching At the moment, intel_bts events get disabled from intel PMU's disable callback, which includes event scheduling transactions of said PMU, which have nothing to do with intel_bts events. We do want to keep intel_bts events off inside the PMI handler to avoid filling up their buffer too soon. This patch moves intel_bts enabling/disabling directly to the PMI handler. Reported-by: Vince Weaver Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915082233.11065-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2cbde2f449aa..4c9a79b9cd69 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1730,9 +1730,11 @@ static __initconst const u64 knl_hw_cache_extra_regs * disabled state if called consecutively. * * During consecutive calls, the same disable value will be written to related - * registers, so the PMU state remains unchanged. hw.state in - * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive - * calls. + * registers, so the PMU state remains unchanged. + * + * intel_bts events don't coexist with intel PMU's BTS events because of + * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them + * disabled around intel PMU's event batching etc, only inside the PMI handler. */ static void __intel_pmu_disable_all(void) { @@ -1742,8 +1744,6 @@ static void __intel_pmu_disable_all(void) if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); - else - intel_bts_disable_local(); intel_pmu_pebs_disable_all(); } @@ -1771,8 +1771,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) return; intel_pmu_enable_bts(event->hw.config); - } else - intel_bts_enable_local(); + } } static void intel_pmu_enable_all(int added) @@ -2073,6 +2072,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_bts_disable_local(); __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); @@ -2172,6 +2172,7 @@ done: /* Only restore PMU state when it's active. See x86_pmu_disable(). */ if (cpuc->enabled) __intel_pmu_enable_all(0, true); + intel_bts_enable_local(); /* * Only unmask the NMI after the overflow counters -- cgit v1.2.3 From b0eaf4506f5f95d15d6731d72c0ddf4a2179eefa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Sep 2016 23:39:12 +0200 Subject: kvm: x86: correctly reset dest_map->vector when restoring LAPIC state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When userspace sends KVM_SET_LAPIC, KVM schedules a check between the vCPU's IRR and ISR and the IOAPIC redirection table, in order to re-establish the IOAPIC's dest_map (the list of CPUs servicing the real-time clock interrupt with the corresponding vectors). However, __rtc_irq_eoi_tracking_restore_one was forgetting to set dest_map->vectors. Because of this, the IOAPIC did not process the real-time clock interrupt EOI, ioapic->rtc_status.pending_eoi got stuck at a non-zero value, and further RTC interrupts were reported to userspace as coalesced. Fixes: 9e4aabe2bb3454c83dac8139cf9974503ee044db Fixes: 4d99ba898dd0c521ca6cdfdde55c9b58aea3cb3d Cc: stable@vger.kernel.org Cc: Joerg Roedel Cc: David Gilbert Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 5f42d038fcb4..c7220ba94aa7 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -109,6 +109,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) { bool new_val, old_val; struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; @@ -117,16 +118,17 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); - old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + old_val = test_bit(vcpu->vcpu_id, dest_map->map); if (new_val == old_val) return; if (new_val) { - __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __set_bit(vcpu->vcpu_id, dest_map->map); + dest_map->vectors[vcpu->vcpu_id] = e->fields.vector; ioapic->rtc_status.pending_eoi++; } else { - __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __clear_bit(vcpu->vcpu_id, dest_map->map); ioapic->rtc_status.pending_eoi--; rtc_status_pending_eoi_check_valid(ioapic); } -- cgit v1.2.3 From 1c109fabbd51863475cd12ac206bdd249aee35af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Sep 2016 02:35:29 +0100 Subject: fix minor infoleak in get_user_ex() get_user_ex(x, ptr) should zero x on failure. It's not a lot of a leak (at most we are leaking uninitialized 64bit value off the kernel stack, and in a fairly constrained situation, at that), but the fix is trivial, so... Cc: stable@vger.kernel.org Signed-off-by: Al Viro [ This sat in different branch from the uaccess fixes since mid-August ] Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index e3af86f58eaf..2131c4ce7d8a 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -433,7 +433,11 @@ do { \ #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ asm volatile("1: mov"itype" %1,%"rtype"0\n" \ "2:\n" \ - _ASM_EXTABLE_EX(1b, 2b) \ + ".section .fixup,\"ax\"\n" \ + "3:xor"itype" %"rtype"0,%"rtype"0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_EX(1b, 3b) \ : ltype(x) : "m" (__m(addr))) #define __put_user_nocheck(x, ptr, size) \ -- cgit v1.2.3 From 95f60084acbcee6c466256cf26eb52191fad9edc Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 18:13:50 +0300 Subject: perf/x86/intel/pt: Fix an off-by-one in address filter configuration PT address filter configuration requires that a range is specified by its first and last address, but at the moment we're obtaining the end of the range by adding user specified size to its start, which is off by one from what it actually needs to be. Fix this and make sure that zero-sized filters don't pass the filter validation. Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org # v4.7 Cc: stable@vger.kernel.org#v4.7 Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915151352.21306-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 04bb5fb5a8d7..5ec0100e3fc6 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1081,7 +1081,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters) list_for_each_entry(filter, filters, entry) { /* PT doesn't support single address triggers */ - if (!filter->range) + if (!filter->range || !filter->size) return -EOPNOTSUPP; if (!filter->inode && !kernel_ip(filter->offset)) @@ -1111,7 +1111,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event) } else { /* apply the offset */ msr_a = filter->offset + offs[range]; - msr_b = filter->size + msr_a; + msr_b = filter->size + msr_a - 1; } filters->filter[range].msr_a = msr_a; -- cgit v1.2.3 From ddfdad991e55b65c1cc4ee29502f6dceee04455a Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 18:13:51 +0300 Subject: perf/x86/intel/pt: Fix kernel address filter's offset validation The kernel_ip() filter is used mostly by the DS/LBR code to look at the branch addresses, but Intel PT also uses it to validate the address filter offsets for kernel addresses, for which it is not sufficient: supplying something in bits 64:48 that's not a sign extension of the lower address bits (like 0xf00d000000000000) throws a #GP. This patch adds address validation for the user supplied kernel filters. Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org # v4.7 Cc: stable@vger.kernel.org#v4.7 Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915151352.21306-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 5ec0100e3fc6..1f94963a283b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1074,6 +1074,11 @@ static void pt_addr_filters_fini(struct perf_event *event) event->hw.addr_filters = NULL; } +static inline bool valid_kernel_ip(unsigned long ip) +{ + return virt_addr_valid(ip) && kernel_ip(ip); +} + static int pt_event_addr_filters_validate(struct list_head *filters) { struct perf_addr_filter *filter; @@ -1084,7 +1089,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters) if (!filter->range || !filter->size) return -EOPNOTSUPP; - if (!filter->inode && !kernel_ip(filter->offset)) + if (!filter->inode && !valid_kernel_ip(filter->offset)) return -EINVAL; if (++range > pt_cap_get(PT_CAP_num_address_ranges)) -- cgit v1.2.3 From 1155bafcb79208abc6ae234c6e135ac70607755c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 15 Sep 2016 18:13:52 +0300 Subject: perf/x86/intel/pt: Do validate the size of a kernel address filter Right now, the kernel address filters in PT are prone to integer overflow that may happen in adding filter's size to its offset to obtain the end of the range. Such an overflow would also throw a #GP in the PT event configuration path. Fix this by explicitly validating the result of this calculation. Reported-by: Adrian Hunter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org # v4.7 Cc: stable@vger.kernel.org#v4.7 Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160915151352.21306-4-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/pt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1f94963a283b..861a7d9cb60f 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1089,8 +1089,13 @@ static int pt_event_addr_filters_validate(struct list_head *filters) if (!filter->range || !filter->size) return -EOPNOTSUPP; - if (!filter->inode && !valid_kernel_ip(filter->offset)) - return -EINVAL; + if (!filter->inode) { + if (!valid_kernel_ip(filter->offset)) + return -EINVAL; + + if (!valid_kernel_ip(filter->offset + filter->size)) + return -EINVAL; + } if (++range > pt_cap_get(PT_CAP_num_address_ranges)) return -EOPNOTSUPP; -- cgit v1.2.3 From 080fe0b790ad438fc1b61621dac37c1964ce7f35 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 24 Aug 2016 14:12:08 +0100 Subject: perf/x86/amd: Make HW_CACHE_REFERENCES and HW_CACHE_MISSES measure L2 While the Intel PMU monitors the LLC when perf enables the HW_CACHE_REFERENCES and HW_CACHE_MISSES events, these events monitor L1 instruction cache fetches (0x0080) and instruction cache misses (0x0081) on the AMD PMU. This is extremely confusing when monitoring the same workload across Intel and AMD machines, since parameters like, $ perf stat -e cache-references,cache-misses measure completely different things. Instead, make the AMD PMU measure instruction/data cache and TLB fill requests to the L2 and instruction/data cache and TLB misses in the L2 when HW_CACHE_REFERENCES and HW_CACHE_MISSES are enabled, respectively. That way the events measure unified caches on both platforms. Signed-off-by: Matt Fleming Acked-by: Peter Zijlstra Cc: Cc: Borislav Petkov Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1472044328-21302-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/events/amd/core.c | 4 ++-- arch/x86/kvm/pmu_amd.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e07a22bb9308..f5f4b3fbbbc2 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -119,8 +119,8 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 39b91127ef07..cd944435dfbd 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -23,8 +23,8 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES }, + [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, -- cgit v1.2.3 From e535ec0899d1fe52ec3a84c9bc03457ac67ad6f7 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 20 Sep 2016 14:26:21 +0100 Subject: x86/mm/pat: Prevent hang during boot when mapping pages There's a mixture of signed 32-bit and unsigned 32-bit and 64-bit data types used for keeping track of how many pages have been mapped. This leads to hangs during boot when mapping large numbers of pages (multiple terabytes, as reported by Waiman) because those values are interpreted as being negative. commit 742563777e8d ("x86/mm/pat: Avoid truncation when converting cpa->numpages to address") fixed one of those bugs, but there is another lurking in __change_page_attr_set_clr(). Additionally, the return value type for the populate_*() functions can return negative values when a large number of pages have been mapped, triggering the error paths even though no error occurred. Consistently use 64-bit types on 64-bit platforms when counting pages. Even in the signed case this gives us room for regions 8PiB (pebibytes) in size whilst still allowing the usual negative value error checking idiom. Reported-by: Waiman Long Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds CC: Theodore Ts'o Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Scott J Norton Cc: Douglas Hatch Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 849dc09fa4f0..e3353c97d086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa, } } -static int populate_pmd(struct cpa_data *cpa, - unsigned long start, unsigned long end, - unsigned num_pages, pud_t *pud, pgprot_t pgprot) +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) { - unsigned int cur_pages = 0; + long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; @@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa, return num_pages; } -static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) { pud_t *pud; unsigned long end; - int cur_pages = 0; + long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* Map trailing leftover */ if (start < end) { - int tmp; + long tmp; pud = pud_offset(pgd, start); if (pud_none(*pud)) @@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ pgd_t *pgd_entry; - int ret; + long ret; pgd_entry = cpa->pgd + pgd_index(addr); @@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { - int ret, numpages = cpa->numpages; + unsigned long numpages = cpa->numpages; + int ret; while (numpages) { /* -- cgit v1.2.3 From 1297667083d5442aafe3e337b9413bf02b114edb Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 19 Sep 2016 13:09:09 +0100 Subject: x86/efi: Only map RAM into EFI page tables if in mixed-mode Waiman reported that booting with CONFIG_EFI_MIXED enabled on his multi-terabyte HP machine results in boot crashes, because the EFI region mapping functions loop forever while trying to map those regions describing RAM. While this patch doesn't fix the underlying hang, there's really no reason to map EFI_CONVENTIONAL_MEMORY regions into the EFI page tables when mixed-mode is not in use at runtime. Reported-by: Waiman Long Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds CC: Theodore Ts'o Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Scott J Norton Cc: Douglas Hatch Cc: # v4.6+ Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 677e29e29473..8dd3784eb075 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -245,7 +245,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * text and allocate a new stack because we can't rely on the * stack pointer being < 4GB. */ - if (!IS_ENABLED(CONFIG_EFI_MIXED)) + if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native()) return 0; /* -- cgit v1.2.3 From f1e1c9e5e357c05253affb13be29285c5cb56bf0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 20 Sep 2016 15:12:21 +0200 Subject: perf/x86/intel/bts: Make sure debug store is valid Since commit 4d4c47412464 ("perf/x86/intel/bts: Fix BTS PMI detection") my box goes boom on boot: | .... node #0, CPUs: #1 #2 #3 #4 #5 #6 #7 | BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 | IP: [] intel_bts_interrupt+0x43/0x130 | Call Trace: | d [] intel_pmu_handle_irq+0x51/0x4b0 | [] perf_event_nmi_handler+0x27/0x40 This happens because the code introduced in this commit dereferences the debug store pointer unconditionally. The debug store is not guaranteed to be available, so a NULL pointer check as on other places is required. Fixes: 4d4c47412464 ("perf/x86/intel/bts: Fix BTS PMI detection") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: vince@deater.net Cc: eranian@google.com Link: http://lkml.kernel.org/r/20160920131220.xg5pbdjtznszuyzb@breakpoint.cc Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/bts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index bdcd6510992c..6ff66efa0feb 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -455,7 +455,7 @@ int intel_bts_interrupt(void) * The only surefire way of knowing if this NMI is ours is by checking * the write ptr against the PMI threshold. */ - if (ds->bts_index >= ds->bts_interrupt_threshold) + if (ds && (ds->bts_index >= ds->bts_interrupt_threshold)) handled = 1; /* -- cgit v1.2.3 From 08b90f0655258411a1b41d856331e20e7ec8d55c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 20 Sep 2016 18:48:10 +0300 Subject: perf/x86/intel/bts: Make it an exclusive PMU Just like intel_pt, intel_bts can only handle one event at a time, which is the reason we introduced PERF_PMU_CAP_EXCLUSIVE in the first place. However, at the moment one can have as many intel_bts events within the same context at the same time as one pleases. Only one of them, however, will get scheduled and receive the actual trace data. Fix this by making intel_bts an "exclusive" PMU. Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160920154811.3255-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6ff66efa0feb..982c9e31daca 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -584,7 +584,8 @@ static __init int bts_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; - bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | + PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; bts_pmu.event_init = bts_event_init; bts_pmu.add = bts_event_add; -- cgit v1.2.3