diff options
| author | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2012-09-05 10:22:45 -0400 | 
|---|---|---|
| committer | Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | 2012-09-05 10:22:45 -0400 | 
| commit | 593d0a3e9f813db910dc50574532914db21d09ff (patch) | |
| tree | 12d8413ee57b4383ca8c906996ffe02be6d377a5 /arch/x86 | |
| parent | 50e900417b8096939d12a46848f965e27a905e36 (diff) | |
| parent | 4cb38750d49010ae72e718d46605ac9ba5a851b4 (diff) | |
| download | linux-593d0a3e9f813db910dc50574532914db21d09ff.tar.bz2 | |
Merge commit '4cb38750d49010ae72e718d46605ac9ba5a851b4' into stable/for-linus-3.6
* commit '4cb38750d49010ae72e718d46605ac9ba5a851b4': (6849 commits)
  bcma: fix invalid PMU chip control masks
  [libata] pata_cmd64x: whitespace cleanup
  libata-acpi: fix up for acpi_pm_device_sleep_state API
  sata_dwc_460ex: device tree may specify dma_channel
  ahci, trivial: fixed coding style issues related to braces
  ahci_platform: add hibernation callbacks
  libata-eh.c: local functions should not be exposed globally
  libata-transport.c: local functions should not be exposed globally
  sata_dwc_460ex: support hardreset
  ata: use module_pci_driver
  drivers/ata/pata_pcmcia.c: adjust suspicious bit operation
  pata_imx: Convert to clk_prepare_enable/clk_disable_unprepare
  ahci: Enable SB600 64bit DMA on MSI K9AGM2 (MS-7327) v2
  [libata] Prevent interface errors with Seagate FreeAgent GoFlex
  drivers/acpi/glue: revert accidental license-related 6b66d95895c bits
  libata-acpi: add missing inlines in libata.h
  i2c-omap: Add support for I2C_M_STOP message flag
  i2c: Fall back to emulated SMBus if the operation isn't supported natively
  i2c: Add SCCB support
  i2c-tiny-usb: Add support for the Robofuzz OSIF USB/I2C converter
  ...
Diffstat (limited to 'arch/x86')
187 files changed, 9698 insertions, 3832 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index e46c2147397f..b322f124ee3c 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -129,6 +129,25 @@ config DOUBLEFAULT  	  option saves about 4k and might cause you much additional grey  	  hair. +config DEBUG_TLBFLUSH +	bool "Set upper limit of TLB entries to flush one-by-one" +	depends on DEBUG_KERNEL && (X86_64 || X86_INVLPG) +	---help--- + +	X86-only for now. + +	This option allows the user to tune the amount of TLB entries the +	kernel flushes one-by-one instead of doing a full TLB flush. In +	certain situations, the former is cheaper. This is controlled by the +	tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it +	to -1, the code flushes the whole TLB unconditionally. Otherwise, +	for positive values of it, the kernel will use single TLB entry +	invalidating instructions according to the following formula: + +	flush_entries <= active_tlb_entries / 2^tlb_flushall_shift + +	If in doubt, say "N". +  config IOMMU_DEBUG  	bool "Enable IOMMU debugging"  	depends on GART_IOMMU && DEBUG_KERNEL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1f2521434554..b0c5276861ec 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -49,6 +49,9 @@ else          KBUILD_AFLAGS += -m64          KBUILD_CFLAGS += -m64 +	# Use -mpreferred-stack-boundary=3 if supported. +	KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3) +          # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)          cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)          cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index cb62f786990d..10f6b1178c68 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,5 +1,7 @@  #include "misc.h" +#ifdef CONFIG_EARLY_PRINTK +  static unsigned long fs;  static inline void set_fs(unsigned long seg)  { @@ -19,3 +21,5 @@ int cmdline_find_option_bool(const char *option)  {  	return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);  } + +#endif diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c index 261e81fb9582..d3d003cb5481 100644 --- a/arch/x86/boot/compressed/early_serial_console.c +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -1,5 +1,9 @@  #include "misc.h" +#ifdef CONFIG_EARLY_PRINTK +  int early_serial_base;  #include "../early_serial_console.c" + +#endif diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 4e85f5f85837..b3e0227df2c9 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -729,32 +729,68 @@ fail:   * need to create one ourselves (usually the bootloader would create   * one for us).   */ -static efi_status_t make_boot_params(struct boot_params *boot_params, -				     efi_loaded_image_t *image, -				     void *handle) +struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)  { -	struct efi_info *efi = &boot_params->efi_info; -	struct apm_bios_info *bi = &boot_params->apm_bios_info; -	struct sys_desc_table *sdt = &boot_params->sys_desc_table; -	struct e820entry *e820_map = &boot_params->e820_map[0]; -	struct e820entry *prev = NULL; -	struct setup_header *hdr = &boot_params->hdr; -	unsigned long size, key, desc_size, _size; -	efi_memory_desc_t *mem_map; -	void *options = image->load_options; -	u32 load_options_size = image->load_options_size / 2; /* ASCII */ +	struct boot_params *boot_params; +	struct sys_desc_table *sdt; +	struct apm_bios_info *bi; +	struct setup_header *hdr; +	struct efi_info *efi; +	efi_loaded_image_t *image; +	void *options; +	u32 load_options_size; +	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;  	int options_size = 0;  	efi_status_t status; -	__u32 desc_version;  	unsigned long cmdline; -	u8 nr_entries;  	u16 *s2;  	u8 *s1;  	int i; +	sys_table = _table; + +	/* Check if we were booted by the EFI firmware */ +	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) +		return NULL; + +	status = efi_call_phys3(sys_table->boottime->handle_protocol, +				handle, &proto, (void *)&image); +	if (status != EFI_SUCCESS) { +		efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); +		return NULL; +	} + +	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); +	if (status != EFI_SUCCESS) { +		efi_printk("Failed to alloc lowmem for boot params\n"); +		return NULL; +	} + +	memset(boot_params, 0x0, 0x4000); + +	hdr = &boot_params->hdr; +	efi = &boot_params->efi_info; +	bi = &boot_params->apm_bios_info; +	sdt = &boot_params->sys_desc_table; + +	/* Copy the second sector to boot_params */ +	memcpy(&hdr->jump, image->image_base + 512, 512); + +	/* +	 * Fill out some of the header fields ourselves because the +	 * EFI firmware loader doesn't load the first sector. +	 */ +	hdr->root_flags = 1; +	hdr->vid_mode = 0xffff; +	hdr->boot_flag = 0xAA55; + +	hdr->code32_start = (__u64)(unsigned long)image->image_base; +  	hdr->type_of_loader = 0x21;  	/* Convert unicode cmdline to ascii */ +	options = image->load_options; +	load_options_size = image->load_options_size / 2; /* ASCII */  	cmdline = 0;  	s2 = (u16 *)options; @@ -791,18 +827,36 @@ static efi_status_t make_boot_params(struct boot_params *boot_params,  	hdr->ramdisk_image = 0;  	hdr->ramdisk_size = 0; -	status = handle_ramdisks(image, hdr); -	if (status != EFI_SUCCESS) -		goto free_cmdline; - -	setup_graphics(boot_params); -  	/* Clear APM BIOS info */  	memset(bi, 0, sizeof(*bi));  	memset(sdt, 0, sizeof(*sdt)); -	memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); +	status = handle_ramdisks(image, hdr); +	if (status != EFI_SUCCESS) +		goto fail2; + +	return boot_params; +fail2: +	if (options_size) +		low_free(options_size, hdr->cmd_line_ptr); +fail: +	low_free(0x4000, (unsigned long)boot_params); +	return NULL; +} + +static efi_status_t exit_boot(struct boot_params *boot_params, +			      void *handle) +{ +	struct efi_info *efi = &boot_params->efi_info; +	struct e820entry *e820_map = &boot_params->e820_map[0]; +	struct e820entry *prev = NULL; +	unsigned long size, key, desc_size, _size; +	efi_memory_desc_t *mem_map; +	efi_status_t status; +	__u32 desc_version; +	u8 nr_entries; +	int i;  	size = sizeof(*mem_map) * 32; @@ -811,7 +865,7 @@ again:  	_size = size;  	status = low_alloc(size, 1, (unsigned long *)&mem_map);  	if (status != EFI_SUCCESS) -		goto free_cmdline; +		return status;  	status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,  				mem_map, &key, &desc_size, &desc_version); @@ -823,6 +877,7 @@ again:  	if (status != EFI_SUCCESS)  		goto free_mem_map; +	memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));  	efi->efi_systab = (unsigned long)sys_table;  	efi->efi_memdesc_size = desc_size;  	efi->efi_memdesc_version = desc_version; @@ -906,61 +961,13 @@ again:  free_mem_map:  	low_free(_size, (unsigned long)mem_map); -free_cmdline: -	if (options_size) -		low_free(options_size, hdr->cmd_line_ptr); -fail:  	return status;  } -/* - * On success we return a pointer to a boot_params structure, and NULL - * on failure. - */ -struct boot_params *efi_main(void *handle, efi_system_table_t *_table) +static efi_status_t relocate_kernel(struct setup_header *hdr)  { -	struct boot_params *boot_params;  	unsigned long start, nr_pages; -	struct desc_ptr *gdt, *idt; -	efi_loaded_image_t *image; -	struct setup_header *hdr;  	efi_status_t status; -	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; -	struct desc_struct *desc; - -	sys_table = _table; - -	/* Check if we were booted by the EFI firmware */ -	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) -		goto fail; - -	status = efi_call_phys3(sys_table->boottime->handle_protocol, -				handle, &proto, (void *)&image); -	if (status != EFI_SUCCESS) { -		efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); -		goto fail; -	} - -	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); -	if (status != EFI_SUCCESS) { -		efi_printk("Failed to alloc lowmem for boot params\n"); -		goto fail; -	} - -	memset(boot_params, 0x0, 0x4000); - -	hdr = &boot_params->hdr; - -	/* Copy the second sector to boot_params */ -	memcpy(&hdr->jump, image->image_base + 512, 512); - -	/* -	 * Fill out some of the header fields ourselves because the -	 * EFI firmware loader doesn't load the first sector. -	 */ -	hdr->root_flags = 1; -	hdr->vid_mode = 0xffff; -	hdr->boot_flag = 0xAA55;  	/*  	 * The EFI firmware loader could have placed the kernel image @@ -978,16 +985,40 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)  	if (status != EFI_SUCCESS) {  		status = low_alloc(hdr->init_size, hdr->kernel_alignment,  				   &start); -		if (status != EFI_SUCCESS) { +		if (status != EFI_SUCCESS)  			efi_printk("Failed to alloc mem for kernel\n"); -			goto fail; -		}  	} +	if (status == EFI_SUCCESS) +		memcpy((void *)start, (void *)(unsigned long)hdr->code32_start, +		       hdr->init_size); + +	hdr->pref_address = hdr->code32_start;  	hdr->code32_start = (__u32)start; -	hdr->pref_address = (__u64)(unsigned long)image->image_base; -	memcpy((void *)start, image->image_base, image->image_size); +	return status; +} + +/* + * On success we return a pointer to a boot_params structure, and NULL + * on failure. + */ +struct boot_params *efi_main(void *handle, efi_system_table_t *_table, +			     struct boot_params *boot_params) +{ +	struct desc_ptr *gdt, *idt; +	efi_loaded_image_t *image; +	struct setup_header *hdr = &boot_params->hdr; +	efi_status_t status; +	struct desc_struct *desc; + +	sys_table = _table; + +	/* Check if we were booted by the EFI firmware */ +	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) +		goto fail; + +	setup_graphics(boot_params);  	status = efi_call_phys3(sys_table->boottime->allocate_pool,  				EFI_LOADER_DATA, sizeof(*gdt), @@ -1015,7 +1046,18 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table)  	idt->size = 0;  	idt->address = 0; -	status = make_boot_params(boot_params, image, handle); +	/* +	 * If the kernel isn't already loaded at the preferred load +	 * address, relocate it. +	 */ +	if (hdr->pref_address != hdr->code32_start) { +		status = relocate_kernel(hdr); + +		if (status != EFI_SUCCESS) +			goto fail; +	} + +	status = exit_boot(boot_params, handle);  	if (status != EFI_SUCCESS)  		goto fail; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index c85e3ac99bba..aa4aaf1b2380 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -42,6 +42,16 @@ ENTRY(startup_32)  	 */  	add	$0x4, %esp +	call	make_boot_params +	cmpl	$0, %eax +	je	1f +	movl	0x4(%esp), %esi +	movl	(%esp), %ecx +	pushl	%eax +	pushl	%esi +	pushl	%ecx + +	.org 0x30,0x90  	call	efi_main  	cmpl	$0, %eax  	movl	%eax, %esi diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 87e03a13d8e3..2c4b171eec33 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -209,6 +209,16 @@ ENTRY(startup_64)  	.org 0x210  	mov	%rcx, %rdi  	mov	%rdx, %rsi +	pushq	%rdi +	pushq	%rsi +	call	make_boot_params +	cmpq	$0,%rax +	je	1f +	mov	%rax, %rdx +	popq	%rsi +	popq	%rdi + +	.org 0x230,0x90  	call	efi_main  	movq	%rax,%rsi  	cmpq	$0,%rax diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 7116dcba0c9e..88f7ff6da404 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -108,8 +108,6 @@ static void error(char *m);   * This is set up by the setup-routine at boot-time   */  struct boot_params *real_mode;		/* Pointer to real-mode data */ -static int quiet; -static int debug;  void *memset(void *s, int c, size_t n);  void *memcpy(void *dest, const void *src, size_t n); @@ -170,15 +168,11 @@ static void serial_putchar(int ch)  	outb(ch, early_serial_base + TXR);  } -void __putstr(int error, const char *s) +void __putstr(const char *s)  {  	int x, y, pos;  	char c; -#ifndef CONFIG_X86_VERBOSE_BOOTUP -	if (!error) -		return; -#endif  	if (early_serial_base) {  		const char *str = s;  		while (*str) { @@ -265,9 +259,9 @@ void *memcpy(void *dest, const void *src, size_t n)  static void error(char *x)  { -	__putstr(1, "\n\n"); -	__putstr(1, x); -	__putstr(1, "\n\n -- System halted"); +	error_putstr("\n\n"); +	error_putstr(x); +	error_putstr("\n\n -- System halted");  	while (1)  		asm("hlt"); @@ -294,8 +288,7 @@ static void parse_elf(void *output)  		return;  	} -	if (!quiet) -		putstr("Parsing ELF... "); +	debug_putstr("Parsing ELF... ");  	phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);  	if (!phdrs) @@ -332,11 +325,6 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,  {  	real_mode = rmode; -	if (cmdline_find_option_bool("quiet")) -		quiet = 1; -	if (cmdline_find_option_bool("debug")) -		debug = 1; -  	if (real_mode->screen_info.orig_video_mode == 7) {  		vidmem = (char *) 0xb0000;  		vidport = 0x3b4; @@ -349,8 +337,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,  	cols = real_mode->screen_info.orig_video_cols;  	console_init(); -	if (debug) -		putstr("early console in decompress_kernel\n"); +	debug_putstr("early console in decompress_kernel\n");  	free_mem_ptr     = heap;	/* Heap */  	free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -369,11 +356,9 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,  		error("Wrong destination address");  #endif -	if (!quiet) -		putstr("\nDecompressing Linux... "); +	debug_putstr("\nDecompressing Linux... ");  	decompress(input_data, input_len, NULL, NULL, output, NULL, error);  	parse_elf(output); -	if (!quiet) -		putstr("done.\nBooting the kernel.\n"); +	debug_putstr("done.\nBooting the kernel.\n");  	return;  } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3f19c81a6203..0e6dc0ee0eea 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -24,9 +24,21 @@  /* misc.c */  extern struct boot_params *real_mode;		/* Pointer to real-mode data */ -void __putstr(int error, const char *s); -#define putstr(__x)  __putstr(0, __x) -#define puts(__x)  __putstr(0, __x) +void __putstr(const char *s); +#define error_putstr(__x)  __putstr(__x) + +#ifdef CONFIG_X86_VERBOSE_BOOTUP + +#define debug_putstr(__x)  __putstr(__x) + +#else + +static inline void debug_putstr(const char *s) +{ } + +#endif + +#ifdef CONFIG_EARLY_PRINTK  /* cmdline.c */  int cmdline_find_option(const char *option, char *buffer, int bufsize); @@ -36,4 +48,13 @@ int cmdline_find_option_bool(const char *option);  extern int early_serial_base;  void console_init(void); +#else + +/* early_serial_console.c */ +static const int early_serial_base; +static inline void console_init(void) +{ } + +#endif +  #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index efe5acfc79c3..b4e15dd6786a 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -283,7 +283,7 @@ _start:  	# Part 2 of the header, from the old setup.S  		.ascii	"HdrS"		# header signature -		.word	0x020a		# header version number (>= 0x0105) +		.word	0x020b		# header version number (>= 0x0105)  					# or else old loadlin-1.5 will fail)  		.globl realmode_swtch  realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG @@ -401,18 +401,13 @@ pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr  #define INIT_SIZE VO_INIT_SIZE  #endif  init_size:		.long INIT_SIZE		# kernel initialization size +handover_offset:	.long 0x30		# offset to the handover +						# protocol entry point  # End of setup header #####################################################  	.section ".entrytext", "ax"  start_of_setup: -#ifdef SAFE_RESET_DISK_CONTROLLER -# Reset the disk controller. -	movw	$0x0000, %ax		# Reset disk controller -	movb	$0x80, %dl		# All disks -	int	$0x13 -#endif -  # Force %es = %ds  	movw	%ds, %ax  	movw	%ax, %es diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e191ac048b59..e908e5de82d3 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,9 @@  # Arch-specific CryptoAPI modules.  # +obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o +obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o +  obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o  obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o  obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o @@ -12,8 +15,10 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o  obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o  obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o  obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o +obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o  obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o  obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o +obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o  obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o  obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -30,16 +35,11 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o  blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o  twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o  twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o +twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o  salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o  serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o +serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o  aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o -  ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o - -# enable AVX support only when $(AS) can actually assemble the instructions -ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes) -AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT -CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT -endif  sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/ablk_helper.c b/arch/x86/crypto/ablk_helper.c new file mode 100644 index 000000000000..43282fe04a8b --- /dev/null +++ b/arch/x86/crypto/ablk_helper.c @@ -0,0 +1,149 @@ +/* + * Shared async block cipher helpers + * + * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * Based on aesni-intel_glue.c by: + *  Copyright (C) 2008, Intel Corp. + *    Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 + * USA + * + */ + +#include <linux/kernel.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <crypto/algapi.h> +#include <crypto/cryptd.h> +#include <asm/i387.h> +#include <asm/crypto/ablk_helper.h> + +int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, +		 unsigned int key_len) +{ +	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); +	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; +	int err; + +	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); +	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) +				    & CRYPTO_TFM_REQ_MASK); +	err = crypto_ablkcipher_setkey(child, key, key_len); +	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) +				    & CRYPTO_TFM_RES_MASK); +	return err; +} +EXPORT_SYMBOL_GPL(ablk_set_key); + +int __ablk_encrypt(struct ablkcipher_request *req) +{ +	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); +	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); +	struct blkcipher_desc desc; + +	desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); +	desc.info = req->info; +	desc.flags = 0; + +	return crypto_blkcipher_crt(desc.tfm)->encrypt( +		&desc, req->dst, req->src, req->nbytes); +} +EXPORT_SYMBOL_GPL(__ablk_encrypt); + +int ablk_encrypt(struct ablkcipher_request *req) +{ +	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); +	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + +	if (!irq_fpu_usable()) { +		struct ablkcipher_request *cryptd_req = +			ablkcipher_request_ctx(req); + +		memcpy(cryptd_req, req, sizeof(*req)); +		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + +		return crypto_ablkcipher_encrypt(cryptd_req); +	} else { +		return __ablk_encrypt(req); +	} +} +EXPORT_SYMBOL_GPL(ablk_encrypt); + +int ablk_decrypt(struct ablkcipher_request *req) +{ +	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); +	struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + +	if (!irq_fpu_usable()) { +		struct ablkcipher_request *cryptd_req = +			ablkcipher_request_ctx(req); + +		memcpy(cryptd_req, req, sizeof(*req)); +		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + +		return crypto_ablkcipher_decrypt(cryptd_req); +	} else { +		struct blkcipher_desc desc; + +		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); +		desc.info = req->info; +		desc.flags = 0; + +		return crypto_blkcipher_crt(desc.tfm)->decrypt( +			&desc, req->dst, req->src, req->nbytes); +	} +} +EXPORT_SYMBOL_GPL(ablk_decrypt); + +void ablk_exit(struct crypto_tfm *tfm) +{ +	struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + +	cryptd_free_ablkcipher(ctx->cryptd_tfm); +} +EXPORT_SYMBOL_GPL(ablk_exit); + +int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) +{ +	struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); +	struct cryptd_ablkcipher *cryptd_tfm; + +	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); +	if (IS_ERR(cryptd_tfm)) +		return PTR_ERR(cryptd_tfm); + +	ctx->cryptd_tfm = cryptd_tfm; +	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + +		crypto_ablkcipher_reqsize(&cryptd_tfm->base); + +	return 0; +} +EXPORT_SYMBOL_GPL(ablk_init_common); + +int ablk_init(struct crypto_tfm *tfm) +{ +	char drv_name[CRYPTO_MAX_ALG_NAME]; + +	snprintf(drv_name, sizeof(drv_name), "__driver-%s", +					crypto_tfm_alg_driver_name(tfm)); + +	return ablk_init_common(tfm, drv_name); +} +EXPORT_SYMBOL_GPL(ablk_init); + +MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 8efcf42a9d7e..59b37deb8c8d 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -5,7 +5,7 @@  #include <linux/module.h>  #include <crypto/aes.h> -#include <asm/aes.h> +#include <asm/crypto/aes.h>  asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);  asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ac7f5cd019e8..34fdcff4d2c8 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -30,7 +30,8 @@  #include <crypto/ctr.h>  #include <asm/cpu_device_id.h>  #include <asm/i387.h> -#include <asm/aes.h> +#include <asm/crypto/aes.h> +#include <asm/crypto/ablk_helper.h>  #include <crypto/scatterwalk.h>  #include <crypto/internal/aead.h>  #include <linux/workqueue.h> @@ -52,10 +53,6 @@  #define HAS_XTS  #endif -struct async_aes_ctx { -	struct cryptd_ablkcipher *cryptd_tfm; -}; -  /* This data is stored at the end of the crypto_tfm struct.   * It's a type of per "session" data storage location.   * This needs to be 16 byte aligned. @@ -377,87 +374,6 @@ static int ctr_crypt(struct blkcipher_desc *desc,  }  #endif -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, -			unsigned int key_len) -{ -	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); -	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; -	int err; - -	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); -	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) -				    & CRYPTO_TFM_REQ_MASK); -	err = crypto_ablkcipher_setkey(child, key, key_len); -	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) -				    & CRYPTO_TFM_RES_MASK); -	return err; -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ -	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); -	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - -	if (!irq_fpu_usable()) { -		struct ablkcipher_request *cryptd_req = -			ablkcipher_request_ctx(req); -		memcpy(cryptd_req, req, sizeof(*req)); -		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); -		return crypto_ablkcipher_encrypt(cryptd_req); -	} else { -		struct blkcipher_desc desc; -		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); -		desc.info = req->info; -		desc.flags = 0; -		return crypto_blkcipher_crt(desc.tfm)->encrypt( -			&desc, req->dst, req->src, req->nbytes); -	} -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ -	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); -	struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - -	if (!irq_fpu_usable()) { -		struct ablkcipher_request *cryptd_req = -			ablkcipher_request_ctx(req); -		memcpy(cryptd_req, req, sizeof(*req)); -		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); -		return crypto_ablkcipher_decrypt(cryptd_req); -	} else { -		struct blkcipher_desc desc; -		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); -		desc.info = req->info; -		desc.flags = 0; -		return crypto_blkcipher_crt(desc.tfm)->decrypt( -			&desc, req->dst, req->src, req->nbytes); -	} -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ -	struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); - -	cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) -{ -	struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); -	struct cryptd_ablkcipher *cryptd_tfm; - -	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); -	if (IS_ERR(cryptd_tfm)) -		return PTR_ERR(cryptd_tfm); - -	ctx->cryptd_tfm = cryptd_tfm; -	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + -		crypto_ablkcipher_reqsize(&cryptd_tfm->base); - -	return 0; -} -  static int ablk_ecb_init(struct crypto_tfm *tfm)  {  	return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); @@ -613,7 +529,7 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,  	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);  	struct aesni_rfc4106_gcm_ctx *child_ctx =                                   aesni_rfc4106_gcm_ctx_get(cryptd_child); -	u8 *new_key_mem = NULL; +	u8 *new_key_align, *new_key_mem = NULL;  	if (key_len < 4) {  		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); @@ -637,9 +553,9 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,  		if (!new_key_mem)  			return -ENOMEM; -		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); -		memcpy(new_key_mem, key, key_len); -		key = new_key_mem; +		new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN); +		memcpy(new_key_align, key, key_len); +		key = new_key_align;  	}  	if (!irq_fpu_usable()) @@ -968,7 +884,7 @@ static struct crypto_alg aesni_algs[] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= AES_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_aes_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -989,7 +905,7 @@ static struct crypto_alg aesni_algs[] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= AES_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_aes_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -1033,7 +949,7 @@ static struct crypto_alg aesni_algs[] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= 1, -	.cra_ctxsize		= sizeof(struct async_aes_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -1098,7 +1014,7 @@ static struct crypto_alg aesni_algs[] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= 1, -	.cra_ctxsize		= sizeof(struct async_aes_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -1126,7 +1042,7 @@ static struct crypto_alg aesni_algs[] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= AES_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_aes_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -1150,7 +1066,7 @@ static struct crypto_alg aesni_algs[] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= AES_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_aes_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -1174,7 +1090,7 @@ static struct crypto_alg aesni_algs[] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= AES_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_aes_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 3306dc0b139e..eeb2b3b743e9 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -5,10 +5,6 @@   *   * Camellia parts based on code by:   *  Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation) - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License as published by @@ -34,9 +30,9 @@  #include <linux/module.h>  #include <linux/types.h>  #include <crypto/algapi.h> -#include <crypto/b128ops.h>  #include <crypto/lrw.h>  #include <crypto/xts.h> +#include <asm/crypto/glue_helper.h>  #define CAMELLIA_MIN_KEY_SIZE	16  #define CAMELLIA_MAX_KEY_SIZE	32 @@ -1312,307 +1308,128 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,  				 &tfm->crt_flags);  } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, -		     void (*fn)(struct camellia_ctx *, u8 *, const u8 *), -		     void (*fn_2way)(struct camellia_ctx *, u8 *, const u8 *)) +static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)  { -	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = CAMELLIA_BLOCK_SIZE; -	unsigned int nbytes; -	int err; - -	err = blkcipher_walk_virt(desc, walk); - -	while ((nbytes = walk->nbytes)) { -		u8 *wsrc = walk->src.virt.addr; -		u8 *wdst = walk->dst.virt.addr; - -		/* Process two block batch */ -		if (nbytes >= bsize * 2) { -			do { -				fn_2way(ctx, wdst, wsrc); - -				wsrc += bsize * 2; -				wdst += bsize * 2; -				nbytes -= bsize * 2; -			} while (nbytes >= bsize * 2); - -			if (nbytes < bsize) -				goto done; -		} - -		/* Handle leftovers */ -		do { -			fn(ctx, wdst, wsrc); - -			wsrc += bsize; -			wdst += bsize; -			nbytes -= bsize; -		} while (nbytes >= bsize); - -done: -		err = blkcipher_walk_done(desc, walk, nbytes); -	} - -	return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) -{ -	struct blkcipher_walk walk; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	return ecb_crypt(desc, &walk, camellia_enc_blk, camellia_enc_blk_2way); -} +	u128 iv = *src; -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) -{ -	struct blkcipher_walk walk; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	return ecb_crypt(desc, &walk, camellia_dec_blk, camellia_dec_blk_2way); -} +	camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, -				  struct blkcipher_walk *walk) -{ -	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = CAMELLIA_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 *iv = (u128 *)walk->iv; - -	do { -		u128_xor(dst, src, iv); -		camellia_enc_blk(ctx, (u8 *)dst, (u8 *)dst); -		iv = dst; - -		src += 1; -		dst += 1; -		nbytes -= bsize; -	} while (nbytes >= bsize); - -	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); -	return nbytes; +	u128_xor(&dst[1], &dst[1], &iv);  } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) +static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)  { -	struct blkcipher_walk walk; -	int err; +	be128 ctrblk; -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt(desc, &walk); +	if (dst != src) +		*dst = *src; -	while ((nbytes = walk.nbytes)) { -		nbytes = __cbc_encrypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} +	u128_to_be128(&ctrblk, iv); +	u128_inc(iv); -	return err; +	camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);  } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, -				  struct blkcipher_walk *walk) +static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, +				    u128 *iv)  { -	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = CAMELLIA_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 ivs[2 - 1]; -	u128 last_iv; +	be128 ctrblks[2]; -	/* Start of the last block. */ -	src += nbytes / bsize - 1; -	dst += nbytes / bsize - 1; - -	last_iv = *src; - -	/* Process two block batch */ -	if (nbytes >= bsize * 2) { -		do { -			nbytes -= bsize * (2 - 1); -			src -= 2 - 1; -			dst -= 2 - 1; - -			ivs[0] = src[0]; - -			camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src); - -			u128_xor(dst + 1, dst + 1, ivs + 0); - -			nbytes -= bsize; -			if (nbytes < bsize) -				goto done; - -			u128_xor(dst, dst, src - 1); -			src -= 1; -			dst -= 1; -		} while (nbytes >= bsize * 2); - -		if (nbytes < bsize) -			goto done; +	if (dst != src) { +		dst[0] = src[0]; +		dst[1] = src[1];  	} -	/* Handle leftovers */ -	for (;;) { -		camellia_dec_blk(ctx, (u8 *)dst, (u8 *)src); - -		nbytes -= bsize; -		if (nbytes < bsize) -			break; +	u128_to_be128(&ctrblks[0], iv); +	u128_inc(iv); +	u128_to_be128(&ctrblks[1], iv); +	u128_inc(iv); -		u128_xor(dst, dst, src - 1); -		src -= 1; -		dst -= 1; -	} - -done: -	u128_xor(dst, dst, (u128 *)walk->iv); -	*(u128 *)walk->iv = last_iv; - -	return nbytes; +	camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);  } -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) -{ -	struct blkcipher_walk walk; -	int err; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt(desc, &walk); +static const struct common_glue_ctx camellia_enc = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 2, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } +	} } +}; -	while ((nbytes = walk.nbytes)) { -		nbytes = __cbc_decrypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} +static const struct common_glue_ctx camellia_ctr = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 2, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } +	} } +}; -	return err; -} +static const struct common_glue_ctx camellia_dec = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 2, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } +	} } +}; -static inline void u128_to_be128(be128 *dst, const u128 *src) -{ -	dst->a = cpu_to_be64(src->a); -	dst->b = cpu_to_be64(src->b); -} +static const struct common_glue_ctx camellia_dec_cbc = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 2, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } +	} } +}; -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	dst->a = be64_to_cpu(src->a); -	dst->b = be64_to_cpu(src->b); +	return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);  } -static inline void u128_inc(u128 *i) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	i->b++; -	if (!i->b) -		i->a++; +	return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);  } -static void ctr_crypt_final(struct blkcipher_desc *desc, -			    struct blkcipher_walk *walk) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	u8 keystream[CAMELLIA_BLOCK_SIZE]; -	u8 *src = walk->src.virt.addr; -	u8 *dst = walk->dst.virt.addr; -	unsigned int nbytes = walk->nbytes; -	u128 ctrblk; - -	memcpy(keystream, src, nbytes); -	camellia_enc_blk_xor(ctx, keystream, walk->iv); -	memcpy(dst, keystream, nbytes); - -	be128_to_u128(&ctrblk, (be128 *)walk->iv); -	u128_inc(&ctrblk); -	u128_to_be128((be128 *)walk->iv, &ctrblk); +	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, +				       dst, src, nbytes);  } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, -				struct blkcipher_walk *walk) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	struct camellia_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = CAMELLIA_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 ctrblk; -	be128 ctrblocks[2]; - -	be128_to_u128(&ctrblk, (be128 *)walk->iv); - -	/* Process two block batch */ -	if (nbytes >= bsize * 2) { -		do { -			if (dst != src) { -				dst[0] = src[0]; -				dst[1] = src[1]; -			} - -			/* create ctrblks for parallel encrypt */ -			u128_to_be128(&ctrblocks[0], &ctrblk); -			u128_inc(&ctrblk); -			u128_to_be128(&ctrblocks[1], &ctrblk); -			u128_inc(&ctrblk); - -			camellia_enc_blk_xor_2way(ctx, (u8 *)dst, -						 (u8 *)ctrblocks); - -			src += 2; -			dst += 2; -			nbytes -= bsize * 2; -		} while (nbytes >= bsize * 2); - -		if (nbytes < bsize) -			goto done; -	} - -	/* Handle leftovers */ -	do { -		if (dst != src) -			*dst = *src; - -		u128_to_be128(&ctrblocks[0], &ctrblk); -		u128_inc(&ctrblk); - -		camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); - -		src += 1; -		dst += 1; -		nbytes -= bsize; -	} while (nbytes >= bsize); - -done: -	u128_to_be128((be128 *)walk->iv, &ctrblk); -	return nbytes; +	return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, +				       nbytes);  }  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,  		     struct scatterlist *src, unsigned int nbytes)  { -	struct blkcipher_walk walk; -	int err; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt_block(desc, &walk, CAMELLIA_BLOCK_SIZE); - -	while ((nbytes = walk.nbytes) >= CAMELLIA_BLOCK_SIZE) { -		nbytes = __ctr_crypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} - -	if (walk.nbytes) { -		ctr_crypt_final(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, 0); -	} - -	return err; +	return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);  }  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c new file mode 100644 index 000000000000..4854f0f31e4f --- /dev/null +++ b/arch/x86/crypto/glue_helper.c @@ -0,0 +1,307 @@ +/* + * Shared glue code for 128bit block ciphers + * + * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * CTR part based on code (crypto/ctr.c) by: + *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 + * USA + * + */ + +#include <linux/module.h> +#include <crypto/b128ops.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/crypto/glue_helper.h> +#include <crypto/scatterwalk.h> + +static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, +				   struct blkcipher_desc *desc, +				   struct blkcipher_walk *walk) +{ +	void *ctx = crypto_blkcipher_ctx(desc->tfm); +	const unsigned int bsize = 128 / 8; +	unsigned int nbytes, i, func_bytes; +	bool fpu_enabled = false; +	int err; + +	err = blkcipher_walk_virt(desc, walk); + +	while ((nbytes = walk->nbytes)) { +		u8 *wsrc = walk->src.virt.addr; +		u8 *wdst = walk->dst.virt.addr; + +		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +					     desc, fpu_enabled, nbytes); + +		for (i = 0; i < gctx->num_funcs; i++) { +			func_bytes = bsize * gctx->funcs[i].num_blocks; + +			/* Process multi-block batch */ +			if (nbytes >= func_bytes) { +				do { +					gctx->funcs[i].fn_u.ecb(ctx, wdst, +								wsrc); + +					wsrc += func_bytes; +					wdst += func_bytes; +					nbytes -= func_bytes; +				} while (nbytes >= func_bytes); + +				if (nbytes < bsize) +					goto done; +			} +		} + +done: +		err = blkcipher_walk_done(desc, walk, nbytes); +	} + +	glue_fpu_end(fpu_enabled); +	return err; +} + +int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, +			  struct blkcipher_desc *desc, struct scatterlist *dst, +			  struct scatterlist *src, unsigned int nbytes) +{ +	struct blkcipher_walk walk; + +	blkcipher_walk_init(&walk, dst, src, nbytes); +	return __glue_ecb_crypt_128bit(gctx, desc, &walk); +} +EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit); + +static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, +					      struct blkcipher_desc *desc, +					      struct blkcipher_walk *walk) +{ +	void *ctx = crypto_blkcipher_ctx(desc->tfm); +	const unsigned int bsize = 128 / 8; +	unsigned int nbytes = walk->nbytes; +	u128 *src = (u128 *)walk->src.virt.addr; +	u128 *dst = (u128 *)walk->dst.virt.addr; +	u128 *iv = (u128 *)walk->iv; + +	do { +		u128_xor(dst, src, iv); +		fn(ctx, (u8 *)dst, (u8 *)dst); +		iv = dst; + +		src += 1; +		dst += 1; +		nbytes -= bsize; +	} while (nbytes >= bsize); + +	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); +	return nbytes; +} + +int glue_cbc_encrypt_128bit(const common_glue_func_t fn, +			    struct blkcipher_desc *desc, +			    struct scatterlist *dst, +			    struct scatterlist *src, unsigned int nbytes) +{ +	struct blkcipher_walk walk; +	int err; + +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	while ((nbytes = walk.nbytes)) { +		nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk); +		err = blkcipher_walk_done(desc, &walk, nbytes); +	} + +	return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit); + +static unsigned int +__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, +			  struct blkcipher_desc *desc, +			  struct blkcipher_walk *walk) +{ +	void *ctx = crypto_blkcipher_ctx(desc->tfm); +	const unsigned int bsize = 128 / 8; +	unsigned int nbytes = walk->nbytes; +	u128 *src = (u128 *)walk->src.virt.addr; +	u128 *dst = (u128 *)walk->dst.virt.addr; +	u128 last_iv; +	unsigned int num_blocks, func_bytes; +	unsigned int i; + +	/* Start of the last block. */ +	src += nbytes / bsize - 1; +	dst += nbytes / bsize - 1; + +	last_iv = *src; + +	for (i = 0; i < gctx->num_funcs; i++) { +		num_blocks = gctx->funcs[i].num_blocks; +		func_bytes = bsize * num_blocks; + +		/* Process multi-block batch */ +		if (nbytes >= func_bytes) { +			do { +				nbytes -= func_bytes - bsize; +				src -= num_blocks - 1; +				dst -= num_blocks - 1; + +				gctx->funcs[i].fn_u.cbc(ctx, dst, src); + +				nbytes -= bsize; +				if (nbytes < bsize) +					goto done; + +				u128_xor(dst, dst, src - 1); +				src -= 1; +				dst -= 1; +			} while (nbytes >= func_bytes); + +			if (nbytes < bsize) +				goto done; +		} +	} + +done: +	u128_xor(dst, dst, (u128 *)walk->iv); +	*(u128 *)walk->iv = last_iv; + +	return nbytes; +} + +int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, +			    struct blkcipher_desc *desc, +			    struct scatterlist *dst, +			    struct scatterlist *src, unsigned int nbytes) +{ +	const unsigned int bsize = 128 / 8; +	bool fpu_enabled = false; +	struct blkcipher_walk walk; +	int err; + +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt(desc, &walk); + +	while ((nbytes = walk.nbytes)) { +		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +					     desc, fpu_enabled, nbytes); +		nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); +		err = blkcipher_walk_done(desc, &walk, nbytes); +	} + +	glue_fpu_end(fpu_enabled); +	return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); + +static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, +					struct blkcipher_desc *desc, +					struct blkcipher_walk *walk) +{ +	void *ctx = crypto_blkcipher_ctx(desc->tfm); +	u8 *src = (u8 *)walk->src.virt.addr; +	u8 *dst = (u8 *)walk->dst.virt.addr; +	unsigned int nbytes = walk->nbytes; +	u128 ctrblk; +	u128 tmp; + +	be128_to_u128(&ctrblk, (be128 *)walk->iv); + +	memcpy(&tmp, src, nbytes); +	fn_ctr(ctx, &tmp, &tmp, &ctrblk); +	memcpy(dst, &tmp, nbytes); + +	u128_to_be128((be128 *)walk->iv, &ctrblk); +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); + +static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, +					    struct blkcipher_desc *desc, +					    struct blkcipher_walk *walk) +{ +	const unsigned int bsize = 128 / 8; +	void *ctx = crypto_blkcipher_ctx(desc->tfm); +	unsigned int nbytes = walk->nbytes; +	u128 *src = (u128 *)walk->src.virt.addr; +	u128 *dst = (u128 *)walk->dst.virt.addr; +	u128 ctrblk; +	unsigned int num_blocks, func_bytes; +	unsigned int i; + +	be128_to_u128(&ctrblk, (be128 *)walk->iv); + +	/* Process multi-block batch */ +	for (i = 0; i < gctx->num_funcs; i++) { +		num_blocks = gctx->funcs[i].num_blocks; +		func_bytes = bsize * num_blocks; + +		if (nbytes >= func_bytes) { +			do { +				gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); + +				src += num_blocks; +				dst += num_blocks; +				nbytes -= func_bytes; +			} while (nbytes >= func_bytes); + +			if (nbytes < bsize) +				goto done; +		} +	} + +done: +	u128_to_be128((be128 *)walk->iv, &ctrblk); +	return nbytes; +} + +int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, +			  struct blkcipher_desc *desc, struct scatterlist *dst, +			  struct scatterlist *src, unsigned int nbytes) +{ +	const unsigned int bsize = 128 / 8; +	bool fpu_enabled = false; +	struct blkcipher_walk walk; +	int err; + +	blkcipher_walk_init(&walk, dst, src, nbytes); +	err = blkcipher_walk_virt_block(desc, &walk, bsize); + +	while ((nbytes = walk.nbytes) >= bsize) { +		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +					     desc, fpu_enabled, nbytes); +		nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); +		err = blkcipher_walk_done(desc, &walk, nbytes); +	} + +	glue_fpu_end(fpu_enabled); + +	if (walk.nbytes) { +		glue_ctr_crypt_final_128bit( +			gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); +		err = blkcipher_walk_done(desc, &walk, 0); +	} + +	return err; +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); + +MODULE_LICENSE("GPL"); diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S new file mode 100644 index 000000000000..504106bf04a2 --- /dev/null +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -0,0 +1,704 @@ +/* + * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) + * + * Copyright (C) 2012 Johannes Goetzfried + *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by + *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 + * USA + * + */ + +.file "serpent-avx-x86_64-asm_64.S" +.text + +#define CTX %rdi + +/********************************************************************** +  8-way AVX serpent + **********************************************************************/ +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 +#define RE1 %xmm4 + +#define tp  %xmm5 + +#define RA2 %xmm6 +#define RB2 %xmm7 +#define RC2 %xmm8 +#define RD2 %xmm9 +#define RE2 %xmm10 + +#define RNOT %xmm11 + +#define RK0 %xmm12 +#define RK1 %xmm13 +#define RK2 %xmm14 +#define RK3 %xmm15 + + +#define S0_1(x0, x1, x2, x3, x4)      \ +	vpor		x0,   x3, tp; \ +	vpxor		x3,   x0, x0; \ +	vpxor		x2,   x3, x4; \ +	vpxor		RNOT, x4, x4; \ +	vpxor		x1,   tp, x3; \ +	vpand		x0,   x1, x1; \ +	vpxor		x4,   x1, x1; \ +	vpxor		x0,   x2, x2; +#define S0_2(x0, x1, x2, x3, x4)      \ +	vpxor		x3,   x0, x0; \ +	vpor		x0,   x4, x4; \ +	vpxor		x2,   x0, x0; \ +	vpand		x1,   x2, x2; \ +	vpxor		x2,   x3, x3; \ +	vpxor		RNOT, x1, x1; \ +	vpxor		x4,   x2, x2; \ +	vpxor		x2,   x1, x1; + +#define S1_1(x0, x1, x2, x3, x4)      \ +	vpxor		x0,   x1, tp; \ +	vpxor		x3,   x0, x0; \ +	vpxor		RNOT, x3, x3; \ +	vpand		tp,   x1, x4; \ +	vpor		tp,   x0, x0; \ +	vpxor		x2,   x3, x3; \ +	vpxor		x3,   x0, x0; \ +	vpxor		x3,   tp, x1; +#define S1_2(x0, x1, x2, x3, x4)      \ +	vpxor		x4,   x3, x3; \ +	vpor		x4,   x1, x1; \ +	vpxor		x2,   x4, x4; \ +	vpand		x0,   x2, x2; \ +	vpxor		x1,   x2, x2; \ +	vpor		x0,   x1, x1; \ +	vpxor		RNOT, x0, x0; \ +	vpxor		x2,   x0, x0; \ +	vpxor		x1,   x4, x4; + +#define S2_1(x0, x1, x2, x3, x4)      \ +	vpxor		RNOT, x3, x3; \ +	vpxor		x0,   x1, x1; \ +	vpand		x2,   x0, tp; \ +	vpxor		x3,   tp, tp; \ +	vpor		x0,   x3, x3; \ +	vpxor		x1,   x2, x2; \ +	vpxor		x1,   x3, x3; \ +	vpand		tp,   x1, x1; +#define S2_2(x0, x1, x2, x3, x4)      \ +	vpxor		x2,   tp, tp; \ +	vpand		x3,   x2, x2; \ +	vpor		x1,   x3, x3; \ +	vpxor		RNOT, tp, tp; \ +	vpxor		tp,   x3, x3; \ +	vpxor		tp,   x0, x4; \ +	vpxor		x2,   tp, x0; \ +	vpor		x2,   x1, x1; + +#define S3_1(x0, x1, x2, x3, x4)      \ +	vpxor		x3,   x1, tp; \ +	vpor		x0,   x3, x3; \ +	vpand		x0,   x1, x4; \ +	vpxor		x2,   x0, x0; \ +	vpxor		tp,   x2, x2; \ +	vpand		x3,   tp, x1; \ +	vpxor		x3,   x2, x2; \ +	vpor		x4,   x0, x0; \ +	vpxor		x3,   x4, x4; +#define S3_2(x0, x1, x2, x3, x4)      \ +	vpxor		x0,   x1, x1; \ +	vpand		x3,   x0, x0; \ +	vpand		x4,   x3, x3; \ +	vpxor		x2,   x3, x3; \ +	vpor		x1,   x4, x4; \ +	vpand		x1,   x2, x2; \ +	vpxor		x3,   x4, x4; \ +	vpxor		x3,   x0, x0; \ +	vpxor		x2,   x3, x3; + +#define S4_1(x0, x1, x2, x3, x4)      \ +	vpand		x0,   x3, tp; \ +	vpxor		x3,   x0, x0; \ +	vpxor		x2,   tp, tp; \ +	vpor		x3,   x2, x2; \ +	vpxor		x1,   x0, x0; \ +	vpxor		tp,   x3, x4; \ +	vpor		x0,   x2, x2; \ +	vpxor		x1,   x2, x2; +#define S4_2(x0, x1, x2, x3, x4)      \ +	vpand		x0,   x1, x1; \ +	vpxor		x4,   x1, x1; \ +	vpand		x2,   x4, x4; \ +	vpxor		tp,   x2, x2; \ +	vpxor		x0,   x4, x4; \ +	vpor		x1,   tp, x3; \ +	vpxor		RNOT, x1, x1; \ +	vpxor		x0,   x3, x3; + +#define S5_1(x0, x1, x2, x3, x4)      \ +	vpor		x0,   x1, tp; \ +	vpxor		tp,   x2, x2; \ +	vpxor		RNOT, x3, x3; \ +	vpxor		x0,   x1, x4; \ +	vpxor		x2,   x0, x0; \ +	vpand		x4,   tp, x1; \ +	vpor		x3,   x4, x4; \ +	vpxor		x0,   x4, x4; +#define S5_2(x0, x1, x2, x3, x4)      \ +	vpand		x3,   x0, x0; \ +	vpxor		x3,   x1, x1; \ +	vpxor		x2,   x3, x3; \ +	vpxor		x1,   x0, x0; \ +	vpand		x4,   x2, x2; \ +	vpxor		x2,   x1, x1; \ +	vpand		x0,   x2, x2; \ +	vpxor		x2,   x3, x3; + +#define S6_1(x0, x1, x2, x3, x4)      \ +	vpxor		x0,   x3, x3; \ +	vpxor		x2,   x1, tp; \ +	vpxor		x0,   x2, x2; \ +	vpand		x3,   x0, x0; \ +	vpor		x3,   tp, tp; \ +	vpxor		RNOT, x1, x4; \ +	vpxor		tp,   x0, x0; \ +	vpxor		x2,   tp, x1; +#define S6_2(x0, x1, x2, x3, x4)      \ +	vpxor		x4,   x3, x3; \ +	vpxor		x0,   x4, x4; \ +	vpand		x0,   x2, x2; \ +	vpxor		x1,   x4, x4; \ +	vpxor		x3,   x2, x2; \ +	vpand		x1,   x3, x3; \ +	vpxor		x0,   x3, x3; \ +	vpxor		x2,   x1, x1; + +#define S7_1(x0, x1, x2, x3, x4)      \ +	vpxor		RNOT, x1, tp; \ +	vpxor		RNOT, x0, x0; \ +	vpand		x2,   tp, x1; \ +	vpxor		x3,   x1, x1; \ +	vpor		tp,   x3, x3; \ +	vpxor		x2,   tp, x4; \ +	vpxor		x3,   x2, x2; \ +	vpxor		x0,   x3, x3; \ +	vpor		x1,   x0, x0; +#define S7_2(x0, x1, x2, x3, x4)      \ +	vpand		x0,   x2, x2; \ +	vpxor		x4,   x0, x0; \ +	vpxor		x3,   x4, x4; \ +	vpand		x0,   x3, x3; \ +	vpxor		x1,   x4, x4; \ +	vpxor		x4,   x2, x2; \ +	vpxor		x1,   x3, x3; \ +	vpor		x0,   x4, x4; \ +	vpxor		x1,   x4, x4; + +#define SI0_1(x0, x1, x2, x3, x4)     \ +	vpxor		x0,   x1, x1; \ +	vpor		x1,   x3, tp; \ +	vpxor		x1,   x3, x4; \ +	vpxor		RNOT, x0, x0; \ +	vpxor		tp,   x2, x2; \ +	vpxor		x0,   tp, x3; \ +	vpand		x1,   x0, x0; \ +	vpxor		x2,   x0, x0; +#define SI0_2(x0, x1, x2, x3, x4)     \ +	vpand		x3,   x2, x2; \ +	vpxor		x4,   x3, x3; \ +	vpxor		x3,   x2, x2; \ +	vpxor		x3,   x1, x1; \ +	vpand		x0,   x3, x3; \ +	vpxor		x0,   x1, x1; \ +	vpxor		x2,   x0, x0; \ +	vpxor		x3,   x4, x4; + +#define SI1_1(x0, x1, x2, x3, x4)     \ +	vpxor		x3,   x1, x1; \ +	vpxor		x2,   x0, tp; \ +	vpxor		RNOT, x2, x2; \ +	vpor		x1,   x0, x4; \ +	vpxor		x3,   x4, x4; \ +	vpand		x1,   x3, x3; \ +	vpxor		x2,   x1, x1; \ +	vpand		x4,   x2, x2; +#define SI1_2(x0, x1, x2, x3, x4)     \ +	vpxor		x1,   x4, x4; \ +	vpor		x3,   x1, x1; \ +	vpxor		tp,   x3, x3; \ +	vpxor		tp,   x2, x2; \ +	vpor		x4,   tp, x0; \ +	vpxor		x4,   x2, x2; \ +	vpxor		x0,   x1, x1; \ +	vpxor		x1,   x4, x4; + +#define SI2_1(x0, x1, x2, x3, x4)     \ +	vpxor		x1,   x2, x2; \ +	vpxor		RNOT, x3, tp; \ +	vpor		x2,   tp, tp; \ +	vpxor		x3,   x2, x2; \ +	vpxor		x0,   x3, x4; \ +	vpxor		x1,   tp, x3; \ +	vpor		x2,   x1, x1; \ +	vpxor		x0,   x2, x2; +#define SI2_2(x0, x1, x2, x3, x4)     \ +	vpxor		x4,   x1, x1; \ +	vpor		x3,   x4, x4; \ +	vpxor		x3,   x2, x2; \ +	vpxor		x2,   x4, x4; \ +	vpand		x1,   x2, x2; \ +	vpxor		x3,   x2, x2; \ +	vpxor		x4,   x3, x3; \ +	vpxor		x0,   x4, x4; + +#define SI3_1(x0, x1, x2, x3, x4)     \ +	vpxor		x1,   x2, x2; \ +	vpand		x2,   x1, tp; \ +	vpxor		x0,   tp, tp; \ +	vpor		x1,   x0, x0; \ +	vpxor		x3,   x1, x4; \ +	vpxor		x3,   x0, x0; \ +	vpor		tp,   x3, x3; \ +	vpxor		x2,   tp, x1; +#define SI3_2(x0, x1, x2, x3, x4)     \ +	vpxor		x3,   x1, x1; \ +	vpxor		x2,   x0, x0; \ +	vpxor		x3,   x2, x2; \ +	vpand		x1,   x3, x3; \ +	vpxor		x0,   x1, x1; \ +	vpand		x2,   x0, x0; \ +	vpxor		x3,   x4, x4; \ +	vpxor		x0,   x3, x3; \ +	vpxor		x1,   x0, x0; + +#define SI4_1(x0, x1, x2, x3, x4)     \ +	vpxor		x3,   x2, x2; \ +	vpand		x1,   x0, tp; \ +	vpxor		x2,   tp, tp; \ +	vpor		x3,   x2, x2; \ +	vpxor		RNOT, x0, x4; \ +	vpxor		tp,   x1, x1; \ +	vpxor		x2,   tp, x0; \ +	vpand		x4,   x2, x2; +#define SI4_2(x0, x1, x2, x3, x4)     \ +	vpxor		x0,   x2, x2; \ +	vpor		x4,   x0, x0; \ +	vpxor		x3,   x0, x0; \ +	vpand		x2,   x3, x3; \ +	vpxor		x3,   x4, x4; \ +	vpxor		x1,   x3, x3; \ +	vpand		x0,   x1, x1; \ +	vpxor		x1,   x4, x4; \ +	vpxor		x3,   x0, x0; + +#define SI5_1(x0, x1, x2, x3, x4)     \ +	vpor		x2,   x1, tp; \ +	vpxor		x1,   x2, x2; \ +	vpxor		x3,   tp, tp; \ +	vpand		x1,   x3, x3; \ +	vpxor		x3,   x2, x2; \ +	vpor		x0,   x3, x3; \ +	vpxor		RNOT, x0, x0; \ +	vpxor		x2,   x3, x3; \ +	vpor		x0,   x2, x2; +#define SI5_2(x0, x1, x2, x3, x4)     \ +	vpxor		tp,   x1, x4; \ +	vpxor		x4,   x2, x2; \ +	vpand		x0,   x4, x4; \ +	vpxor		tp,   x0, x0; \ +	vpxor		x3,   tp, x1; \ +	vpand		x2,   x0, x0; \ +	vpxor		x3,   x2, x2; \ +	vpxor		x2,   x0, x0; \ +	vpxor		x4,   x2, x2; \ +	vpxor		x3,   x4, x4; + +#define SI6_1(x0, x1, x2, x3, x4)     \ +	vpxor		x2,   x0, x0; \ +	vpand		x3,   x0, tp; \ +	vpxor		x3,   x2, x2; \ +	vpxor		x2,   tp, tp; \ +	vpxor		x1,   x3, x3; \ +	vpor		x0,   x2, x2; \ +	vpxor		x3,   x2, x2; \ +	vpand		tp,   x3, x3; +#define SI6_2(x0, x1, x2, x3, x4)     \ +	vpxor		RNOT, tp, tp; \ +	vpxor		x1,   x3, x3; \ +	vpand		x2,   x1, x1; \ +	vpxor		tp,   x0, x4; \ +	vpxor		x4,   x3, x3; \ +	vpxor		x2,   x4, x4; \ +	vpxor		x1,   tp, x0; \ +	vpxor		x0,   x2, x2; + +#define SI7_1(x0, x1, x2, x3, x4)     \ +	vpand		x0,   x3, tp; \ +	vpxor		x2,   x0, x0; \ +	vpor		x3,   x2, x2; \ +	vpxor		x1,   x3, x4; \ +	vpxor		RNOT, x0, x0; \ +	vpor		tp,   x1, x1; \ +	vpxor		x0,   x4, x4; \ +	vpand		x2,   x0, x0; \ +	vpxor		x1,   x0, x0; +#define SI7_2(x0, x1, x2, x3, x4)     \ +	vpand		x2,   x1, x1; \ +	vpxor		x2,   tp, x3; \ +	vpxor		x3,   x4, x4; \ +	vpand		x3,   x2, x2; \ +	vpor		x0,   x3, x3; \ +	vpxor		x4,   x1, x1; \ +	vpxor		x4,   x3, x3; \ +	vpand		x0,   x4, x4; \ +	vpxor		x2,   x4, x4; + +#define get_key(i, j, t) \ +	vbroadcastss (4*(i)+(j))*4(CTX), t; + +#define K2(x0, x1, x2, x3, x4, i) \ +	get_key(i, 0, RK0); \ +	get_key(i, 1, RK1); \ +	get_key(i, 2, RK2); \ +	get_key(i, 3, RK3); \ +	vpxor RK0,	x0 ## 1, x0 ## 1; \ +	vpxor RK1,	x1 ## 1, x1 ## 1; \ +	vpxor RK2,	x2 ## 1, x2 ## 1; \ +	vpxor RK3,	x3 ## 1, x3 ## 1; \ +		vpxor RK0,	x0 ## 2, x0 ## 2; \ +		vpxor RK1,	x1 ## 2, x1 ## 2; \ +		vpxor RK2,	x2 ## 2, x2 ## 2; \ +		vpxor RK3,	x3 ## 2, x3 ## 2; + +#define LK2(x0, x1, x2, x3, x4, i) \ +	vpslld $13,		x0 ## 1, x4 ## 1;          \ +	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \ +	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \ +	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \ +	vpslld $3,		x2 ## 1, x4 ## 1;          \ +	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \ +	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \ +	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \ +		vpslld $13,		x0 ## 2, x4 ## 2;          \ +		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \ +		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \ +		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \ +		vpslld $3,		x2 ## 2, x4 ## 2;          \ +		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \ +		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \ +		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \ +	vpslld $1,		x1 ## 1, x4 ## 1;          \ +	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \ +	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \ +	vpslld $3,		x0 ## 1, x4 ## 1;          \ +	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \ +	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \ +	get_key(i, 1, RK1); \ +		vpslld $1,		x1 ## 2, x4 ## 2;          \ +		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \ +		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \ +		vpslld $3,		x0 ## 2, x4 ## 2;          \ +		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \ +		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \ +		get_key(i, 3, RK3); \ +	vpslld $7,		x3 ## 1, x4 ## 1;          \ +	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \ +	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \ +	vpslld $7,		x1 ## 1, x4 ## 1;          \ +	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \ +	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \ +	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \ +	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \ +	get_key(i, 0, RK0); \ +		vpslld $7,		x3 ## 2, x4 ## 2;          \ +		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \ +		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \ +		vpslld $7,		x1 ## 2, x4 ## 2;          \ +		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \ +		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \ +		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \ +		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \ +		get_key(i, 2, RK2); \ +	vpxor			RK1, x1 ## 1, x1 ## 1;     \ +	vpxor			RK3, x3 ## 1, x3 ## 1;     \ +	vpslld $5,		x0 ## 1, x4 ## 1;          \ +	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \ +	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \ +	vpslld $22,		x2 ## 1, x4 ## 1;          \ +	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \ +	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \ +	vpxor			RK0, x0 ## 1, x0 ## 1;     \ +	vpxor			RK2, x2 ## 1, x2 ## 1;     \ +		vpxor			RK1, x1 ## 2, x1 ## 2;     \ +		vpxor			RK3, x3 ## 2, x3 ## 2;     \ +		vpslld $5,		x0 ## 2, x4 ## 2;          \ +		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \ +		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \ +		vpslld $22,		x2 ## 2, x4 ## 2;          \ +		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \ +		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \ +		vpxor			RK0, x0 ## 2, x0 ## 2;     \ +		vpxor			RK2, x2 ## 2, x2 ## 2; + +#define KL2(x0, x1, x2, x3, x4, i) \ +	vpxor			RK0, x0 ## 1, x0 ## 1;     \ +	vpxor			RK2, x2 ## 1, x2 ## 1;     \ +	vpsrld $5,		x0 ## 1, x4 ## 1;          \ +	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \ +	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \ +	vpxor			RK3, x3 ## 1, x3 ## 1;     \ +	vpxor			RK1, x1 ## 1, x1 ## 1;     \ +	vpsrld $22,		x2 ## 1, x4 ## 1;          \ +	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \ +	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \ +	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \ +		vpxor			RK0, x0 ## 2, x0 ## 2;     \ +		vpxor			RK2, x2 ## 2, x2 ## 2;     \ +		vpsrld $5,		x0 ## 2, x4 ## 2;          \ +		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \ +		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \ +		vpxor			RK3, x3 ## 2, x3 ## 2;     \ +		vpxor			RK1, x1 ## 2, x1 ## 2;     \ +		vpsrld $22,		x2 ## 2, x4 ## 2;          \ +		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \ +		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \ +		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \ +	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \ +	vpslld $7,		x1 ## 1, x4 ## 1;          \ +	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \ +	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \ +	vpsrld $1,		x1 ## 1, x4 ## 1;          \ +	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \ +	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \ +		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \ +		vpslld $7,		x1 ## 2, x4 ## 2;          \ +		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \ +		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \ +		vpsrld $1,		x1 ## 2, x4 ## 2;          \ +		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \ +		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \ +	vpsrld $7,		x3 ## 1, x4 ## 1;          \ +	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \ +	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \ +	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \ +	vpslld $3,		x0 ## 1, x4 ## 1;          \ +	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \ +		vpsrld $7,		x3 ## 2, x4 ## 2;          \ +		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \ +		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \ +		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \ +		vpslld $3,		x0 ## 2, x4 ## 2;          \ +		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \ +	vpsrld $13,		x0 ## 1, x4 ## 1;          \ +	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \ +	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \ +	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \ +	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \ +	vpsrld $3,		x2 ## 1, x4 ## 1;          \ +	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \ +	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \ +		vpsrld $13,		x0 ## 2, x4 ## 2;          \ +		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \ +		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \ +		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \ +		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \ +		vpsrld $3,		x2 ## 2, x4 ## 2;          \ +		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \ +		vpor			x4 ## 2, x2 ## 2, x2 ## 2; + +#define S(SBOX, x0, x1, x2, x3, x4) \ +	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ +	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ +	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ +	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); + +#define SP(SBOX, x0, x1, x2, x3, x4, i) \ +	get_key(i, 0, RK0); \ +	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ +	get_key(i, 2, RK2); \ +	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ +	get_key(i, 3, RK3); \ +	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ +	get_key(i, 1, RK1); \ +	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ +	vpunpckldq		x1, x0, t0; \ +	vpunpckhdq		x1, x0, t2; \ +	vpunpckldq		x3, x2, t1; \ +	vpunpckhdq		x3, x2, x3; \ +	\ +	vpunpcklqdq		t1, t0, x0; \ +	vpunpckhqdq		t1, t0, x1; \ +	vpunpcklqdq		x3, t2, x2; \ +	vpunpckhqdq		x3, t2, x3; + +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ +	vmovdqu (0*4*4)(in),	x0; \ +	vmovdqu (1*4*4)(in),	x1; \ +	vmovdqu (2*4*4)(in),	x2; \ +	vmovdqu (3*4*4)(in),	x3; \ +	\ +	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ +	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ +	\ +	vmovdqu x0,		(0*4*4)(out); \ +	vmovdqu x1,		(1*4*4)(out); \ +	vmovdqu x2,		(2*4*4)(out); \ +	vmovdqu x3,		(3*4*4)(out); + +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ +	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ +	\ +	vpxor (0*4*4)(out),	x0, x0;       \ +	vmovdqu x0,		(0*4*4)(out); \ +	vpxor (1*4*4)(out),	x1, x1;       \ +	vmovdqu x1,		(1*4*4)(out); \ +	vpxor (2*4*4)(out),	x2, x2;       \ +	vmovdqu x2,		(2*4*4)(out); \ +	vpxor (3*4*4)(out),	x3, x3;       \ +	vmovdqu x3,		(3*4*4)(out); + +.align 8 +.global __serpent_enc_blk_8way_avx +.type   __serpent_enc_blk_8way_avx,@function; + +__serpent_enc_blk_8way_avx: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 *	%rcx: bool, if true: xor output +	 */ + +	vpcmpeqd RNOT, RNOT, RNOT; + +	leaq (4*4*4)(%rdx), %rax; +	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); +	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + +						 K2(RA, RB, RC, RD, RE, 0); +	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1); +	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2); +	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3); +	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4); +	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5); +	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6); +	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7); +	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8); +	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9); +	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10); +	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11); +	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12); +	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13); +	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14); +	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15); +	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16); +	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17); +	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18); +	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19); +	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20); +	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21); +	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22); +	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23); +	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24); +	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25); +	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26); +	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27); +	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28); +	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29); +	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30); +	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31); +	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32); + +	leaq (4*4*4)(%rsi), %rax; + +	testb %cl, %cl; +	jnz __enc_xor8; + +	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); +	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + +	ret; + +__enc_xor8: +	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); +	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + +	ret; + +.align 8 +.global serpent_dec_blk_8way_avx +.type   serpent_dec_blk_8way_avx,@function; + +serpent_dec_blk_8way_avx: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	vpcmpeqd RNOT, RNOT, RNOT; + +	leaq (4*4*4)(%rdx), %rax; +	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); +	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + +						 K2(RA, RB, RC, RD, RE, 32); +	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31); +	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30); +	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29); +	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28); +	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27); +	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26); +	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25); +	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24); +	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23); +	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22); +	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21); +	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20); +	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19); +	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18); +	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17); +	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16); +	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15); +	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14); +	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13); +	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12); +	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11); +	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10); +	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9); +	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8); +	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7); +	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6); +	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5); +	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4); +	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3); +	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2); +	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1); +	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0); + +	leaq (4*4*4)(%rsi), %rax; +	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); +	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); + +	ret; diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c new file mode 100644 index 000000000000..b36bdac237eb --- /dev/null +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -0,0 +1,636 @@ +/* + * Glue Code for AVX assembler versions of Serpent Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * Glue code based on serpent_sse2_glue.c by: + *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 + * USA + * + */ + +#include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/algapi.h> +#include <crypto/serpent.h> +#include <crypto/cryptd.h> +#include <crypto/b128ops.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/serpent-avx.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> + +static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) +{ +	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; +	unsigned int j; + +	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) +		ivs[j] = src[j]; + +	serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + +	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) +		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +} + +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) +{ +	be128 ctrblk; + +	u128_to_be128(&ctrblk, iv); +	u128_inc(iv); + +	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); +	u128_xor(dst, src, (u128 *)&ctrblk); +} + +static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, +				   u128 *iv) +{ +	be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; +	unsigned int i; + +	for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { +		if (dst != src) +			dst[i] = src[i]; + +		u128_to_be128(&ctrblks[i], iv); +		u128_inc(iv); +	} + +	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); +} + +static const struct common_glue_ctx serpent_enc = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } +	} } +}; + +static const struct common_glue_ctx serpent_ctr = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } +	} } +}; + +static const struct common_glue_ctx serpent_dec = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } +	} } +}; + +static const struct common_glue_ctx serpent_dec_cbc = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } +	} } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, +				     dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, +				       nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		     struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes); +} + +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ +	return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, +			      NULL, fpu_enabled, nbytes); +} + +static inline void serpent_fpu_end(bool fpu_enabled) +{ +	glue_fpu_end(fpu_enabled); +} + +struct crypt_priv { +	struct serpent_ctx *ctx; +	bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ +	const unsigned int bsize = SERPENT_BLOCK_SIZE; +	struct crypt_priv *ctx = priv; +	int i; + +	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + +	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { +		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); +		return; +	} + +	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) +		__serpent_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ +	const unsigned int bsize = SERPENT_BLOCK_SIZE; +	struct crypt_priv *ctx = priv; +	int i; + +	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + +	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { +		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); +		return; +	} + +	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) +		__serpent_decrypt(ctx->ctx, srcdst, srcdst); +} + +struct serpent_lrw_ctx { +	struct lrw_table_ctx lrw_table; +	struct serpent_ctx serpent_ctx; +}; + +static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, +			      unsigned int keylen) +{ +	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); +	int err; + +	err = __serpent_setkey(&ctx->serpent_ctx, key, keylen - +							SERPENT_BLOCK_SIZE); +	if (err) +		return err; + +	return lrw_init_table(&ctx->lrw_table, key + keylen - +						SERPENT_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[SERPENT_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->serpent_ctx, +		.fpu_enabled = false, +	}; +	struct lrw_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.table_ctx = &ctx->lrw_table, +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = encrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = lrw_crypt(desc, dst, src, nbytes, &req); +	serpent_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[SERPENT_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->serpent_ctx, +		.fpu_enabled = false, +	}; +	struct lrw_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.table_ctx = &ctx->lrw_table, +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = decrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = lrw_crypt(desc, dst, src, nbytes, &req); +	serpent_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ +	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + +	lrw_free_table(&ctx->lrw_table); +} + +struct serpent_xts_ctx { +	struct serpent_ctx tweak_ctx; +	struct serpent_ctx crypt_ctx; +}; + +static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, +			      unsigned int keylen) +{ +	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); +	u32 *flags = &tfm->crt_flags; +	int err; + +	/* key consists of keys of equal size concatenated, therefore +	 * the length must be even +	 */ +	if (keylen % 2) { +		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; +		return -EINVAL; +	} + +	/* first half of xts-key is for crypt */ +	err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); +	if (err) +		return err; + +	/* second half of xts-key is for tweak */ +	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[SERPENT_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->crypt_ctx, +		.fpu_enabled = false, +	}; +	struct xts_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.tweak_ctx = &ctx->tweak_ctx, +		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = encrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = xts_crypt(desc, dst, src, nbytes, &req); +	serpent_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[SERPENT_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->crypt_ctx, +		.fpu_enabled = false, +	}; +	struct xts_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.tweak_ctx = &ctx->tweak_ctx, +		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = decrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = xts_crypt(desc, dst, src, nbytes, &req); +	serpent_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static struct crypto_alg serpent_algs[10] = { { +	.cra_name		= "__ecb-serpent-avx", +	.cra_driver_name	= "__driver-ecb-serpent-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct serpent_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[0].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE, +			.setkey		= serpent_setkey, +			.encrypt	= ecb_encrypt, +			.decrypt	= ecb_decrypt, +		}, +	}, +}, { +	.cra_name		= "__cbc-serpent-avx", +	.cra_driver_name	= "__driver-cbc-serpent-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct serpent_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[1].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE, +			.setkey		= serpent_setkey, +			.encrypt	= cbc_encrypt, +			.decrypt	= cbc_decrypt, +		}, +	}, +}, { +	.cra_name		= "__ctr-serpent-avx", +	.cra_driver_name	= "__driver-ctr-serpent-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct serpent_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[2].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE, +			.ivsize		= SERPENT_BLOCK_SIZE, +			.setkey		= serpent_setkey, +			.encrypt	= ctr_crypt, +			.decrypt	= ctr_crypt, +		}, +	}, +}, { +	.cra_name		= "__lrw-serpent-avx", +	.cra_driver_name	= "__driver-lrw-serpent-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[3].cra_list), +	.cra_exit		= lrw_exit_tfm, +	.cra_u = { +		.blkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE + +					  SERPENT_BLOCK_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE + +					  SERPENT_BLOCK_SIZE, +			.ivsize		= SERPENT_BLOCK_SIZE, +			.setkey		= lrw_serpent_setkey, +			.encrypt	= lrw_encrypt, +			.decrypt	= lrw_decrypt, +		}, +	}, +}, { +	.cra_name		= "__xts-serpent-avx", +	.cra_driver_name	= "__driver-xts-serpent-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct serpent_xts_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[4].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2, +			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2, +			.ivsize		= SERPENT_BLOCK_SIZE, +			.setkey		= xts_serpent_setkey, +			.encrypt	= xts_encrypt, +			.decrypt	= xts_decrypt, +		}, +	}, +}, { +	.cra_name		= "ecb(serpent)", +	.cra_driver_name	= "ecb-serpent-avx", +	.cra_priority		= 500, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[5].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "cbc(serpent)", +	.cra_driver_name	= "cbc-serpent-avx", +	.cra_priority		= 500, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[6].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE, +			.ivsize		= SERPENT_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= __ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "ctr(serpent)", +	.cra_driver_name	= "ctr-serpent-avx", +	.cra_priority		= 500, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[7].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE, +			.ivsize		= SERPENT_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_encrypt, +			.geniv		= "chainiv", +		}, +	}, +}, { +	.cra_name		= "lrw(serpent)", +	.cra_driver_name	= "lrw-serpent-avx", +	.cra_priority		= 500, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[8].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE + +					  SERPENT_BLOCK_SIZE, +			.max_keysize	= SERPENT_MAX_KEY_SIZE + +					  SERPENT_BLOCK_SIZE, +			.ivsize		= SERPENT_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "xts(serpent)", +	.cra_driver_name	= "xts-serpent-avx", +	.cra_priority		= 500, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= SERPENT_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(serpent_algs[9].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2, +			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2, +			.ivsize		= SERPENT_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +} }; + +static int __init serpent_init(void) +{ +	u64 xcr0; + +	if (!cpu_has_avx || !cpu_has_osxsave) { +		printk(KERN_INFO "AVX instructions are not detected.\n"); +		return -ENODEV; +	} + +	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); +	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { +		printk(KERN_INFO "AVX detected but unusable.\n"); +		return -ENODEV; +	} + +	return crypto_register_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); +} + +static void __exit serpent_exit(void) +{ +	crypto_unregister_algs(serpent_algs, ARRAY_SIZE(serpent_algs)); +} + +module_init(serpent_init); +module_exit(serpent_exit); + +MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("serpent"); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 4b21be85e0a1..d679c8675f4a 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -41,358 +41,145 @@  #include <crypto/ctr.h>  #include <crypto/lrw.h>  #include <crypto/xts.h> -#include <asm/i387.h> -#include <asm/serpent.h> -#include <crypto/scatterwalk.h> -#include <linux/workqueue.h> -#include <linux/spinlock.h> - -struct async_serpent_ctx { -	struct cryptd_ablkcipher *cryptd_tfm; -}; +#include <asm/crypto/serpent-sse2.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> -static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) -{ -	if (fpu_enabled) -		return true; - -	/* SSE2 is only used when chunk to be processed is large enough, so -	 * do not enable FPU until it is necessary. -	 */ -	if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS) -		return false; - -	kernel_fpu_begin(); -	return true; -} - -static inline void serpent_fpu_end(bool fpu_enabled) +static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)  { -	if (fpu_enabled) -		kernel_fpu_end(); -} - -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, -		     bool enc) -{ -	bool fpu_enabled = false; -	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	const unsigned int bsize = SERPENT_BLOCK_SIZE; -	unsigned int nbytes; -	int err; - -	err = blkcipher_walk_virt(desc, walk); -	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - -	while ((nbytes = walk->nbytes)) { -		u8 *wsrc = walk->src.virt.addr; -		u8 *wdst = walk->dst.virt.addr; - -		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); - -		/* Process multi-block batch */ -		if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { -			do { -				if (enc) -					serpent_enc_blk_xway(ctx, wdst, wsrc); -				else -					serpent_dec_blk_xway(ctx, wdst, wsrc); - -				wsrc += bsize * SERPENT_PARALLEL_BLOCKS; -				wdst += bsize * SERPENT_PARALLEL_BLOCKS; -				nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; -			} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - -			if (nbytes < bsize) -				goto done; -		} - -		/* Handle leftovers */ -		do { -			if (enc) -				__serpent_encrypt(ctx, wdst, wsrc); -			else -				__serpent_decrypt(ctx, wdst, wsrc); - -			wsrc += bsize; -			wdst += bsize; -			nbytes -= bsize; -		} while (nbytes >= bsize); - -done: -		err = blkcipher_walk_done(desc, walk, nbytes); -	} +	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; +	unsigned int j; -	serpent_fpu_end(fpu_enabled); -	return err; -} +	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) +		ivs[j] = src[j]; -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) -{ -	struct blkcipher_walk walk; +	serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); -	blkcipher_walk_init(&walk, dst, src, nbytes); -	return ecb_crypt(desc, &walk, true); +	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) +		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);  } -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) +static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)  { -	struct blkcipher_walk walk; +	be128 ctrblk; -	blkcipher_walk_init(&walk, dst, src, nbytes); -	return ecb_crypt(desc, &walk, false); -} +	u128_to_be128(&ctrblk, iv); +	u128_inc(iv); -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, -				  struct blkcipher_walk *walk) -{ -	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	const unsigned int bsize = SERPENT_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 *iv = (u128 *)walk->iv; - -	do { -		u128_xor(dst, src, iv); -		__serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst); -		iv = dst; - -		src += 1; -		dst += 1; -		nbytes -= bsize; -	} while (nbytes >= bsize); - -	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); -	return nbytes; +	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); +	u128_xor(dst, src, (u128 *)&ctrblk);  } -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) +static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, +				   u128 *iv)  { -	struct blkcipher_walk walk; -	int err; +	be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; +	unsigned int i; -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt(desc, &walk); +	for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { +		if (dst != src) +			dst[i] = src[i]; -	while ((nbytes = walk.nbytes)) { -		nbytes = __cbc_encrypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); +		u128_to_be128(&ctrblks[i], iv); +		u128_inc(iv);  	} -	return err; +	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);  } -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, -				  struct blkcipher_walk *walk) -{ -	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	const unsigned int bsize = SERPENT_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; -	u128 last_iv; -	int i; - -	/* Start of the last block. */ -	src += nbytes / bsize - 1; -	dst += nbytes / bsize - 1; - -	last_iv = *src; - -	/* Process multi-block batch */ -	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { -		do { -			nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1); -			src -= SERPENT_PARALLEL_BLOCKS - 1; -			dst -= SERPENT_PARALLEL_BLOCKS - 1; - -			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) -				ivs[i] = src[i]; - -			serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); - -			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) -				u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); - -			nbytes -= bsize; -			if (nbytes < bsize) -				goto done; +static const struct common_glue_ctx serpent_enc = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, -			u128_xor(dst, dst, src - 1); -			src -= 1; -			dst -= 1; -		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - -		if (nbytes < bsize) -			goto done; -	} - -	/* Handle leftovers */ -	for (;;) { -		__serpent_decrypt(ctx, (u8 *)dst, (u8 *)src); - -		nbytes -= bsize; -		if (nbytes < bsize) -			break; +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } +	} } +}; -		u128_xor(dst, dst, src - 1); -		src -= 1; -		dst -= 1; -	} +static const struct common_glue_ctx serpent_ctr = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } +	} } +}; -done: -	u128_xor(dst, dst, (u128 *)walk->iv); -	*(u128 *)walk->iv = last_iv; +static const struct common_glue_ctx serpent_dec = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } +	} } +}; -	return nbytes; -} +static const struct common_glue_ctx serpent_dec_cbc = { +	.num_funcs = 2, +	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = SERPENT_PARALLEL_BLOCKS, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } +	}, { +		.num_blocks = 1, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } +	} } +}; -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  		       struct scatterlist *src, unsigned int nbytes)  { -	bool fpu_enabled = false; -	struct blkcipher_walk walk; -	int err; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt(desc, &walk); -	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - -	while ((nbytes = walk.nbytes)) { -		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); -		nbytes = __cbc_decrypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} - -	serpent_fpu_end(fpu_enabled); -	return err; +	return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);  } -static inline void u128_to_be128(be128 *dst, const u128 *src) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	dst->a = cpu_to_be64(src->a); -	dst->b = cpu_to_be64(src->b); +	return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);  } -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	dst->a = be64_to_cpu(src->a); -	dst->b = be64_to_cpu(src->b); +	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc, +				     dst, src, nbytes);  } -static inline void u128_inc(u128 *i) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	i->b++; -	if (!i->b) -		i->a++; +	return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src, +				       nbytes);  } -static void ctr_crypt_final(struct blkcipher_desc *desc, -			    struct blkcipher_walk *walk) +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		     struct scatterlist *src, unsigned int nbytes)  { -	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	u8 *ctrblk = walk->iv; -	u8 keystream[SERPENT_BLOCK_SIZE]; -	u8 *src = walk->src.virt.addr; -	u8 *dst = walk->dst.virt.addr; -	unsigned int nbytes = walk->nbytes; - -	__serpent_encrypt(ctx, keystream, ctrblk); -	crypto_xor(keystream, src, nbytes); -	memcpy(dst, keystream, nbytes); - -	crypto_inc(ctrblk, SERPENT_BLOCK_SIZE); +	return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);  } -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, -				struct blkcipher_walk *walk) +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)  { -	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	const unsigned int bsize = SERPENT_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 ctrblk; -	be128 ctrblocks[SERPENT_PARALLEL_BLOCKS]; -	int i; - -	be128_to_u128(&ctrblk, (be128 *)walk->iv); - -	/* Process multi-block batch */ -	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { -		do { -			/* create ctrblks for parallel encrypt */ -			for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { -				if (dst != src) -					dst[i] = src[i]; - -				u128_to_be128(&ctrblocks[i], &ctrblk); -				u128_inc(&ctrblk); -			} - -			serpent_enc_blk_xway_xor(ctx, (u8 *)dst, -						 (u8 *)ctrblocks); - -			src += SERPENT_PARALLEL_BLOCKS; -			dst += SERPENT_PARALLEL_BLOCKS; -			nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; -		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); - -		if (nbytes < bsize) -			goto done; -	} - -	/* Handle leftovers */ -	do { -		if (dst != src) -			*dst = *src; - -		u128_to_be128(&ctrblocks[0], &ctrblk); -		u128_inc(&ctrblk); - -		__serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); -		u128_xor(dst, dst, (u128 *)ctrblocks); - -		src += 1; -		dst += 1; -		nbytes -= bsize; -	} while (nbytes >= bsize); - -done: -	u128_to_be128((be128 *)walk->iv, &ctrblk); -	return nbytes; +	return glue_fpu_begin(SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS, +			      NULL, fpu_enabled, nbytes);  } -static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		     struct scatterlist *src, unsigned int nbytes) +static inline void serpent_fpu_end(bool fpu_enabled)  { -	bool fpu_enabled = false; -	struct blkcipher_walk walk; -	int err; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE); -	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; - -	while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) { -		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); -		nbytes = __ctr_crypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} - -	serpent_fpu_end(fpu_enabled); - -	if (walk.nbytes) { -		ctr_crypt_final(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, 0); -	} - -	return err; +	glue_fpu_end(fpu_enabled);  }  struct crypt_priv { @@ -596,106 +383,6 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  	return ret;  } -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, -			unsigned int key_len) -{ -	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); -	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; -	int err; - -	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); -	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) -				    & CRYPTO_TFM_REQ_MASK); -	err = crypto_ablkcipher_setkey(child, key, key_len); -	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) -				    & CRYPTO_TFM_RES_MASK); -	return err; -} - -static int __ablk_encrypt(struct ablkcipher_request *req) -{ -	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); -	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); -	struct blkcipher_desc desc; - -	desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); -	desc.info = req->info; -	desc.flags = 0; - -	return crypto_blkcipher_crt(desc.tfm)->encrypt( -		&desc, req->dst, req->src, req->nbytes); -} - -static int ablk_encrypt(struct ablkcipher_request *req) -{ -	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); -	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - -	if (!irq_fpu_usable()) { -		struct ablkcipher_request *cryptd_req = -			ablkcipher_request_ctx(req); - -		memcpy(cryptd_req, req, sizeof(*req)); -		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - -		return crypto_ablkcipher_encrypt(cryptd_req); -	} else { -		return __ablk_encrypt(req); -	} -} - -static int ablk_decrypt(struct ablkcipher_request *req) -{ -	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); -	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); - -	if (!irq_fpu_usable()) { -		struct ablkcipher_request *cryptd_req = -			ablkcipher_request_ctx(req); - -		memcpy(cryptd_req, req, sizeof(*req)); -		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - -		return crypto_ablkcipher_decrypt(cryptd_req); -	} else { -		struct blkcipher_desc desc; - -		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); -		desc.info = req->info; -		desc.flags = 0; - -		return crypto_blkcipher_crt(desc.tfm)->decrypt( -			&desc, req->dst, req->src, req->nbytes); -	} -} - -static void ablk_exit(struct crypto_tfm *tfm) -{ -	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); - -	cryptd_free_ablkcipher(ctx->cryptd_tfm); -} - -static int ablk_init(struct crypto_tfm *tfm) -{ -	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); -	struct cryptd_ablkcipher *cryptd_tfm; -	char drv_name[CRYPTO_MAX_ALG_NAME]; - -	snprintf(drv_name, sizeof(drv_name), "__driver-%s", -					crypto_tfm_alg_driver_name(tfm)); - -	cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); -	if (IS_ERR(cryptd_tfm)) -		return PTR_ERR(cryptd_tfm); - -	ctx->cryptd_tfm = cryptd_tfm; -	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + -		crypto_ablkcipher_reqsize(&cryptd_tfm->base); - -	return 0; -} -  static struct crypto_alg serpent_algs[10] = { {  	.cra_name		= "__ecb-serpent-sse2",  	.cra_driver_name	= "__driver-ecb-serpent-sse2", @@ -808,7 +495,7 @@ static struct crypto_alg serpent_algs[10] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= SERPENT_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_serpent_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -830,7 +517,7 @@ static struct crypto_alg serpent_algs[10] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= SERPENT_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_serpent_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -853,7 +540,7 @@ static struct crypto_alg serpent_algs[10] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= 1, -	.cra_ctxsize		= sizeof(struct async_serpent_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -877,7 +564,7 @@ static struct crypto_alg serpent_algs[10] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= SERPENT_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_serpent_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, @@ -902,7 +589,7 @@ static struct crypto_alg serpent_algs[10] = { {  	.cra_priority		= 400,  	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,  	.cra_blocksize		= SERPENT_BLOCK_SIZE, -	.cra_ctxsize		= sizeof(struct async_serpent_ctx), +	.cra_ctxsize		= sizeof(struct async_helper_ctx),  	.cra_alignmask		= 0,  	.cra_type		= &crypto_ablkcipher_type,  	.cra_module		= THIS_MODULE, diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index b2c2f57d70e8..49d6987a73d9 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -468,7 +468,7 @@ W_PRECALC_SSSE3   */  SHA1_VECTOR_ASM     sha1_transform_ssse3 -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX  .macro W_PRECALC_AVX diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index f916499d0abe..4a11a9d72451 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -35,7 +35,7 @@  asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,  				     unsigned int rounds); -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX  asmlinkage void sha1_transform_avx(u32 *digest, const char *data,  				   unsigned int rounds);  #endif @@ -184,7 +184,7 @@ static struct shash_alg alg = {  	}  }; -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX  static bool __init avx_usable(void)  {  	u64 xcr0; @@ -209,7 +209,7 @@ static int __init sha1_ssse3_mod_init(void)  	if (cpu_has_ssse3)  		sha1_transform_asm = sha1_transform_ssse3; -#ifdef SHA1_ENABLE_AVX_SUPPORT +#ifdef CONFIG_AS_AVX  	/* allow AVX to override SSSE3, it's a little faster */  	if (avx_usable())  		sha1_transform_asm = sha1_transform_avx; diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S new file mode 100644 index 000000000000..35f45574390d --- /dev/null +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -0,0 +1,300 @@ +/* + * Twofish Cipher 8-way parallel algorithm (AVX/x86_64) + * + * Copyright (C) 2012 Johannes Goetzfried + *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 + * USA + * + */ + +.file "twofish-avx-x86_64-asm_64.S" +.text + +/* structure of crypto context */ +#define s0	0 +#define s1	1024 +#define s2	2048 +#define s3	3072 +#define w	4096 +#define k	4128 + +/********************************************************************** +  8-way AVX twofish + **********************************************************************/ +#define CTX %rdi + +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 + +#define RA2 %xmm4 +#define RB2 %xmm5 +#define RC2 %xmm6 +#define RD2 %xmm7 + +#define RX %xmm8 +#define RY %xmm9 + +#define RK1 %xmm10 +#define RK2 %xmm11 + +#define RID1  %rax +#define RID1b %al +#define RID2  %rbx +#define RID2b %bl + +#define RGI1   %rdx +#define RGI1bl %dl +#define RGI1bh %dh +#define RGI2   %rcx +#define RGI2bl %cl +#define RGI2bh %ch + +#define RGS1  %r8 +#define RGS1d %r8d +#define RGS2  %r9 +#define RGS2d %r9d +#define RGS3  %r10 +#define RGS3d %r10d + + +#define lookup_32bit(t0, t1, t2, t3, src, dst) \ +	movb		src ## bl,        RID1b;     \ +	movb		src ## bh,        RID2b;     \ +	movl		t0(CTX, RID1, 4), dst ## d;  \ +	xorl		t1(CTX, RID2, 4), dst ## d;  \ +	shrq $16,	src;                         \ +	movb		src ## bl,        RID1b;     \ +	movb		src ## bh,        RID2b;     \ +	xorl		t2(CTX, RID1, 4), dst ## d;  \ +	xorl		t3(CTX, RID2, 4), dst ## d; + +#define G(a, x, t0, t1, t2, t3) \ +	vmovq		a,    RGI1;               \ +	vpsrldq $8,	a,    x;                  \ +	vmovq		x,    RGI2;               \ +	\ +	lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \ +	shrq $16,	RGI1;                     \ +	lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \ +	shlq $32,	RGS2;                     \ +	orq		RGS1, RGS2;               \ +	\ +	lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \ +	shrq $16,	RGI2;                     \ +	lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \ +	shlq $32,	RGS3;                     \ +	orq		RGS1, RGS3;               \ +	\ +	vmovq		RGS2, x;                  \ +	vpinsrq $1,	RGS3, x, x; + +#define encround(a, b, c, d, x, y) \ +	G(a, x, s0, s1, s2, s3);           \ +	G(b, y, s1, s2, s3, s0);           \ +	vpaddd			x, y,   x; \ +	vpaddd			y, x,   y; \ +	vpaddd			x, RK1, x; \ +	vpaddd			y, RK2, y; \ +	vpxor			x, c,   c; \ +	vpsrld $1,		c, x;      \ +	vpslld $(32 - 1),	c, c;      \ +	vpor			c, x,   c; \ +	vpslld $1,		d, x;      \ +	vpsrld $(32 - 1),	d, d;      \ +	vpor			d, x,   d; \ +	vpxor			d, y,   d; + +#define decround(a, b, c, d, x, y) \ +	G(a, x, s0, s1, s2, s3);           \ +	G(b, y, s1, s2, s3, s0);           \ +	vpaddd			x, y,   x; \ +	vpaddd			y, x,   y; \ +	vpaddd			y, RK2, y; \ +	vpxor			d, y,   d; \ +	vpsrld $1,		d, y;      \ +	vpslld $(32 - 1),	d, d;      \ +	vpor			d, y,   d; \ +	vpslld $1,		c, y;      \ +	vpsrld $(32 - 1),	c, c;      \ +	vpor			c, y,   c; \ +	vpaddd			x, RK1, x; \ +	vpxor			x, c,   c; + +#define encrypt_round(n, a, b, c, d) \ +	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \ +	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \ +	encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ +	encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); + +#define decrypt_round(n, a, b, c, d) \ +	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \ +	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \ +	decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ +	decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); + +#define encrypt_cycle(n) \ +	encrypt_round((2*n), RA, RB, RC, RD);       \ +	encrypt_round(((2*n) + 1), RC, RD, RA, RB); + +#define decrypt_cycle(n) \ +	decrypt_round(((2*n) + 1), RC, RD, RA, RB); \ +	decrypt_round((2*n), RA, RB, RC, RD); + + +#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ +	vpunpckldq		x1, x0, t0; \ +	vpunpckhdq		x1, x0, t2; \ +	vpunpckldq		x3, x2, t1; \ +	vpunpckhdq		x3, x2, x3; \ +	\ +	vpunpcklqdq		t1, t0, x0; \ +	vpunpckhqdq		t1, t0, x1; \ +	vpunpcklqdq		x3, t2, x2; \ +	vpunpckhqdq		x3, t2, x3; + +#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ +	vpxor (0*4*4)(in),	wkey, x0; \ +	vpxor (1*4*4)(in),	wkey, x1; \ +	vpxor (2*4*4)(in),	wkey, x2; \ +	vpxor (3*4*4)(in),	wkey, x3; \ +	\ +	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ +	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ +	\ +	vpxor		x0, wkey, x0;     \ +	vmovdqu 	x0, (0*4*4)(out); \ +	vpxor		x1, wkey, x1;     \ +	vmovdqu		x1, (1*4*4)(out); \ +	vpxor		x2, wkey, x2;     \ +	vmovdqu		x2, (2*4*4)(out); \ +	vpxor		x3, wkey, x3;     \ +	vmovdqu		x3, (3*4*4)(out); + +#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ +	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ +	\ +	vpxor		x0, wkey, x0;         \ +	vpxor		(0*4*4)(out), x0, x0; \ +	vmovdqu 	x0, (0*4*4)(out);     \ +	vpxor		x1, wkey, x1;         \ +	vpxor		(1*4*4)(out), x1, x1; \ +	vmovdqu	        x1, (1*4*4)(out);     \ +	vpxor		x2, wkey, x2;         \ +	vpxor           (2*4*4)(out), x2, x2; \ +	vmovdqu		x2, (2*4*4)(out);     \ +	vpxor		x3, wkey, x3;         \ +	vpxor           (3*4*4)(out), x3, x3; \ +	vmovdqu		x3, (3*4*4)(out); + +.align 8 +.global __twofish_enc_blk_8way +.type   __twofish_enc_blk_8way,@function; + +__twofish_enc_blk_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 *	%rcx: bool, if true: xor output +	 */ + +	pushq %rbx; +	pushq %rcx; + +	vmovdqu w(CTX), RK1; + +	leaq (4*4*4)(%rdx), %rax; +	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); +	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + +	xorq RID1, RID1; +	xorq RID2, RID2; + +	encrypt_cycle(0); +	encrypt_cycle(1); +	encrypt_cycle(2); +	encrypt_cycle(3); +	encrypt_cycle(4); +	encrypt_cycle(5); +	encrypt_cycle(6); +	encrypt_cycle(7); + +	vmovdqu (w+4*4)(CTX), RK1; + +	popq %rcx; +	popq %rbx; + +	leaq (4*4*4)(%rsi), %rax; + +	testb %cl, %cl; +	jnz __enc_xor8; + +	outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); +	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + +	ret; + +__enc_xor8: +	outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); +	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + +	ret; + +.align 8 +.global twofish_dec_blk_8way +.type   twofish_dec_blk_8way,@function; + +twofish_dec_blk_8way: +	/* input: +	 *	%rdi: ctx, CTX +	 *	%rsi: dst +	 *	%rdx: src +	 */ + +	pushq %rbx; + +	vmovdqu (w+4*4)(CTX), RK1; + +	leaq (4*4*4)(%rdx), %rax; +	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); +	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); + +	xorq RID1, RID1; +	xorq RID2, RID2; + +	decrypt_cycle(7); +	decrypt_cycle(6); +	decrypt_cycle(5); +	decrypt_cycle(4); +	decrypt_cycle(3); +	decrypt_cycle(2); +	decrypt_cycle(1); +	decrypt_cycle(0); + +	vmovdqu (w)(CTX), RK1; + +	popq %rbx; + +	leaq (4*4*4)(%rsi), %rax; +	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); +	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); + +	ret; diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c new file mode 100644 index 000000000000..782b67ddaf6a --- /dev/null +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -0,0 +1,624 @@ +/* + * Glue Code for AVX assembler version of Twofish Cipher + * + * Copyright (C) 2012 Johannes Goetzfried + *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 + * USA + * + */ + +#include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <crypto/algapi.h> +#include <crypto/twofish.h> +#include <crypto/cryptd.h> +#include <crypto/b128ops.h> +#include <crypto/ctr.h> +#include <crypto/lrw.h> +#include <crypto/xts.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/xsave.h> +#include <asm/crypto/twofish.h> +#include <asm/crypto/ablk_helper.h> +#include <asm/crypto/glue_helper.h> +#include <crypto/scatterwalk.h> +#include <linux/workqueue.h> +#include <linux/spinlock.h> + +#define TWOFISH_PARALLEL_BLOCKS 8 + +static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, +					const u8 *src) +{ +	__twofish_enc_blk_3way(ctx, dst, src, false); +} + +/* 8-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, +				       const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, +				     const u8 *src); + +static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, +					const u8 *src) +{ +	__twofish_enc_blk_8way(ctx, dst, src, false); +} + +static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst, +					    const u8 *src) +{ +	__twofish_enc_blk_8way(ctx, dst, src, true); +} + +static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, +					const u8 *src) +{ +	twofish_dec_blk_8way(ctx, dst, src); +} + +static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src) +{ +	u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; +	unsigned int j; + +	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) +		ivs[j] = src[j]; + +	twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + +	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) +		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); +} + +static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src, +				     u128 *iv) +{ +	be128 ctrblks[TWOFISH_PARALLEL_BLOCKS]; +	unsigned int i; + +	for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { +		if (dst != src) +			dst[i] = src[i]; + +		u128_to_be128(&ctrblks[i], iv); +		u128_inc(iv); +	} + +	twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); +} + +static const struct common_glue_ctx twofish_enc = { +	.num_funcs = 3, +	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = TWOFISH_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } +	}, { +		.num_blocks = 3, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } +	} } +}; + +static const struct common_glue_ctx twofish_ctr = { +	.num_funcs = 3, +	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = TWOFISH_PARALLEL_BLOCKS, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } +	}, { +		.num_blocks = 3, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) } +	} } +}; + +static const struct common_glue_ctx twofish_dec = { +	.num_funcs = 3, +	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = TWOFISH_PARALLEL_BLOCKS, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } +	}, { +		.num_blocks = 3, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } +	} } +}; + +static const struct common_glue_ctx twofish_dec_cbc = { +	.num_funcs = 3, +	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS, + +	.funcs = { { +		.num_blocks = TWOFISH_PARALLEL_BLOCKS, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } +	}, { +		.num_blocks = 3, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } +	} } +}; + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes); +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, +				       dst, src, nbytes); +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, +				       nbytes); +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		     struct scatterlist *src, unsigned int nbytes) +{ +	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes); +} + +static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ +	return glue_fpu_begin(TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS, NULL, +			      fpu_enabled, nbytes); +} + +static inline void twofish_fpu_end(bool fpu_enabled) +{ +	glue_fpu_end(fpu_enabled); +} + +struct crypt_priv { +	struct twofish_ctx *ctx; +	bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ +	const unsigned int bsize = TF_BLOCK_SIZE; +	struct crypt_priv *ctx = priv; +	int i; + +	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); + +	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { +		twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); +		return; +	} + +	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) +		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); + +	nbytes %= bsize * 3; + +	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) +		twofish_enc_blk(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ +	const unsigned int bsize = TF_BLOCK_SIZE; +	struct crypt_priv *ctx = priv; +	int i; + +	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); + +	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { +		twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); +		return; +	} + +	for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) +		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); + +	nbytes %= bsize * 3; + +	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) +		twofish_dec_blk(ctx->ctx, srcdst, srcdst); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[TWOFISH_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->twofish_ctx, +		.fpu_enabled = false, +	}; +	struct lrw_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.table_ctx = &ctx->lrw_table, +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = encrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = lrw_crypt(desc, dst, src, nbytes, &req); +	twofish_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[TWOFISH_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->twofish_ctx, +		.fpu_enabled = false, +	}; +	struct lrw_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.table_ctx = &ctx->lrw_table, +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = decrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = lrw_crypt(desc, dst, src, nbytes, &req); +	twofish_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[TWOFISH_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->crypt_ctx, +		.fpu_enabled = false, +	}; +	struct xts_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.tweak_ctx = &ctx->tweak_ctx, +		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = encrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = xts_crypt(desc, dst, src, nbytes, &req); +	twofish_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes) +{ +	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); +	be128 buf[TWOFISH_PARALLEL_BLOCKS]; +	struct crypt_priv crypt_ctx = { +		.ctx = &ctx->crypt_ctx, +		.fpu_enabled = false, +	}; +	struct xts_crypt_req req = { +		.tbuf = buf, +		.tbuflen = sizeof(buf), + +		.tweak_ctx = &ctx->tweak_ctx, +		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), +		.crypt_ctx = &crypt_ctx, +		.crypt_fn = decrypt_callback, +	}; +	int ret; + +	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	ret = xts_crypt(desc, dst, src, nbytes, &req); +	twofish_fpu_end(crypt_ctx.fpu_enabled); + +	return ret; +} + +static struct crypto_alg twofish_algs[10] = { { +	.cra_name		= "__ecb-twofish-avx", +	.cra_driver_name	= "__driver-ecb-twofish-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct twofish_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[0].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE, +			.setkey		= twofish_setkey, +			.encrypt	= ecb_encrypt, +			.decrypt	= ecb_decrypt, +		}, +	}, +}, { +	.cra_name		= "__cbc-twofish-avx", +	.cra_driver_name	= "__driver-cbc-twofish-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct twofish_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[1].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE, +			.setkey		= twofish_setkey, +			.encrypt	= cbc_encrypt, +			.decrypt	= cbc_decrypt, +		}, +	}, +}, { +	.cra_name		= "__ctr-twofish-avx", +	.cra_driver_name	= "__driver-ctr-twofish-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct twofish_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[2].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE, +			.ivsize		= TF_BLOCK_SIZE, +			.setkey		= twofish_setkey, +			.encrypt	= ctr_crypt, +			.decrypt	= ctr_crypt, +		}, +	}, +}, { +	.cra_name		= "__lrw-twofish-avx", +	.cra_driver_name	= "__driver-lrw-twofish-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[3].cra_list), +	.cra_exit		= lrw_twofish_exit_tfm, +	.cra_u = { +		.blkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE + +					  TF_BLOCK_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE + +					  TF_BLOCK_SIZE, +			.ivsize		= TF_BLOCK_SIZE, +			.setkey		= lrw_twofish_setkey, +			.encrypt	= lrw_encrypt, +			.decrypt	= lrw_decrypt, +		}, +	}, +}, { +	.cra_name		= "__xts-twofish-avx", +	.cra_driver_name	= "__driver-xts-twofish-avx", +	.cra_priority		= 0, +	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct twofish_xts_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_blkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[4].cra_list), +	.cra_u = { +		.blkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE * 2, +			.max_keysize	= TF_MAX_KEY_SIZE * 2, +			.ivsize		= TF_BLOCK_SIZE, +			.setkey		= xts_twofish_setkey, +			.encrypt	= xts_encrypt, +			.decrypt	= xts_decrypt, +		}, +	}, +}, { +	.cra_name		= "ecb(twofish)", +	.cra_driver_name	= "ecb-twofish-avx", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[5].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "cbc(twofish)", +	.cra_driver_name	= "cbc-twofish-avx", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[6].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE, +			.ivsize		= TF_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= __ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "ctr(twofish)", +	.cra_driver_name	= "ctr-twofish-avx", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= 1, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[7].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE, +			.ivsize		= TF_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_encrypt, +			.geniv		= "chainiv", +		}, +	}, +}, { +	.cra_name		= "lrw(twofish)", +	.cra_driver_name	= "lrw-twofish-avx", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[8].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE + +					  TF_BLOCK_SIZE, +			.max_keysize	= TF_MAX_KEY_SIZE + +					  TF_BLOCK_SIZE, +			.ivsize		= TF_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +}, { +	.cra_name		= "xts(twofish)", +	.cra_driver_name	= "xts-twofish-avx", +	.cra_priority		= 400, +	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, +	.cra_blocksize		= TF_BLOCK_SIZE, +	.cra_ctxsize		= sizeof(struct async_helper_ctx), +	.cra_alignmask		= 0, +	.cra_type		= &crypto_ablkcipher_type, +	.cra_module		= THIS_MODULE, +	.cra_list		= LIST_HEAD_INIT(twofish_algs[9].cra_list), +	.cra_init		= ablk_init, +	.cra_exit		= ablk_exit, +	.cra_u = { +		.ablkcipher = { +			.min_keysize	= TF_MIN_KEY_SIZE * 2, +			.max_keysize	= TF_MAX_KEY_SIZE * 2, +			.ivsize		= TF_BLOCK_SIZE, +			.setkey		= ablk_set_key, +			.encrypt	= ablk_encrypt, +			.decrypt	= ablk_decrypt, +		}, +	}, +} }; + +static int __init twofish_init(void) +{ +	u64 xcr0; + +	if (!cpu_has_avx || !cpu_has_osxsave) { +		printk(KERN_INFO "AVX instructions are not detected.\n"); +		return -ENODEV; +	} + +	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); +	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { +		printk(KERN_INFO "AVX detected but unusable.\n"); +		return -ENODEV; +	} + +	return crypto_register_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); +} + +static void __exit twofish_exit(void) +{ +	crypto_unregister_algs(twofish_algs, ARRAY_SIZE(twofish_algs)); +} + +module_init(twofish_init); +module_exit(twofish_exit); + +MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("twofish"); diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 922ab24cce31..15f9347316c8 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -3,11 +3,6 @@   *   * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>   * - * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: - *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> - * CTR part based on code (crypto/ctr.c) by: - *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> - *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License as published by   * the Free Software Foundation; either version 2 of the License, or @@ -33,20 +28,13 @@  #include <crypto/algapi.h>  #include <crypto/twofish.h>  #include <crypto/b128ops.h> +#include <asm/crypto/twofish.h> +#include <asm/crypto/glue_helper.h>  #include <crypto/lrw.h>  #include <crypto/xts.h> -/* regular block cipher functions from twofish_x86_64 module */ -asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, -				const u8 *src); -asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, -				const u8 *src); - -/* 3-way parallel cipher functions */ -asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, -				       const u8 *src, bool xor); -asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, -				     const u8 *src); +EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way); +EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);  static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,  					const u8 *src) @@ -60,311 +48,139 @@ static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,  	__twofish_enc_blk_3way(ctx, dst, src, true);  } -static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, -		     void (*fn)(struct twofish_ctx *, u8 *, const u8 *), -		     void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *)) -{ -	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = TF_BLOCK_SIZE; -	unsigned int nbytes; -	int err; - -	err = blkcipher_walk_virt(desc, walk); - -	while ((nbytes = walk->nbytes)) { -		u8 *wsrc = walk->src.virt.addr; -		u8 *wdst = walk->dst.virt.addr; - -		/* Process three block batch */ -		if (nbytes >= bsize * 3) { -			do { -				fn_3way(ctx, wdst, wsrc); - -				wsrc += bsize * 3; -				wdst += bsize * 3; -				nbytes -= bsize * 3; -			} while (nbytes >= bsize * 3); - -			if (nbytes < bsize) -				goto done; -		} - -		/* Handle leftovers */ -		do { -			fn(ctx, wdst, wsrc); - -			wsrc += bsize; -			wdst += bsize; -			nbytes -= bsize; -		} while (nbytes >= bsize); - -done: -		err = blkcipher_walk_done(desc, walk, nbytes); -	} - -	return err; -} - -static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) +void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)  { -	struct blkcipher_walk walk; +	u128 ivs[2]; -	blkcipher_walk_init(&walk, dst, src, nbytes); -	return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); -} +	ivs[0] = src[0]; +	ivs[1] = src[1]; -static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) -{ -	struct blkcipher_walk walk; +	twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); -	blkcipher_walk_init(&walk, dst, src, nbytes); -	return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); +	u128_xor(&dst[1], &dst[1], &ivs[0]); +	u128_xor(&dst[2], &dst[2], &ivs[1]);  } +EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); -static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, -				  struct blkcipher_walk *walk) -{ -	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = TF_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 *iv = (u128 *)walk->iv; - -	do { -		u128_xor(dst, src, iv); -		twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); -		iv = dst; - -		src += 1; -		dst += 1; -		nbytes -= bsize; -	} while (nbytes >= bsize); - -	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); -	return nbytes; -} - -static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) +void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)  { -	struct blkcipher_walk walk; -	int err; +	be128 ctrblk; -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt(desc, &walk); +	if (dst != src) +		*dst = *src; -	while ((nbytes = walk.nbytes)) { -		nbytes = __cbc_encrypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} +	u128_to_be128(&ctrblk, iv); +	u128_inc(iv); -	return err; +	twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); +	u128_xor(dst, dst, (u128 *)&ctrblk);  } +EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); -static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, -				  struct blkcipher_walk *walk) +void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, +				     u128 *iv)  { -	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = TF_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 ivs[3 - 1]; -	u128 last_iv; - -	/* Start of the last block. */ -	src += nbytes / bsize - 1; -	dst += nbytes / bsize - 1; - -	last_iv = *src; - -	/* Process three block batch */ -	if (nbytes >= bsize * 3) { -		do { -			nbytes -= bsize * (3 - 1); -			src -= 3 - 1; -			dst -= 3 - 1; - -			ivs[0] = src[0]; -			ivs[1] = src[1]; - -			twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); - -			u128_xor(dst + 1, dst + 1, ivs + 0); -			u128_xor(dst + 2, dst + 2, ivs + 1); - -			nbytes -= bsize; -			if (nbytes < bsize) -				goto done; - -			u128_xor(dst, dst, src - 1); -			src -= 1; -			dst -= 1; -		} while (nbytes >= bsize * 3); - -		if (nbytes < bsize) -			goto done; -	} - -	/* Handle leftovers */ -	for (;;) { -		twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); - -		nbytes -= bsize; -		if (nbytes < bsize) -			break; +	be128 ctrblks[3]; -		u128_xor(dst, dst, src - 1); -		src -= 1; -		dst -= 1; +	if (dst != src) { +		dst[0] = src[0]; +		dst[1] = src[1]; +		dst[2] = src[2];  	} -done: -	u128_xor(dst, dst, (u128 *)walk->iv); -	*(u128 *)walk->iv = last_iv; +	u128_to_be128(&ctrblks[0], iv); +	u128_inc(iv); +	u128_to_be128(&ctrblks[1], iv); +	u128_inc(iv); +	u128_to_be128(&ctrblks[2], iv); +	u128_inc(iv); -	return nbytes; +	twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);  } +EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way); + +static const struct common_glue_ctx twofish_enc = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 3, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) } +	} } +}; -static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, -		       struct scatterlist *src, unsigned int nbytes) -{ -	struct blkcipher_walk walk; -	int err; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt(desc, &walk); +static const struct common_glue_ctx twofish_ctr = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 3, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) } +	} } +}; -	while ((nbytes = walk.nbytes)) { -		nbytes = __cbc_decrypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} +static const struct common_glue_ctx twofish_dec = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 3, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) } +	} } +}; -	return err; -} +static const struct common_glue_ctx twofish_dec_cbc = { +	.num_funcs = 2, +	.fpu_blocks_limit = -1, + +	.funcs = { { +		.num_blocks = 3, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } +	}, { +		.num_blocks = 1, +		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) } +	} } +}; -static inline void u128_to_be128(be128 *dst, const u128 *src) +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	dst->a = cpu_to_be64(src->a); -	dst->b = cpu_to_be64(src->b); +	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);  } -static inline void be128_to_u128(u128 *dst, const be128 *src) +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	dst->a = be64_to_cpu(src->a); -	dst->b = be64_to_cpu(src->b); +	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);  } -static inline void u128_inc(u128 *i) +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	i->b++; -	if (!i->b) -		i->a++; +	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc, +				       dst, src, nbytes);  } -static void ctr_crypt_final(struct blkcipher_desc *desc, -			    struct blkcipher_walk *walk) +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, +		       struct scatterlist *src, unsigned int nbytes)  { -	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	u8 *ctrblk = walk->iv; -	u8 keystream[TF_BLOCK_SIZE]; -	u8 *src = walk->src.virt.addr; -	u8 *dst = walk->dst.virt.addr; -	unsigned int nbytes = walk->nbytes; - -	twofish_enc_blk(ctx, keystream, ctrblk); -	crypto_xor(keystream, src, nbytes); -	memcpy(dst, keystream, nbytes); - -	crypto_inc(ctrblk, TF_BLOCK_SIZE); -} - -static unsigned int __ctr_crypt(struct blkcipher_desc *desc, -				struct blkcipher_walk *walk) -{ -	struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); -	unsigned int bsize = TF_BLOCK_SIZE; -	unsigned int nbytes = walk->nbytes; -	u128 *src = (u128 *)walk->src.virt.addr; -	u128 *dst = (u128 *)walk->dst.virt.addr; -	u128 ctrblk; -	be128 ctrblocks[3]; - -	be128_to_u128(&ctrblk, (be128 *)walk->iv); - -	/* Process three block batch */ -	if (nbytes >= bsize * 3) { -		do { -			if (dst != src) { -				dst[0] = src[0]; -				dst[1] = src[1]; -				dst[2] = src[2]; -			} - -			/* create ctrblks for parallel encrypt */ -			u128_to_be128(&ctrblocks[0], &ctrblk); -			u128_inc(&ctrblk); -			u128_to_be128(&ctrblocks[1], &ctrblk); -			u128_inc(&ctrblk); -			u128_to_be128(&ctrblocks[2], &ctrblk); -			u128_inc(&ctrblk); - -			twofish_enc_blk_xor_3way(ctx, (u8 *)dst, -						 (u8 *)ctrblocks); - -			src += 3; -			dst += 3; -			nbytes -= bsize * 3; -		} while (nbytes >= bsize * 3); - -		if (nbytes < bsize) -			goto done; -	} - -	/* Handle leftovers */ -	do { -		if (dst != src) -			*dst = *src; - -		u128_to_be128(&ctrblocks[0], &ctrblk); -		u128_inc(&ctrblk); - -		twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); -		u128_xor(dst, dst, (u128 *)ctrblocks); - -		src += 1; -		dst += 1; -		nbytes -= bsize; -	} while (nbytes >= bsize); - -done: -	u128_to_be128((be128 *)walk->iv, &ctrblk); -	return nbytes; +	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src, +				       nbytes);  }  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,  		     struct scatterlist *src, unsigned int nbytes)  { -	struct blkcipher_walk walk; -	int err; - -	blkcipher_walk_init(&walk, dst, src, nbytes); -	err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); - -	while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { -		nbytes = __ctr_crypt(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, nbytes); -	} - -	if (walk.nbytes) { -		ctr_crypt_final(desc, &walk); -		err = blkcipher_walk_done(desc, &walk, 0); -	} - -	return err; +	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);  }  static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) @@ -397,13 +213,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)  		twofish_dec_blk(ctx, srcdst, srcdst);  } -struct twofish_lrw_ctx { -	struct lrw_table_ctx lrw_table; -	struct twofish_ctx twofish_ctx; -}; - -static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, -			      unsigned int keylen) +int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, +		       unsigned int keylen)  {  	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);  	int err; @@ -415,6 +226,7 @@ static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,  	return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);  } +EXPORT_SYMBOL_GPL(lrw_twofish_setkey);  static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  		       struct scatterlist *src, unsigned int nbytes) @@ -450,20 +262,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  	return lrw_crypt(desc, dst, src, nbytes, &req);  } -static void lrw_exit_tfm(struct crypto_tfm *tfm) +void lrw_twofish_exit_tfm(struct crypto_tfm *tfm)  {  	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);  	lrw_free_table(&ctx->lrw_table);  } +EXPORT_SYMBOL_GPL(lrw_twofish_exit_tfm); -struct twofish_xts_ctx { -	struct twofish_ctx tweak_ctx; -	struct twofish_ctx crypt_ctx; -}; - -static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, -			      unsigned int keylen) +int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, +		       unsigned int keylen)  {  	struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);  	u32 *flags = &tfm->crt_flags; @@ -486,6 +294,7 @@ static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,  	return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,  				flags);  } +EXPORT_SYMBOL_GPL(xts_twofish_setkey);  static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,  		       struct scatterlist *src, unsigned int nbytes) @@ -596,7 +405,7 @@ static struct crypto_alg tf_algs[5] = { {  	.cra_type		= &crypto_blkcipher_type,  	.cra_module		= THIS_MODULE,  	.cra_list		= LIST_HEAD_INIT(tf_algs[3].cra_list), -	.cra_exit		= lrw_exit_tfm, +	.cra_exit		= lrw_twofish_exit_tfm,  	.cra_u = {  		.blkcipher = {  			.min_keysize	= TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index daeca56211e3..673ac9b63d6b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -38,7 +38,7 @@  int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)  {  	int err = 0; -	bool ia32 = is_ia32_task(); +	bool ia32 = test_thread_flag(TIF_IA32);  	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))  		return -EFAULT; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 49331bedc158..70780689599a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end)  }  #endif	/* CONFIG_SMP */ +#define OLDINSTR(oldinstr)	"661:\n\t" oldinstr "\n662:\n" + +#define b_replacement(number)	"663"#number +#define e_replacement(number)	"664"#number + +#define alt_slen "662b-661b" +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" + +#define ALTINSTR_ENTRY(feature, number)					      \ +	" .long 661b - .\n"				/* label           */ \ +	" .long " b_replacement(number)"f - .\n"	/* new instruction */ \ +	" .word " __stringify(feature) "\n"		/* feature bit     */ \ +	" .byte " alt_slen "\n"				/* source len      */ \ +	" .byte " alt_rlen(number) "\n"			/* replacement len */ + +#define DISCARD_ENTRY(number)				/* rlen <= slen */    \ +	" .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + +#define ALTINSTR_REPLACEMENT(newinstr, feature, number)	/* replacement */     \ +	b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" +  /* alternative assembly primitive: */  #define ALTERNATIVE(oldinstr, newinstr, feature)			\ -									\ -      "661:\n\t" oldinstr "\n662:\n"					\ -      ".section .altinstructions,\"a\"\n"				\ -      "	 .long 661b - .\n"			/* label           */	\ -      "	 .long 663f - .\n"			/* new instruction */	\ -      "	 .word " __stringify(feature) "\n"	/* feature bit     */	\ -      "	 .byte 662b-661b\n"			/* sourcelen       */	\ -      "	 .byte 664f-663f\n"			/* replacementlen  */	\ -      ".previous\n"							\ -      ".section .discard,\"aw\",@progbits\n"				\ -      "	 .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */	\ -      ".previous\n"							\ -      ".section .altinstr_replacement, \"ax\"\n"			\ -      "663:\n\t" newinstr "\n664:\n"		/* replacement     */	\ -      ".previous" +	OLDINSTR(oldinstr)						\ +	".section .altinstructions,\"a\"\n"				\ +	ALTINSTR_ENTRY(feature, 1)					\ +	".previous\n"							\ +	".section .discard,\"aw\",@progbits\n"				\ +	DISCARD_ENTRY(1)						\ +	".previous\n"							\ +	".section .altinstr_replacement, \"ax\"\n"			\ +	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\ +	".previous" + +#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ +	OLDINSTR(oldinstr)						\ +	".section .altinstructions,\"a\"\n"				\ +	ALTINSTR_ENTRY(feature1, 1)					\ +	ALTINSTR_ENTRY(feature2, 2)					\ +	".previous\n"							\ +	".section .discard,\"aw\",@progbits\n"				\ +	DISCARD_ENTRY(1)						\ +	DISCARD_ENTRY(2)						\ +	".previous\n"							\ +	".section .altinstr_replacement, \"ax\"\n"			\ +	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\ +	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\ +	".previous"  /*   * This must be included *after* the definition of ALTERNATIVE due to @@ -140,6 +171,19 @@ static inline int alternatives_text_reserved(void *start, void *end)  		: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)  /* + * Like alternative_call, but there are two features and respective functions. + * If CPU has feature2, function2 is used. + * Otherwise, if CPU has feature1, function1 is used. + * Otherwise, old function is used. + */ +#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2,   \ +			   output, input...)				      \ +	asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ +		"call %P[new2]", feature2)				      \ +		: output : [old] "i" (oldfunc), [new1] "i" (newfunc1),	      \ +		[new2] "i" (newfunc2), ## input) + +/*   * use this macro(s) if you need more than one output parameter   * in alternative_io   */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 49ad773f4b9f..b3341e9cd8fd 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -26,10 +26,31 @@ struct amd_l3_cache {  	u8	 subcaches[4];  }; +struct threshold_block { +	unsigned int		block; +	unsigned int		bank; +	unsigned int		cpu; +	u32			address; +	u16			interrupt_enable; +	bool			interrupt_capable; +	u16			threshold_limit; +	struct kobject		kobj; +	struct list_head	miscj; +}; + +struct threshold_bank { +	struct kobject		*kobj; +	struct threshold_block	*blocks; + +	/* initialized to the number of CPUs on the node sharing this bank */ +	atomic_t		cpus; +}; +  struct amd_northbridge {  	struct pci_dev *misc;  	struct pci_dev *link;  	struct amd_l3_cache l3_cache; +	struct threshold_bank *bank4;  };  struct amd_northbridge_info { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eaff4790ed96..f34261296ffb 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,8 @@ struct apic {  	unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);  	unsigned long (*check_apicid_present)(int apicid); -	void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); +	void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, +					 const struct cpumask *mask);  	void (*init_apic_ldr)(void);  	void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -331,9 +332,9 @@ struct apic {  	unsigned long (*set_apic_id)(unsigned int id);  	unsigned long apic_id_mask; -	unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); -	unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, -					       const struct cpumask *andmask); +	int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, +				      const struct cpumask *andmask, +				      unsigned int *apicid);  	/* ipi */  	void (*send_IPI_mask)(const struct cpumask *mask, int vector); @@ -464,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void)  	return apic->safe_wait_icr_idle();  } +extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); +  #else /* CONFIG_X86_LOCAL_APIC */  static inline u32 apic_read(u32 reg) { return 0; } @@ -473,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; }  static inline void apic_icr_write(u32 low, u32 high) { }  static inline void apic_wait_icr_idle(void) { }  static inline u32 safe_apic_wait_icr_idle(void) { return 0; } +static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}  #endif /* CONFIG_X86_LOCAL_APIC */ @@ -537,7 +541,12 @@ static inline const struct cpumask *default_target_cpus(void)  #endif  } -DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +static inline const struct cpumask *online_target_cpus(void) +{ +	return cpu_online_mask; +} + +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);  static inline unsigned int read_apic_id(void) @@ -586,21 +595,50 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)  #endif -static inline unsigned int -default_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +			    const struct cpumask *andmask, +			    unsigned int *apicid)  { -	return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; +	unsigned long cpu_mask = cpumask_bits(cpumask)[0] & +				 cpumask_bits(andmask)[0] & +				 cpumask_bits(cpu_online_mask)[0] & +				 APIC_ALL_CPUS; + +	if (likely(cpu_mask)) { +		*apicid = (unsigned int)cpu_mask; +		return 0; +	} else { +		return -EINVAL; +	}  } -static inline unsigned int +extern int  default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -			       const struct cpumask *andmask) +			       const struct cpumask *andmask, +			       unsigned int *apicid); + +static inline void +flat_vector_allocation_domain(int cpu, struct cpumask *retmask, +			      const struct cpumask *mask)  { -	unsigned long mask1 = cpumask_bits(cpumask)[0]; -	unsigned long mask2 = cpumask_bits(andmask)[0]; -	unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; +	/* Careful. Some cpus do not strictly honor the set of cpus +	 * specified in the interrupt destination when using lowest +	 * priority interrupt delivery mode. +	 * +	 * In particular there was a hyperthreading cpu observed to +	 * deliver interrupts to the wrong hyperthread when only one +	 * hyperthread was specified in the interrupt desitination. +	 */ +	cpumask_clear(retmask); +	cpumask_bits(retmask)[0] = APIC_ALL_CPUS; +} -	return (unsigned int)(mask1 & mask2 & mask3); +static inline void +default_vector_allocation_domain(int cpu, struct cpumask *retmask, +				 const struct cpumask *mask) +{ +	cpumask_copy(retmask, cpumask_of(cpu));  }  static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a6983b277220..72f5009deb5a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)   * This operation is non-atomic and can be reordered.   * If two examples of this operation race, one can appear to succeed   * but actually fail.  You must protect multiple accesses with a lock. + * + * Note: the operation is performed atomically with respect to + * the local CPU, but not other CPUs. Portable code should not + * rely on this behaviour. + * KVM relies on this behaviour on x86 for modifying memory that is also + * accessed from a hypervisor on the same CPU if running in a VM: don't change + * this without also updating arch/x86/kernel/kvm.c   */  static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)  { diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index eb45aa6b1f27..2ad874cb661c 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -66,6 +66,7 @@ struct setup_header {  	__u64	setup_data;  	__u64	pref_address;  	__u32	init_size; +	__u32	handover_offset;  } __attribute__((packed));  struct sys_desc_table { diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 340ee49961a6..6b7ee5ff6820 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,7 +176,7 @@  #define X86_FEATURE_XSAVEOPT	(7*32+ 4) /* Optimized Xsave */  #define X86_FEATURE_PLN		(7*32+ 5) /* Intel Power Limit Notification */  #define X86_FEATURE_PTS		(7*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_DTS		(7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_DTHERM	(7*32+ 7) /* Digital Thermal Sensor */  #define X86_FEATURE_HW_PSTATE	(7*32+ 8) /* AMD HW-PState */  /* Virtualization flags: Linux defined, word 8 */ @@ -207,6 +207,8 @@  #define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */  #define X86_FEATURE_INVPCID	(9*32+10) /* Invalidate Processor Context ID */  #define X86_FEATURE_RTM		(9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_RDSEED	(9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX		(9*32+19) /* The ADCX and ADOX instructions */  #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/crypto/ablk_helper.h b/arch/x86/include/asm/crypto/ablk_helper.h new file mode 100644 index 000000000000..4f93df50c23e --- /dev/null +++ b/arch/x86/include/asm/crypto/ablk_helper.h @@ -0,0 +1,31 @@ +/* + * Shared async block cipher helpers + */ + +#ifndef _CRYPTO_ABLK_HELPER_H +#define _CRYPTO_ABLK_HELPER_H + +#include <linux/crypto.h> +#include <linux/kernel.h> +#include <crypto/cryptd.h> + +struct async_helper_ctx { +	struct cryptd_ablkcipher *cryptd_tfm; +}; + +extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, +			unsigned int key_len); + +extern int __ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_decrypt(struct ablkcipher_request *req); + +extern void ablk_exit(struct crypto_tfm *tfm); + +extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name); + +extern int ablk_init(struct crypto_tfm *tfm); + +#endif /* _CRYPTO_ABLK_HELPER_H */ diff --git a/arch/x86/include/asm/aes.h b/arch/x86/include/asm/crypto/aes.h index 80545a1cbe39..80545a1cbe39 100644 --- a/arch/x86/include/asm/aes.h +++ b/arch/x86/include/asm/crypto/aes.h diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h new file mode 100644 index 000000000000..3e408bddc96f --- /dev/null +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -0,0 +1,115 @@ +/* + * Shared glue code for 128bit block ciphers + */ + +#ifndef _CRYPTO_GLUE_HELPER_H +#define _CRYPTO_GLUE_HELPER_H + +#include <linux/kernel.h> +#include <linux/crypto.h> +#include <asm/i387.h> +#include <crypto/b128ops.h> + +typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); +typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); +typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, +				       u128 *iv); + +#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) +#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) +#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) + +struct common_glue_func_entry { +	unsigned int num_blocks; /* number of blocks that @fn will process */ +	union { +		common_glue_func_t ecb; +		common_glue_cbc_func_t cbc; +		common_glue_ctr_func_t ctr; +	} fn_u; +}; + +struct common_glue_ctx { +	unsigned int num_funcs; +	int fpu_blocks_limit; /* -1 means fpu not needed at all */ + +	/* +	 * First funcs entry must have largest num_blocks and last funcs entry +	 * must have num_blocks == 1! +	 */ +	struct common_glue_func_entry funcs[]; +}; + +static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, +				  struct blkcipher_desc *desc, +				  bool fpu_enabled, unsigned int nbytes) +{ +	if (likely(fpu_blocks_limit < 0)) +		return false; + +	if (fpu_enabled) +		return true; + +	/* +	 * Vector-registers are only used when chunk to be processed is large +	 * enough, so do not enable FPU until it is necessary. +	 */ +	if (nbytes < bsize * (unsigned int)fpu_blocks_limit) +		return false; + +	if (desc) { +		/* prevent sleeping if FPU is in use */ +		desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; +	} + +	kernel_fpu_begin(); +	return true; +} + +static inline void glue_fpu_end(bool fpu_enabled) +{ +	if (fpu_enabled) +		kernel_fpu_end(); +} + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ +	dst->a = cpu_to_be64(src->a); +	dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ +	dst->a = be64_to_cpu(src->a); +	dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ +	i->b++; +	if (!i->b) +		i->a++; +} + +extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, +				 struct blkcipher_desc *desc, +				 struct scatterlist *dst, +				 struct scatterlist *src, unsigned int nbytes); + +extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn, +				   struct blkcipher_desc *desc, +				   struct scatterlist *dst, +				   struct scatterlist *src, +				   unsigned int nbytes); + +extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, +				   struct blkcipher_desc *desc, +				   struct scatterlist *dst, +				   struct scatterlist *src, +				   unsigned int nbytes); + +extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, +				 struct blkcipher_desc *desc, +				 struct scatterlist *dst, +				 struct scatterlist *src, unsigned int nbytes); + +#endif /* _CRYPTO_GLUE_HELPER_H */ diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h new file mode 100644 index 000000000000..432deedd2945 --- /dev/null +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -0,0 +1,32 @@ +#ifndef ASM_X86_SERPENT_AVX_H +#define ASM_X86_SERPENT_AVX_H + +#include <linux/crypto.h> +#include <crypto/serpent.h> + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, +					   const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, +					 const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, +				   const u8 *src) +{ +	__serpent_enc_blk_8way_avx(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, +				       const u8 *src) +{ +	__serpent_enc_blk_8way_avx(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, +				   const u8 *src) +{ +	serpent_dec_blk_8way_avx(ctx, dst, src); +} + +#endif diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/crypto/serpent-sse2.h index d3ef63fe0c81..e6e77dffbdab 100644 --- a/arch/x86/include/asm/serpent.h +++ b/arch/x86/include/asm/crypto/serpent-sse2.h @@ -1,5 +1,5 @@ -#ifndef ASM_X86_SERPENT_H -#define ASM_X86_SERPENT_H +#ifndef ASM_X86_SERPENT_SSE2_H +#define ASM_X86_SERPENT_SSE2_H  #include <linux/crypto.h>  #include <crypto/serpent.h> diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h new file mode 100644 index 000000000000..9d2c514bd5f9 --- /dev/null +++ b/arch/x86/include/asm/crypto/twofish.h @@ -0,0 +1,46 @@ +#ifndef ASM_X86_TWOFISH_H +#define ASM_X86_TWOFISH_H + +#include <linux/crypto.h> +#include <crypto/twofish.h> +#include <crypto/lrw.h> +#include <crypto/b128ops.h> + +struct twofish_lrw_ctx { +	struct lrw_table_ctx lrw_table; +	struct twofish_ctx twofish_ctx; +}; + +struct twofish_xts_ctx { +	struct twofish_ctx tweak_ctx; +	struct twofish_ctx crypt_ctx; +}; + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, +				const u8 *src); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, +				const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, +				       const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, +				     const u8 *src); + +/* helpers from twofish_x86_64-3way module */ +extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); +extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, +				u128 *iv); +extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, +				     u128 *iv); + +extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, +			      unsigned int keylen); + +extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm); + +extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, +			      unsigned int keylen); + +#endif /* ASM_X86_TWOFISH_H */ diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index cc70c1c78ca4..75ce3f47d204 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -4,9 +4,7 @@  enum reboot_type {  	BOOT_TRIPLE = 't',  	BOOT_KBD = 'k', -#ifdef CONFIG_X86_32  	BOOT_BIOS = 'b', -#endif  	BOOT_ACPI = 'a',  	BOOT_EFI = 'e',  	BOOT_CF9 = 'p', diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 0baa628e330c..40afa0005c69 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -15,15 +15,6 @@ BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)  BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)  BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)  BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) - -.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ -	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 -.if NUM_INVALIDATE_TLB_VECTORS > \idx -BUILD_INTERRUPT3(invalidate_interrupt\idx, -		 (INVALIDATE_TLB_VECTOR_START)+\idx, -		 smp_invalidate_interrupt) -.endif -.endr  #endif  BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index dbe82a5c5eac..d3d74698dce9 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)  		virtual_dma_residue += virtual_dma_count;  		virtual_dma_count = 0;  #ifdef TRACE_FLPY_INT -		printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", +		printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",  		       virtual_dma_count, virtual_dma_residue, calls, bytes,  		       dma_wait);  		calls = 0; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 7a15153c675d..b518c7509933 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper;  extern const struct hypervisor_x86 x86_hyper_vmware;  extern const struct hypervisor_x86 x86_hyper_ms_hyperv;  extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_kvm;  static inline bool hypervisor_x2apic_available(void)  { diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index dffc38ee6255..345c99cef152 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -5,7 +5,6 @@ extern struct dma_map_ops nommu_dma_ops;  extern int force_iommu, no_iommu;  extern int iommu_detected;  extern int iommu_pass_through; -extern int iommu_group_mf;  /* 10 seconds */  #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4b4448761e88..1508e518c7e3 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -119,17 +119,6 @@   */  #define LOCAL_TIMER_VECTOR		0xef -/* up to 32 vectors used for spreading out TLB flushes: */ -#if NR_CPUS <= 32 -# define NUM_INVALIDATE_TLB_VECTORS	(NR_CPUS) -#else -# define NUM_INVALIDATE_TLB_VECTORS	(32) -#endif - -#define INVALIDATE_TLB_VECTOR_END	(0xee) -#define INVALIDATE_TLB_VECTOR_START	\ -	(INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1) -  #define NR_VECTORS			 256  #define FPU_IRQ				  13 diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index e7d1c194d272..246617efd67f 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -12,6 +12,7 @@  /* Select x86 specific features in <linux/kvm.h> */  #define __KVM_HAVE_PIT  #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE  #define __KVM_HAVE_DEVICE_ASSIGNMENT  #define __KVM_HAVE_MSI  #define __KVM_HAVE_USER_NMI diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1ac46c22dd50..c764f43b71c5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -192,8 +192,8 @@ struct x86_emulate_ops {  			 struct x86_instruction_info *info,  			 enum x86_intercept_stage stage); -	bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, -			 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); +	void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, +			  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);  };  typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -280,9 +280,9 @@ struct x86_emulate_ctxt {  	u8 modrm_seg;  	bool rip_relative;  	unsigned long _eip; +	struct operand memop;  	/* Fields above regs are cleared together. */  	unsigned long regs[NR_VCPU_REGS]; -	struct operand memop;  	struct operand *memopp;  	struct fetch_cache fetch;  	struct read_cache io_read; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db7c1f2709a2..09155d64cf7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,12 +48,13 @@  #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)  #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL  #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |	\  				  0xFFFFFF0000000000ULL)  #define CR4_RESERVED_BITS                                               \  	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\  			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \ -			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \ +			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \  			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \  			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) @@ -175,6 +176,13 @@ enum {  /* apic attention bits */  #define KVM_APIC_CHECK_VAPIC	0 +/* + * The following bit is set with PV-EOI, unset on EOI. + * We detect PV-EOI changes by guest by comparing + * this bit with PV-EOI in guest memory. + * See the implementation in apic_update_pv_eoi. + */ +#define KVM_APIC_PV_EOI_PENDING	1  /*   * We don't want allocation failures within the mmu code, so we preallocate @@ -313,8 +321,8 @@ struct kvm_pmu {  	u64 counter_bitmask[2];  	u64 global_ctrl_mask;  	u8 version; -	struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; -	struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; +	struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; +	struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];  	struct irq_work irq_work;  	u64 reprogram_pmi;  }; @@ -484,6 +492,11 @@ struct kvm_vcpu_arch {  		u64 length;  		u64 status;  	} osvw; + +	struct { +		u64 msr_val; +		struct gfn_to_hva_cache data; +	} pv_eoi;  };  struct kvm_lpage_info { @@ -661,6 +674,7 @@ struct kvm_x86_ops {  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);  	int (*get_lpage_level)(void);  	bool (*rdtscp_supported)(void); +	bool (*invpcid_supported)(void);  	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);  	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,  void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);  bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); -int kvm_pic_set_irq(void *opaque, int irq, int level); +static inline int __kvm_irq_line_state(unsigned long *irq_state, +				       int irq_source_id, int level) +{ +	/* Logical OR for level trig interrupt */ +	if (level) +		__set_bit(irq_source_id, irq_state); +	else +		__clear_bit(irq_source_id, irq_state); + +	return !!(*irq_state); +} + +int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); +void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);  void kvm_inject_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00e..2f7712e08b1e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -22,6 +22,7 @@  #define KVM_FEATURE_CLOCKSOURCE2        3  #define KVM_FEATURE_ASYNC_PF		4  #define KVM_FEATURE_STEAL_TIME		5 +#define KVM_FEATURE_PV_EOI		6  /* The last 8 bits are used to indicate how to interpret the flags field   * in pvclock structure. If no bits are set, all flags are ignored. @@ -37,6 +38,7 @@  #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01  #define MSR_KVM_ASYNC_PF_EN 0x4b564d02  #define MSR_KVM_STEAL_TIME  0x4b564d03 +#define MSR_KVM_PV_EOI_EN      0x4b564d04  struct kvm_steal_time {  	__u64 steal; @@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data {  	__u32 enabled;  }; +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 +  #ifdef __KERNEL__  #include <asm/processor.h> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 084ef95274cd..813ed103f45e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,  extern unsigned long long native_read_tsc(void); -extern int native_rdmsr_safe_regs(u32 regs[8]); -extern int native_wrmsr_safe_regs(u32 regs[8]); +extern int rdmsr_safe_regs(u32 regs[8]); +extern int wrmsr_safe_regs(u32 regs[8]);  static __always_inline unsigned long long __native_read_tsc(void)  { @@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)  	return err;  } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ -	u32 gprs[8] = { 0 }; -	int err; - -	gprs[1] = msr; -	gprs[7] = 0x9c5a203a; - -	err = native_rdmsr_safe_regs(gprs); - -	*p = gprs[0] | ((u64)gprs[2] << 32); - -	return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ -	u32 gprs[8] = { 0 }; - -	gprs[0] = (u32)val; -	gprs[1] = msr; -	gprs[2] = val >> 32; -	gprs[7] = 0x9c5a203a; - -	return native_wrmsr_safe_regs(gprs); -} - -static inline int rdmsr_safe_regs(u32 regs[8]) -{ -	return native_rdmsr_safe_regs(regs); -} - -static inline int wrmsr_safe_regs(u32 regs[8]) -{ -	return native_wrmsr_safe_regs(regs); -} -  #define rdtscl(low)						\  	((low) = (u32)__native_read_tsc()) @@ -237,6 +200,8 @@ do {							\  	(high) = (u32)(_l >> 32);			\  } while (0) +#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) +  #define rdtscp(low, high, aux)					\  do {                                                            \  	unsigned long long _val = native_read_tscp(&(aux));     \ @@ -248,8 +213,7 @@ do {                                                            \  #endif	/* !CONFIG_PARAVIRT */ - -#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val),		\ +#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val),		\  					     (u32)((val) >> 32))  #define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index dc580c42851c..c0fa356e90de 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -44,28 +44,14 @@ struct nmiaction {  	const char		*name;  }; -#define register_nmi_handler(t, fn, fg, n)		\ +#define register_nmi_handler(t, fn, fg, n, init...)	\  ({							\ -	static struct nmiaction fn##_na = {		\ +	static struct nmiaction init fn##_na = {	\  		.handler = (fn),			\  		.name = (n),				\  		.flags = (fg),				\  	};						\ -	__register_nmi_handler((t), &fn##_na);	\ -}) - -/* - * For special handlers that register/unregister in the - * init section only.  This should be considered rare. - */ -#define register_nmi_handler_initonly(t, fn, fg, n)		\ -({							\ -	static struct nmiaction fn##_na __initdata = {		\ -		.handler = (fn),			\ -		.name = (n),				\ -		.flags = (fg),				\ -	};						\ -	__register_nmi_handler((t), &fn##_na);	\ +	__register_nmi_handler((t), &fn##_na);		\  })  int __register_nmi_handler(unsigned int, struct nmiaction *); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf52707..a0facf3908d7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err)  	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);  } -static inline int paravirt_rdmsr_regs(u32 *regs) -{ -	return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); -} -  static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)  {  	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);  } -static inline int paravirt_wrmsr_regs(u32 *regs) -{ -	return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); -} -  /* These should all do BUG_ON(_err), but our headers are too tangled. */  #define rdmsr(msr, val1, val2)			\  do {						\ @@ -176,9 +166,6 @@ do {						\  	_err;					\  }) -#define rdmsr_safe_regs(regs)	paravirt_rdmsr_regs(regs) -#define wrmsr_safe_regs(regs)	paravirt_wrmsr_regs(regs) -  static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)  {  	int err; @@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)  	*p = paravirt_read_msr(msr, &err);  	return err;  } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ -	u32 gprs[8] = { 0 }; -	int err; - -	gprs[1] = msr; -	gprs[7] = 0x9c5a203a; - -	err = paravirt_rdmsr_regs(gprs); - -	*p = gprs[0] | ((u64)gprs[2] << 32); - -	return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ -	u32 gprs[8] = { 0 }; - -	gprs[0] = (u32)val; -	gprs[1] = msr; -	gprs[2] = val >> 32; -	gprs[7] = 0x9c5a203a; - -	return paravirt_wrmsr_regs(gprs); -}  static inline u64 paravirt_read_tsc(void)  { @@ -252,6 +213,8 @@ do {						\  	high = _l >> 32;			\  } while (0) +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) +  static inline unsigned long long paravirt_rdtscp(unsigned int *aux)  {  	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); @@ -397,9 +360,10 @@ static inline void __flush_tlb_single(unsigned long addr)  static inline void flush_tlb_others(const struct cpumask *cpumask,  				    struct mm_struct *mm, -				    unsigned long va) +				    unsigned long start, +				    unsigned long end)  { -	PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va); +	PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);  }  static inline int paravirt_pgd_alloc(struct mm_struct *mm) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8e8b9a4987ee..142236ed83af 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -153,9 +153,7 @@ struct pv_cpu_ops {  	/* MSR, PMC and TSR operations.  	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */  	u64 (*read_msr)(unsigned int msr, int *err); -	int (*rdmsr_regs)(u32 *regs);  	int (*write_msr)(unsigned int msr, unsigned low, unsigned high); -	int (*wrmsr_regs)(u32 *regs);  	u64 (*read_tsc)(void);  	u64 (*read_pmc)(int counter); @@ -250,7 +248,8 @@ struct pv_mmu_ops {  	void (*flush_tlb_single)(unsigned long addr);  	void (*flush_tlb_others)(const struct cpumask *cpus,  				 struct mm_struct *mm, -				 unsigned long va); +				 unsigned long start, +				 unsigned long end);  	/* Hooks for allocating and freeing a pagetable top-level */  	int  (*pgd_alloc)(struct mm_struct *mm); diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b3a531746026..73e8eeff22ee 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -7,9 +7,13 @@  #undef DEBUG  #ifdef DEBUG -#define DBG(x...) printk(x) +#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__)  #else -#define DBG(x...) +#define DBG(fmt, ...)				\ +do {						\ +	if (0)					\ +		printk(fmt, ##__VA_ARGS__);	\ +} while (0)  #endif  #define PCI_PROBE_BIOS		0x0001 @@ -100,6 +104,7 @@ struct pci_raw_ops {  extern const struct pci_raw_ops *raw_pci_ops;  extern const struct pci_raw_ops *raw_pci_ext_ops; +extern const struct pci_raw_ops pci_mmcfg;  extern const struct pci_raw_ops pci_direct_conf1;  extern bool port_cf9_safe; @@ -135,6 +140,12 @@ struct pci_mmcfg_region {  extern int __init pci_mmcfg_arch_init(void);  extern void __init pci_mmcfg_arch_free(void); +extern int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg); +extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg); +extern int __devinit pci_mmconfig_insert(struct device *dev, +					 u16 seg, u8 start, +					 u8 end, phys_addr_t addr); +extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);  extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);  extern struct list_head pci_mmcfg_list; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index d9b8e3f7f42a..1104afaba52b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -551,6 +551,12 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);  				{ [0 ... NR_CPUS-1] = _initvalue };	\  	__typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map +#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue)	\ +	DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue;		\ +	__typeof__(_type) _name##_early_map[NR_CPUS] __initdata =	\ +				{ [0 ... NR_CPUS-1] = _initvalue };	\ +	__typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map +  #define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\  	EXPORT_PER_CPU_SYMBOL(_name) @@ -559,6 +565,11 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);  	extern __typeof__(_type) *_name##_early_ptr;		\  	extern __typeof__(_type)  _name##_early_map[] +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name)		\ +	DECLARE_PER_CPU_READ_MOSTLY(_type, _name);		\ +	extern __typeof__(_type) *_name##_early_ptr;		\ +	extern __typeof__(_type)  _name##_early_map[] +  #define	early_per_cpu_ptr(_name) (_name##_early_ptr)  #define	early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])  #define	early_per_cpu(_name, _cpu) 				\ @@ -570,12 +581,18 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);  #define	DEFINE_EARLY_PER_CPU(_type, _name, _initvalue)		\  	DEFINE_PER_CPU(_type, _name) = _initvalue +#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue)	\ +	DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue +  #define EXPORT_EARLY_PER_CPU_SYMBOL(_name)			\  	EXPORT_PER_CPU_SYMBOL(_name)  #define DECLARE_EARLY_PER_CPU(_type, _name)			\  	DECLARE_PER_CPU(_type, _name) +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name)		\ +	DECLARE_PER_CPU_READ_MOSTLY(_type, _name) +  #define	early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)  #define	early_per_cpu_ptr(_name) NULL  /* no early_per_cpu_map() */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 588f52ea810e..c78f14a0df00 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -5,11 +5,10 @@   * Performance event hw details:   */ -#define X86_PMC_MAX_GENERIC				       32 -#define X86_PMC_MAX_FIXED					3 +#define INTEL_PMC_MAX_GENERIC				       32 +#define INTEL_PMC_MAX_FIXED					3 +#define INTEL_PMC_IDX_FIXED				       32 -#define X86_PMC_IDX_GENERIC				        0 -#define X86_PMC_IDX_FIXED				       32  #define X86_PMC_IDX_MAX					       64  #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1 @@ -48,8 +47,7 @@  	(X86_RAW_EVENT_MASK          |  \  	 AMD64_EVENTSEL_EVENT)  #define AMD64_NUM_COUNTERS				4 -#define AMD64_NUM_COUNTERS_F15H				6 -#define AMD64_NUM_COUNTERS_MAX				AMD64_NUM_COUNTERS_F15H +#define AMD64_NUM_COUNTERS_CORE				6  #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		0x3c  #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8) @@ -121,16 +119,16 @@ struct x86_pmu_capability {  /* Instr_Retired.Any: */  #define MSR_ARCH_PERFMON_FIXED_CTR0	0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS	(X86_PMC_IDX_FIXED + 0) +#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS	(INTEL_PMC_IDX_FIXED + 0)  /* CPU_CLK_Unhalted.Core: */  #define MSR_ARCH_PERFMON_FIXED_CTR1	0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES	(X86_PMC_IDX_FIXED + 1) +#define INTEL_PMC_IDX_FIXED_CPU_CYCLES	(INTEL_PMC_IDX_FIXED + 1)  /* CPU_CLK_Unhalted.Ref: */  #define MSR_ARCH_PERFMON_FIXED_CTR2	0x30b -#define X86_PMC_IDX_FIXED_REF_CYCLES	(X86_PMC_IDX_FIXED + 2) -#define X86_PMC_MSK_FIXED_REF_CYCLES	(1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) +#define INTEL_PMC_IDX_FIXED_REF_CYCLES	(INTEL_PMC_IDX_FIXED + 2) +#define INTEL_PMC_MSK_FIXED_REF_CYCLES	(1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)  /*   * We model BTS tracing as another fixed-mode PMC. @@ -139,7 +137,7 @@ struct x86_pmu_capability {   * values are used by actual fixed events and higher values are used   * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.   */ -#define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS				(INTEL_PMC_IDX_FIXED + 16)  /*   * IBS cpuid feature detection @@ -234,6 +232,7 @@ struct perf_guest_switch_msr {  extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);  extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern void perf_check_microcode(void);  #else  static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)  { @@ -247,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)  }  static inline void perf_events_lapic_init(void)	{ } +static inline void perf_check_microcode(void) { }  #endif  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 98391db840c6..f2b489cf1602 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -2,9 +2,9 @@  #define _ASM_X86_PGTABLE_2LEVEL_H  #define pte_ERROR(e) \ -	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) +	pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low)  #define pgd_ERROR(e) \ -	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) +	pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e))  /*   * Certain architectures need to do special things when PTEs diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 43876f16caf1..4cc9f2b7cdc3 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -9,13 +9,13 @@   */  #define pte_ERROR(e)							\ -	printk("%s:%d: bad pte %p(%08lx%08lx).\n",			\ +	pr_err("%s:%d: bad pte %p(%08lx%08lx)\n",			\  	       __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)  #define pmd_ERROR(e)							\ -	printk("%s:%d: bad pmd %p(%016Lx).\n",				\ +	pr_err("%s:%d: bad pmd %p(%016Lx)\n",				\  	       __FILE__, __LINE__, &(e), pmd_val(e))  #define pgd_ERROR(e)							\ -	printk("%s:%d: bad pgd %p(%016Lx).\n",				\ +	pr_err("%s:%d: bad pgd %p(%016Lx)\n",				\  	       __FILE__, __LINE__, &(e), pgd_val(e))  /* Rules for using set_pte: the pte being assigned *must* be @@ -47,16 +47,26 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)   * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd   * operations.   * - * Without THP if the mmap_sem is hold for reading, the - * pmd can only transition from null to not null while pmd_read_atomic runs. - * So there's no need of literally reading it atomically. + * Without THP if the mmap_sem is hold for reading, the pmd can only + * transition from null to not null while pmd_read_atomic runs. So + * we can always return atomic pmd values with this function.   *   * With THP if the mmap_sem is hold for reading, the pmd can become - * THP or null or point to a pte (and in turn become "stable") at any - * time under pmd_read_atomic, so it's mandatory to read it atomically - * with cmpxchg8b. + * trans_huge or none or point to a pte (and in turn become "stable") + * at any time under pmd_read_atomic. We could read it really + * atomically here with a atomic64_read for the THP enabled case (and + * it would be a whole lot simpler), but to avoid using cmpxchg8b we + * only return an atomic pmdval if the low part of the pmdval is later + * found stable (i.e. pointing to a pte). And we're returning a none + * pmdval if the low part of the pmd is none. In some cases the high + * and low part of the pmdval returned may not be consistent if THP is + * enabled (the low part may point to previously mapped hugepage, + * while the high part may point to a more recently mapped hugepage), + * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part + * of the pmd to be read atomically to decide if the pmd is unstable + * or not, with the only exception of when the low part of the pmd is + * zero in which case we return a none pmd.   */ -#ifndef CONFIG_TRANSPARENT_HUGEPAGE  static inline pmd_t pmd_read_atomic(pmd_t *pmdp)  {  	pmdval_t ret; @@ -74,12 +84,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)  	return (pmd_t) { ret };  } -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ -	return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */  static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)  { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09ae..8251be02301e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[];  extern void paging_init(void);  #define pte_ERROR(e)					\ -	printk("%s:%d: bad pte %p(%016lx).\n",		\ +	pr_err("%s:%d: bad pte %p(%016lx)\n",		\  	       __FILE__, __LINE__, &(e), pte_val(e))  #define pmd_ERROR(e)					\ -	printk("%s:%d: bad pmd %p(%016lx).\n",		\ +	pr_err("%s:%d: bad pmd %p(%016lx)\n",		\  	       __FILE__, __LINE__, &(e), pmd_val(e))  #define pud_ERROR(e)					\ -	printk("%s:%d: bad pud %p(%016lx).\n",		\ +	pr_err("%s:%d: bad pud %p(%016lx)\n",		\  	       __FILE__, __LINE__, &(e), pud_val(e))  #define pgd_ERROR(e)					\ -	printk("%s:%d: bad pgd %p(%016lx).\n",		\ +	pr_err("%s:%d: bad pgd %p(%016lx)\n",		\  	       __FILE__, __LINE__, &(e), pgd_val(e))  struct mm_struct; diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index f8ab3eaad128..aea1d1d848c7 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -44,6 +44,7 @@   */  #define X86_CR3_PWT	0x00000008 /* Page Write Through */  #define X86_CR3_PCD	0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */  /*   * Intel CPU features in CR4 @@ -61,6 +62,7 @@  #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */  #define X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */  #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ +#define X86_CR4_PCIDE	0x00020000 /* enable PCID support */  #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */  #define X86_CR4_SMEP	0x00100000 /* enable SMEP support */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 39bc5777211a..d048cad9bcad 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -61,6 +61,19 @@ static inline void *current_text_addr(void)  # define ARCH_MIN_MMSTRUCT_ALIGN	0  #endif +enum tlb_infos { +	ENTRIES, +	NR_INFO +}; + +extern u16 __read_mostly tlb_lli_4k[NR_INFO]; +extern u16 __read_mostly tlb_lli_2m[NR_INFO]; +extern u16 __read_mostly tlb_lli_4m[NR_INFO]; +extern u16 __read_mostly tlb_lld_4k[NR_INFO]; +extern u16 __read_mostly tlb_lld_2m[NR_INFO]; +extern u16 __read_mostly tlb_lld_4m[NR_INFO]; +extern s8  __read_mostly tlb_flushall_shift; +  /*   *  CPU type and hardware bug flags. Kept separately for each CPU.   *  Members of this structure are referenced in head.S, so think twice diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fce3f4ae5bd6..fe1ec5bcd846 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -21,8 +21,9 @@ struct real_mode_header {  	u32	wakeup_header;  #endif  	/* APM/BIOS reboot */ -#ifdef CONFIG_X86_32  	u32	machine_real_restart_asm; +#ifdef CONFIG_X86_64 +	u32	machine_real_restart_seg;  #endif  }; diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 92f297069e87..a82c4f1b4d83 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -18,8 +18,8 @@ extern struct machine_ops machine_ops;  void native_machine_crash_shutdown(struct pt_regs *regs);  void native_machine_shutdown(void); -void machine_real_restart(unsigned int type); -/* These must match dispatch_table in reboot_32.S */ +void __noreturn machine_real_restart(unsigned int type); +/* These must match dispatch in arch/x86/realmore/rm/reboot.S */  #define MRR_BIOS	0  #define MRR_APM		1 diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f48394513c37..4f19a1526037 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -31,12 +31,12 @@ static inline bool cpu_has_ht_siblings(void)  	return has_siblings;  } -DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); -DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);  /* cpus sharing the last level cache: */ -DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); -DECLARE_PER_CPU(u16, cpu_llc_id); -DECLARE_PER_CPU(int, cpu_number); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);  static inline struct cpumask *cpu_sibling_mask(int cpu)  { @@ -53,10 +53,10 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)  	return per_cpu(cpu_llc_shared_map, cpu);  } -DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); -DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);  #endif  /* Static state in head.S used to set up a CPU */ @@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);  void smp_store_cpu_info(int id);  #define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu) -/* We don't mark CPUs online until __cpu_up(), so we need another measure */ -static inline int num_booting_cpus(void) -{ -	return cpumask_weight(cpu_callout_mask); -}  #else /* !CONFIG_SMP */  #define wbinvd_on_cpu(cpu)     wbinvd()  static inline int wbinvd_on_all_cpus(void) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 829215fef9ee..4fef20773b8f 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -4,7 +4,14 @@  #define tlb_start_vma(tlb, vma) do { } while (0)  #define tlb_end_vma(tlb, vma) do { } while (0)  #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) + +#define tlb_flush(tlb)							\ +{									\ +	if (tlb->fullmm == 0)						\ +		flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL);	\ +	else								\ +		flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL);	\ +}  #include <asm-generic/tlb.h> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 36a1a2ab87d2..74a44333545a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -73,14 +73,10 @@ static inline void __flush_tlb_one(unsigned long addr)   *  - flush_tlb_page(vma, vmaddr) flushes one page   *  - flush_tlb_range(vma, start, end) flushes a range of pages   *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - *  - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus + *  - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus   *   * ..but the i386 has somewhat limited tlb flushing capabilities,   * and page-granular flushes are available only on i486 and up. - * - * x86-64 can only flush individual pages or full VMs. For a range flush - * we always do the full VM. Might be worth trying if for a small - * range a few INVLPGs in a row are a win.   */  #ifndef CONFIG_SMP @@ -109,9 +105,17 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,  		__flush_tlb();  } +static inline void flush_tlb_mm_range(struct mm_struct *mm, +	   unsigned long start, unsigned long end, unsigned long vmflag) +{ +	if (mm == current->active_mm) +		__flush_tlb(); +} +  static inline void native_flush_tlb_others(const struct cpumask *cpumask,  					   struct mm_struct *mm, -					   unsigned long va) +					   unsigned long start, +					   unsigned long end)  {  } @@ -119,27 +123,35 @@ static inline void reset_lazy_tlbstate(void)  {  } +static inline void flush_tlb_kernel_range(unsigned long start, +					  unsigned long end) +{ +	flush_tlb_all(); +} +  #else  /* SMP */  #include <asm/smp.h>  #define local_flush_tlb() __flush_tlb() +#define flush_tlb_mm(mm)	flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) + +#define flush_tlb_range(vma, start, end)	\ +		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) +  extern void flush_tlb_all(void);  extern void flush_tlb_current_task(void); -extern void flush_tlb_mm(struct mm_struct *);  extern void flush_tlb_page(struct vm_area_struct *, unsigned long); +extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, +				unsigned long end, unsigned long vmflag); +extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);  #define flush_tlb()	flush_tlb_current_task() -static inline void flush_tlb_range(struct vm_area_struct *vma, -				   unsigned long start, unsigned long end) -{ -	flush_tlb_mm(vma->vm_mm); -} -  void native_flush_tlb_others(const struct cpumask *cpumask, -			     struct mm_struct *mm, unsigned long va); +				struct mm_struct *mm, +				unsigned long start, unsigned long end);  #define TLBSTATE_OK	1  #define TLBSTATE_LAZY	2 @@ -159,13 +171,8 @@ static inline void reset_lazy_tlbstate(void)  #endif	/* SMP */  #ifndef CONFIG_PARAVIRT -#define flush_tlb_others(mask, mm, va)	native_flush_tlb_others(mask, mm, va) +#define flush_tlb_others(mask, mm, start, end)	\ +	native_flush_tlb_others(mask, mm, start, end)  #endif -static inline void flush_tlb_kernel_range(unsigned long start, -					  unsigned long end) -{ -	flush_tlb_all(); -} -  #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8e796fbbf9c6..d8def8b3dba0 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -17,6 +17,8 @@  /* Handles exceptions in both to and from, but doesn't do access_ok */  __must_check unsigned long +copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); +__must_check unsigned long  copy_user_generic_string(void *to, const void *from, unsigned len);  __must_check unsigned long  copy_user_generic_unrolled(void *to, const void *from, unsigned len); @@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len)  {  	unsigned ret; -	alternative_call(copy_user_generic_unrolled, +	/* +	 * If CPU has ERMS feature, use copy_user_enhanced_fast_string. +	 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. +	 * Otherwise, use copy_user_generic_unrolled. +	 */ +	alternative_call_2(copy_user_generic_unrolled,  			 copy_user_generic_string,  			 X86_FEATURE_REP_GOOD, +			 copy_user_enhanced_fast_string, +			 X86_FEATURE_ERMS,  			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),  				     "=d" (len)),  			 "1" (to), "2" (from), "3" (len) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1e9bed14f7ae..f3971bbcd1de 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -48,7 +48,7 @@ struct arch_uprobe_task {  #endif  }; -extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);  extern int  arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);  extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);  extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 3bb9491b7659..b47c2a82ff15 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -15,7 +15,8 @@ extern void uv_nmi_init(void);  extern void uv_system_init(void);  extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,  						 struct mm_struct *mm, -						 unsigned long va, +						 unsigned long start, +						 unsigned end,  						 unsigned int cpu);  #else	/* X86_UV */ @@ -26,7 +27,7 @@ static inline void uv_cpu_init(void)	{ }  static inline void uv_system_init(void)	{ }  static inline const struct cpumask *  uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, -		    unsigned long va, unsigned int cpu) +		    unsigned long start, unsigned long end, unsigned int cpu)  { return cpumask; }  #endif	/* X86_UV */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 6149b476d9df..a06983cdc125 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -140,6 +140,9 @@  #define IPI_RESET_LIMIT			1  /* after this # consecutive successes, bump up the throttle if it was lowered */  #define COMPLETE_THRESHOLD		5 +/* after this # of giveups (fall back to kernel IPI's) disable the use of +   the BAU for a period of time */ +#define GIVEUP_LIMIT			100  #define UV_LB_SUBNODEID			0x10 @@ -166,7 +169,6 @@  #define FLUSH_RETRY_TIMEOUT		2  #define FLUSH_GIVEUP			3  #define FLUSH_COMPLETE			4 -#define FLUSH_RETRY_BUSYBUG		5  /*   * tuning the action when the numalink network is extremely delayed @@ -175,7 +177,7 @@  						   microseconds */  #define CONGESTED_REPS			10	/* long delays averaged over  						   this many broadcasts */ -#define CONGESTED_PERIOD		30	/* time for the bau to be +#define DISABLED_PERIOD			10	/* time for the bau to be  						   disabled, in seconds */  /* see msg_type: */  #define MSG_NOOP			0 @@ -520,6 +522,12 @@ struct ptc_stats {  	unsigned long	s_uv2_wars;		/* uv2 workaround, perm. busy */  	unsigned long	s_uv2_wars_hw;		/* uv2 workaround, hiwater */  	unsigned long	s_uv2_war_waits;	/* uv2 workaround, long waits */ +	unsigned long	s_overipilimit;		/* over the ipi reset limit */ +	unsigned long	s_giveuplimit;		/* disables, over giveup limit*/ +	unsigned long	s_enters;		/* entries to the driver */ +	unsigned long	s_ipifordisabled;	/* fall back to IPI; disabled */ +	unsigned long	s_plugged;		/* plugged by h/w bug*/ +	unsigned long	s_congested;		/* giveup on long wait */  	/* destination statistics */  	unsigned long	d_alltlb;		/* times all tlb's on this  						   cpu were flushed */ @@ -586,8 +594,8 @@ struct bau_control {  	int			timeout_tries;  	int			ipi_attempts;  	int			conseccompletes; -	int			baudisabled; -	int			set_bau_off; +	short			nobau; +	short			baudisabled;  	short			cpu;  	short			osnode;  	short			uvhub_cpu; @@ -596,14 +604,16 @@ struct bau_control {  	short			cpus_in_socket;  	short			cpus_in_uvhub;  	short			partition_base_pnode; -	short			using_desc; /* an index, like uvhub_cpu */ -	unsigned int		inuse_map; +	short			busy;       /* all were busy (war) */  	unsigned short		message_number;  	unsigned short		uvhub_quiesce;  	short			socket_acknowledge_count[DEST_Q_SIZE];  	cycles_t		send_message; +	cycles_t		period_end; +	cycles_t		period_time;  	spinlock_t		uvhub_lock;  	spinlock_t		queue_lock; +	spinlock_t		disable_lock;  	/* tunables */  	int			max_concurr;  	int			max_concurr_const; @@ -614,9 +624,9 @@ struct bau_control {  	int			complete_threshold;  	int			cong_response_us;  	int			cong_reps; -	int			cong_period; -	unsigned long		clocks_per_100_usec; -	cycles_t		period_time; +	cycles_t		disabled_period; +	int			period_giveups; +	int			giveup_limit;  	long			period_requests;  	struct hub_and_pnode	*thp;  }; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 31f180c21ce9..74fcb963595b 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -60,6 +60,7 @@  #define SECONDARY_EXEC_WBINVD_EXITING		0x00000040  #define SECONDARY_EXEC_UNRESTRICTED_GUEST	0x00000080  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400 +#define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000  #define PIN_BASED_EXT_INTR_MASK                 0x00000001 @@ -281,6 +282,7 @@ enum vmcs_field {  #define EXIT_REASON_EPT_MISCONFIG       49  #define EXIT_REASON_WBINVD		54  #define EXIT_REASON_XSETBV		55 +#define EXIT_REASON_INVPCID		58  /*   * Interruption-information format @@ -404,6 +406,7 @@ enum vmcs_field {  #define VMX_EPTP_WB_BIT				(1ull << 14)  #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)  #define VMX_EPT_1GB_PAGE_BIT			(1ull << 17) +#define VMX_EPT_AD_BIT					(1ull << 21)  #define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)  #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)  #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26) @@ -415,11 +418,14 @@ enum vmcs_field {  #define VMX_EPT_MAX_GAW				0x4  #define VMX_EPT_MT_EPTE_SHIFT			3  #define VMX_EPT_GAW_EPTP_SHIFT			3 +#define VMX_EPT_AD_ENABLE_BIT			(1ull << 6)  #define VMX_EPT_DEFAULT_MT			0x6ull  #define VMX_EPT_READABLE_MASK			0x1ull  #define VMX_EPT_WRITABLE_MASK			0x2ull  #define VMX_EPT_EXECUTABLE_MASK			0x4ull  #define VMX_EPT_IPAT_BIT    			(1ull << 6) +#define VMX_EPT_ACCESS_BIT				(1ull << 8) +#define VMX_EPT_DIRTY_BIT				(1ull << 9)  #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 92e54abf89e0..f90f0a587c66 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -9,15 +9,6 @@  #include <asm/ipi.h>  #include <linux/cpumask.h> -/* - * Need to use more than cpu 0, because we need more vectors - * when MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) -{ -	return cpu_online_mask; -} -  static int x2apic_apic_id_valid(int apicid)  {  	return 1; @@ -28,15 +19,6 @@ static int x2apic_apic_id_registered(void)  	return 1;  } -/* - * For now each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	cpumask_clear(retmask); -	cpumask_set_cpu(cpu, retmask); -} -  static void  __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)  { diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c090af10ac7d..38155f667144 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -156,7 +156,6 @@ struct x86_cpuinit_ops {  /**   * struct x86_platform_ops - platform specific runtime functions   * @calibrate_tsc:		calibrate TSC - * @wallclock_init:		init the wallclock device   * @get_wallclock:		get time from HW clock like RTC etc.   * @set_wallclock:		set time back to HW clock   * @is_untracked_pat_range	exclude from PAT logic @@ -164,10 +163,10 @@ struct x86_cpuinit_ops {   * @i8042_detect		pre-detect if i8042 controller exists   * @save_sched_clock_state:	save state for sched_clock() on suspend   * @restore_sched_clock_state:	restore state for sched_clock() on resume + * @apic_post_init:		adjust apic if neeeded   */  struct x86_platform_ops {  	unsigned long (*calibrate_tsc)(void); -	void (*wallclock_init)(void);  	unsigned long (*get_wallclock)(void);  	int (*set_wallclock)(unsigned long nowtime);  	void (*iommu_shutdown)(void); @@ -177,6 +176,7 @@ struct x86_platform_ops {  	int (*i8042_detect)(void);  	void (*save_sched_clock_state)(void);  	void (*restore_sched_clock_state)(void); +	void (*apic_post_init)(void);  };  struct pci_dev; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8afb69319815..b2297e58c6ed 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,  		return 0;  	} -	if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { +	if (intsrc->source_irq == 0) {  		if (acpi_skip_timer_override) { -			printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); +			printk(PREFIX "BIOS IRQ0 override ignored.\n");  			return 0;  		} -		if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + +		if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity +			&& (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {  			intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;  			printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");  		} @@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)  }  /* - * Force ignoring BIOS IRQ0 pin2 override + * Force ignoring BIOS IRQ0 override   */  static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)  { -	/* -	 * The ati_ixp4x0_rev() early PCI quirk should have set -	 * the acpi_skip_timer_override flag already: -	 */  	if (!acpi_skip_timer_override) { -		WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); -		pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", +		pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",  			d->ident);  		acpi_skip_timer_override = 1;  	} @@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {  	 * is enabled.  This input is incorrectly designated the  	 * ISA IRQ 0 via an interrupt source override even though  	 * it is wired to the output of the master 8259A and INTIN0 -	 * is not connected at all.  Force ignoring BIOS IRQ0 pin2 +	 * is not connected at all.  Force ignoring BIOS IRQ0  	 * override in that cases.  	 */  	{ @@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {  		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),  		     },  	 }, +	{ +	 .callback = dmi_ignore_irq0_timer_override, +	 .ident = "FUJITSU SIEMENS", +	 .matches = { +		     DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), +		     DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), +		     }, +	 },  	{}  }; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f0759..931280ff8299 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "SMP alternatives: " fmt +  #include <linux/module.h>  #include <linux/sched.h>  #include <linux/mutex.h> @@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str)  __setup("noreplace-paravirt", setup_noreplace_paravirt);  #endif -#define DPRINTK(fmt, args...) if (debug_alternative) \ -	printk(KERN_DEBUG fmt, args) +#define DPRINTK(fmt, ...)				\ +do {							\ +	if (debug_alternative)				\ +		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\ +} while (0)  /*   * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes @@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp)  	 * If this still occurs then you should see a hang  	 * or crash shortly after this line:  	 */ -	printk("lockdep: fixing up alternatives.\n"); +	pr_info("lockdep: fixing up alternatives\n");  #endif  	if (noreplace_smp || smp_alt_once || skip_smp_alternatives) @@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp)  	if (smp == smp_mode) {  		/* nothing */  	} else if (smp) { -		printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); +		pr_info("switching to SMP code\n");  		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);  		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);  		list_for_each_entry(mod, &smp_alt_modules, next)  			alternatives_smp_lock(mod->locks, mod->locks_end,  					      mod->text, mod->text_end);  	} else { -		printk(KERN_INFO "SMP alternatives: switching to UP code\n"); +		pr_info("switching to UP code\n");  		set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);  		set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);  		list_for_each_entry(mod, &smp_alt_modules, next) @@ -546,7 +551,7 @@ void __init alternative_instructions(void)  #ifdef CONFIG_SMP  	if (smp_alt_once) {  		if (1 == num_possible_cpus()) { -			printk(KERN_INFO "SMP alternatives: switching to UP code\n"); +			pr_info("switching to UP code\n");  			set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);  			set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); @@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data)  	struct text_poke_param *p;  	int i; -	if (atomic_dec_and_test(&stop_machine_first)) { +	if (atomic_xchg(&stop_machine_first, 0)) {  		for (i = 0; i < tpp->nparams; i++) {  			p = &tpp->params[i];  			text_poke(p->addr, p->opcode, p->len); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854591cc..aadf3359e2a7 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -2,6 +2,9 @@   * Shared support code for AMD K8 northbridges and derivates.   * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.   */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/types.h>  #include <linux/slab.h>  #include <linux/init.h> @@ -16,6 +19,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },  	{}  };  EXPORT_SYMBOL(amd_nb_misc_ids); @@ -258,7 +262,7 @@ void amd_flush_garts(void)  	}  	spin_unlock_irqrestore(&gart_lock, flags);  	if (!flushed) -		printk("nothing to flush?\n"); +		pr_notice("nothing to flush?\n");  }  EXPORT_SYMBOL_GPL(amd_flush_garts); @@ -269,11 +273,10 @@ static __init int init_amd_nbs(void)  	err = amd_cache_northbridges();  	if (err < 0) -		printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); +		pr_notice("Cannot enumerate AMD northbridges\n");  	if (amd_cache_gart() < 0) -		printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " -		       "GART support disabled.\n"); +		pr_notice("Cannot initialize GART flush words, GART support disabled\n");  	return err;  } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e094af..24deb3082328 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -75,8 +75,8 @@ physid_mask_t phys_cpu_present_map;  /*   * Map cpu index to physical APIC ID   */ -DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);  EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);  EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); @@ -88,7 +88,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);   * used for the mapping.  This is where the behaviors of x86_64 and 32   * actually diverge.  Let's keep it ugly for now.   */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);  /*   * Knob to control our willingness to enable the local APIC. @@ -2123,6 +2123,42 @@ void default_init_apic_ldr(void)  	apic_write(APIC_LDR, val);  } +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +				   const struct cpumask *andmask, +				   unsigned int *apicid) +{ +	unsigned int cpu; + +	for_each_cpu_and(cpu, cpumask, andmask) { +		if (cpumask_test_cpu(cpu, cpu_online_mask)) +			break; +	} + +	if (likely(cpu < nr_cpu_ids)) { +		*apicid = per_cpu(x86_cpu_to_apicid, cpu); +		return 0; +	} + +	return -EINVAL; +} + +/* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ +	struct apic **drv; + +	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { +		/* Should happen once for each apic */ +		WARN_ON((*drv)->eoi_write == eoi_write); +		(*drv)->eoi_write = eoi_write; +	} +} +  /*   * Power management   */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 0e881c46e8c8..00c77cf78e9e 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -36,25 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  	return 1;  } -static const struct cpumask *flat_target_cpus(void) -{ -	return cpu_online_mask; -} - -static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	/* Careful. Some cpus do not strictly honor the set of cpus -	 * specified in the interrupt destination when using lowest -	 * priority interrupt delivery mode. -	 * -	 * In particular there was a hyperthreading cpu observed to -	 * deliver interrupts to the wrong hyperthread when only one -	 * hyperthread was specified in the interrupt desitination. -	 */ -	cpumask_clear(retmask); -	cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} -  /*   * Set up the logical destination ID.   * @@ -92,7 +73,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)  }  static void - flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)  {  	unsigned long mask = cpumask_bits(cpumask)[0];  	int cpu = smp_processor_id(); @@ -186,7 +167,7 @@ static struct apic apic_flat =  {  	.irq_delivery_mode		= dest_LowestPrio,  	.irq_dest_mode			= 1, /* logical */ -	.target_cpus			= flat_target_cpus, +	.target_cpus			= online_target_cpus,  	.disable_esr			= 0,  	.dest_logical			= APIC_DEST_LOGICAL,  	.check_apicid_used		= NULL, @@ -210,8 +191,7 @@ static struct apic apic_flat =  {  	.set_apic_id			= set_apic_id,  	.apic_id_mask			= 0xFFu << 24, -	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and, +	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,  	.send_IPI_mask			= flat_send_IPI_mask,  	.send_IPI_mask_allbutself	= flat_send_IPI_mask_allbutself, @@ -262,17 +242,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)  	return 0;  } -static const struct cpumask *physflat_target_cpus(void) -{ -	return cpu_online_mask; -} - -static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	cpumask_clear(retmask); -	cpumask_set_cpu(cpu, retmask); -} -  static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)  {  	default_send_IPI_mask_sequence_phys(cpumask, vector); @@ -294,38 +263,6 @@ static void physflat_send_IPI_all(int vector)  	physflat_send_IPI_mask(cpu_online_mask, vector);  } -static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ -	int cpu; - -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	cpu = cpumask_first(cpumask); -	if ((unsigned)cpu < nr_cpu_ids) -		return per_cpu(x86_cpu_to_apicid, cpu); -	else -		return BAD_APICID; -} - -static unsigned int -physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -				const struct cpumask *andmask) -{ -	int cpu; - -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	for_each_cpu_and(cpu, cpumask, andmask) { -		if (cpumask_test_cpu(cpu, cpu_online_mask)) -			break; -	} -	return per_cpu(x86_cpu_to_apicid, cpu); -} -  static int physflat_probe(void)  {  	if (apic == &apic_physflat || num_possible_cpus() > 8) @@ -345,13 +282,13 @@ static struct apic apic_physflat =  {  	.irq_delivery_mode		= dest_Fixed,  	.irq_dest_mode			= 0, /* physical */ -	.target_cpus			= physflat_target_cpus, +	.target_cpus			= online_target_cpus,  	.disable_esr			= 0,  	.dest_logical			= 0,  	.check_apicid_used		= NULL,  	.check_apicid_present		= NULL, -	.vector_allocation_domain	= physflat_vector_allocation_domain, +	.vector_allocation_domain	= default_vector_allocation_domain,  	/* not needed, but shouldn't hurt: */  	.init_apic_ldr			= flat_init_apic_ldr, @@ -370,8 +307,7 @@ static struct apic apic_physflat =  {  	.set_apic_id			= set_apic_id,  	.apic_id_mask			= 0xFFu << 24, -	.cpu_mask_to_apicid		= physflat_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and		= physflat_cpu_mask_to_apicid_and, +	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,  	.send_IPI_mask			= physflat_send_IPI_mask,  	.send_IPI_mask_allbutself	= physflat_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index a6e4c6e06c08..e145f28b4099 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit)  	return physid_isset(bit, phys_cpu_present_map);  } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, +					  const struct cpumask *mask)  {  	if (cpu != 0)  		pr_warning("APIC: Vector allocated for non-BSP cpu\n"); -	cpumask_clear(retmask); -	cpumask_set_cpu(cpu, retmask); +	cpumask_copy(retmask, cpumask_of(cpu));  }  static u32 noop_apic_read(u32 reg) @@ -159,8 +159,7 @@ struct apic apic_noop = {  	.set_apic_id			= NULL,  	.apic_id_mask			= 0x0F << 24, -	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and, +	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,  	.send_IPI_mask			= noop_send_IPI_mask,  	.send_IPI_mask_allbutself	= noop_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 6ec6d5d297c3..bc552cff2578 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -72,17 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)  	return initial_apic_id >> index_msb;  } -static const struct cpumask *numachip_target_cpus(void) -{ -	return cpu_online_mask; -} - -static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	cpumask_clear(retmask); -	cpumask_set_cpu(cpu, retmask); -} -  static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)  {  	union numachip_csr_g3_ext_irq_gen int_gen; @@ -157,38 +146,6 @@ static void numachip_send_IPI_self(int vector)  	__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);  } -static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ -	int cpu; - -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	cpu = cpumask_first(cpumask); -	if (likely((unsigned)cpu < nr_cpu_ids)) -		return per_cpu(x86_cpu_to_apicid, cpu); - -	return BAD_APICID; -} - -static unsigned int -numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -				const struct cpumask *andmask) -{ -	int cpu; - -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	for_each_cpu_and(cpu, cpumask, andmask) { -		if (cpumask_test_cpu(cpu, cpu_online_mask)) -			break; -	} -	return per_cpu(x86_cpu_to_apicid, cpu); -} -  static int __init numachip_probe(void)  {  	return apic == &apic_numachip; @@ -253,13 +210,13 @@ static struct apic apic_numachip __refconst = {  	.irq_delivery_mode		= dest_Fixed,  	.irq_dest_mode			= 0, /* physical */ -	.target_cpus			= numachip_target_cpus, +	.target_cpus			= online_target_cpus,  	.disable_esr			= 0,  	.dest_logical			= 0,  	.check_apicid_used		= NULL,  	.check_apicid_present		= NULL, -	.vector_allocation_domain	= numachip_vector_allocation_domain, +	.vector_allocation_domain	= default_vector_allocation_domain,  	.init_apic_ldr			= flat_init_apic_ldr,  	.ioapic_phys_id_map		= NULL, @@ -277,8 +234,7 @@ static struct apic apic_numachip __refconst = {  	.set_apic_id			= set_apic_id,  	.apic_id_mask			= 0xffU << 24, -	.cpu_mask_to_apicid		= numachip_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and		= numachip_cpu_mask_to_apicid_and, +	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,  	.send_IPI_mask			= numachip_send_IPI_mask,  	.send_IPI_mask_allbutself	= numachip_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 31fbdbfbf960..d50e3640d5ae 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void)  	return 1;  } -static const struct cpumask *bigsmp_target_cpus(void) -{ -#ifdef CONFIG_SMP -	return cpu_online_mask; -#else -	return cpumask_of(0); -#endif -} -  static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)  {  	return 0; @@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)  	return 1;  } -/* As we are using single CPU as destination, pick only one CPU here */ -static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ -	int cpu = cpumask_first(cpumask); - -	if (cpu < nr_cpu_ids) -		return cpu_physical_id(cpu); -	return BAD_APICID; -} - -static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -			      const struct cpumask *andmask) -{ -	int cpu; - -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	for_each_cpu_and(cpu, cpumask, andmask) { -		if (cpumask_test_cpu(cpu, cpu_online_mask)) -			return cpu_physical_id(cpu); -	} -	return BAD_APICID; -} -  static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)  {  	return cpuid_apic >> index_msb; @@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {  	{ } /* NULL entry stops DMI scanning */  }; -static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	cpumask_clear(retmask); -	cpumask_set_cpu(cpu, retmask); -} -  static int probe_bigsmp(void)  {  	if (def_to_bigsmp) @@ -205,13 +164,13 @@ static struct apic apic_bigsmp = {  	/* phys delivery to target CPU: */  	.irq_dest_mode			= 0, -	.target_cpus			= bigsmp_target_cpus, +	.target_cpus			= default_target_cpus,  	.disable_esr			= 1,  	.dest_logical			= 0,  	.check_apicid_used		= bigsmp_check_apicid_used,  	.check_apicid_present		= bigsmp_check_apicid_present, -	.vector_allocation_domain	= bigsmp_vector_allocation_domain, +	.vector_allocation_domain	= default_vector_allocation_domain,  	.init_apic_ldr			= bigsmp_init_apic_ldr,  	.ioapic_phys_id_map		= bigsmp_ioapic_phys_id_map, @@ -229,8 +188,7 @@ static struct apic apic_bigsmp = {  	.set_apic_id			= NULL,  	.apic_id_mask			= 0xFF << 24, -	.cpu_mask_to_apicid		= bigsmp_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and		= bigsmp_cpu_mask_to_apicid_and, +	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,  	.send_IPI_mask			= bigsmp_send_IPI_mask,  	.send_IPI_mask_allbutself	= NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index db4ab1be3c79..0874799a98c6 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void)  		WARN(1, "Command failed, status = %x\n", mip_status);  } -static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	/* Careful. Some cpus do not strictly honor the set of cpus -	 * specified in the interrupt destination when using lowest -	 * priority interrupt delivery mode. -	 * -	 * In particular there was a hyperthreading cpu observed to -	 * deliver interrupts to the wrong hyperthread when only one -	 * hyperthread was specified in the interrupt desitination. -	 */ -	cpumask_clear(retmask); -	cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - -  static void es7000_wait_for_init_deassert(atomic_t *deassert)  {  	while (!atomic_read(deassert)) @@ -540,45 +525,49 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)  	return 1;  } -static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)  {  	unsigned int round = 0; -	int cpu, uninitialized_var(apicid); +	unsigned int cpu, uninitialized_var(apicid);  	/*  	 * The cpus in the mask must all be on the apic cluster.  	 */ -	for_each_cpu(cpu, cpumask) { +	for_each_cpu_and(cpu, cpumask, cpu_online_mask) {  		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);  		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {  			WARN(1, "Not a valid mask!"); -			return BAD_APICID; +			return -EINVAL;  		} -		apicid = new_apicid; +		apicid |= new_apicid;  		round++;  	} -	return apicid; +	if (!round) +		return -EINVAL; +	*dest_id = apicid; +	return 0;  } -static unsigned int +static int  es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, -			      const struct cpumask *andmask) +			      const struct cpumask *andmask, +			      unsigned int *apicid)  { -	int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);  	cpumask_var_t cpumask; +	*apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);  	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) -		return apicid; +		return 0;  	cpumask_and(cpumask, inmask, andmask); -	cpumask_and(cpumask, cpumask, cpu_online_mask); -	apicid = es7000_cpu_mask_to_apicid(cpumask); +	es7000_cpu_mask_to_apicid(cpumask, apicid);  	free_cpumask_var(cpumask); -	return apicid; +	return 0;  }  static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) @@ -638,7 +627,7 @@ static struct apic __refdata apic_es7000_cluster = {  	.check_apicid_used		= es7000_check_apicid_used,  	.check_apicid_present		= es7000_check_apicid_present, -	.vector_allocation_domain	= es7000_vector_allocation_domain, +	.vector_allocation_domain	= flat_vector_allocation_domain,  	.init_apic_ldr			= es7000_init_apic_ldr_cluster,  	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map, @@ -656,7 +645,6 @@ static struct apic __refdata apic_es7000_cluster = {  	.set_apic_id			= NULL,  	.apic_id_mask			= 0xFF << 24, -	.cpu_mask_to_apicid		= es7000_cpu_mask_to_apicid,  	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and,  	.send_IPI_mask			= es7000_send_IPI_mask, @@ -705,7 +693,7 @@ static struct apic __refdata apic_es7000 = {  	.check_apicid_used		= es7000_check_apicid_used,  	.check_apicid_present		= es7000_check_apicid_present, -	.vector_allocation_domain	= es7000_vector_allocation_domain, +	.vector_allocation_domain	= flat_vector_allocation_domain,  	.init_apic_ldr			= es7000_init_apic_ldr,  	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map, @@ -723,7 +711,6 @@ static struct apic __refdata apic_es7000 = {  	.set_apic_id			= NULL,  	.apic_id_mask			= 0xFF << 24, -	.cpu_mask_to_apicid		= es7000_cpu_mask_to_apicid,  	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and,  	.send_IPI_mask			= es7000_send_IPI_mask, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5f0ff597437c..406eee784684 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi  	entry = alloc_irq_pin_list(node);  	if (!entry) { -		printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", -				node, apic, pin); +		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", +		       node, apic, pin);  		return -ENOMEM;  	}  	entry->apic = apic; @@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)  	ioapic_mask_entry(apic, pin);  	entry = ioapic_read_entry(apic, pin);  	if (entry.irr) -		printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", +		pr_err("Unable to reset IRR for apic: %d, pin :%d\n",  		       mpc_ioapic_id(apic), pin);  } @@ -895,7 +895,7 @@ static int irq_polarity(int idx)  		}  		case 2: /* reserved */  		{ -			printk(KERN_WARNING "broken BIOS!!\n"); +			pr_warn("broken BIOS!!\n");  			polarity = 1;  			break;  		} @@ -906,7 +906,7 @@ static int irq_polarity(int idx)  		}  		default: /* invalid */  		{ -			printk(KERN_WARNING "broken BIOS!!\n"); +			pr_warn("broken BIOS!!\n");  			polarity = 1;  			break;  		} @@ -948,7 +948,7 @@ static int irq_trigger(int idx)  				}  				default:  				{ -					printk(KERN_WARNING "broken BIOS!!\n"); +					pr_warn("broken BIOS!!\n");  					trigger = 1;  					break;  				} @@ -962,7 +962,7 @@ static int irq_trigger(int idx)  		}  		case 2: /* reserved */  		{ -			printk(KERN_WARNING "broken BIOS!!\n"); +			pr_warn("broken BIOS!!\n");  			trigger = 1;  			break;  		} @@ -973,7 +973,7 @@ static int irq_trigger(int idx)  		}  		default: /* invalid */  		{ -			printk(KERN_WARNING "broken BIOS!!\n"); +			pr_warn("broken BIOS!!\n");  			trigger = 0;  			break;  		} @@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin)  	 * Debugging check, we are in big trouble if this message pops up!  	 */  	if (mp_irqs[idx].dstirq != pin) -		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); +		pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");  	if (test_bit(bus, mp_bus_not_pci)) {  		irq = mp_irqs[idx].srcbusirq; @@ -1112,8 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)  	 * 0x80, because int 0x80 is hm, kind of importantish. ;)  	 */  	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; -	static int current_offset = VECTOR_OFFSET_START % 8; -	unsigned int old_vector; +	static int current_offset = VECTOR_OFFSET_START % 16;  	int cpu, err;  	cpumask_var_t tmp_mask; @@ -1123,35 +1122,45 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)  	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))  		return -ENOMEM; -	old_vector = cfg->vector; -	if (old_vector) { -		cpumask_and(tmp_mask, mask, cpu_online_mask); -		cpumask_and(tmp_mask, cfg->domain, tmp_mask); -		if (!cpumask_empty(tmp_mask)) { -			free_cpumask_var(tmp_mask); -			return 0; -		} -	} -  	/* Only try and allocate irqs on cpus that are present */  	err = -ENOSPC; -	for_each_cpu_and(cpu, mask, cpu_online_mask) { -		int new_cpu; -		int vector, offset; +	cpumask_clear(cfg->old_domain); +	cpu = cpumask_first_and(mask, cpu_online_mask); +	while (cpu < nr_cpu_ids) { +		int new_cpu, vector, offset; -		apic->vector_allocation_domain(cpu, tmp_mask); +		apic->vector_allocation_domain(cpu, tmp_mask, mask); + +		if (cpumask_subset(tmp_mask, cfg->domain)) { +			err = 0; +			if (cpumask_equal(tmp_mask, cfg->domain)) +				break; +			/* +			 * New cpumask using the vector is a proper subset of +			 * the current in use mask. So cleanup the vector +			 * allocation for the members that are not used anymore. +			 */ +			cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); +			cfg->move_in_progress = 1; +			cpumask_and(cfg->domain, cfg->domain, tmp_mask); +			break; +		}  		vector = current_vector;  		offset = current_offset;  next: -		vector += 8; +		vector += 16;  		if (vector >= first_system_vector) { -			/* If out of vectors on large boxen, must share them. */ -			offset = (offset + 1) % 8; +			offset = (offset + 1) % 16;  			vector = FIRST_EXTERNAL_VECTOR + offset;  		} -		if (unlikely(current_vector == vector)) + +		if (unlikely(current_vector == vector)) { +			cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); +			cpumask_andnot(tmp_mask, mask, cfg->old_domain); +			cpu = cpumask_first_and(tmp_mask, cpu_online_mask);  			continue; +		}  		if (test_bit(vector, used_vectors))  			goto next; @@ -1162,7 +1171,7 @@ next:  		/* Found one! */  		current_vector = vector;  		current_offset = offset; -		if (old_vector) { +		if (cfg->vector) {  			cfg->move_in_progress = 1;  			cpumask_copy(cfg->old_domain, cfg->domain);  		} @@ -1346,18 +1355,18 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,  	if (!IO_APIC_IRQ(irq))  		return; -	/* -	 * For legacy irqs, cfg->domain starts with cpu 0 for legacy -	 * controllers like 8259. Now that IO-APIC can handle this irq, update -	 * the cfg->domain. -	 */ -	if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) -		apic->vector_allocation_domain(0, cfg->domain);  	if (assign_irq_vector(irq, cfg, apic->target_cpus()))  		return; -	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); +	if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), +					 &dest)) { +		pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", +			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); +		__clear_irq_vector(irq, cfg); + +		return; +	}  	apic_printk(APIC_VERBOSE,KERN_DEBUG  		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1366,7 +1375,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,  		    cfg->vector, irq, attr->trigger, attr->polarity, dest);  	if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { -		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n", +		pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",  			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);  		__clear_irq_vector(irq, cfg); @@ -1469,9 +1478,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)   * Set up the timer pin, possibly with the 8259A-master behind.   */  static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, -					 unsigned int pin, int vector) +					unsigned int pin, int vector)  {  	struct IO_APIC_route_entry entry; +	unsigned int dest;  	if (irq_remapping_enabled)  		return; @@ -1482,9 +1492,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,  	 * We use logical delivery to get the timer IRQ  	 * to the first CPU.  	 */ +	if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), +						  apic->target_cpus(), &dest))) +		dest = BAD_APICID; +  	entry.dest_mode = apic->irq_dest_mode;  	entry.mask = 0;			/* don't mask IRQ for edge */ -	entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); +	entry.dest = dest;  	entry.delivery_mode = apic->irq_delivery_mode;  	entry.polarity = 0;  	entry.trigger = 0; @@ -1521,7 +1535,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)  		reg_03.raw = io_apic_read(ioapic_idx, 3);  	raw_spin_unlock_irqrestore(&ioapic_lock, flags); -	printk("\n");  	printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));  	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);  	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1578,7 +1591,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)  				i,  				ir_entry->index  			); -			printk("%1d   %1d    %1d    %1d   %1d   " +			pr_cont("%1d   %1d    %1d    %1d   %1d   "  				"%1d    %1d     %X    %02X\n",  				ir_entry->format,  				ir_entry->mask, @@ -1598,7 +1611,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)  				i,  				entry.dest  			); -			printk("%1d    %1d    %1d   %1d   %1d    " +			pr_cont("%1d    %1d    %1d   %1d   %1d    "  				"%1d    %1d    %02X\n",  				entry.mask,  				entry.trigger, @@ -1651,8 +1664,8 @@ __apicdebuginit(void) print_IO_APICs(void)  			continue;  		printk(KERN_DEBUG "IRQ%d ", irq);  		for_each_irq_pin(entry, cfg->irq_2_pin) -			printk("-> %d:%d", entry->apic, entry->pin); -		printk("\n"); +			pr_cont("-> %d:%d", entry->apic, entry->pin); +		pr_cont("\n");  	}  	printk(KERN_INFO ".................................... done.\n"); @@ -1665,9 +1678,9 @@ __apicdebuginit(void) print_APIC_field(int base)  	printk(KERN_DEBUG);  	for (i = 0; i < 8; i++) -		printk(KERN_CONT "%08x", apic_read(base + i*0x10)); +		pr_cont("%08x", apic_read(base + i*0x10)); -	printk(KERN_CONT "\n"); +	pr_cont("\n");  }  __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1769,7 +1782,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)  			printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);  		}  	} -	printk("\n"); +	pr_cont("\n");  }  __apicdebuginit(void) print_local_APICs(int maxcpu) @@ -2065,7 +2078,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)  		reg_00.raw = io_apic_read(ioapic_idx, 0);  		raw_spin_unlock_irqrestore(&ioapic_lock, flags);  		if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) -			printk("could not set ID!\n"); +			pr_cont("could not set ID!\n");  		else  			apic_printk(APIC_VERBOSE, " ok.\n");  	} @@ -2210,71 +2223,6 @@ void send_cleanup_vector(struct irq_cfg *cfg)  	cfg->move_in_progress = 0;  } -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ -	int apic, pin; -	struct irq_pin_list *entry; -	u8 vector = cfg->vector; - -	for_each_irq_pin(entry, cfg->irq_2_pin) { -		unsigned int reg; - -		apic = entry->apic; -		pin = entry->pin; -		/* -		 * With interrupt-remapping, destination information comes -		 * from interrupt-remapping table entry. -		 */ -		if (!irq_remapped(cfg)) -			io_apic_write(apic, 0x11 + pin*2, dest); -		reg = io_apic_read(apic, 0x10 + pin*2); -		reg &= ~IO_APIC_REDIR_VECTOR_MASK; -		reg |= vector; -		io_apic_modify(apic, 0x10 + pin*2, reg); -	} -} - -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, -			  unsigned int *dest_id) -{ -	struct irq_cfg *cfg = data->chip_data; - -	if (!cpumask_intersects(mask, cpu_online_mask)) -		return -1; - -	if (assign_irq_vector(data->irq, data->chip_data, mask)) -		return -1; - -	cpumask_copy(data->affinity, mask); - -	*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); -	return 0; -} - -static int -ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, -		    bool force) -{ -	unsigned int dest, irq = data->irq; -	unsigned long flags; -	int ret; - -	raw_spin_lock_irqsave(&ioapic_lock, flags); -	ret = __ioapic_set_affinity(data, mask, &dest); -	if (!ret) { -		/* Only the high 8 bits are valid. */ -		dest = SET_APIC_LOGICAL_ID(dest); -		__target_IO_APIC_irq(irq, dest, data->chip_data); -	} -	raw_spin_unlock_irqrestore(&ioapic_lock, flags); -	return ret; -} -  asmlinkage void smp_irq_move_cleanup_interrupt(void)  {  	unsigned vector, me; @@ -2362,6 +2310,87 @@ void irq_force_complete_move(int irq)  static inline void irq_complete_move(struct irq_cfg *cfg) { }  #endif +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ +	int apic, pin; +	struct irq_pin_list *entry; +	u8 vector = cfg->vector; + +	for_each_irq_pin(entry, cfg->irq_2_pin) { +		unsigned int reg; + +		apic = entry->apic; +		pin = entry->pin; +		/* +		 * With interrupt-remapping, destination information comes +		 * from interrupt-remapping table entry. +		 */ +		if (!irq_remapped(cfg)) +			io_apic_write(apic, 0x11 + pin*2, dest); +		reg = io_apic_read(apic, 0x10 + pin*2); +		reg &= ~IO_APIC_REDIR_VECTOR_MASK; +		reg |= vector; +		io_apic_modify(apic, 0x10 + pin*2, reg); +	} +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, +			  unsigned int *dest_id) +{ +	struct irq_cfg *cfg = data->chip_data; +	unsigned int irq = data->irq; +	int err; + +	if (!config_enabled(CONFIG_SMP)) +		return -1; + +	if (!cpumask_intersects(mask, cpu_online_mask)) +		return -EINVAL; + +	err = assign_irq_vector(irq, cfg, mask); +	if (err) +		return err; + +	err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); +	if (err) { +		if (assign_irq_vector(irq, cfg, data->affinity)) +			pr_err("Failed to recover vector for irq %d\n", irq); +		return err; +	} + +	cpumask_copy(data->affinity, mask); + +	return 0; +} + +static int +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, +		    bool force) +{ +	unsigned int dest, irq = data->irq; +	unsigned long flags; +	int ret; + +	if (!config_enabled(CONFIG_SMP)) +		return -1; + +	raw_spin_lock_irqsave(&ioapic_lock, flags); +	ret = __ioapic_set_affinity(data, mask, &dest); +	if (!ret) { +		/* Only the high 8 bits are valid. */ +		dest = SET_APIC_LOGICAL_ID(dest); +		__target_IO_APIC_irq(irq, dest, data->chip_data); +		ret = IRQ_SET_MASK_OK_NOCOPY; +	} +	raw_spin_unlock_irqrestore(&ioapic_lock, flags); +	return ret; +} +  static void ack_apic_edge(struct irq_data *data)  {  	irq_complete_move(data->chip_data); @@ -2541,9 +2570,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)  	chip->irq_ack = ir_ack_apic_edge;  	chip->irq_eoi = ir_ack_apic_level; -#ifdef CONFIG_SMP  	chip->irq_set_affinity = set_remapped_irq_affinity; -#endif  }  #endif /* CONFIG_IRQ_REMAP */ @@ -2554,9 +2581,7 @@ static struct irq_chip ioapic_chip __read_mostly = {  	.irq_unmask		= unmask_ioapic_irq,  	.irq_ack		= ack_apic_edge,  	.irq_eoi		= ack_apic_level, -#ifdef CONFIG_SMP  	.irq_set_affinity	= ioapic_set_affinity, -#endif  	.irq_retrigger		= ioapic_retrigger_irq,  }; @@ -3038,7 +3063,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,  	if (err)  		return err; -	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); +	err = apic->cpu_mask_to_apicid_and(cfg->domain, +					   apic->target_cpus(), &dest); +	if (err) +		return err;  	if (irq_remapped(cfg)) {  		compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); @@ -3072,7 +3100,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,  	return err;  } -#ifdef CONFIG_SMP  static int  msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)  { @@ -3092,9 +3119,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)  	__write_msi_msg(data->msi_desc, &msg); -	return 0; +	return IRQ_SET_MASK_OK_NOCOPY;  } -#endif /* CONFIG_SMP */  /*   * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, @@ -3105,9 +3131,7 @@ static struct irq_chip msi_chip = {  	.irq_unmask		= unmask_msi_irq,  	.irq_mask		= mask_msi_irq,  	.irq_ack		= ack_apic_edge, -#ifdef CONFIG_SMP  	.irq_set_affinity	= msi_set_affinity, -#endif  	.irq_retrigger		= ioapic_retrigger_irq,  }; @@ -3192,7 +3216,6 @@ void native_teardown_msi_irq(unsigned int irq)  }  #ifdef CONFIG_DMAR_TABLE -#ifdef CONFIG_SMP  static int  dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,  		      bool force) @@ -3214,19 +3237,15 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,  	dmar_msi_write(irq, &msg); -	return 0; +	return IRQ_SET_MASK_OK_NOCOPY;  } -#endif /* CONFIG_SMP */ -  static struct irq_chip dmar_msi_type = {  	.name			= "DMAR_MSI",  	.irq_unmask		= dmar_msi_unmask,  	.irq_mask		= dmar_msi_mask,  	.irq_ack		= ack_apic_edge, -#ifdef CONFIG_SMP  	.irq_set_affinity	= dmar_msi_set_affinity, -#endif  	.irq_retrigger		= ioapic_retrigger_irq,  }; @@ -3247,7 +3266,6 @@ int arch_setup_dmar_msi(unsigned int irq)  #ifdef CONFIG_HPET_TIMER -#ifdef CONFIG_SMP  static int hpet_msi_set_affinity(struct irq_data *data,  				 const struct cpumask *mask, bool force)  { @@ -3267,19 +3285,15 @@ static int hpet_msi_set_affinity(struct irq_data *data,  	hpet_msi_write(data->handler_data, &msg); -	return 0; +	return IRQ_SET_MASK_OK_NOCOPY;  } -#endif /* CONFIG_SMP */ -  static struct irq_chip hpet_msi_type = {  	.name = "HPET_MSI",  	.irq_unmask = hpet_msi_unmask,  	.irq_mask = hpet_msi_mask,  	.irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP  	.irq_set_affinity = hpet_msi_set_affinity, -#endif  	.irq_retrigger = ioapic_retrigger_irq,  }; @@ -3314,8 +3328,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)   */  #ifdef CONFIG_HT_IRQ -#ifdef CONFIG_SMP -  static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)  {  	struct ht_irq_msg msg; @@ -3340,25 +3352,23 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)  		return -1;  	target_ht_irq(data->irq, dest, cfg->vector); -	return 0; +	return IRQ_SET_MASK_OK_NOCOPY;  } -#endif -  static struct irq_chip ht_irq_chip = {  	.name			= "PCI-HT",  	.irq_mask		= mask_ht_irq,  	.irq_unmask		= unmask_ht_irq,  	.irq_ack		= ack_apic_edge, -#ifdef CONFIG_SMP  	.irq_set_affinity	= ht_set_affinity, -#endif  	.irq_retrigger		= ioapic_retrigger_irq,  };  int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)  {  	struct irq_cfg *cfg; +	struct ht_irq_msg msg; +	unsigned dest;  	int err;  	if (disable_apic) @@ -3366,36 +3376,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)  	cfg = irq_cfg(irq);  	err = assign_irq_vector(irq, cfg, apic->target_cpus()); -	if (!err) { -		struct ht_irq_msg msg; -		unsigned dest; +	if (err) +		return err; + +	err = apic->cpu_mask_to_apicid_and(cfg->domain, +					   apic->target_cpus(), &dest); +	if (err) +		return err; -		dest = apic->cpu_mask_to_apicid_and(cfg->domain, -						    apic->target_cpus()); +	msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); -		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); +	msg.address_lo = +		HT_IRQ_LOW_BASE | +		HT_IRQ_LOW_DEST_ID(dest) | +		HT_IRQ_LOW_VECTOR(cfg->vector) | +		((apic->irq_dest_mode == 0) ? +			HT_IRQ_LOW_DM_PHYSICAL : +			HT_IRQ_LOW_DM_LOGICAL) | +		HT_IRQ_LOW_RQEOI_EDGE | +		((apic->irq_delivery_mode != dest_LowestPrio) ? +			HT_IRQ_LOW_MT_FIXED : +			HT_IRQ_LOW_MT_ARBITRATED) | +		HT_IRQ_LOW_IRQ_MASKED; -		msg.address_lo = -			HT_IRQ_LOW_BASE | -			HT_IRQ_LOW_DEST_ID(dest) | -			HT_IRQ_LOW_VECTOR(cfg->vector) | -			((apic->irq_dest_mode == 0) ? -				HT_IRQ_LOW_DM_PHYSICAL : -				HT_IRQ_LOW_DM_LOGICAL) | -			HT_IRQ_LOW_RQEOI_EDGE | -			((apic->irq_delivery_mode != dest_LowestPrio) ? -				HT_IRQ_LOW_MT_FIXED : -				HT_IRQ_LOW_MT_ARBITRATED) | -			HT_IRQ_LOW_IRQ_MASKED; +	write_ht_irq_msg(irq, &msg); -		write_ht_irq_msg(irq, &msg); +	irq_set_chip_and_handler_name(irq, &ht_irq_chip, +				      handle_edge_irq, "edge"); -		irq_set_chip_and_handler_name(irq, &ht_irq_chip, -					      handle_edge_irq, "edge"); +	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); -		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); -	} -	return err; +	return 0;  }  #endif /* CONFIG_HT_IRQ */ @@ -3563,7 +3574,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)  		/* Sanity check */  		if (reg_00.bits.ID != apic_id) { -			printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); +			pr_err("IOAPIC[%d]: Unable to change apic_id!\n", +			       ioapic);  			return -1;  		}  	} diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f00a68cca37a..d661ee95cabf 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -406,16 +406,13 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid)   * We use physical apicids here, not logical, so just return the default   * physical broadcast to stop people from breaking us   */ -static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ -	return 0x0F; -} - -static inline unsigned int +static int  numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -			     const struct cpumask *andmask) +			     const struct cpumask *andmask, +			     unsigned int *apicid)  { -	return 0x0F; +	*apicid = 0x0F; +	return 0;  }  /* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ @@ -441,20 +438,6 @@ static int probe_numaq(void)  	return found_numaq;  } -static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	/* Careful. Some cpus do not strictly honor the set of cpus -	 * specified in the interrupt destination when using lowest -	 * priority interrupt delivery mode. -	 * -	 * In particular there was a hyperthreading cpu observed to -	 * deliver interrupts to the wrong hyperthread when only one -	 * hyperthread was specified in the interrupt desitination. -	 */ -	cpumask_clear(retmask); -	cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} -  static void numaq_setup_portio_remap(void)  {  	int num_quads = num_online_nodes(); @@ -491,7 +474,7 @@ static struct apic __refdata apic_numaq = {  	.check_apicid_used		= numaq_check_apicid_used,  	.check_apicid_present		= numaq_check_apicid_present, -	.vector_allocation_domain	= numaq_vector_allocation_domain, +	.vector_allocation_domain	= flat_vector_allocation_domain,  	.init_apic_ldr			= numaq_init_apic_ldr,  	.ioapic_phys_id_map		= numaq_ioapic_phys_id_map, @@ -509,7 +492,6 @@ static struct apic __refdata apic_numaq = {  	.set_apic_id			= NULL,  	.apic_id_mask			= 0x0F << 24, -	.cpu_mask_to_apicid		= numaq_cpu_mask_to_apicid,  	.cpu_mask_to_apicid_and		= numaq_cpu_mask_to_apicid_and,  	.send_IPI_mask			= numaq_send_IPI_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1b291da09e60..eb35ef9ee63f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void)  #endif  } -static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	/* -	 * Careful. Some cpus do not strictly honor the set of cpus -	 * specified in the interrupt destination when using lowest -	 * priority interrupt delivery mode. -	 * -	 * In particular there was a hyperthreading cpu observed to -	 * deliver interrupts to the wrong hyperthread when only one -	 * hyperthread was specified in the interrupt desitination. -	 */ -	cpumask_clear(retmask); -	cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} -  /* should be called last. */  static int probe_default(void)  { @@ -105,7 +90,7 @@ static struct apic apic_default = {  	.check_apicid_used		= default_check_apicid_used,  	.check_apicid_present		= default_check_apicid_present, -	.vector_allocation_domain	= default_vector_allocation_domain, +	.vector_allocation_domain	= flat_vector_allocation_domain,  	.init_apic_ldr			= default_init_apic_ldr,  	.ioapic_phys_id_map		= default_ioapic_phys_id_map, @@ -123,8 +108,7 @@ static struct apic apic_default = {  	.set_apic_id			= NULL,  	.apic_id_mask			= 0x0F << 24, -	.cpu_mask_to_apicid		= default_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and, +	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,  	.send_IPI_mask			= default_send_IPI_mask_logical,  	.send_IPI_mask_allbutself	= default_send_IPI_mask_allbutself_logical, @@ -208,6 +192,9 @@ void __init default_setup_apic_routing(void)  	if (apic->setup_apic_routing)  		apic->setup_apic_routing(); + +	if (x86_platform.apic_post_init) +		x86_platform.apic_post_init();  }  void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 3fe986698929..1793dba7a741 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,11 +23,6 @@  #include <asm/ipi.h>  #include <asm/setup.h> -static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) -{ -	return hard_smp_processor_id() >> index_msb; -} -  /*   * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.   */ @@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void)  		}  	} -	if (is_vsmp_box()) { -		/* need to update phys_pkg_id */ -		apic->phys_pkg_id = apicid_phys_pkg_id; -	} +	if (x86_platform.apic_post_init) +		x86_platform.apic_post_init();  }  /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 659897c00755..77c95c0e1bf7 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -26,6 +26,8 @@   *   */ +#define pr_fmt(fmt) "summit: %s: " fmt, __func__ +  #include <linux/mm.h>  #include <linux/init.h>  #include <asm/io.h> @@ -235,8 +237,8 @@ static int summit_apic_id_registered(void)  static void summit_setup_apic_routing(void)  { -	printk("Enabling APIC mode:  Summit.  Using %d I/O APICs\n", -						nr_ioapics); +	pr_info("Enabling APIC mode:  Summit.  Using %d I/O APICs\n", +		nr_ioapics);  }  static int summit_cpu_present_to_apicid(int mps_cpu) @@ -263,43 +265,48 @@ static int summit_check_phys_apicid_present(int physical_apicid)  	return 1;  } -static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)  {  	unsigned int round = 0; -	int cpu, apicid = 0; +	unsigned int cpu, apicid = 0;  	/*  	 * The cpus in the mask must all be on the apic cluster.  	 */ -	for_each_cpu(cpu, cpumask) { +	for_each_cpu_and(cpu, cpumask, cpu_online_mask) {  		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);  		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { -			printk("%s: Not a valid mask!\n", __func__); -			return BAD_APICID; +			pr_err("Not a valid mask!\n"); +			return -EINVAL;  		}  		apicid |= new_apicid;  		round++;  	} -	return apicid; +	if (!round) +		return -EINVAL; +	*dest_id = apicid; +	return 0;  } -static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, -			      const struct cpumask *andmask) +static int +summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, +			      const struct cpumask *andmask, +			      unsigned int *apicid)  { -	int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);  	cpumask_var_t cpumask; +	*apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);  	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) -		return apicid; +		return 0;  	cpumask_and(cpumask, inmask, andmask); -	cpumask_and(cpumask, cpumask, cpu_online_mask); -	apicid = summit_cpu_mask_to_apicid(cpumask); +	summit_cpu_mask_to_apicid(cpumask, apicid);  	free_cpumask_var(cpumask); -	return apicid; +	return 0;  }  /* @@ -320,20 +327,6 @@ static int probe_summit(void)  	return 0;  } -static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	/* Careful. Some cpus do not strictly honor the set of cpus -	 * specified in the interrupt destination when using lowest -	 * priority interrupt delivery mode. -	 * -	 * In particular there was a hyperthreading cpu observed to -	 * deliver interrupts to the wrong hyperthread when only one -	 * hyperthread was specified in the interrupt desitination. -	 */ -	cpumask_clear(retmask); -	cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} -  #ifdef CONFIG_X86_SUMMIT_NUMA  static struct rio_table_hdr *rio_table_hdr;  static struct scal_detail   *scal_devs[MAX_NUMNODES]; @@ -355,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)  		}  	}  	if (i == rio_table_hdr->num_rio_dev) { -		printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); +		pr_err("Couldn't find owner Cyclone for Winnipeg!\n");  		return last_bus;  	} @@ -366,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)  		}  	}  	if (i == rio_table_hdr->num_scal_dev) { -		printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); +		pr_err("Couldn't find owner Twister for Cyclone!\n");  		return last_bus;  	} @@ -396,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)  		num_buses = 9;  		break;  	default: -		printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); +		pr_info("Unsupported Winnipeg type!\n");  		return last_bus;  	} @@ -411,13 +404,15 @@ static int build_detail_arrays(void)  	int i, scal_detail_size, rio_detail_size;  	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { -		printk(KERN_WARNING "%s: MAX_NUMNODES too low!  Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); +		pr_warn("MAX_NUMNODES too low!  Defined as %d, but system has %d nodes\n", +			MAX_NUMNODES, rio_table_hdr->num_scal_dev);  		return 0;  	}  	switch (rio_table_hdr->version) {  	default: -		printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); +		pr_warn("Invalid Rio Grande Table Version: %d\n", +			rio_table_hdr->version);  		return 0;  	case 2:  		scal_detail_size = 11; @@ -462,7 +457,7 @@ void setup_summit(void)  		offset = *((unsigned short *)(ptr + offset));  	}  	if (!rio_table_hdr) { -		printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); +		pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n");  		return;  	} @@ -509,7 +504,7 @@ static struct apic apic_summit = {  	.check_apicid_used		= summit_check_apicid_used,  	.check_apicid_present		= summit_check_apicid_present, -	.vector_allocation_domain	= summit_vector_allocation_domain, +	.vector_allocation_domain	= flat_vector_allocation_domain,  	.init_apic_ldr			= summit_init_apic_ldr,  	.ioapic_phys_id_map		= summit_ioapic_phys_id_map, @@ -527,7 +522,6 @@ static struct apic apic_summit = {  	.set_apic_id			= NULL,  	.apic_id_mask			= 0xFF << 24, -	.cpu_mask_to_apicid		= summit_cpu_mask_to_apicid,  	.cpu_mask_to_apicid_and		= summit_cpu_mask_to_apicid_and,  	.send_IPI_mask			= summit_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index ff35cff0e1a7..c88baa4ff0e5 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)  }  static void - x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)  {  	__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);  } @@ -96,36 +96,37 @@ static void x2apic_send_IPI_all(int vector)  	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);  } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +			      const struct cpumask *andmask, +			      unsigned int *apicid)  { -	/* -	 * We're using fixed IRQ delivery, can only return one logical APIC ID. -	 * May as well be the first. -	 */ -	int cpu = cpumask_first(cpumask); +	u32 dest = 0; +	u16 cluster; +	int i; -	if ((unsigned)cpu < nr_cpu_ids) -		return per_cpu(x86_cpu_to_logical_apicid, cpu); -	else -		return BAD_APICID; -} +	for_each_cpu_and(i, cpumask, andmask) { +		if (!cpumask_test_cpu(i, cpu_online_mask)) +			continue; +		dest = per_cpu(x86_cpu_to_logical_apicid, i); +		cluster = x2apic_cluster(i); +		break; +	} -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -			      const struct cpumask *andmask) -{ -	int cpu; +	if (!dest) +		return -EINVAL; -	/* -	 * We're using fixed IRQ delivery, can only return one logical APIC ID. -	 * May as well be the first. -	 */ -	for_each_cpu_and(cpu, cpumask, andmask) { -		if (cpumask_test_cpu(cpu, cpu_online_mask)) -			break; +	for_each_cpu_and(i, cpumask, andmask) { +		if (!cpumask_test_cpu(i, cpu_online_mask)) +			continue; +		if (cluster != x2apic_cluster(i)) +			continue; +		dest |= per_cpu(x86_cpu_to_logical_apicid, i);  	} -	return per_cpu(x86_cpu_to_logical_apicid, cpu); +	*apicid = dest; + +	return 0;  }  static void init_x2apic_ldr(void) @@ -208,6 +209,32 @@ static int x2apic_cluster_probe(void)  		return 0;  } +static const struct cpumask *x2apic_cluster_target_cpus(void) +{ +	return cpu_all_mask; +} + +/* + * Each x2apic cluster is an allocation domain. + */ +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, +					     const struct cpumask *mask) +{ +	/* +	 * To minimize vector pressure, default case of boot, device bringup +	 * etc will use a single cpu for the interrupt destination. +	 * +	 * On explicit migration requests coming from irqbalance etc, +	 * interrupts will be routed to the x2apic cluster (cluster-id +	 * derived from the first cpu in the mask) members specified +	 * in the mask. +	 */ +	if (mask == x2apic_cluster_target_cpus()) +		cpumask_copy(retmask, cpumask_of(cpu)); +	else +		cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); +} +  static struct apic apic_x2apic_cluster = {  	.name				= "cluster x2apic", @@ -219,13 +246,13 @@ static struct apic apic_x2apic_cluster = {  	.irq_delivery_mode		= dest_LowestPrio,  	.irq_dest_mode			= 1, /* logical */ -	.target_cpus			= x2apic_target_cpus, +	.target_cpus			= x2apic_cluster_target_cpus,  	.disable_esr			= 0,  	.dest_logical			= APIC_DEST_LOGICAL,  	.check_apicid_used		= NULL,  	.check_apicid_present		= NULL, -	.vector_allocation_domain	= x2apic_vector_allocation_domain, +	.vector_allocation_domain	= cluster_vector_allocation_domain,  	.init_apic_ldr			= init_x2apic_ldr,  	.ioapic_phys_id_map		= NULL, @@ -243,7 +270,6 @@ static struct apic apic_x2apic_cluster = {  	.set_apic_id			= x2apic_set_apic_id,  	.apic_id_mask			= 0xFFFFFFFFu, -	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid,  	.cpu_mask_to_apicid_and		= x2apic_cpu_mask_to_apicid_and,  	.send_IPI_mask			= x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c17e982db275..e03a1e180e81 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector)  	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);  } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	int cpu = cpumask_first(cpumask); - -	if ((unsigned)cpu < nr_cpu_ids) -		return per_cpu(x86_cpu_to_apicid, cpu); -	else -		return BAD_APICID; -} - -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -			      const struct cpumask *andmask) -{ -	int cpu; - -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	for_each_cpu_and(cpu, cpumask, andmask) { -		if (cpumask_test_cpu(cpu, cpu_online_mask)) -			break; -	} - -	return per_cpu(x86_cpu_to_apicid, cpu); -} -  static void init_x2apic_ldr(void)  {  } @@ -131,13 +99,13 @@ static struct apic apic_x2apic_phys = {  	.irq_delivery_mode		= dest_Fixed,  	.irq_dest_mode			= 0, /* physical */ -	.target_cpus			= x2apic_target_cpus, +	.target_cpus			= online_target_cpus,  	.disable_esr			= 0,  	.dest_logical			= 0,  	.check_apicid_used		= NULL,  	.check_apicid_present		= NULL, -	.vector_allocation_domain	= x2apic_vector_allocation_domain, +	.vector_allocation_domain	= default_vector_allocation_domain,  	.init_apic_ldr			= init_x2apic_ldr,  	.ioapic_phys_id_map		= NULL, @@ -155,8 +123,7 @@ static struct apic apic_x2apic_phys = {  	.set_apic_id			= x2apic_set_apic_id,  	.apic_id_mask			= 0xFFFFFFFFu, -	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid, -	.cpu_mask_to_apicid_and		= x2apic_cpu_mask_to_apicid_and, +	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,  	.send_IPI_mask			= x2apic_send_IPI_mask,  	.send_IPI_mask_allbutself	= x2apic_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c6d03f7a4401..8cfade9510a4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -185,17 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);  unsigned long sn_rtc_cycles_per_second;  EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static const struct cpumask *uv_target_cpus(void) -{ -	return cpu_online_mask; -} - -static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ -	cpumask_clear(retmask); -	cpumask_set_cpu(cpu, retmask); -} -  static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)  {  #ifdef CONFIG_SMP @@ -280,25 +269,12 @@ static void uv_init_apic_ldr(void)  {  } -static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ -	/* -	 * We're using fixed IRQ delivery, can only return one phys APIC ID. -	 * May as well be the first. -	 */ -	int cpu = cpumask_first(cpumask); - -	if ((unsigned)cpu < nr_cpu_ids) -		return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; -	else -		return BAD_APICID; -} - -static unsigned int +static int  uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -			  const struct cpumask *andmask) +			  const struct cpumask *andmask, +			  unsigned int *apicid)  { -	int cpu; +	int unsigned cpu;  	/*  	 * We're using fixed IRQ delivery, can only return one phys APIC ID. @@ -308,7 +284,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,  		if (cpumask_test_cpu(cpu, cpu_online_mask))  			break;  	} -	return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + +	if (likely(cpu < nr_cpu_ids)) { +		*apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; +		return 0; +	} + +	return -EINVAL;  }  static unsigned int x2apic_get_apic_id(unsigned long x) @@ -362,13 +344,13 @@ static struct apic __refdata apic_x2apic_uv_x = {  	.irq_delivery_mode		= dest_Fixed,  	.irq_dest_mode			= 0, /* physical */ -	.target_cpus			= uv_target_cpus, +	.target_cpus			= online_target_cpus,  	.disable_esr			= 0,  	.dest_logical			= APIC_DEST_LOGICAL,  	.check_apicid_used		= NULL,  	.check_apicid_present		= NULL, -	.vector_allocation_domain	= uv_vector_allocation_domain, +	.vector_allocation_domain	= default_vector_allocation_domain,  	.init_apic_ldr			= uv_init_apic_ldr,  	.ioapic_phys_id_map		= NULL, @@ -386,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = {  	.set_apic_id			= set_apic_id,  	.apic_id_mask			= 0xFFFFFFFFu, -	.cpu_mask_to_apicid		= uv_cpu_mask_to_apicid,  	.cpu_mask_to_apicid_and		= uv_cpu_mask_to_apicid_and,  	.send_IPI_mask			= uv_send_IPI_mask, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 07b0c0db466c..d65464e43503 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -201,6 +201,8 @@   *    http://www.microsoft.com/whdc/archive/amp_12.mspx]   */ +#define pr_fmt(fmt) "apm: " fmt +  #include <linux/module.h>  #include <linux/poll.h> @@ -485,11 +487,11 @@ static void apm_error(char *str, int err)  		if (error_table[i].key == err)  			break;  	if (i < ERROR_COUNT) -		printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); +		pr_notice("%s: %s\n", str, error_table[i].msg);  	else if (err < 0) -		printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); +		pr_notice("%s: linux error code %i\n", str, err);  	else -		printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", +		pr_notice("%s: unknown error code %#2.2x\n",  		       str, err);  } @@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)  			static int notified;  			if (notified++ == 0) -			    printk(KERN_ERR "apm: an event queue overflowed\n"); +				pr_err("an event queue overflowed\n");  			if (++as->event_tail >= APM_MAX_EVENTS)  				as->event_tail = 0;  		} @@ -1447,7 +1449,7 @@ static void apm_mainloop(void)  static int check_apm_user(struct apm_user *as, const char *func)  {  	if (as == NULL || as->magic != APM_BIOS_MAGIC) { -		printk(KERN_ERR "apm: %s passed bad filp\n", func); +		pr_err("%s passed bad filp\n", func);  		return 1;  	}  	return 0; @@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp)  		     as1 = as1->next)  			;  		if (as1 == NULL) -			printk(KERN_ERR "apm: filp not in user list\n"); +			pr_err("filp not in user list\n");  		else  			as1->next = as->next;  	} @@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp)  	struct apm_user *as;  	as = kmalloc(sizeof(*as), GFP_KERNEL); -	if (as == NULL) { -		printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", -		       sizeof(*as)); +	if (as == NULL)  		return -ENOMEM; -	} +  	as->magic = APM_BIOS_MAGIC;  	as->event_tail = as->event_head = 0;  	as->suspends_pending = as->standbys_pending = 0; @@ -2313,16 +2313,16 @@ static int __init apm_init(void)  	}  	if (apm_info.disabled) { -		printk(KERN_NOTICE "apm: disabled on user request.\n"); +		pr_notice("disabled on user request.\n");  		return -ENODEV;  	}  	if ((num_online_cpus() > 1) && !power_off && !smp) { -		printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); +		pr_notice("disabled - APM is not SMP safe.\n");  		apm_info.disabled = 1;  		return -ENODEV;  	}  	if (!acpi_disabled) { -		printk(KERN_NOTICE "apm: overridden by ACPI.\n"); +		pr_notice("overridden by ACPI.\n");  		apm_info.disabled = 1;  		return -ENODEV;  	} @@ -2356,8 +2356,7 @@ static int __init apm_init(void)  	kapmd_task = kthread_create(apm, NULL, "kapmd");  	if (IS_ERR(kapmd_task)) { -		printk(KERN_ERR "apm: disabled - Unable to start kernel " -				"thread.\n"); +		pr_err("disabled - Unable to start kernel thread\n");  		err = PTR_ERR(kapmd_task);  		kapmd_task = NULL;  		remove_proc_entry("apm", NULL); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2fdfdd..d30a6a9a0121 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)  obj-y			:= intel_cacheinfo.o scattered.o topology.o  obj-y			+= proc.o capflags.o powerflags.o common.o -obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o +obj-y			+= vmware.o hypervisor.o mshyperv.o  obj-y			+= rdrand.o  obj-y			+= match.o @@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o  ifdef CONFIG_PERF_EVENTS  obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd.o -obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o  endif  obj-$(CONFIG_X86_MCE)			+= mcheck/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 146bb6218eec..9d92e19039f0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -19,6 +19,39 @@  #include "cpu.h" +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ +	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); +	u32 gprs[8] = { 0 }; +	int err; + +	WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + +	gprs[1] = msr; +	gprs[7] = 0x9c5a203a; + +	err = rdmsr_safe_regs(gprs); + +	*p = gprs[0] | ((u64)gprs[2] << 32); + +	return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ +	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); +	u32 gprs[8] = { 0 }; + +	WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + +	gprs[0] = (u32)val; +	gprs[1] = msr; +	gprs[2] = val >> 32; +	gprs[7] = 0x9c5a203a; + +	return wrmsr_safe_regs(gprs); +} +  #ifdef CONFIG_X86_32  /*   *	B step AMD K6 before B 9730xxxx have hardware bugs that can cause @@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)  	    !cpu_has(c, X86_FEATURE_TOPOEXT)) {  		u64 val; -		if (!rdmsrl_amd_safe(0xc0011005, &val)) { +		if (!rdmsrl_safe(0xc0011005, &val)) {  			val |= 1ULL << 54; -			wrmsrl_amd_safe(0xc0011005, val); +			wrmsrl_safe(0xc0011005, val);  			rdmsrl(0xc0011005, val);  			if (val & (1ULL << 54)) {  				set_cpu_cap(c, X86_FEATURE_TOPOEXT); @@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)  		err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);  		if (err == 0) {  			mask |= (1 << 10); -			checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); +			wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);  		}  	} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fbb62ba..c97bb7b5a9f8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -55,8 +55,8 @@ static void __init check_fpu(void)  	if (!boot_cpu_data.hard_math) {  #ifndef CONFIG_MATH_EMULATION -		printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); -		printk(KERN_EMERG "Giving up.\n"); +		pr_emerg("No coprocessor found and no math emulation present\n"); +		pr_emerg("Giving up\n");  		for (;;) ;  #endif  		return; @@ -86,7 +86,7 @@ static void __init check_fpu(void)  	boot_cpu_data.fdiv_bug = fdiv_bug;  	if (boot_cpu_data.fdiv_bug) -		printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); +		pr_warn("Hmm, FPU with FDIV bug\n");  }  static void __init check_hlt(void) @@ -94,16 +94,16 @@ static void __init check_hlt(void)  	if (boot_cpu_data.x86 >= 5 || paravirt_enabled())  		return; -	printk(KERN_INFO "Checking 'hlt' instruction... "); +	pr_info("Checking 'hlt' instruction... ");  	if (!boot_cpu_data.hlt_works_ok) { -		printk("disabled\n"); +		pr_cont("disabled\n");  		return;  	}  	halt();  	halt();  	halt();  	halt(); -	printk(KERN_CONT "OK.\n"); +	pr_cont("OK\n");  }  /* @@ -116,7 +116,7 @@ static void __init check_popad(void)  #ifndef CONFIG_X86_POPAD_OK  	int res, inp = (int) &res; -	printk(KERN_INFO "Checking for popad bug... "); +	pr_info("Checking for popad bug... ");  	__asm__ __volatile__(  	  "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "  	  : "=&a" (res) @@ -127,9 +127,9 @@ static void __init check_popad(void)  	 * CPU hard. Too bad.  	 */  	if (res != 12345678) -		printk(KERN_CONT "Buggy.\n"); +		pr_cont("Buggy\n");  	else -		printk(KERN_CONT "OK.\n"); +		pr_cont("OK\n");  #endif  } @@ -161,7 +161,7 @@ void __init check_bugs(void)  {  	identify_boot_cpu();  #ifndef CONFIG_SMP -	printk(KERN_INFO "CPU: "); +	pr_info("CPU: ");  	print_cpu_info(&boot_cpu_data);  #endif  	check_config(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b9333b429ba..46d8786d655e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -452,6 +452,35 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)  	c->x86_cache_size = l2size;  } +u16 __read_mostly tlb_lli_4k[NR_INFO]; +u16 __read_mostly tlb_lli_2m[NR_INFO]; +u16 __read_mostly tlb_lli_4m[NR_INFO]; +u16 __read_mostly tlb_lld_4k[NR_INFO]; +u16 __read_mostly tlb_lld_2m[NR_INFO]; +u16 __read_mostly tlb_lld_4m[NR_INFO]; + +/* + * tlb_flushall_shift shows the balance point in replacing cr3 write + * with multiple 'invlpg'. It will do this replacement when + *   flush_tlb_lines <= active_lines/2^tlb_flushall_shift. + * If tlb_flushall_shift is -1, means the replacement will be disabled. + */ +s8  __read_mostly tlb_flushall_shift = -1; + +void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c) +{ +	if (this_cpu->c_detect_tlb) +		this_cpu->c_detect_tlb(c); + +	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ +		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"	     \ +		"tlb_flushall_shift is 0x%x\n", +		tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], +		tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], +		tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], +		tlb_flushall_shift); +} +  void __cpuinit detect_ht(struct cpuinfo_x86 *c)  {  #ifdef CONFIG_X86_HT @@ -911,6 +940,8 @@ void __init identify_boot_cpu(void)  #else  	vgetcpu_set_mode();  #endif +	if (boot_cpu_data.cpuid_level >= 2) +		cpu_detect_tlb(&boot_cpu_data);  }  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -947,7 +978,7 @@ static void __cpuinit __print_cpu_msr(void)  		index_max = msr_range_array[i].max;  		for (index = index_min; index < index_max; index++) { -			if (rdmsrl_amd_safe(index, &val)) +			if (rdmsrl_safe(index, &val))  				continue;  			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);  		} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 8bacc7826fb3..4041c24ae7db 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -20,10 +20,19 @@ struct cpu_dev {  	void		(*c_bsp_init)(struct cpuinfo_x86 *);  	void		(*c_init)(struct cpuinfo_x86 *);  	void		(*c_identify)(struct cpuinfo_x86 *); +	void		(*c_detect_tlb)(struct cpuinfo_x86 *);  	unsigned int	(*c_size_cache)(struct cpuinfo_x86 *, unsigned int);  	int		c_x86_vendor;  }; +struct _tlb_table { +	unsigned char descriptor; +	char tlb_type; +	unsigned int entries; +	/* unsigned int ways; */ +	char info[128]; +}; +  #define cpu_dev_register(cpu_devX) \  	static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \  	__attribute__((__section__(".x86_cpu_dev.init"))) = \ diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 755f64fb0743..a8f8fa9769d6 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =  #endif  	&x86_hyper_vmware,  	&x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST +	&x86_hyper_kvm, +#endif  };  const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3e6ff6cbf42a..0a4ce2980a5a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -491,6 +491,181 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i  }  #endif +#define TLB_INST_4K	0x01 +#define TLB_INST_4M	0x02 +#define TLB_INST_2M_4M	0x03 + +#define TLB_INST_ALL	0x05 +#define TLB_INST_1G	0x06 + +#define TLB_DATA_4K	0x11 +#define TLB_DATA_4M	0x12 +#define TLB_DATA_2M_4M	0x13 +#define TLB_DATA_4K_4M	0x14 + +#define TLB_DATA_1G	0x16 + +#define TLB_DATA0_4K	0x21 +#define TLB_DATA0_4M	0x22 +#define TLB_DATA0_2M_4M	0x23 + +#define STLB_4K		0x41 + +static const struct _tlb_table intel_tlb_table[] __cpuinitconst = { +	{ 0x01, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages, 4-way set associative" }, +	{ 0x02, TLB_INST_4M,		2,	" TLB_INST 4 MByte pages, full associative" }, +	{ 0x03, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way set associative" }, +	{ 0x04, TLB_DATA_4M,		8,	" TLB_DATA 4 MByte pages, 4-way set associative" }, +	{ 0x05, TLB_DATA_4M,		32,	" TLB_DATA 4 MByte pages, 4-way set associative" }, +	{ 0x0b, TLB_INST_4M,		4,	" TLB_INST 4 MByte pages, 4-way set associative" }, +	{ 0x4f, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages */" }, +	{ 0x50, TLB_INST_ALL,		64,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, +	{ 0x51, TLB_INST_ALL,		128,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, +	{ 0x52, TLB_INST_ALL,		256,	" TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, +	{ 0x55, TLB_INST_2M_4M,		7,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" }, +	{ 0x56, TLB_DATA0_4M,		16,	" TLB_DATA0 4 MByte pages, 4-way set associative" }, +	{ 0x57, TLB_DATA0_4K,		16,	" TLB_DATA0 4 KByte pages, 4-way associative" }, +	{ 0x59, TLB_DATA0_4K,		16,	" TLB_DATA0 4 KByte pages, fully associative" }, +	{ 0x5a, TLB_DATA0_2M_4M,	32,	" TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, +	{ 0x5b, TLB_DATA_4K_4M,		64,	" TLB_DATA 4 KByte and 4 MByte pages" }, +	{ 0x5c, TLB_DATA_4K_4M,		128,	" TLB_DATA 4 KByte and 4 MByte pages" }, +	{ 0x5d, TLB_DATA_4K_4M,		256,	" TLB_DATA 4 KByte and 4 MByte pages" }, +	{ 0xb0, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 4-way set associative" }, +	{ 0xb1, TLB_INST_2M_4M,		4,	" TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, +	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" }, +	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" }, +	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" }, +	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" }, +	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, +	{ 0xca, STLB_4K,		512,	" STLB 4 KByte pages, 4-way associative" }, +	{ 0x00, 0, 0 } +}; + +static void __cpuinit intel_tlb_lookup(const unsigned char desc) +{ +	unsigned char k; +	if (desc == 0) +		return; + +	/* look up this descriptor in the table */ +	for (k = 0; intel_tlb_table[k].descriptor != desc && \ +			intel_tlb_table[k].descriptor != 0; k++) +		; + +	if (intel_tlb_table[k].tlb_type == 0) +		return; + +	switch (intel_tlb_table[k].tlb_type) { +	case STLB_4K: +		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_INST_ALL: +		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_INST_4K: +		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_INST_4M: +		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_INST_2M_4M: +		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_DATA_4K: +	case TLB_DATA0_4K: +		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_DATA_4M: +	case TLB_DATA0_4M: +		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_DATA_2M_4M: +	case TLB_DATA0_2M_4M: +		if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; +		break; +	case TLB_DATA_4K_4M: +		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; +		break; +	} +} + +static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) +{ +	if (!cpu_has_invlpg) { +		tlb_flushall_shift = -1; +		return; +	} +	switch ((c->x86 << 8) + c->x86_model) { +	case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ +	case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ +	case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ +	case 0x61d: /* six-core 45 nm xeon "Dunnington" */ +		tlb_flushall_shift = -1; +		break; +	case 0x61a: /* 45 nm nehalem, "Bloomfield" */ +	case 0x61e: /* 45 nm nehalem, "Lynnfield" */ +	case 0x625: /* 32 nm nehalem, "Clarkdale" */ +	case 0x62c: /* 32 nm nehalem, "Gulftown" */ +	case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ +	case 0x62f: /* 32 nm Xeon E7 */ +		tlb_flushall_shift = 6; +		break; +	case 0x62a: /* SandyBridge */ +	case 0x62d: /* SandyBridge, "Romely-EP" */ +		tlb_flushall_shift = 5; +		break; +	case 0x63a: /* Ivybridge */ +		tlb_flushall_shift = 1; +		break; +	default: +		tlb_flushall_shift = 6; +	} +} + +static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c) +{ +	int i, j, n; +	unsigned int regs[4]; +	unsigned char *desc = (unsigned char *)regs; +	/* Number of times to iterate */ +	n = cpuid_eax(2) & 0xFF; + +	for (i = 0 ; i < n ; i++) { +		cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + +		/* If bit 31 is set, this is an unknown format */ +		for (j = 0 ; j < 3 ; j++) +			if (regs[j] & (1 << 31)) +				regs[j] = 0; + +		/* Byte 0 is level count, not a descriptor */ +		for (j = 1 ; j < 16 ; j++) +			intel_tlb_lookup(desc[j]); +	} +	intel_tlb_flushall_shift_set(c); +} +  static const struct cpu_dev __cpuinitconst intel_cpu_dev = {  	.c_vendor	= "Intel",  	.c_ident	= { "GenuineIntel" }, @@ -546,6 +721,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {  	},  	.c_size_cache	= intel_size_cache,  #endif +	.c_detect_tlb	= intel_detect_tlb,  	.c_early_init   = early_init_intel,  	.c_init		= init_intel,  	.c_x86_vendor	= X86_VENDOR_INTEL, diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index aa7548799af4..5e095f873e3e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,6 +7,9 @@   * Copyright 2008 Intel Corporation   * Author: Andi Kleen   */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/thread_info.h>  #include <linux/capability.h>  #include <linux/miscdevice.h> @@ -208,7 +211,7 @@ static void drain_mcelog_buffer(void)  				cpu_relax();  				if (!m->finished && retries >= 4) { -					pr_err("MCE: skipping error being logged currently!\n"); +					pr_err("skipping error being logged currently!\n");  					break;  				}  			} @@ -1165,8 +1168,9 @@ int memory_failure(unsigned long pfn, int vector, int flags)  {  	/* mce_severity() should not hand us an ACTION_REQUIRED error */  	BUG_ON(flags & MF_ACTION_REQUIRED); -	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" -		"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); +	pr_err("Uncorrected memory error in page 0x%lx ignored\n" +	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", +	       pfn);  	return 0;  } @@ -1184,6 +1188,7 @@ void mce_notify_process(void)  {  	unsigned long pfn;  	struct mce_info *mi = mce_find_info(); +	int flags = MF_ACTION_REQUIRED;  	if (!mi)  		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); @@ -1198,8 +1203,9 @@ void mce_notify_process(void)  	 * doomed. We still need to mark the page as poisoned and alert any  	 * other users of the page.  	 */ -	if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || -			   mi->restartable == 0) { +	if (!mi->restartable) +		flags |= MF_MUST_KILL; +	if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {  		pr_err("Memory error not recovered");  		force_sig(SIGBUS, current);  	} @@ -1356,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void)  	b = cap & MCG_BANKCNT_MASK;  	if (!banks) -		printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); +		pr_info("CPU supports %d MCE banks\n", b);  	if (b > MAX_NR_BANKS) { -		printk(KERN_WARNING -		       "MCE: Using only %u machine check banks out of %u\n", +		pr_warn("Using only %u machine check banks out of %u\n",  			MAX_NR_BANKS, b);  		b = MAX_NR_BANKS;  	} @@ -1417,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void)  static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)  {  	if (c->x86_vendor == X86_VENDOR_UNKNOWN) { -		pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); +		pr_info("unknown CPU type - not enabling MCE support\n");  		return -EOPNOTSUPP;  	} @@ -1572,7 +1577,7 @@ static void __mcheck_cpu_init_timer(void)  /* Handle unconfigured int18 (should never happen) */  static void unexpected_machine_check(struct pt_regs *regs, long error_code)  { -	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", +	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",  	       smp_processor_id());  } @@ -1891,8 +1896,7 @@ static int __init mcheck_enable(char *str)  			get_option(&str, &monarch_timeout);  		}  	} else { -		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", -		       str); +		pr_info("mce argument %s ignored. Please use /sys\n", str);  		return 0;  	}  	return 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index be5274490428..c4e916d77378 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,15 +1,17 @@  /* - *  (c) 2005, 2006 Advanced Micro Devices, Inc. + *  (c) 2005-2012 Advanced Micro Devices, Inc.   *  Your use of this code is subject to the terms and conditions of the   *  GNU general public license version 2. See "COPYING" or   *  http://www.gnu.org/licenses/gpl.html   *   *  Written by Jacob Shin - AMD, Inc.   * - *  Support : jacob.shin@amd.com + *  Support: borislav.petkov@amd.com   *   *  April 2006   *     - added support for AMD Family 0x10 processors + *  May 2012 + *     - major scrubbing   *   *  All MC4_MISCi registers are shared between multi-cores   */ @@ -25,6 +27,7 @@  #include <linux/cpu.h>  #include <linux/smp.h> +#include <asm/amd_nb.h>  #include <asm/apic.h>  #include <asm/idle.h>  #include <asm/mce.h> @@ -45,23 +48,15 @@  #define MASK_BLKPTR_LO    0xFF000000  #define MCG_XBLK_ADDR     0xC0000400 -struct threshold_block { -	unsigned int		block; -	unsigned int		bank; -	unsigned int		cpu; -	u32			address; -	u16			interrupt_enable; -	bool			interrupt_capable; -	u16			threshold_limit; -	struct kobject		kobj; -	struct list_head	miscj; +static const char * const th_names[] = { +	"load_store", +	"insn_fetch", +	"combined_unit", +	"", +	"northbridge", +	"execution_unit",  }; -struct threshold_bank { -	struct kobject		*kobj; -	struct threshold_block	*blocks; -	cpumask_var_t		cpus; -};  static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);  static unsigned char shared_bank[NR_BANKS] = { @@ -84,6 +79,26 @@ struct thresh_restart {  	u16			old_limit;  }; +static const char * const bank4_names(struct threshold_block *b) +{ +	switch (b->address) { +	/* MSR4_MISC0 */ +	case 0x00000413: +		return "dram"; + +	case 0xc0000408: +		return "ht_links"; + +	case 0xc0000409: +		return "l3_cache"; + +	default: +		WARN(1, "Funny MSR: 0x%08x\n", b->address); +		return ""; +	} +}; + +  static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)  {  	/* @@ -224,8 +239,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  			if (!block)  				per_cpu(bank_map, cpu) |= (1 << bank); -			if (shared_bank[bank] && c->cpu_core_id) -				break;  			memset(&b, 0, sizeof(b));  			b.cpu			= cpu; @@ -326,7 +339,7 @@ struct threshold_attr {  #define SHOW_FIELDS(name)						\  static ssize_t show_ ## name(struct threshold_block *b, char *buf)	\  {									\ -	return sprintf(buf, "%lx\n", (unsigned long) b->name);		\ +	return sprintf(buf, "%lu\n", (unsigned long) b->name);		\  }  SHOW_FIELDS(interrupt_enable)  SHOW_FIELDS(threshold_limit) @@ -377,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)  	return size;  } -struct threshold_block_cross_cpu { -	struct threshold_block	*tb; -	long			retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ -	struct threshold_block_cross_cpu *tbcc = _tbcc; -	struct threshold_block *b = tbcc->tb; -	u32 low, high; - -	rdmsr(b->address, low, high); -	tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} -  static ssize_t show_error_count(struct threshold_block *b, char *buf)  { -	struct threshold_block_cross_cpu tbcc = { .tb = b, }; +	u32 lo, hi; -	smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); -	return sprintf(buf, "%lx\n", tbcc.retval); -} +	rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); -static ssize_t store_error_count(struct threshold_block *b, -				 const char *buf, size_t count) -{ -	struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; - -	smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); -	return 1; +	return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - +				     (THRESHOLD_MAX - b->threshold_limit)));  } +static struct threshold_attr error_count = { +	.attr = {.name = __stringify(error_count), .mode = 0444 }, +	.show = show_error_count, +}; +  #define RW_ATTR(val)							\  static struct threshold_attr val = {					\  	.attr	= {.name = __stringify(val), .mode = 0644 },		\ @@ -418,7 +414,6 @@ static struct threshold_attr val = {					\  RW_ATTR(interrupt_enable);  RW_ATTR(threshold_limit); -RW_ATTR(error_count);  static struct attribute *default_attrs[] = {  	&threshold_limit.attr, @@ -517,7 +512,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,  	err = kobject_init_and_add(&b->kobj, &threshold_ktype,  				   per_cpu(threshold_banks, cpu)[bank]->kobj, -				   "misc%i", block); +				   (bank == 4 ? bank4_names(b) : th_names[bank]));  	if (err)  		goto out_free;  recurse: @@ -548,98 +543,91 @@ out_free:  	return err;  } -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) +static __cpuinit int __threshold_add_blocks(struct threshold_bank *b)  { -	return allocate_threshold_blocks(cpu, bank, 0, -					 MSR_IA32_MC0_MISC + bank * 4); +	struct list_head *head = &b->blocks->miscj; +	struct threshold_block *pos = NULL; +	struct threshold_block *tmp = NULL; +	int err = 0; + +	err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); +	if (err) +		return err; + +	list_for_each_entry_safe(pos, tmp, head, miscj) { + +		err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); +		if (err) { +			list_for_each_entry_safe_reverse(pos, tmp, head, miscj) +				kobject_del(&pos->kobj); + +			return err; +		} +	} +	return err;  } -/* symlinks sibling shared banks to first core.  first core owns dir/files. */  static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  { -	int i, err = 0; -	struct threshold_bank *b = NULL;  	struct device *dev = per_cpu(mce_device, cpu); -	char name[32]; - -	sprintf(name, "threshold_bank%i", bank); +	struct amd_northbridge *nb = NULL; +	struct threshold_bank *b = NULL; +	const char *name = th_names[bank]; +	int err = 0; -#ifdef CONFIG_SMP -	if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {	/* symlink */ -		i = cpumask_first(cpu_llc_shared_mask(cpu)); +	if (shared_bank[bank]) { -		/* first core not up yet */ -		if (cpu_data(i).cpu_core_id) -			goto out; +		nb = node_to_amd_nb(amd_get_nb_id(cpu)); +		WARN_ON(!nb); -		/* already linked */ -		if (per_cpu(threshold_banks, cpu)[bank]) -			goto out; +		/* threshold descriptor already initialized on this node? */ +		if (nb->bank4) { +			/* yes, use it */ +			b = nb->bank4; +			err = kobject_add(b->kobj, &dev->kobj, name); +			if (err) +				goto out; -		b = per_cpu(threshold_banks, i)[bank]; +			per_cpu(threshold_banks, cpu)[bank] = b; +			atomic_inc(&b->cpus); -		if (!b) -			goto out; +			err = __threshold_add_blocks(b); -		err = sysfs_create_link(&dev->kobj, b->kobj, name); -		if (err)  			goto out; - -		cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); -		per_cpu(threshold_banks, cpu)[bank] = b; - -		goto out; +		}  	} -#endif  	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);  	if (!b) {  		err = -ENOMEM;  		goto out;  	} -	if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { -		kfree(b); -		err = -ENOMEM; -		goto out; -	}  	b->kobj = kobject_create_and_add(name, &dev->kobj); -	if (!b->kobj) +	if (!b->kobj) { +		err = -EINVAL;  		goto out_free; - -#ifndef CONFIG_SMP -	cpumask_setall(b->cpus); -#else -	cpumask_set_cpu(cpu, b->cpus); -#endif +	}  	per_cpu(threshold_banks, cpu)[bank] = b; -	err = local_allocate_threshold_blocks(cpu, bank); -	if (err) -		goto out_free; - -	for_each_cpu(i, b->cpus) { -		if (i == cpu) -			continue; +	if (shared_bank[bank]) { +		atomic_set(&b->cpus, 1); -		dev = per_cpu(mce_device, i); -		if (dev) -			err = sysfs_create_link(&dev->kobj,b->kobj, name); -		if (err) -			goto out; - -		per_cpu(threshold_banks, i)[bank] = b; +		/* nb is already initialized, see above */ +		WARN_ON(nb->bank4); +		nb->bank4 = b;  	} -	goto out; +	err = allocate_threshold_blocks(cpu, bank, 0, +					MSR_IA32_MC0_MISC + bank * 4); +	if (!err) +		goto out; -out_free: -	per_cpu(threshold_banks, cpu)[bank] = NULL; -	free_cpumask_var(b->cpus); + out_free:  	kfree(b); -out: + + out:  	return err;  } @@ -660,12 +648,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)  	return err;  } -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - *   of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ -  static void deallocate_threshold_block(unsigned int cpu,  						 unsigned int bank)  { @@ -686,41 +668,42 @@ static void deallocate_threshold_block(unsigned int cpu,  	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;  } +static void __threshold_remove_blocks(struct threshold_bank *b) +{ +	struct threshold_block *pos = NULL; +	struct threshold_block *tmp = NULL; + +	kobject_del(b->kobj); + +	list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) +		kobject_del(&pos->kobj); +} +  static void threshold_remove_bank(unsigned int cpu, int bank)  { +	struct amd_northbridge *nb;  	struct threshold_bank *b; -	struct device *dev; -	char name[32]; -	int i = 0;  	b = per_cpu(threshold_banks, cpu)[bank];  	if (!b)  		return; +  	if (!b->blocks)  		goto free_out; -	sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP -	/* sibling symlink */ -	if (shared_bank[bank] && b->blocks->cpu != cpu) { -		dev = per_cpu(mce_device, cpu); -		sysfs_remove_link(&dev->kobj, name); -		per_cpu(threshold_banks, cpu)[bank] = NULL; - -		return; -	} -#endif - -	/* remove all sibling symlinks before unregistering */ -	for_each_cpu(i, b->cpus) { -		if (i == cpu) -			continue; - -		dev = per_cpu(mce_device, i); -		if (dev) -			sysfs_remove_link(&dev->kobj, name); -		per_cpu(threshold_banks, i)[bank] = NULL; +	if (shared_bank[bank]) { +		if (!atomic_dec_and_test(&b->cpus)) { +			__threshold_remove_blocks(b); +			per_cpu(threshold_banks, cpu)[bank] = NULL; +			return; +		} else { +			/* +			 * the last CPU on this node using the shared bank is +			 * going away, remove that bank now. +			 */ +			nb = node_to_amd_nb(amd_get_nb_id(cpu)); +			nb->bank4 = NULL; +		}  	}  	deallocate_threshold_block(cpu, bank); @@ -728,7 +711,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  free_out:  	kobject_del(b->kobj);  	kobject_put(b->kobj); -	free_cpumask_var(b->cpus);  	kfree(b);  	per_cpu(threshold_banks, cpu)[bank] = NULL;  } diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index dfea390e1608..c7b3fe2d72e0 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w  #  # Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h  # @@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";  print OUT "#include <asm/cpufeature.h>\n\n";  print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; +%features = (); +$err = 0; +  while (defined($line = <IN>)) {  	if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {  		$macro = $1; -		$feature = $2; +		$feature = "\L$2";  		$tail = $3;  		if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { -			$feature = $1; +			$feature = "\L$1";  		} -		if ($feature ne '') { -			printf OUT "\t%-32s = \"%s\",\n", -				"[$macro]", "\L$feature"; +		next if ($feature eq ''); + +		if ($features{$feature}++) { +			print STDERR "$in: duplicate feature name: $feature\n"; +			$err++;  		} +		printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature;  	}  }  print OUT "};\n";  close(IN);  close(OUT); + +if ($err) { +	unlink($out); +	exit(1); +} + +exit(0); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index bdda2e6c673b..35ffda5d0727 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,  		/* Compute the maximum size with which we can make a range: */  		if (range_startk) -			max_align = ffs(range_startk) - 1; +			max_align = __ffs(range_startk);  		else -			max_align = 32; +			max_align = BITS_PER_LONG - 1; -		align = fls(range_sizek) - 1; +		align = __fls(range_sizek);  		if (align > max_align)  			align = max_align; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 75772ae6c65f..e9fe907cd249 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -361,11 +361,7 @@ static void __init print_mtrr_state(void)  	}  	pr_debug("MTRR variable ranges %sabled:\n",  		 mtrr_state.enabled & 2 ? "en" : "dis"); -	if (size_or_mask & 0xffffffffUL) -		high_width = ffs(size_or_mask & 0xffffffffUL) - 1; -	else -		high_width = ffs(size_or_mask>>32) + 32 - 1; -	high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; +	high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;  	for (i = 0; i < num_var_ranges; ++i) {  		if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c4706cf9c011..29557aa06dda 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -35,17 +35,6 @@  #include "perf_event.h" -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) 					\ -do {								\ -	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ -			(unsigned long)(val));			\ -	native_write_msr((msr), (u32)((u64)(val)), 		\ -			(u32)((u64)(val) >> 32));		\ -} while (0) -#endif -  struct x86_pmu x86_pmu __read_mostly;  DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { @@ -74,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event)  	int idx = hwc->idx;  	s64 delta; -	if (idx == X86_PMC_IDX_FIXED_BTS) +	if (idx == INTEL_PMC_IDX_FIXED_BTS)  		return 0;  	/* @@ -86,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event)  	 */  again:  	prev_raw_count = local64_read(&hwc->prev_count); -	rdmsrl(hwc->event_base, new_raw_count); +	rdpmcl(hwc->event_base_rdpmc, new_raw_count);  	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,  					new_raw_count) != prev_raw_count) @@ -189,7 +178,7 @@ static void release_pmc_hardware(void) {}  static bool check_hw_exists(void)  { -	u64 val, val_new = 0; +	u64 val, val_new = ~0;  	int i, reg, ret = 0;  	/* @@ -222,8 +211,9 @@ static bool check_hw_exists(void)  	 * that don't trap on the MSR access and always return 0s.  	 */  	val = 0xabcdUL; -	ret = checking_wrmsrl(x86_pmu_event_addr(0), val); -	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); +	reg = x86_pmu_event_addr(0); +	ret = wrmsrl_safe(reg, val); +	ret |= rdmsrl_safe(reg, &val_new);  	if (ret || val != val_new)  		goto msr_fail; @@ -240,6 +230,7 @@ bios_fail:  msr_fail:  	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); +	printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);  	return false;  } @@ -388,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event)  		int precise = 0;  		/* Support for constant skid */ -		if (x86_pmu.pebs_active) { +		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {  			precise++;  			/* Support for IP fixup */ @@ -637,8 +628,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)  	c = sched->constraints[sched->state.event];  	/* Prefer fixed purpose counters */ -	if (x86_pmu.num_counters_fixed) { -		idx = X86_PMC_IDX_FIXED; +	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { +		idx = INTEL_PMC_IDX_FIXED;  		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {  			if (!__test_and_set_bit(idx, sched->state.used))  				goto done; @@ -646,7 +637,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)  	}  	/* Grab the first unused counter starting with idx */  	idx = sched->state.counter; -	for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { +	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {  		if (!__test_and_set_bit(idx, sched->state.used))  			goto done;  	} @@ -704,8 +695,8 @@ static bool perf_sched_next_event(struct perf_sched *sched)  /*   * Assign a counter for each event.   */ -static int perf_assign_events(struct event_constraint **constraints, int n, -			      int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, +			int wmin, int wmax, int *assign)  {  	struct perf_sched sched; @@ -824,15 +815,17 @@ static inline void x86_assign_hw_event(struct perf_event *event,  	hwc->last_cpu = smp_processor_id();  	hwc->last_tag = ++cpuc->tags[i]; -	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { +	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {  		hwc->config_base = 0;  		hwc->event_base	= 0; -	} else if (hwc->idx >= X86_PMC_IDX_FIXED) { +	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {  		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; -		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); +		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); +		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;  	} else {  		hwc->config_base = x86_pmu_config_addr(hwc->idx);  		hwc->event_base  = x86_pmu_event_addr(hwc->idx); +		hwc->event_base_rdpmc = hwc->idx;  	}  } @@ -930,7 +923,7 @@ int x86_perf_event_set_period(struct perf_event *event)  	s64 period = hwc->sample_period;  	int ret = 0, idx = hwc->idx; -	if (idx == X86_PMC_IDX_FIXED_BTS) +	if (idx == INTEL_PMC_IDX_FIXED_BTS)  		return 0;  	/* @@ -1316,7 +1309,6 @@ static struct attribute_group x86_pmu_format_group = {  static int __init init_hw_perf_events(void)  {  	struct x86_pmu_quirk *quirk; -	struct event_constraint *c;  	int err;  	pr_info("Performance Events: "); @@ -1347,21 +1339,8 @@ static int __init init_hw_perf_events(void)  	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)  		quirk->func(); -	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { -		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", -		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC); -		x86_pmu.num_counters = X86_PMC_MAX_GENERIC; -	} -	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - -	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { -		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", -		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); -		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; -	} - -	x86_pmu.intel_ctrl |= -		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; +	if (!x86_pmu.intel_ctrl) +		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;  	perf_events_lapic_init();  	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1370,22 +1349,6 @@ static int __init init_hw_perf_events(void)  		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,  				   0, x86_pmu.num_counters, 0); -	if (x86_pmu.event_constraints) { -		/* -		 * event on fixed counter2 (REF_CYCLES) only works on this -		 * counter, so do not extend mask to generic counters -		 */ -		for_each_event_constraint(c, x86_pmu.event_constraints) { -			if (c->cmask != X86_RAW_EVENT_MASK -			    || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { -				continue; -			} - -			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; -			c->weight += x86_pmu.num_counters; -		} -	} -  	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */  	x86_pmu_format_group.attrs = x86_pmu.format_attrs; @@ -1620,8 +1583,8 @@ static int x86_pmu_event_idx(struct perf_event *event)  	if (!x86_pmu.attr_rdpmc)  		return 0; -	if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { -		idx -= X86_PMC_IDX_FIXED; +	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { +		idx -= INTEL_PMC_IDX_FIXED;  		idx |= 1 << 30;  	} @@ -1649,7 +1612,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,  			      struct device_attribute *attr,  			      const char *buf, size_t count)  { -	unsigned long val = simple_strtoul(buf, NULL, 0); +	unsigned long val; +	ssize_t ret; + +	ret = kstrtoul(buf, 0, &val); +	if (ret) +		return ret;  	if (!!val != !!x86_pmu.attr_rdpmc) {  		x86_pmu.attr_rdpmc = !!val; @@ -1682,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void)  		x86_pmu.flush_branch_stack();  } +void perf_check_microcode(void) +{ +	if (x86_pmu.check_microcode) +		x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); +  static struct pmu pmu = {  	.pmu_enable		= x86_pmu_enable,  	.pmu_disable		= x86_pmu_disable, -	.attr_groups	= x86_pmu_attr_groups, +	.attr_groups		= x86_pmu_attr_groups, -	.event_init	= x86_pmu_event_init, +	.event_init		= x86_pmu_event_init,  	.add			= x86_pmu_add,  	.del			= x86_pmu_del, @@ -1696,11 +1671,11 @@ static struct pmu pmu = {  	.stop			= x86_pmu_stop,  	.read			= x86_pmu_read, -	.start_txn	= x86_pmu_start_txn, -	.cancel_txn	= x86_pmu_cancel_txn, -	.commit_txn	= x86_pmu_commit_txn, +	.start_txn		= x86_pmu_start_txn, +	.cancel_txn		= x86_pmu_cancel_txn, +	.commit_txn		= x86_pmu_commit_txn, -	.event_idx	= x86_pmu_event_idx, +	.event_idx		= x86_pmu_event_idx,  	.flush_branch_stack	= x86_pmu_flush_branch_stack,  }; @@ -1863,7 +1838,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)  		else  			misc |= PERF_RECORD_MISC_GUEST_KERNEL;  	} else { -		if (user_mode(regs)) +		if (!kernel_ip(regs->ip))  			misc |= PERF_RECORD_MISC_USER;  		else  			misc |= PERF_RECORD_MISC_KERNEL; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7241e2fc3c17..a15df4be151f 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -14,6 +14,18 @@  #include <linux/perf_event.h> +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) 						\ +do {									\ +	unsigned int _msr = (msr);					\ +	u64 _val = (val);						\ +	trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr),		\ +			(unsigned long long)(_val));			\ +	native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32));	\ +} while (0) +#endif +  /*   *          |   NHM/WSM    |      SNB     |   * register ------------------------------- @@ -57,7 +69,7 @@ struct amd_nb {  };  /* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS		4 +#define MAX_PEBS_EVENTS		8  /*   * A debug store configuration. @@ -349,6 +361,8 @@ struct x86_pmu {  	void		(*cpu_starting)(int cpu);  	void		(*cpu_dying)(int cpu);  	void		(*cpu_dead)(int cpu); + +	void		(*check_microcode)(void);  	void		(*flush_branch_stack)(void);  	/* @@ -360,12 +374,16 @@ struct x86_pmu {  	/*  	 * Intel DebugStore bits  	 */ -	int		bts, pebs; -	int		bts_active, pebs_active; +	int		bts		:1, +			bts_active	:1, +			pebs		:1, +			pebs_active	:1, +			pebs_broken	:1;  	int		pebs_record_size;  	void		(*drain_pebs)(struct pt_regs *regs);  	struct event_constraint *pebs_constraints;  	void		(*pebs_aliases)(struct perf_event *event); +	int 		max_pebs_events;  	/*  	 * Intel LBR @@ -468,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,  void x86_pmu_enable_all(int added); +int perf_assign_events(struct event_constraint **constraints, int n, +			int wmin, int wmax, int *assign);  int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);  void x86_pmu_stop(struct perf_event *event, int flags); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 11a4eb9131d5..4528ae7b6ec4 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu)  	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; -	if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) +	if (boot_cpu_data.x86_max_cores < 2)  		return;  	nb_id = amd_get_nb_id(cpu); @@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = {  	NULL,  }; -static __initconst const struct x86_pmu amd_pmu = { -	.name			= "AMD", -	.handle_irq		= x86_pmu_handle_irq, -	.disable_all		= x86_pmu_disable_all, -	.enable_all		= x86_pmu_enable_all, -	.enable			= x86_pmu_enable_event, -	.disable		= x86_pmu_disable_event, -	.hw_config		= amd_pmu_hw_config, -	.schedule_events	= x86_schedule_events, -	.eventsel		= MSR_K7_EVNTSEL0, -	.perfctr		= MSR_K7_PERFCTR0, -	.event_map		= amd_pmu_event_map, -	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), -	.num_counters		= AMD64_NUM_COUNTERS, -	.cntval_bits		= 48, -	.cntval_mask		= (1ULL << 48) - 1, -	.apic			= 1, -	/* use highest bit to detect overflow */ -	.max_period		= (1ULL << 47) - 1, -	.get_event_constraints	= amd_get_event_constraints, -	.put_event_constraints	= amd_put_event_constraints, - -	.format_attrs		= amd_format_attr, - -	.cpu_prepare		= amd_pmu_cpu_prepare, -	.cpu_starting		= amd_pmu_cpu_starting, -	.cpu_dead		= amd_pmu_cpu_dead, -}; -  /* AMD Family 15h */  #define AMD_EVENT_TYPE_MASK	0x000000F0ULL @@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev  	}  } -static __initconst const struct x86_pmu amd_pmu_f15h = { -	.name			= "AMD Family 15h", +static __initconst const struct x86_pmu amd_pmu = { +	.name			= "AMD",  	.handle_irq		= x86_pmu_handle_irq,  	.disable_all		= x86_pmu_disable_all,  	.enable_all		= x86_pmu_enable_all, @@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {  	.disable		= x86_pmu_disable_event,  	.hw_config		= amd_pmu_hw_config,  	.schedule_events	= x86_schedule_events, -	.eventsel		= MSR_F15H_PERF_CTL, -	.perfctr		= MSR_F15H_PERF_CTR, +	.eventsel		= MSR_K7_EVNTSEL0, +	.perfctr		= MSR_K7_PERFCTR0,  	.event_map		= amd_pmu_event_map,  	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), -	.num_counters		= AMD64_NUM_COUNTERS_F15H, +	.num_counters		= AMD64_NUM_COUNTERS,  	.cntval_bits		= 48,  	.cntval_mask		= (1ULL << 48) - 1,  	.apic			= 1,  	/* use highest bit to detect overflow */  	.max_period		= (1ULL << 47) - 1, -	.get_event_constraints	= amd_get_event_constraints_f15h, -	/* nortbridge counters not yet implemented: */ -#if 0 +	.get_event_constraints	= amd_get_event_constraints,  	.put_event_constraints	= amd_put_event_constraints, +	.format_attrs		= amd_format_attr, +  	.cpu_prepare		= amd_pmu_cpu_prepare, -	.cpu_dead		= amd_pmu_cpu_dead, -#endif  	.cpu_starting		= amd_pmu_cpu_starting, -	.format_attrs		= amd_format_attr, +	.cpu_dead		= amd_pmu_cpu_dead,  }; +static int setup_event_constraints(void) +{ +	if (boot_cpu_data.x86 >= 0x15) +		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; +	return 0; +} + +static int setup_perfctr_core(void) +{ +	if (!cpu_has_perfctr_core) { +		WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, +		     KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); +		return -ENODEV; +	} + +	WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, +	     KERN_ERR "hw perf events core counters need constraints handler!"); + +	/* +	 * If core performance counter extensions exists, we must use +	 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also +	 * x86_pmu_addr_offset(). +	 */ +	x86_pmu.eventsel	= MSR_F15H_PERF_CTL; +	x86_pmu.perfctr		= MSR_F15H_PERF_CTR; +	x86_pmu.num_counters	= AMD64_NUM_COUNTERS_CORE; + +	printk(KERN_INFO "perf: AMD core performance counters detected\n"); + +	return 0; +} +  __init int amd_pmu_init(void)  {  	/* Performance-monitoring supported from K7 and later: */  	if (boot_cpu_data.x86 < 6)  		return -ENODEV; -	/* -	 * If core performance counter extensions exists, it must be -	 * family 15h, otherwise fail. See x86_pmu_addr_offset(). -	 */ -	switch (boot_cpu_data.x86) { -	case 0x15: -		if (!cpu_has_perfctr_core) -			return -ENODEV; -		x86_pmu = amd_pmu_f15h; -		break; -	default: -		if (cpu_has_perfctr_core) -			return -ENODEV; -		x86_pmu = amd_pmu; -		break; -	} +	x86_pmu = amd_pmu; + +	setup_event_constraints(); +	setup_perfctr_core();  	/* Events are common for all AMDs */  	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 187c294bc658..7a8b9d0abcaa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -5,6 +5,8 @@   * among events on a single PMU.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/stddef.h>  #include <linux/types.h>  #include <linux/init.h> @@ -21,14 +23,14 @@   */  static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =  { -  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c, -  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, -  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e, -  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e, -  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, -  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, -  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c, -  [PERF_COUNT_HW_REF_CPU_CYCLES]	= 0x0300, /* pseudo-encoding */ +	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c, +	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, +	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e, +	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e, +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, +	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, +	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c, +	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */  };  static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -747,7 +749,7 @@ static void intel_pmu_disable_all(void)  	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); -	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) +	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))  		intel_pmu_disable_bts();  	intel_pmu_pebs_disable_all(); @@ -763,9 +765,9 @@ static void intel_pmu_enable_all(int added)  	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,  			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); -	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { +	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {  		struct perf_event *event = -			cpuc->events[X86_PMC_IDX_FIXED_BTS]; +			cpuc->events[INTEL_PMC_IDX_FIXED_BTS];  		if (WARN_ON_ONCE(!event))  			return; @@ -871,7 +873,7 @@ static inline void intel_pmu_ack_status(u64 ack)  static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)  { -	int idx = hwc->idx - X86_PMC_IDX_FIXED; +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;  	u64 ctrl_val, mask;  	mask = 0xfULL << (idx * 4); @@ -886,7 +888,7 @@ static void intel_pmu_disable_event(struct perf_event *event)  	struct hw_perf_event *hwc = &event->hw;  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { +	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {  		intel_pmu_disable_bts();  		intel_pmu_drain_bts_buffer();  		return; @@ -915,7 +917,7 @@ static void intel_pmu_disable_event(struct perf_event *event)  static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)  { -	int idx = hwc->idx - X86_PMC_IDX_FIXED; +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;  	u64 ctrl_val, bits, mask;  	/* @@ -949,7 +951,7 @@ static void intel_pmu_enable_event(struct perf_event *event)  	struct hw_perf_event *hwc = &event->hw;  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { +	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {  		if (!__this_cpu_read(cpu_hw_events.enabled))  			return; @@ -1000,14 +1002,14 @@ static void intel_pmu_reset(void)  	local_irq_save(flags); -	printk("clearing PMU state on CPU#%d\n", smp_processor_id()); +	pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());  	for (idx = 0; idx < x86_pmu.num_counters; idx++) { -		checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); -		checking_wrmsrl(x86_pmu_event_addr(idx),  0ull); +		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); +		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);  	}  	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) -		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); +		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);  	if (ds)  		ds->bts_index = ds->bts_buffer_base; @@ -1707,16 +1709,61 @@ static __init void intel_clovertown_quirk(void)  	 * But taken together it might just make sense to not enable PEBS on  	 * these chips.  	 */ -	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); +	pr_warn("PEBS disabled due to CPU errata\n");  	x86_pmu.pebs = 0;  	x86_pmu.pebs_constraints = NULL;  } +static int intel_snb_pebs_broken(int cpu) +{ +	u32 rev = UINT_MAX; /* default to broken for unknown models */ + +	switch (cpu_data(cpu).x86_model) { +	case 42: /* SNB */ +		rev = 0x28; +		break; + +	case 45: /* SNB-EP */ +		switch (cpu_data(cpu).x86_mask) { +		case 6: rev = 0x618; break; +		case 7: rev = 0x70c; break; +		} +	} + +	return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ +	int pebs_broken = 0; +	int cpu; + +	get_online_cpus(); +	for_each_online_cpu(cpu) { +		if ((pebs_broken = intel_snb_pebs_broken(cpu))) +			break; +	} +	put_online_cpus(); + +	if (pebs_broken == x86_pmu.pebs_broken) +		return; + +	/* +	 * Serialized by the microcode lock.. +	 */ +	if (x86_pmu.pebs_broken) { +		pr_info("PEBS enabled due to microcode update\n"); +		x86_pmu.pebs_broken = 0; +	} else { +		pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); +		x86_pmu.pebs_broken = 1; +	} +} +  static __init void intel_sandybridge_quirk(void)  { -	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); -	x86_pmu.pebs = 0; -	x86_pmu.pebs_constraints = NULL; +	x86_pmu.check_microcode = intel_snb_check_microcode; +	intel_snb_check_microcode();  }  static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -1736,8 +1783,8 @@ static __init void intel_arch_events_quirk(void)  	/* disable event that reported as not presend by cpuid */  	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {  		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; -		printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", -				intel_arch_events_map[bit].name); +		pr_warn("CPUID marked event: \'%s\' unavailable\n", +			intel_arch_events_map[bit].name);  	}  } @@ -1756,7 +1803,7 @@ static __init void intel_nehalem_quirk(void)  		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;  		ebx.split.no_branch_misses_retired = 0;  		x86_pmu.events_maskl = ebx.full; -		printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); +		pr_info("CPU erratum AAJ80 worked around\n");  	}  } @@ -1765,6 +1812,7 @@ __init int intel_pmu_init(void)  	union cpuid10_edx edx;  	union cpuid10_eax eax;  	union cpuid10_ebx ebx; +	struct event_constraint *c;  	unsigned int unused;  	int version; @@ -1800,6 +1848,8 @@ __init int intel_pmu_init(void)  	x86_pmu.events_maskl		= ebx.full;  	x86_pmu.events_mask_len		= eax.split.mask_length; +	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); +  	/*  	 * Quirk: v2 perfmon does not report fixed-purpose events, so  	 * assume at least 3 events: @@ -1951,5 +2001,37 @@ __init int intel_pmu_init(void)  		}  	} +	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { +		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", +		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); +		x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; +	} +	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + +	if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { +		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", +		     x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); +		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; +	} + +	x86_pmu.intel_ctrl |= +		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + +	if (x86_pmu.event_constraints) { +		/* +		 * event on fixed counter2 (REF_CYCLES) only works on this +		 * counter, so do not extend mask to generic counters +		 */ +		for_each_event_constraint(c, x86_pmu.event_constraints) { +			if (c->cmask != X86_RAW_EVENT_MASK +			    || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { +				continue; +			} + +			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; +			c->weight += x86_pmu.num_counters; +		} +	} +  	return 0;  } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 35e2192df9f4..629ae0b7ad90 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -248,7 +248,7 @@ void reserve_ds_buffers(void)   */  struct event_constraint bts_constraint = -	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); +	EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);  void intel_pmu_enable_bts(u64 config)  { @@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void)  		u64	to;  		u64	flags;  	}; -	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; +	struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];  	struct bts_record *at, *top;  	struct perf_output_handle handle;  	struct perf_event_header header; @@ -620,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)  	 * Should not happen, we program the threshold at 1 and do not  	 * set a reset value.  	 */ -	WARN_ON_ONCE(n > 1); +	WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);  	at += n - 1;  	__intel_pmu_pebs_event(event, iregs, at); @@ -651,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  	 * Should not happen, we program the threshold at 1 and do not  	 * set a reset value.  	 */ -	WARN_ON_ONCE(n > MAX_PEBS_EVENTS); +	WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n);  	for ( ; at < top; at++) { -		for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { +		for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) {  			event = cpuc->events[bit];  			if (!test_bit(bit, cpuc->active_mask))  				continue; @@ -670,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  			break;  		} -		if (!event || bit >= MAX_PEBS_EVENTS) +		if (!event || bit >= x86_pmu.max_pebs_events)  			continue;  		__intel_pmu_pebs_event(event, iregs, at); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 000000000000..19faffc60886 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,1850 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = +	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = +	EVENT_CONSTRAINT(0, 0, 0); + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31"); + +/* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ +	struct pci_dev *pdev = box->pci_dev; +	int box_ctl = uncore_pci_box_ctl(box); +	u32 config; + +	pci_read_config_dword(pdev, box_ctl, &config); +	config |= SNBEP_PMON_BOX_CTL_FRZ; +	pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ +	struct pci_dev *pdev = box->pci_dev; +	int box_ctl = uncore_pci_box_ctl(box); +	u32 config; + +	pci_read_config_dword(pdev, box_ctl, &config); +	config &= ~SNBEP_PMON_BOX_CTL_FRZ; +	pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; + +	pci_write_config_dword(pdev, hwc->config_base, hwc->config | +				SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; + +	pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; +	u64 count; + +	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); +	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); +	return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ +	struct pci_dev *pdev = box->pci_dev; +	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, +				SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ +	u64 config; +	unsigned msr; + +	msr = uncore_msr_box_ctl(box); +	if (msr) { +		rdmsrl(msr, config); +		config |= SNBEP_PMON_BOX_CTL_FRZ; +		wrmsrl(msr, config); +		return; +	} +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ +	u64 config; +	unsigned msr; + +	msr = uncore_msr_box_ctl(box); +	if (msr) { +		rdmsrl(msr, config); +		config &= ~SNBEP_PMON_BOX_CTL_FRZ; +		wrmsrl(msr, config); +		return; +	} +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; +	struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + +	if (reg1->idx != EXTRA_REG_NONE) +		wrmsrl(reg1->reg, reg1->config); + +	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; + +	wrmsrl(hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; +	u64 count; + +	rdmsrl(hwc->event_base, count); +	return count; +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ +	unsigned msr = uncore_msr_box_ctl(box); +	if (msr) +		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct event_constraint * +snbep_uncore_get_constraint(struct intel_uncore_box *box, +			    struct perf_event *event) +{ +	struct intel_uncore_extra_reg *er; +	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; +	unsigned long flags; +	bool ok = false; + +	if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc)) +		return NULL; + +	er = &box->shared_regs[reg1->idx]; +	raw_spin_lock_irqsave(&er->lock, flags); +	if (!atomic_read(&er->ref) || er->config1 == reg1->config) { +		atomic_inc(&er->ref); +		er->config1 = reg1->config; +		ok = true; +	} +	raw_spin_unlock_irqrestore(&er->lock, flags); + +	if (ok) { +		if (box->phys_id >= 0) +			reg1->alloc = 1; +		return NULL; +	} +	return &constraint_empty; +} + +static void snbep_uncore_put_constraint(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct intel_uncore_extra_reg *er; +	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + +	if (box->phys_id < 0 || !reg1->alloc) +		return; + +	er = &box->shared_regs[reg1->idx]; +	atomic_dec(&er->ref); +	reg1->alloc = 0; +} + +static int snbep_uncore_hw_config(struct intel_uncore_box *box, +				  struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; +	struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + +	if (box->pmu->type == &snbep_uncore_cbox) { +		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + +			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; +		reg1->config = event->attr.config1 & +			SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; +	} else if (box->pmu->type == &snbep_uncore_pcu) { +		reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; +		reg1->config = event->attr.config1 & +			SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; +	} else { +		return 0; +	} +	reg1->idx = 0; +	return 0; +} + +static struct attribute *snbep_uncore_formats_attr[] = { +	&format_attr_event.attr, +	&format_attr_umask.attr, +	&format_attr_edge.attr, +	&format_attr_inv.attr, +	&format_attr_thresh8.attr, +	NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { +	&format_attr_event.attr, +	&format_attr_umask.attr, +	&format_attr_edge.attr, +	&format_attr_inv.attr, +	&format_attr_thresh5.attr, +	NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { +	&format_attr_event.attr, +	&format_attr_umask.attr, +	&format_attr_edge.attr, +	&format_attr_tid_en.attr, +	&format_attr_inv.attr, +	&format_attr_thresh8.attr, +	&format_attr_filter_tid.attr, +	&format_attr_filter_nid.attr, +	&format_attr_filter_state.attr, +	&format_attr_filter_opc.attr, +	NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { +	&format_attr_event.attr, +	&format_attr_occ_sel.attr, +	&format_attr_edge.attr, +	&format_attr_inv.attr, +	&format_attr_thresh5.attr, +	&format_attr_occ_invert.attr, +	&format_attr_occ_edge.attr, +	&format_attr_filter_brand0.attr, +	&format_attr_filter_brand1.attr, +	&format_attr_filter_brand2.attr, +	&format_attr_filter_brand3.attr, +	NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { +	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"), +	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"), +	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), +	{ /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { +	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"), +	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), +	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x02,umask=0x08"), +	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x03,umask=0x04"), +	{ /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { +	.name = "format", +	.attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { +	.name = "format", +	.attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { +	.name = "format", +	.attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { +	.name = "format", +	.attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct intel_uncore_ops snbep_uncore_msr_ops = { +	.init_box	= snbep_uncore_msr_init_box, +	.disable_box	= snbep_uncore_msr_disable_box, +	.enable_box	= snbep_uncore_msr_enable_box, +	.disable_event	= snbep_uncore_msr_disable_event, +	.enable_event	= snbep_uncore_msr_enable_event, +	.read_counter	= snbep_uncore_msr_read_counter, +	.get_constraint = snbep_uncore_get_constraint, +	.put_constraint = snbep_uncore_put_constraint, +	.hw_config	= snbep_uncore_hw_config, +}; + +static struct intel_uncore_ops snbep_uncore_pci_ops = { +	.init_box	= snbep_uncore_pci_init_box, +	.disable_box	= snbep_uncore_pci_disable_box, +	.enable_box	= snbep_uncore_pci_enable_box, +	.disable_event	= snbep_uncore_pci_disable_event, +	.enable_event	= snbep_uncore_pci_enable_event, +	.read_counter	= snbep_uncore_pci_read_counter, +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { +	UNCORE_EVENT_CONSTRAINT(0x01, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x02, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x04, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x05, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x07, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x11, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x12, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x13, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), +	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), +	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), +	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), +	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), +	UNCORE_EVENT_CONSTRAINT(0x21, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x23, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x31, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x32, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x33, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x34, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x35, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x36, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x37, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x38, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x39, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), +	EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { +	UNCORE_EVENT_CONSTRAINT(0x10, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x11, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x12, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x23, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x24, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x25, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x26, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x32, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x33, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x34, 0x3), +	EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { +	UNCORE_EVENT_CONSTRAINT(0x10, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x11, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x12, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x13, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x20, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x21, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x22, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x23, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x24, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x25, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x26, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x30, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x31, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x32, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x33, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x34, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x36, 0x3), +	UNCORE_EVENT_CONSTRAINT(0x37, 0x3), +	EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { +	.name		= "ubox", +	.num_counters   = 2, +	.num_boxes	= 1, +	.perf_ctr_bits	= 44, +	.fixed_ctr_bits	= 48, +	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0, +	.event_ctl	= SNBEP_U_MSR_PMON_CTL0, +	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK, +	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, +	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, +	.ops		= &snbep_uncore_msr_ops, +	.format_group	= &snbep_uncore_ubox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { +	.name			= "cbox", +	.num_counters		= 4, +	.num_boxes		= 8, +	.perf_ctr_bits		= 44, +	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0, +	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0, +	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, +	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL, +	.msr_offset		= SNBEP_CBO_MSR_OFFSET, +	.num_shared_regs	= 1, +	.constraints		= snbep_uncore_cbox_constraints, +	.ops			= &snbep_uncore_msr_ops, +	.format_group		= &snbep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { +	.name			= "pcu", +	.num_counters		= 4, +	.num_boxes		= 1, +	.perf_ctr_bits		= 48, +	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0, +	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0, +	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, +	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL, +	.num_shared_regs	= 1, +	.ops			= &snbep_uncore_msr_ops, +	.format_group		= &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { +	&snbep_uncore_ubox, +	&snbep_uncore_cbox, +	&snbep_uncore_pcu, +	NULL, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT()				\ +	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\ +	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\ +	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\ +	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\ +	.ops		= &snbep_uncore_pci_ops,		\ +	.format_group	= &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { +	.name		= "ha", +	.num_counters   = 4, +	.num_boxes	= 1, +	.perf_ctr_bits	= 48, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { +	.name		= "imc", +	.num_counters   = 4, +	.num_boxes	= 4, +	.perf_ctr_bits	= 48, +	.fixed_ctr_bits	= 48, +	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, +	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, +	.event_descs	= snbep_uncore_imc_events, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { +	.name		= "qpi", +	.num_counters   = 4, +	.num_boxes	= 2, +	.perf_ctr_bits	= 48, +	.event_descs	= snbep_uncore_qpi_events, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { +	.name		= "r2pcie", +	.num_counters   = 4, +	.num_boxes	= 1, +	.perf_ctr_bits	= 44, +	.constraints	= snbep_uncore_r2pcie_constraints, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { +	.name		= "r3qpi", +	.num_counters   = 3, +	.num_boxes	= 2, +	.perf_ctr_bits	= 44, +	.constraints	= snbep_uncore_r3qpi_constraints, +	SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { +	&snbep_uncore_ha, +	&snbep_uncore_imc, +	&snbep_uncore_qpi, +	&snbep_uncore_r2pcie, +	&snbep_uncore_r3qpi, +	NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { +	{ /* Home Agent */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), +		.driver_data = (unsigned long)&snbep_uncore_ha, +	}, +	{ /* MC Channel 0 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), +		.driver_data = (unsigned long)&snbep_uncore_imc, +	}, +	{ /* MC Channel 1 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), +		.driver_data = (unsigned long)&snbep_uncore_imc, +	}, +	{ /* MC Channel 2 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), +		.driver_data = (unsigned long)&snbep_uncore_imc, +	}, +	{ /* MC Channel 3 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), +		.driver_data = (unsigned long)&snbep_uncore_imc, +	}, +	{ /* QPI Port 0 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), +		.driver_data = (unsigned long)&snbep_uncore_qpi, +	}, +	{ /* QPI Port 1 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), +		.driver_data = (unsigned long)&snbep_uncore_qpi, +	}, +	{ /* P2PCIe */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), +		.driver_data = (unsigned long)&snbep_uncore_r2pcie, +	}, +	{ /* R3QPI Link 0 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), +		.driver_data = (unsigned long)&snbep_uncore_r3qpi, +	}, +	{ /* R3QPI Link 1 */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), +		.driver_data = (unsigned long)&snbep_uncore_r3qpi, +	}, +	{ /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { +	.name		= "snbep_uncore", +	.id_table	= snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static void snbep_pci2phy_map_init(void) +{ +	struct pci_dev *ubox_dev = NULL; +	int i, bus, nodeid; +	u32 config; + +	while (1) { +		/* find the UBOX device */ +		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, +					PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX, +					ubox_dev); +		if (!ubox_dev) +			break; +		bus = ubox_dev->bus->number; +		/* get the Node ID of the local register */ +		pci_read_config_dword(ubox_dev, 0x40, &config); +		nodeid = config; +		/* get the Node ID mapping */ +		pci_read_config_dword(ubox_dev, 0x54, &config); +		/* +		 * every three bits in the Node ID mapping register maps +		 * to a particular node. +		 */ +		for (i = 0; i < 8; i++) { +			if (nodeid == ((config >> (3 * i)) & 0x7)) { +				pcibus_to_physid[bus] = i; +				break; +			} +		} +	}; +	return; +} +/* end of Sandy Bridge-EP uncore support */ + + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; + +	if (hwc->idx < UNCORE_PMC_IDX_FIXED) +		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); +	else +		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	wrmsrl(event->hw.config_base, 0); +} + +static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	u64 count; +	rdmsrl(event->hw.event_base, count); +	return count; +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ +	if (box->pmu->pmu_idx == 0) { +		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, +			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); +	} +} + +static struct attribute *snb_uncore_formats_attr[] = { +	&format_attr_event.attr, +	&format_attr_umask.attr, +	&format_attr_edge.attr, +	&format_attr_inv.attr, +	&format_attr_cmask5.attr, +	NULL, +}; + +static struct attribute_group snb_uncore_format_group = { +	.name = "format", +	.attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { +	.init_box	= snb_uncore_msr_init_box, +	.disable_event	= snb_uncore_msr_disable_event, +	.enable_event	= snb_uncore_msr_enable_event, +	.read_counter	= snb_uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { +	UNCORE_EVENT_CONSTRAINT(0x80, 0x1), +	UNCORE_EVENT_CONSTRAINT(0x83, 0x1), +	EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { +	.name		= "cbox", +	.num_counters   = 2, +	.num_boxes	= 4, +	.perf_ctr_bits	= 44, +	.fixed_ctr_bits	= 48, +	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0, +	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0, +	.fixed_ctr	= SNB_UNC_FIXED_CTR, +	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL, +	.single_fixed	= 1, +	.event_mask	= SNB_UNC_RAW_EVENT_MASK, +	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET, +	.constraints	= snb_uncore_cbox_constraints, +	.ops		= &snb_uncore_msr_ops, +	.format_group	= &snb_uncore_format_group, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { +	&snb_uncore_cbox, +	NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ +	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ +	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, +		NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; + +	if (hwc->idx < UNCORE_PMC_IDX_FIXED) +		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); +	else +		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { +	&format_attr_event.attr, +	&format_attr_umask.attr, +	&format_attr_edge.attr, +	&format_attr_inv.attr, +	&format_attr_cmask8.attr, +	NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { +	.name = "format", +	.attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { +	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"), +	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"), +	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"), +	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"), +	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"), +	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"), +	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), +	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"), +	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"), +	{ /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { +	.disable_box	= nhm_uncore_msr_disable_box, +	.enable_box	= nhm_uncore_msr_enable_box, +	.disable_event	= snb_uncore_msr_disable_event, +	.enable_event	= nhm_uncore_msr_enable_event, +	.read_counter	= snb_uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { +	.name		= "", +	.num_counters   = 8, +	.num_boxes	= 1, +	.perf_ctr_bits	= 48, +	.fixed_ctr_bits	= 48, +	.event_ctl	= NHM_UNC_PERFEVTSEL0, +	.perf_ctr	= NHM_UNC_UNCORE_PMC0, +	.fixed_ctr	= NHM_UNC_FIXED_CTR, +	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL, +	.event_mask	= NHM_UNC_RAW_EVENT_MASK, +	.event_descs	= nhm_uncore_events, +	.ops		= &nhm_uncore_msr_ops, +	.format_group	= &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { +	&nhm_uncore, +	NULL, +}; +/* end of Nehalem uncore support */ + +static void uncore_assign_hw_event(struct intel_uncore_box *box, +				struct perf_event *event, int idx) +{ +	struct hw_perf_event *hwc = &event->hw; + +	hwc->idx = idx; +	hwc->last_tag = ++box->tags[idx]; + +	if (hwc->idx == UNCORE_PMC_IDX_FIXED) { +		hwc->event_base = uncore_fixed_ctr(box); +		hwc->config_base = uncore_fixed_ctl(box); +		return; +	} + +	hwc->config_base = uncore_event_ctl(box, hwc->idx); +	hwc->event_base  = uncore_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	u64 prev_count, new_count, delta; +	int shift; + +	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) +		shift = 64 - uncore_fixed_ctr_bits(box); +	else +		shift = 64 - uncore_perf_ctr_bits(box); + +	/* the hrtimer might modify the previous event value */ +again: +	prev_count = local64_read(&event->hw.prev_count); +	new_count = uncore_read_counter(box, event); +	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) +		goto again; + +	delta = (new_count << shift) - (prev_count << shift); +	delta >>= shift; + +	local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ +	struct intel_uncore_box *box; +	unsigned long flags; +	int bit; + +	box = container_of(hrtimer, struct intel_uncore_box, hrtimer); +	if (!box->n_active || box->cpu != smp_processor_id()) +		return HRTIMER_NORESTART; +	/* +	 * disable local interrupt to prevent uncore_pmu_event_start/stop +	 * to interrupt the update process +	 */ +	local_irq_save(flags); + +	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) +		uncore_perf_event_update(box, box->events[bit]); + +	local_irq_restore(flags); + +	hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); +	return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ +	__hrtimer_start_range_ns(&box->hrtimer, +			ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, +			HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ +	hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ +	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	box->hrtimer.function = uncore_pmu_hrtimer; +} + +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, +					  int cpu) +{ +	struct intel_uncore_box *box; +	int i, size; + +	size = sizeof(*box) + type->num_shared_regs * +		sizeof(struct intel_uncore_extra_reg); + +	box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); +	if (!box) +		return NULL; + +	for (i = 0; i < type->num_shared_regs; i++) +		raw_spin_lock_init(&box->shared_regs[i].lock); + +	uncore_pmu_init_hrtimer(box); +	atomic_set(&box->refcnt, 1); +	box->cpu = -1; +	box->phys_id = -1; + +	return box; +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ +	static struct intel_uncore_box *box; + +	box = *per_cpu_ptr(pmu->box, cpu); +	if (box) +		return box; + +	raw_spin_lock(&uncore_box_lock); +	list_for_each_entry(box, &pmu->box_list, list) { +		if (box->phys_id == topology_physical_package_id(cpu)) { +			atomic_inc(&box->refcnt); +			*per_cpu_ptr(pmu->box, cpu) = box; +			break; +		} +	} +	raw_spin_unlock(&uncore_box_lock); + +	return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ +	return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ +	/* +	 * perf core schedules event on the basis of cpu, uncore events are +	 * collected by one of the cpus inside a physical package. +	 */ +	return uncore_pmu_to_box(uncore_event_to_pmu(event), +				 smp_processor_id()); +} + +static int uncore_collect_events(struct intel_uncore_box *box, +				struct perf_event *leader, bool dogrp) +{ +	struct perf_event *event; +	int n, max_count; + +	max_count = box->pmu->type->num_counters; +	if (box->pmu->type->fixed_ctl) +		max_count++; + +	if (box->n_events >= max_count) +		return -EINVAL; + +	n = box->n_events; +	box->event_list[n] = leader; +	n++; +	if (!dogrp) +		return n; + +	list_for_each_entry(event, &leader->sibling_list, group_entry) { +		if (event->state <= PERF_EVENT_STATE_OFF) +			continue; + +		if (n >= max_count) +			return -EINVAL; + +		box->event_list[n] = event; +		n++; +	} +	return n; +} + +static struct event_constraint * +uncore_get_event_constraint(struct intel_uncore_box *box, +			    struct perf_event *event) +{ +	struct intel_uncore_type *type = box->pmu->type; +	struct event_constraint *c; + +	if (type->ops->get_constraint) { +		c = type->ops->get_constraint(box, event); +		if (c) +			return c; +	} + +	if (event->hw.config == ~0ULL) +		return &constraint_fixed; + +	if (type->constraints) { +		for_each_event_constraint(c, type->constraints) { +			if ((event->hw.config & c->cmask) == c->code) +				return c; +		} +	} + +	return &type->unconstrainted; +} + +static void uncore_put_event_constraint(struct intel_uncore_box *box, +					struct perf_event *event) +{ +	if (box->pmu->type->ops->put_constraint) +		box->pmu->type->ops->put_constraint(box, event); +} + +static int uncore_assign_events(struct intel_uncore_box *box, +				int assign[], int n) +{ +	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; +	struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; +	int i, wmin, wmax, ret = 0; +	struct hw_perf_event *hwc; + +	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + +	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { +		c = uncore_get_event_constraint(box, box->event_list[i]); +		constraints[i] = c; +		wmin = min(wmin, c->weight); +		wmax = max(wmax, c->weight); +	} + +	/* fastpath, try to reuse previous register */ +	for (i = 0; i < n; i++) { +		hwc = &box->event_list[i]->hw; +		c = constraints[i]; + +		/* never assigned */ +		if (hwc->idx == -1) +			break; + +		/* constraint still honored */ +		if (!test_bit(hwc->idx, c->idxmsk)) +			break; + +		/* not already used */ +		if (test_bit(hwc->idx, used_mask)) +			break; + +		__set_bit(hwc->idx, used_mask); +		if (assign) +			assign[i] = hwc->idx; +	} +	/* slow path */ +	if (i != n) +		ret = perf_assign_events(constraints, n, wmin, wmax, assign); + +	if (!assign || ret) { +		for (i = 0; i < n; i++) +			uncore_put_event_constraint(box, box->event_list[i]); +	} +	return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	int idx = event->hw.idx; + +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +		return; + +	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) +		return; + +	event->hw.state = 0; +	box->events[idx] = event; +	box->n_active++; +	__set_bit(idx, box->active_mask); + +	local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); +	uncore_enable_event(box, event); + +	if (box->n_active == 1) { +		uncore_enable_box(box); +		uncore_pmu_start_hrtimer(box); +	} +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	struct hw_perf_event *hwc = &event->hw; + +	if (__test_and_clear_bit(hwc->idx, box->active_mask)) { +		uncore_disable_event(box, event); +		box->n_active--; +		box->events[hwc->idx] = NULL; +		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); +		hwc->state |= PERF_HES_STOPPED; + +		if (box->n_active == 0) { +			uncore_disable_box(box); +			uncore_pmu_cancel_hrtimer(box); +		} +	} + +	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { +		/* +		 * Drain the remaining delta count out of a event +		 * that we are disabling: +		 */ +		uncore_perf_event_update(box, event); +		hwc->state |= PERF_HES_UPTODATE; +	} +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	struct hw_perf_event *hwc = &event->hw; +	int assign[UNCORE_PMC_IDX_MAX]; +	int i, n, ret; + +	if (!box) +		return -ENODEV; + +	ret = n = uncore_collect_events(box, event, false); +	if (ret < 0) +		return ret; + +	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; +	if (!(flags & PERF_EF_START)) +		hwc->state |= PERF_HES_ARCH; + +	ret = uncore_assign_events(box, assign, n); +	if (ret) +		return ret; + +	/* save events moving to new counters */ +	for (i = 0; i < box->n_events; i++) { +		event = box->event_list[i]; +		hwc = &event->hw; + +		if (hwc->idx == assign[i] && +			hwc->last_tag == box->tags[assign[i]]) +			continue; +		/* +		 * Ensure we don't accidentally enable a stopped +		 * counter simply because we rescheduled. +		 */ +		if (hwc->state & PERF_HES_STOPPED) +			hwc->state |= PERF_HES_ARCH; + +		uncore_pmu_event_stop(event, PERF_EF_UPDATE); +	} + +	/* reprogram moved events into new counters */ +	for (i = 0; i < n; i++) { +		event = box->event_list[i]; +		hwc = &event->hw; + +		if (hwc->idx != assign[i] || +			hwc->last_tag != box->tags[assign[i]]) +			uncore_assign_hw_event(box, event, assign[i]); +		else if (i < box->n_events) +			continue; + +		if (hwc->state & PERF_HES_ARCH) +			continue; + +		uncore_pmu_event_start(event, 0); +	} +	box->n_events = n; + +	return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	int i; + +	uncore_pmu_event_stop(event, PERF_EF_UPDATE); + +	for (i = 0; i < box->n_events; i++) { +		if (event == box->event_list[i]) { +			uncore_put_event_constraint(box, event); + +			while (++i < box->n_events) +				box->event_list[i - 1] = box->event_list[i]; + +			--box->n_events; +			break; +		} +	} + +	event->hw.idx = -1; +	event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, +				struct perf_event *event) +{ +	struct perf_event *leader = event->group_leader; +	struct intel_uncore_box *fake_box; +	int ret = -EINVAL, n; + +	fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); +	if (!fake_box) +		return -ENOMEM; + +	fake_box->pmu = pmu; +	/* +	 * the event is not yet connected with its +	 * siblings therefore we must first collect +	 * existing siblings, then add the new event +	 * before we can simulate the scheduling +	 */ +	n = uncore_collect_events(fake_box, leader, true); +	if (n < 0) +		goto out; + +	fake_box->n_events = n; +	n = uncore_collect_events(fake_box, event, false); +	if (n < 0) +		goto out; + +	fake_box->n_events = n; + +	ret = uncore_assign_events(fake_box, NULL, n); +out: +	kfree(fake_box); +	return ret; +} + +int uncore_pmu_event_init(struct perf_event *event) +{ +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box; +	struct hw_perf_event *hwc = &event->hw; +	int ret; + +	if (event->attr.type != event->pmu->type) +		return -ENOENT; + +	pmu = uncore_event_to_pmu(event); +	/* no device found for this pmu */ +	if (pmu->func_id < 0) +		return -ENOENT; + +	/* +	 * Uncore PMU does measure at all privilege level all the time. +	 * So it doesn't make sense to specify any exclude bits. +	 */ +	if (event->attr.exclude_user || event->attr.exclude_kernel || +			event->attr.exclude_hv || event->attr.exclude_idle) +		return -EINVAL; + +	/* Sampling not supported yet */ +	if (hwc->sample_period) +		return -EINVAL; + +	/* +	 * Place all uncore events for a particular physical package +	 * onto a single cpu +	 */ +	if (event->cpu < 0) +		return -EINVAL; +	box = uncore_pmu_to_box(pmu, event->cpu); +	if (!box || box->cpu < 0) +		return -EINVAL; +	event->cpu = box->cpu; + +	event->hw.idx = -1; +	event->hw.last_tag = ~0ULL; +	event->hw.extra_reg.idx = EXTRA_REG_NONE; + +	if (event->attr.config == UNCORE_FIXED_EVENT) { +		/* no fixed counter */ +		if (!pmu->type->fixed_ctl) +			return -EINVAL; +		/* +		 * if there is only one fixed counter, only the first pmu +		 * can access the fixed counter +		 */ +		if (pmu->type->single_fixed && pmu->pmu_idx > 0) +			return -EINVAL; +		hwc->config = ~0ULL; +	} else { +		hwc->config = event->attr.config & pmu->type->event_mask; +		if (pmu->type->ops->hw_config) { +			ret = pmu->type->ops->hw_config(box, event); +			if (ret) +				return ret; +		} +	} + +	if (event->group_leader != event) +		ret = uncore_validate_group(pmu, event); +	else +		ret = 0; + +	return ret; +} + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ +	int ret; + +	pmu->pmu = (struct pmu) { +		.attr_groups	= pmu->type->attr_groups, +		.task_ctx_nr	= perf_invalid_context, +		.event_init	= uncore_pmu_event_init, +		.add		= uncore_pmu_event_add, +		.del		= uncore_pmu_event_del, +		.start		= uncore_pmu_event_start, +		.stop		= uncore_pmu_event_stop, +		.read		= uncore_pmu_event_read, +	}; + +	if (pmu->type->num_boxes == 1) { +		if (strlen(pmu->type->name) > 0) +			sprintf(pmu->name, "uncore_%s", pmu->type->name); +		else +			sprintf(pmu->name, "uncore"); +	} else { +		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, +			pmu->pmu_idx); +	} + +	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); +	return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ +	int i; + +	for (i = 0; i < type->num_boxes; i++) +		free_percpu(type->pmus[i].box); +	kfree(type->pmus); +	type->pmus = NULL; +	kfree(type->attr_groups[1]); +	type->attr_groups[1] = NULL; +} + +static void uncore_types_exit(struct intel_uncore_type **types) +{ +	int i; +	for (i = 0; types[i]; i++) +		uncore_type_exit(types[i]); +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ +	struct intel_uncore_pmu *pmus; +	struct attribute_group *events_group; +	struct attribute **attrs; +	int i, j; + +	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); +	if (!pmus) +		return -ENOMEM; + +	type->unconstrainted = (struct event_constraint) +		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, +				0, type->num_counters, 0); + +	for (i = 0; i < type->num_boxes; i++) { +		pmus[i].func_id = -1; +		pmus[i].pmu_idx = i; +		pmus[i].type = type; +		INIT_LIST_HEAD(&pmus[i].box_list); +		pmus[i].box = alloc_percpu(struct intel_uncore_box *); +		if (!pmus[i].box) +			goto fail; +	} + +	if (type->event_descs) { +		i = 0; +		while (type->event_descs[i].attr.attr.name) +			i++; + +		events_group = kzalloc(sizeof(struct attribute *) * (i + 1) + +					sizeof(*events_group), GFP_KERNEL); +		if (!events_group) +			goto fail; + +		attrs = (struct attribute **)(events_group + 1); +		events_group->name = "events"; +		events_group->attrs = attrs; + +		for (j = 0; j < i; j++) +			attrs[j] = &type->event_descs[j].attr.attr; + +		type->attr_groups[1] = events_group; +	} + +	type->pmus = pmus; +	return 0; +fail: +	uncore_type_exit(type); +	return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ +	int i, ret; + +	for (i = 0; types[i]; i++) { +		ret = uncore_type_init(types[i]); +		if (ret) +			goto fail; +	} +	return 0; +fail: +	while (--i >= 0) +		uncore_type_exit(types[i]); +	return ret; +} + +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int __devinit uncore_pci_add(struct intel_uncore_type *type, +				    struct pci_dev *pdev) +{ +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box; +	int i, phys_id; + +	phys_id = pcibus_to_physid[pdev->bus->number]; +	if (phys_id < 0) +		return -ENODEV; + +	box = uncore_alloc_box(type, 0); +	if (!box) +		return -ENOMEM; + +	/* +	 * for performance monitoring unit with multiple boxes, +	 * each box has a different function id. +	 */ +	for (i = 0; i < type->num_boxes; i++) { +		pmu = &type->pmus[i]; +		if (pmu->func_id == pdev->devfn) +			break; +		if (pmu->func_id < 0) { +			pmu->func_id = pdev->devfn; +			break; +		} +		pmu = NULL; +	} + +	if (!pmu) { +		kfree(box); +		return -EINVAL; +	} + +	box->phys_id = phys_id; +	box->pci_dev = pdev; +	box->pmu = pmu; +	uncore_box_init(box); +	pci_set_drvdata(pdev, box); + +	raw_spin_lock(&uncore_box_lock); +	list_add_tail(&box->list, &pmu->box_list); +	raw_spin_unlock(&uncore_box_lock); + +	return 0; +} + +static void uncore_pci_remove(struct pci_dev *pdev) +{ +	struct intel_uncore_box *box = pci_get_drvdata(pdev); +	struct intel_uncore_pmu *pmu = box->pmu; +	int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + +	if (WARN_ON_ONCE(phys_id != box->phys_id)) +		return; + +	raw_spin_lock(&uncore_box_lock); +	list_del(&box->list); +	raw_spin_unlock(&uncore_box_lock); + +	for_each_possible_cpu(cpu) { +		if (*per_cpu_ptr(pmu->box, cpu) == box) { +			*per_cpu_ptr(pmu->box, cpu) = NULL; +			atomic_dec(&box->refcnt); +		} +	} + +	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); +	kfree(box); +} + +static int __devinit uncore_pci_probe(struct pci_dev *pdev, +				const struct pci_device_id *id) +{ +	struct intel_uncore_type *type; + +	type = (struct intel_uncore_type *)id->driver_data; +	return uncore_pci_add(type, pdev); +} + +static int __init uncore_pci_init(void) +{ +	int ret; + +	switch (boot_cpu_data.x86_model) { +	case 45: /* Sandy Bridge-EP */ +		pci_uncores = snbep_pci_uncores; +		uncore_pci_driver = &snbep_uncore_pci_driver; +		snbep_pci2phy_map_init(); +		break; +	default: +		return 0; +	} + +	ret = uncore_types_init(pci_uncores); +	if (ret) +		return ret; + +	uncore_pci_driver->probe = uncore_pci_probe; +	uncore_pci_driver->remove = uncore_pci_remove; + +	ret = pci_register_driver(uncore_pci_driver); +	if (ret == 0) +		pcidrv_registered = true; +	else +		uncore_types_exit(pci_uncores); + +	return ret; +} + +static void __init uncore_pci_exit(void) +{ +	if (pcidrv_registered) { +		pcidrv_registered = false; +		pci_unregister_driver(uncore_pci_driver); +		uncore_types_exit(pci_uncores); +	} +} + +static void __cpuinit uncore_cpu_dying(int cpu) +{ +	struct intel_uncore_type *type; +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box; +	int i, j; + +	for (i = 0; msr_uncores[i]; i++) { +		type = msr_uncores[i]; +		for (j = 0; j < type->num_boxes; j++) { +			pmu = &type->pmus[j]; +			box = *per_cpu_ptr(pmu->box, cpu); +			*per_cpu_ptr(pmu->box, cpu) = NULL; +			if (box && atomic_dec_and_test(&box->refcnt)) +				kfree(box); +		} +	} +} + +static int __cpuinit uncore_cpu_starting(int cpu) +{ +	struct intel_uncore_type *type; +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box, *exist; +	int i, j, k, phys_id; + +	phys_id = topology_physical_package_id(cpu); + +	for (i = 0; msr_uncores[i]; i++) { +		type = msr_uncores[i]; +		for (j = 0; j < type->num_boxes; j++) { +			pmu = &type->pmus[j]; +			box = *per_cpu_ptr(pmu->box, cpu); +			/* called by uncore_cpu_init? */ +			if (box && box->phys_id >= 0) { +				uncore_box_init(box); +				continue; +			} + +			for_each_online_cpu(k) { +				exist = *per_cpu_ptr(pmu->box, k); +				if (exist && exist->phys_id == phys_id) { +					atomic_inc(&exist->refcnt); +					*per_cpu_ptr(pmu->box, cpu) = exist; +					kfree(box); +					box = NULL; +					break; +				} +			} + +			if (box) { +				box->phys_id = phys_id; +				uncore_box_init(box); +			} +		} +	} +	return 0; +} + +static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +{ +	struct intel_uncore_type *type; +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box; +	int i, j; + +	for (i = 0; msr_uncores[i]; i++) { +		type = msr_uncores[i]; +		for (j = 0; j < type->num_boxes; j++) { +			pmu = &type->pmus[j]; +			if (pmu->func_id < 0) +				pmu->func_id = j; + +			box = uncore_alloc_box(type, cpu); +			if (!box) +				return -ENOMEM; + +			box->pmu = pmu; +			box->phys_id = phys_id; +			*per_cpu_ptr(pmu->box, cpu) = box; +		} +	} +	return 0; +} + +static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores, +					    int old_cpu, int new_cpu) +{ +	struct intel_uncore_type *type; +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box; +	int i, j; + +	for (i = 0; uncores[i]; i++) { +		type = uncores[i]; +		for (j = 0; j < type->num_boxes; j++) { +			pmu = &type->pmus[j]; +			if (old_cpu < 0) +				box = uncore_pmu_to_box(pmu, new_cpu); +			else +				box = uncore_pmu_to_box(pmu, old_cpu); +			if (!box) +				continue; + +			if (old_cpu < 0) { +				WARN_ON_ONCE(box->cpu != -1); +				box->cpu = new_cpu; +				continue; +			} + +			WARN_ON_ONCE(box->cpu != old_cpu); +			if (new_cpu >= 0) { +				uncore_pmu_cancel_hrtimer(box); +				perf_pmu_migrate_context(&pmu->pmu, +						old_cpu, new_cpu); +				box->cpu = new_cpu; +			} else { +				box->cpu = -1; +			} +		} +	} +} + +static void __cpuinit uncore_event_exit_cpu(int cpu) +{ +	int i, phys_id, target; + +	/* if exiting cpu is used for collecting uncore events */ +	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) +		return; + +	/* find a new cpu to collect uncore events */ +	phys_id = topology_physical_package_id(cpu); +	target = -1; +	for_each_online_cpu(i) { +		if (i == cpu) +			continue; +		if (phys_id == topology_physical_package_id(i)) { +			target = i; +			break; +		} +	} + +	/* migrate uncore events to the new cpu */ +	if (target >= 0) +		cpumask_set_cpu(target, &uncore_cpu_mask); + +	uncore_change_context(msr_uncores, cpu, target); +	uncore_change_context(pci_uncores, cpu, target); +} + +static void __cpuinit uncore_event_init_cpu(int cpu) +{ +	int i, phys_id; + +	phys_id = topology_physical_package_id(cpu); +	for_each_cpu(i, &uncore_cpu_mask) { +		if (phys_id == topology_physical_package_id(i)) +			return; +	} + +	cpumask_set_cpu(cpu, &uncore_cpu_mask); + +	uncore_change_context(msr_uncores, -1, cpu); +	uncore_change_context(pci_uncores, -1, cpu); +} + +static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, +					 unsigned long action, void *hcpu) +{ +	unsigned int cpu = (long)hcpu; + +	/* allocate/free data structure for uncore box */ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_UP_PREPARE: +		uncore_cpu_prepare(cpu, -1); +		break; +	case CPU_STARTING: +		uncore_cpu_starting(cpu); +		break; +	case CPU_UP_CANCELED: +	case CPU_DYING: +		uncore_cpu_dying(cpu); +		break; +	default: +		break; +	} + +	/* select the cpu that collects uncore events */ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_FAILED: +	case CPU_STARTING: +		uncore_event_init_cpu(cpu); +		break; +	case CPU_DOWN_PREPARE: +		uncore_event_exit_cpu(cpu); +		break; +	default: +		break; +	} + +	return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb __cpuinitdata = { +	.notifier_call = uncore_cpu_notifier, +	/* +	 * to migrate uncore events, our notifier should be executed +	 * before perf core's notifier. +	 */ +	.priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ +	uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ +	int ret, cpu, max_cores; + +	max_cores = boot_cpu_data.x86_max_cores; +	switch (boot_cpu_data.x86_model) { +	case 26: /* Nehalem */ +	case 30: +	case 37: /* Westmere */ +	case 44: +		msr_uncores = nhm_msr_uncores; +		break; +	case 42: /* Sandy Bridge */ +		if (snb_uncore_cbox.num_boxes > max_cores) +			snb_uncore_cbox.num_boxes = max_cores; +		msr_uncores = snb_msr_uncores; +		break; +	case 45: /* Sandy Birdge-EP */ +		if (snbep_uncore_cbox.num_boxes > max_cores) +			snbep_uncore_cbox.num_boxes = max_cores; +		msr_uncores = snbep_msr_uncores; +		break; +	default: +		return 0; +	} + +	ret = uncore_types_init(msr_uncores); +	if (ret) +		return ret; + +	get_online_cpus(); + +	for_each_online_cpu(cpu) { +		int i, phys_id = topology_physical_package_id(cpu); + +		for_each_cpu(i, &uncore_cpu_mask) { +			if (phys_id == topology_physical_package_id(i)) { +				phys_id = -1; +				break; +			} +		} +		if (phys_id < 0) +			continue; + +		uncore_cpu_prepare(cpu, phys_id); +		uncore_event_init_cpu(cpu); +	} +	on_each_cpu(uncore_cpu_setup, NULL, 1); + +	register_cpu_notifier(&uncore_cpu_nb); + +	put_online_cpus(); + +	return 0; +} + +static int __init uncore_pmus_register(void) +{ +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_type *type; +	int i, j; + +	for (i = 0; msr_uncores[i]; i++) { +		type = msr_uncores[i]; +		for (j = 0; j < type->num_boxes; j++) { +			pmu = &type->pmus[j]; +			uncore_pmu_register(pmu); +		} +	} + +	for (i = 0; pci_uncores[i]; i++) { +		type = pci_uncores[i]; +		for (j = 0; j < type->num_boxes; j++) { +			pmu = &type->pmus[j]; +			uncore_pmu_register(pmu); +		} +	} + +	return 0; +} + +static int __init intel_uncore_init(void) +{ +	int ret; + +	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) +		return -ENODEV; + +	ret = uncore_pci_init(); +	if (ret) +		goto fail; +	ret = uncore_cpu_init(); +	if (ret) { +		uncore_pci_exit(); +		goto fail; +	} + +	uncore_pmus_register(); +	return 0; +fail: +	return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 000000000000..b13e9ea81def --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,424 @@ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/perf_event.h> +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN		32 +#define UNCORE_BOX_HASH_SIZE		8 + +#define UNCORE_PMU_HRTIMER_INTERVAL	(60 * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT		0xff +#define UNCORE_PMC_IDX_MAX_GENERIC	8 +#define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff +#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET			(1 << 18) +#define SNB_UNC_CTL_EN				(1 << 22) +#define SNB_UNC_CTL_INVERT			(1 << 23) +#define SNB_UNC_CTL_CMASK_MASK			0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK			0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \ +						 SNB_UNC_CTL_UMASK_MASK | \ +						 SNB_UNC_CTL_EDGE_DET | \ +						 SNB_UNC_CTL_INVERT | \ +						 SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \ +						 SNB_UNC_CTL_UMASK_MASK | \ +						 SNB_UNC_CTL_EDGE_DET | \ +						 SNB_UNC_CTL_INVERT | \ +						 NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL                 0x391 +#define SNB_UNC_FIXED_CTR_CTRL                  0x394 +#define SNB_UNC_FIXED_CTR                       0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700 +#define SNB_UNC_CBO_0_PER_CTR0                  0x706 +#define SNB_UNC_CBO_MSR_OFFSET                  0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL                 0x391 +#define NHM_UNC_FIXED_CTR                       0x394 +#define NHM_UNC_FIXED_CTR_CTRL                  0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0                     0x3c0 +#define NHM_UNC_UNCORE_PMC0                     0x3b0 + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16) +#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \ +					 SNBEP_PMON_BOX_CTL_RST_CTRS | \ +					 SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00 +#define SNBEP_PMON_CTL_RST		(1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)	/* only for QPI */ +#define SNBEP_PMON_CTL_EN		(1 << 22) +#define SNBEP_PMON_CTL_INVERT		(1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \ +					 SNBEP_PMON_CTL_UMASK_MASK | \ +					 SNBEP_PMON_CTL_EDGE_DET | \ +					 SNBEP_PMON_CTL_INVERT | \ +					 SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\ +				(SNBEP_PMON_CTL_EV_SEL_MASK | \ +				 SNBEP_PMON_CTL_UMASK_MASK | \ +				 SNBEP_PMON_CTL_EDGE_DET | \ +				 SNBEP_PMON_CTL_INVERT | \ +				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \ +						 SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\ +				(SNBEP_PMON_CTL_EV_SEL_MASK | \ +				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ +				 SNBEP_PMON_CTL_EDGE_DET | \ +				 SNBEP_PMON_CTL_INVERT | \ +				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ +				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ +				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL			0xf4 +#define SNBEP_PCI_PMON_CTL0			0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0			0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0			0xc16 +#define SNBEP_U_MSR_PMON_CTL0			0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0			0xd16 +#define SNBEP_C0_MSR_PMON_CTL0			0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK	0xfffffc1f +#define SNBEP_CBO_MSR_OFFSET			0x20 + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0			0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0			0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { +	const char *name; +	int num_counters; +	int num_boxes; +	int perf_ctr_bits; +	int fixed_ctr_bits; +	unsigned perf_ctr; +	unsigned event_ctl; +	unsigned event_mask; +	unsigned fixed_ctr; +	unsigned fixed_ctl; +	unsigned box_ctl; +	unsigned msr_offset; +	unsigned num_shared_regs:8; +	unsigned single_fixed:1; +	struct event_constraint unconstrainted; +	struct event_constraint *constraints; +	struct intel_uncore_pmu *pmus; +	struct intel_uncore_ops *ops; +	struct uncore_event_desc *event_descs; +	const struct attribute_group *attr_groups[3]; +}; + +#define format_group attr_groups[0] + +struct intel_uncore_ops { +	void (*init_box)(struct intel_uncore_box *); +	void (*disable_box)(struct intel_uncore_box *); +	void (*enable_box)(struct intel_uncore_box *); +	void (*disable_event)(struct intel_uncore_box *, struct perf_event *); +	void (*enable_event)(struct intel_uncore_box *, struct perf_event *); +	u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); +	int (*hw_config)(struct intel_uncore_box *, struct perf_event *); +	struct event_constraint *(*get_constraint)(struct intel_uncore_box *, +						   struct perf_event *); +	void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { +	struct pmu pmu; +	char name[UNCORE_PMU_NAME_LEN]; +	int pmu_idx; +	int func_id; +	struct intel_uncore_type *type; +	struct intel_uncore_box ** __percpu box; +	struct list_head box_list; +}; + +struct intel_uncore_extra_reg { +	raw_spinlock_t lock; +	u64 config1; +	atomic_t ref; +}; + +struct intel_uncore_box { +	int phys_id; +	int n_active;	/* number of active events */ +	int n_events; +	int cpu;	/* cpu to collect events */ +	unsigned long flags; +	atomic_t refcnt; +	struct perf_event *events[UNCORE_PMC_IDX_MAX]; +	struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; +	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; +	u64 tags[UNCORE_PMC_IDX_MAX]; +	struct pci_dev *pci_dev; +	struct intel_uncore_pmu *pmu; +	struct hrtimer hrtimer; +	struct list_head list; +	struct intel_uncore_extra_reg shared_regs[0]; +}; + +#define UNCORE_BOX_FLAG_INITIATED	0 + +struct uncore_event_desc { +	struct kobj_attribute attr; +	const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config)			\ +{								\ +	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\ +	.config	= _config,					\ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)			\ +static ssize_t __uncore_##_var##_show(struct kobject *kobj,		\ +				struct kobj_attribute *attr,		\ +				char *page)				\ +{									\ +	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\ +	return sprintf(page, _format "\n");				\ +}									\ +static struct kobj_attribute format_attr_##_var =			\ +	__ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, +				struct kobj_attribute *attr, char *buf) +{ +	struct uncore_event_desc *event = +		container_of(attr, struct uncore_event_desc, attr); +	return sprintf(buf, "%s", event->config); +} + +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ +	return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ +	return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ +	return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ +	return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ +	return idx * 8 + box->pmu->type->perf_ctr; +} + +static inline +unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ +	if (!box->pmu->type->box_ctl) +		return 0; +	return box->pmu->type->box_ctl + +		box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ +	if (!box->pmu->type->fixed_ctl) +		return 0; +	return box->pmu->type->fixed_ctl + +		box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ +	return box->pmu->type->fixed_ctr + +		box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ +	return idx + box->pmu->type->event_ctl + +		box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ +	return idx + box->pmu->type->perf_ctr + +		box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ +	if (box->pci_dev) +		return uncore_pci_fixed_ctl(box); +	else +		return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ +	if (box->pci_dev) +		return uncore_pci_fixed_ctr(box); +	else +		return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ +	if (box->pci_dev) +		return uncore_pci_event_ctl(box, idx); +	else +		return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ +	if (box->pci_dev) +		return uncore_pci_perf_ctr(box, idx); +	else +		return uncore_msr_perf_ctr(box, idx); +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ +	return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ +	return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ +	return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ +	if (box->pmu->type->ops->disable_box) +		box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ +	if (box->pmu->type->ops->enable_box) +		box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, +				struct perf_event *event) +{ +	box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, +				struct perf_event *event) +{ +	box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, +				struct perf_event *event) +{ +	return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ +	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { +		if (box->pmu->type->ops->init_box) +			box->pmu->type->ops->init_box(box); +	} +} diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 47124a73dd73..92c7e39a079f 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void)  	 * So at moment let leave metrics turned on forever -- it's  	 * ok for now but need to be revisited!  	 * -	 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); -	 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); +	 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0); +	 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0);  	 */  } @@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)  	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get  	 * asserted again and again  	 */ -	(void)checking_wrmsrl(hwc->config_base, +	(void)wrmsrl_safe(hwc->config_base,  		(u64)(p4_config_unpack_cccr(hwc->config)) &  			~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);  } @@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config)  	bind = &p4_pebs_bind_map[idx]; -	(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs); -	(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert); +	(void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs); +	(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);  }  static void p4_pmu_enable_event(struct perf_event *event) @@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event)  	 */  	p4_pmu_enable_pebs(hwc->config); -	(void)checking_wrmsrl(escr_addr, escr_conf); -	(void)checking_wrmsrl(hwc->config_base, +	(void)wrmsrl_safe(escr_addr, escr_conf); +	(void)wrmsrl_safe(hwc->config_base,  				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);  } @@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void)  	unsigned int low, high;  	/* If we get stripped -- indexing fails */ -	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); +	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);  	rdmsr(MSR_IA32_MISC_ENABLE, low, high);  	if (!(low & (1 << 7))) { diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 32bcfc7dd230..e4dd0f7a0453 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event)  	if (cpuc->enabled)  		val |= ARCH_PERFMON_EVENTSEL_ENABLE; -	(void)checking_wrmsrl(hwc->config_base, val); +	(void)wrmsrl_safe(hwc->config_base, val);  }  static void p6_pmu_enable_event(struct perf_event *event) @@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)  	if (cpuc->enabled)  		val |= ARCH_PERFMON_EVENTSEL_ENABLE; -	(void)checking_wrmsrl(hwc->config_base, val); +	(void)wrmsrl_safe(hwc->config_base, val);  }  PMU_FORMAT_ATTR(event,	"config:0-7"	); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index addf9e82a7f2..ee8e9abc859f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)  	const struct cpuid_bit *cb;  	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { -		{ X86_FEATURE_DTS,		CR_EAX, 0, 0x00000006, 0 }, +		{ X86_FEATURE_DTHERM,		CR_EAX, 0, 0x00000006, 0 },  		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },  		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },  		{ X86_FEATURE_PLN,		CR_EAX, 4, 0x00000006, 0 }, diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c deleted file mode 100644 index a640ae5ad201..000000000000 --- a/arch/x86/kernel/cpu/sched.c +++ /dev/null @@ -1,55 +0,0 @@ -#include <linux/sched.h> -#include <linux/math64.h> -#include <linux/percpu.h> -#include <linux/irqflags.h> - -#include <asm/cpufeature.h> -#include <asm/processor.h> - -#ifdef CONFIG_SMP - -static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); - -static unsigned long scale_aperfmperf(void) -{ -	struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); -	unsigned long ratio, flags; - -	local_irq_save(flags); -	get_aperfmperf(&val); -	local_irq_restore(flags); - -	ratio = calc_aperfmperf_ratio(old, &val); -	*old = val; - -	return ratio; -} - -unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) -{ -	/* -	 * do aperf/mperf on the cpu level because it includes things -	 * like turbo mode, which are relevant to full cores. -	 */ -	if (boot_cpu_has(X86_FEATURE_APERFMPERF)) -		return scale_aperfmperf(); - -	/* -	 * maybe have something cpufreq here -	 */ - -	return default_scale_freq_power(sd, cpu); -} - -unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) -{ -	/* -	 * aperf/mperf already includes the smt gain -	 */ -	if (boot_cpu_has(X86_FEATURE_APERFMPERF)) -		return SCHED_LOAD_SCALE; - -	return default_scale_smt_power(sd, cpu); -} - -#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 571246d81edf..ae42418bc50f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,8 +27,8 @@ static int die_counter;  void printk_address(unsigned long address, int reliable)  { -	printk(" [<%p>] %s%pB\n", (void *) address, -			reliable ? "" : "? ", (void *) address); +	pr_cont(" [<%p>] %s%pB\n", +		(void *)address, reliable ? "" : "? ", (void *)address);  }  #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)  			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)  		return 1; +	print_modules();  	show_regs(regs);  #ifdef CONFIG_X86_32  	if (user_mode_vm(regs)) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0b1d783daab..1038a417ea53 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  		if (kstack_end(stack))  			break;  		if (i && ((i % STACKSLOTS_PER_LINE) == 0)) -			printk(KERN_CONT "\n"); -		printk(KERN_CONT " %08lx", *stack++); +			pr_cont("\n"); +		pr_cont(" %08lx", *stack++);  		touch_nmi_watchdog();  	} -	printk(KERN_CONT "\n"); +	pr_cont("\n");  	show_trace_log_lvl(task, regs, sp, bp, log_lvl);  } @@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs)  {  	int i; -	print_modules();  	__show_regs(regs, !user_mode_vm(regs)); -	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", -		TASK_COMM_LEN, current->comm, task_pid_nr(current), -		current_thread_info(), current, task_thread_info(current)); +	pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", +		 TASK_COMM_LEN, current->comm, task_pid_nr(current), +		 current_thread_info(), current, task_thread_info(current));  	/*  	 * When in-kernel, we also print out the stack and code at the  	 * time of the fault.. @@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs)  		unsigned char c;  		u8 *ip; -		printk(KERN_EMERG "Stack:\n"); +		pr_emerg("Stack:\n");  		show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); -		printk(KERN_EMERG "Code: "); +		pr_emerg("Code:");  		ip = (u8 *)regs->ip - code_prologue;  		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs)  		for (i = 0; i < code_len; i++, ip++) {  			if (ip < (u8 *)PAGE_OFFSET ||  					probe_kernel_address(ip, c)) { -				printk(KERN_CONT " Bad EIP value."); +				pr_cont("  Bad EIP value.");  				break;  			}  			if (ip == (u8 *)regs->ip) -				printk(KERN_CONT "<%02x> ", c); +				pr_cont(" <%02x>", c);  			else -				printk(KERN_CONT "%02x ", c); +				pr_cont(" %02x", c);  		}  	} -	printk(KERN_CONT "\n"); +	pr_cont("\n");  }  int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 791b76122aa8..b653675d5288 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  		if (stack >= irq_stack && stack <= irq_stack_end) {  			if (stack == irq_stack_end) {  				stack = (unsigned long *) (irq_stack_end[-1]); -				printk(KERN_CONT " <EOI> "); +				pr_cont(" <EOI> ");  			}  		} else {  		if (((long) stack & (THREAD_SIZE-1)) == 0)  			break;  		}  		if (i && ((i % STACKSLOTS_PER_LINE) == 0)) -			printk(KERN_CONT "\n"); -		printk(KERN_CONT " %016lx", *stack++); +			pr_cont("\n"); +		pr_cont(" %016lx", *stack++);  		touch_nmi_watchdog();  	}  	preempt_enable(); -	printk(KERN_CONT "\n"); +	pr_cont("\n");  	show_trace_log_lvl(task, regs, sp, bp, log_lvl);  } @@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs)  	sp = regs->sp;  	printk("CPU %d ", cpu); -	print_modules();  	__show_regs(regs, 1); -	printk("Process %s (pid: %d, threadinfo %p, task %p)\n", -		cur->comm, cur->pid, task_thread_info(cur), cur); +	printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", +	       cur->comm, cur->pid, task_thread_info(cur), cur);  	/*  	 * When in-kernel, we also print out the stack and code at the @@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs)  		for (i = 0; i < code_len; i++, ip++) {  			if (ip < (u8 *)PAGE_OFFSET ||  					probe_kernel_address(ip, c)) { -				printk(KERN_CONT " Bad RIP value."); +				pr_cont(" Bad RIP value.");  				break;  			}  			if (ip == (u8 *)regs->ip) -				printk(KERN_CONT "<%02x> ", c); +				pr_cont("<%02x> ", c);  			else -				printk(KERN_CONT "%02x ", c); +				pr_cont("%02x ", c);  		}  	} -	printk(KERN_CONT "\n"); +	pr_cont("\n");  }  int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133b51be..69babd8c834f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1048,24 +1048,6 @@ apicinterrupt LOCAL_TIMER_VECTOR \  apicinterrupt X86_PLATFORM_IPI_VECTOR \  	x86_platform_ipi smp_x86_platform_ipi -#ifdef CONFIG_SMP -	ALIGN -	INTR_FRAME -.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ -	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 -.if NUM_INVALIDATE_TLB_VECTORS > \idx -ENTRY(invalidate_interrupt\idx) -	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) -	jmp .Lcommon_invalidate_interrupt0 -	CFI_ADJUST_CFA_OFFSET -8 -END(invalidate_interrupt\idx) -.endif -.endr -	CFI_ENDPROC -apicinterrupt INVALIDATE_TLB_VECTOR_START, \ -	invalidate_interrupt0, smp_invalidate_interrupt -#endif -  apicinterrupt THRESHOLD_APIC_VECTOR \  	threshold_interrupt smp_threshold_interrupt  apicinterrupt THERMAL_APIC_VECTOR \ @@ -1758,10 +1740,30 @@ end_repeat_nmi:  	 */  	call save_paranoid  	DEFAULT_FRAME 0 + +	/* +	 * Save off the CR2 register. If we take a page fault in the NMI then +	 * it could corrupt the CR2 value. If the NMI preempts a page fault +	 * handler before it was able to read the CR2 register, and then the +	 * NMI itself takes a page fault, the page fault that was preempted +	 * will read the information from the NMI page fault and not the +	 * origin fault. Save it off and restore it if it changes. +	 * Use the r12 callee-saved register. +	 */ +	movq %cr2, %r12 +  	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */  	movq %rsp,%rdi  	movq $-1,%rsi  	call do_nmi + +	/* Did the NMI take a page fault? Restore cr2 if it did */ +	movq %cr2, %rcx +	cmpq %rcx, %r12 +	je 1f +	movq %r12, %cr2 +1: +	  	testl %ebx,%ebx				/* swapgs needed? */  	jnz nmi_restore  nmi_swapgs: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3dafc6003b7c..1f5f1d5d2a02 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -294,9 +294,9 @@ void fixup_irqs(void)  		raw_spin_unlock(&desc->lock);  		if (break_affinity && set_affinity) -			printk("Broke affinity for irq %i\n", irq); +			pr_notice("Broke affinity for irq %i\n", irq);  		else if (!set_affinity) -			printk("Cannot set affinity for irq %i\n", irq); +			pr_notice("Cannot set affinity for irq %i\n", irq);  	}  	/* diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 252981afd6c4..6e03b0d69138 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -171,79 +171,6 @@ static void __init smp_intr_init(void)  	 */  	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); -	/* IPIs for invalidation */ -#define ALLOC_INVTLB_VEC(NR) \ -	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ -		invalidate_interrupt##NR) - -	switch (NUM_INVALIDATE_TLB_VECTORS) { -	default: -		ALLOC_INVTLB_VEC(31); -	case 31: -		ALLOC_INVTLB_VEC(30); -	case 30: -		ALLOC_INVTLB_VEC(29); -	case 29: -		ALLOC_INVTLB_VEC(28); -	case 28: -		ALLOC_INVTLB_VEC(27); -	case 27: -		ALLOC_INVTLB_VEC(26); -	case 26: -		ALLOC_INVTLB_VEC(25); -	case 25: -		ALLOC_INVTLB_VEC(24); -	case 24: -		ALLOC_INVTLB_VEC(23); -	case 23: -		ALLOC_INVTLB_VEC(22); -	case 22: -		ALLOC_INVTLB_VEC(21); -	case 21: -		ALLOC_INVTLB_VEC(20); -	case 20: -		ALLOC_INVTLB_VEC(19); -	case 19: -		ALLOC_INVTLB_VEC(18); -	case 18: -		ALLOC_INVTLB_VEC(17); -	case 17: -		ALLOC_INVTLB_VEC(16); -	case 16: -		ALLOC_INVTLB_VEC(15); -	case 15: -		ALLOC_INVTLB_VEC(14); -	case 14: -		ALLOC_INVTLB_VEC(13); -	case 13: -		ALLOC_INVTLB_VEC(12); -	case 12: -		ALLOC_INVTLB_VEC(11); -	case 11: -		ALLOC_INVTLB_VEC(10); -	case 10: -		ALLOC_INVTLB_VEC(9); -	case 9: -		ALLOC_INVTLB_VEC(8); -	case 8: -		ALLOC_INVTLB_VEC(7); -	case 7: -		ALLOC_INVTLB_VEC(6); -	case 6: -		ALLOC_INVTLB_VEC(5); -	case 5: -		ALLOC_INVTLB_VEC(4); -	case 4: -		ALLOC_INVTLB_VEC(3); -	case 3: -		ALLOC_INVTLB_VEC(2); -	case 2: -		ALLOC_INVTLB_VEC(1); -	case 1: -		ALLOC_INVTLB_VEC(0); -		break; -	} -  	/* IPI for generic function call */  	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8bfb6146f753..3f61904365cf 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags)  /**   *	kgdb_arch_handle_exception - Handle architecture specific GDB packets. - *	@vector: The error vector of the exception that happened. + *	@e_vector: The error vector of the exception that happened.   *	@signo: The signal number of the exception that happened.   *	@err_code: The error code of the exception that happened. - *	@remcom_in_buffer: The buffer of the packet we have read. - *	@remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. - *	@regs: The &struct pt_regs of the current process. + *	@remcomInBuffer: The buffer of the packet we have read. + *	@remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + *	@linux_regs: The &struct pt_regs of the current process.   *   *	This function MUST handle the 'c' and 's' command packets,   *	as well packets to set / remove a hardware breakpoint, if used. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe8..c1d61ee4b4f1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,6 +39,9 @@  #include <asm/desc.h>  #include <asm/tlbflush.h>  #include <asm/idle.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/hypervisor.h>  static int kvmapf = 1; @@ -283,6 +286,22 @@ static void kvm_register_steal_time(void)  		cpu, __pa(st));  } +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ +	/** +	 * This relies on __test_and_clear_bit to modify the memory +	 * in a way that is atomic with respect to the local CPU. +	 * The hypervisor only accesses this memory from the local CPU so +	 * there's no need for lock or memory barriers. +	 * An optimization barrier is implied in apic write. +	 */ +	if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) +		return; +	apic_write(APIC_EOI, APIC_EOI_ACK); +} +  void __cpuinit kvm_guest_cpu_init(void)  {  	if (!kvm_para_available()) @@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void)  		       smp_processor_id());  	} +	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { +		unsigned long pa; +		/* Size alignment is implied but just to make it explicit. */ +		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); +		__get_cpu_var(kvm_apic_eoi) = 0; +		pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; +		wrmsrl(MSR_KVM_PV_EOI_EN, pa); +	} +  	if (has_steal_clock)  		kvm_register_steal_time();  } -static void kvm_pv_disable_apf(void *unused) +static void kvm_pv_disable_apf(void)  {  	if (!__get_cpu_var(apf_reason).enabled)  		return; @@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused)  	       smp_processor_id());  } +static void kvm_pv_guest_cpu_reboot(void *unused) +{ +	/* +	 * We disable PV EOI before we load a new kernel by kexec, +	 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. +	 * New kernel can re-enable when it boots. +	 */ +	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) +		wrmsrl(MSR_KVM_PV_EOI_EN, 0); +	kvm_pv_disable_apf(); +} +  static int kvm_pv_reboot_notify(struct notifier_block *nb,  				unsigned long code, void *unused)  {  	if (code == SYS_RESTART) -		on_each_cpu(kvm_pv_disable_apf, NULL, 1); +		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);  	return NOTIFY_DONE;  } @@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)  static void kvm_guest_cpu_offline(void *dummy)  {  	kvm_disable_steal_time(); -	kvm_pv_disable_apf(NULL); +	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) +		wrmsrl(MSR_KVM_PV_EOI_EN, 0); +	kvm_pv_disable_apf();  	apf_task_wake_all();  } @@ -424,6 +466,9 @@ void __init kvm_guest_init(void)  		pv_time_ops.steal_clock = kvm_steal_clock;  	} +	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) +		apic_set_eoi_write(kvm_guest_apic_eoi_write); +  #ifdef CONFIG_SMP  	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;  	register_cpu_notifier(&kvm_cpu_notifier); @@ -432,6 +477,19 @@ void __init kvm_guest_init(void)  #endif  } +static bool __init kvm_detect(void) +{ +	if (!kvm_para_available()) +		return false; +	return true; +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { +	.name			= "KVM", +	.detect			= kvm_detect, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); +  static __init int activate_jump_labels(void)  {  	if (has_steal_clock) { diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc6917180..4873e62db6a1 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -87,6 +87,7 @@  #include <asm/microcode.h>  #include <asm/processor.h>  #include <asm/cpu_device_id.h> +#include <asm/perf_event.h>  MODULE_DESCRIPTION("Microcode Update Driver");  MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu)  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	int err = 0; -	mutex_lock(µcode_mutex);  	if (uci->valid) {  		enum ucode_state ustate; @@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu)  			if (ustate == UCODE_ERROR)  				err = -EINVAL;  	} -	mutex_unlock(µcode_mutex);  	return err;  } @@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev,  			    const char *buf, size_t size)  {  	unsigned long val; -	int cpu = dev->id; -	ssize_t ret = 0; +	int cpu; +	ssize_t ret = 0, tmp_ret;  	ret = kstrtoul(buf, 0, &val);  	if (ret)  		return ret; -	if (val == 1) { -		get_online_cpus(); -		if (cpu_online(cpu)) -			ret = reload_for_cpu(cpu); -		put_online_cpus(); +	if (val != 1) +		return size; + +	get_online_cpus(); +	mutex_lock(µcode_mutex); +	for_each_online_cpu(cpu) { +		tmp_ret = reload_for_cpu(cpu); +		if (tmp_ret != 0) +			pr_warn("Error reloading microcode on CPU %d\n", cpu); + +		/* save retval of the first encountered reload error */ +		if (!ret) +			ret = tmp_ret;  	} +	if (!ret) +		perf_check_microcode(); +	mutex_unlock(µcode_mutex); +	put_online_cpus();  	if (!ret)  		ret = size; @@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL);  static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);  static struct attribute *mc_default_attrs[] = { -	&dev_attr_reload.attr,  	&dev_attr_version.attr,  	&dev_attr_processor_flags.attr,  	NULL @@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = {  #ifdef MODULE  /* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id microcode_id[] = { +static const struct x86_cpu_id __initconst microcode_id[] = {  #ifdef CONFIG_MICROCODE_INTEL  	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },  #endif @@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = {  MODULE_DEVICE_TABLE(x86cpu, microcode_id);  #endif +static struct attribute *cpu_root_microcode_attrs[] = { +	&dev_attr_reload.attr, +	NULL +}; + +static struct attribute_group cpu_root_microcode_group = { +	.name  = "microcode", +	.attrs = cpu_root_microcode_attrs, +}; +  static int __init microcode_init(void)  {  	struct cpuinfo_x86 *c = &cpu_data(0); @@ -540,16 +560,25 @@ static int __init microcode_init(void)  	mutex_lock(µcode_mutex);  	error = subsys_interface_register(&mc_cpu_interface); - +	if (!error) +		perf_check_microcode();  	mutex_unlock(µcode_mutex);  	put_online_cpus();  	if (error)  		goto out_pdev; +	error = sysfs_create_group(&cpu_subsys.dev_root->kobj, +				   &cpu_root_microcode_group); + +	if (error) { +		pr_err("Error creating microcode group!\n"); +		goto out_driver; +	} +  	error = microcode_dev_init();  	if (error) -		goto out_driver; +		goto out_ucode_group;  	register_syscore_ops(&mc_syscore_ops);  	register_hotcpu_notifier(&mc_cpu_notifier); @@ -559,7 +588,11 @@ static int __init microcode_init(void)  	return 0; -out_driver: + out_ucode_group: +	sysfs_remove_group(&cpu_subsys.dev_root->kobj, +			   &cpu_root_microcode_group); + + out_driver:  	get_online_cpus();  	mutex_lock(µcode_mutex); @@ -568,7 +601,7 @@ out_driver:  	mutex_unlock(µcode_mutex);  	put_online_cpus(); -out_pdev: + out_pdev:  	platform_device_unregister(microcode_pdev);  	return error; @@ -584,6 +617,9 @@ static void __exit microcode_exit(void)  	unregister_hotcpu_notifier(&mc_cpu_notifier);  	unregister_syscore_ops(&mc_syscore_ops); +	sysfs_remove_group(&cpu_subsys.dev_root->kobj, +			   &cpu_root_microcode_group); +  	get_online_cpus();  	mutex_lock(µcode_mutex); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f21fd94ac897..216a4d754b0c 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -15,6 +15,9 @@      along with this program; if not, write to the Free Software      Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/moduleloader.h>  #include <linux/elf.h>  #include <linux/vmalloc.h> @@ -30,9 +33,14 @@  #include <asm/pgtable.h>  #if 0 -#define DEBUGP printk +#define DEBUGP(fmt, ...)				\ +	printk(KERN_DEBUG fmt, ##__VA_ARGS__)  #else -#define DEBUGP(fmt...) +#define DEBUGP(fmt, ...)				\ +do {							\ +	if (0)						\ +		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\ +} while (0)  #endif  void *module_alloc(unsigned long size) @@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,  	Elf32_Sym *sym;  	uint32_t *location; -	DEBUGP("Applying relocate section %u to %u\n", relsec, -	       sechdrs[relsec].sh_info); +	DEBUGP("Applying relocate section %u to %u\n", +	       relsec, sechdrs[relsec].sh_info);  	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {  		/* This is where to make the change */  		location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -73,11 +81,11 @@ int apply_relocate(Elf32_Shdr *sechdrs,  			*location += sym->st_value;  			break;  		case R_386_PC32: -			/* Add the value, subtract its postition */ +			/* Add the value, subtract its position */  			*location += sym->st_value - (uint32_t)location;  			break;  		default: -			printk(KERN_ERR "module %s: Unknown relocation: %u\n", +			pr_err("%s: Unknown relocation: %u\n",  			       me->name, ELF32_R_TYPE(rel[i].r_info));  			return -ENOEXEC;  		} @@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  	void *loc;  	u64 val; -	DEBUGP("Applying relocate section %u to %u\n", relsec, -	       sechdrs[relsec].sh_info); +	DEBUGP("Applying relocate section %u to %u\n", +	       relsec, sechdrs[relsec].sh_info);  	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {  		/* This is where to make the change */  		loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  			+ ELF64_R_SYM(rel[i].r_info);  		DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", -			(int)ELF64_R_TYPE(rel[i].r_info), -			sym->st_value, rel[i].r_addend, (u64)loc); +		       (int)ELF64_R_TYPE(rel[i].r_info), +		       sym->st_value, rel[i].r_addend, (u64)loc);  		val = sym->st_value + rel[i].r_addend; @@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  #endif  			break;  		default: -			printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", +			pr_err("%s: Unknown rela relocation: %llu\n",  			       me->name, ELF64_R_TYPE(rel[i].r_info));  			return -ENOEXEC;  		} @@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,  	return 0;  overflow: -	printk(KERN_ERR "overflow in relocation type %d val %Lx\n", +	pr_err("overflow in relocation type %d val %Lx\n",  	       (int)ELF64_R_TYPE(rel[i].r_info), val); -	printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", +	pr_err("`%s' likely not compiled with -mcmodel=kernel\n",  	       me->name);  	return -ENOEXEC;  } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0b2f84457be..f84f5c57de35 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)  #ifdef CONFIG_X86_32  /*   * For i386, NMIs use the same stack as the kernel, and we can - * add a workaround to the iret problem in C. Simply have 3 states - * the NMI can be in. + * add a workaround to the iret problem in C (preventing nested + * NMIs if an NMI takes a trap). Simply have 3 states the NMI + * can be in:   *   *  1) not running   *  2) executing @@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)   * If an NMI hits a breakpoint that executes an iret, another   * NMI can preempt it. We do not want to allow this new NMI   * to run, but we want to execute it when the first one finishes. - * We set the state to "latched", and the first NMI will perform - * an cmpxchg on the state, and if it doesn't successfully - * reset the state to "not running" it will restart the next - * NMI. + * We set the state to "latched", and the exit of the first NMI will + * perform a dec_return, if the result is zero (NOT_RUNNING), then + * it will simply exit the NMI handler. If not, the dec_return + * would have set the state to NMI_EXECUTING (what we want it to + * be when we are running). In this case, we simply jump back + * to rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2.   */  enum nmi_states { -	NMI_NOT_RUNNING, +	NMI_NOT_RUNNING = 0,  	NMI_EXECUTING,  	NMI_LATCHED,  };  static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2);  #define nmi_nesting_preprocess(regs)					\  	do {								\ -		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\ -			__get_cpu_var(nmi_state) = NMI_LATCHED;		\ +		if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {	\ +			this_cpu_write(nmi_state, NMI_LATCHED);		\  			return;						\  		}							\ -	nmi_restart:							\ -		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\ -	} while (0) +		this_cpu_write(nmi_state, NMI_EXECUTING);		\ +		this_cpu_write(nmi_cr2, read_cr2());			\ +	} while (0);							\ +	nmi_restart:  #define nmi_nesting_postprocess()					\  	do {								\ -		if (cmpxchg(&__get_cpu_var(nmi_state),			\ -		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\ +		if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))	\ +			write_cr2(this_cpu_read(nmi_cr2));		\ +		if (this_cpu_dec_return(nmi_state))			\  			goto nmi_restart;				\  	} while (0)  #else /* x86_64 */ diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 149b8d9c6ad4..6d9582ec0324 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)  static void __init init_nmi_testsuite(void)  {  	/* trap all the unknown NMIs we may generate */ -	register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); +	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", +			__initdata);  }  static void __init cleanup_nmi_testsuite(void) @@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask)  {  	unsigned long timeout; -	if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, -				 NMI_FLAG_FIRST, "nmi_selftest")) { +	if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, +				 NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {  		nmi_fail = FAILURE;  		return;  	} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9ce885996fd7..17fff18a1031 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = {  #endif  	.wbinvd = native_wbinvd,  	.read_msr = native_read_msr_safe, -	.rdmsr_regs = native_rdmsr_safe_regs,  	.write_msr = native_write_msr_safe, -	.wrmsr_regs = native_wrmsr_safe_regs,  	.read_tsc = native_read_tsc,  	.read_pmc = native_read_pmc,  	.read_tscp = native_read_tscp, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index b72838bae64a..299d49302e7d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -22,6 +22,8 @@   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA   */ +#define pr_fmt(fmt) "Calgary: " fmt +  #include <linux/kernel.h>  #include <linux/init.h>  #include <linux/types.h> @@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev,  		offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,  					  npages, 0, boundary_size, 0);  		if (offset == ~0UL) { -			printk(KERN_WARNING "Calgary: IOMMU full.\n"); +			pr_warn("IOMMU full\n");  			spin_unlock_irqrestore(&tbl->it_lock, flags);  			if (panic_on_overflow)  				panic("Calgary: fix the allocator.\n"); @@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,  	entry = iommu_range_alloc(dev, tbl, npages);  	if (unlikely(entry == DMA_ERROR_CODE)) { -		printk(KERN_WARNING "Calgary: failed to allocate %u pages in " -		       "iommu %p\n", npages, tbl); +		pr_warn("failed to allocate %u pages in iommu %p\n", +			npages, tbl);  		return DMA_ERROR_CODE;  	} @@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl)  		i++;  	} while ((val & 0xff) != 0xff && i < 100);  	if (i == 100) -		printk(KERN_WARNING "Calgary: PCI bus not quiesced, " -		       "continuing anyway\n"); +		pr_warn("PCI bus not quiesced, continuing anyway\n");  	/* invalidate TCE cache */  	target = calgary_reg(bbar, tar_offset(tbl->it_busno)); @@ -604,8 +605,7 @@ begin:  		i++;  	} while ((val64 & 0xff) != 0xff && i < 100);  	if (i == 100) -		printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " -		       "continuing anyway\n"); +		pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");  	/* 3. poll Page Migration DEBUG for SoftStopFault */  	target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); @@ -617,8 +617,7 @@ begin:  		if (++count < 100)  			goto begin;  		else { -			printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " -			       "aborting TCE cache flush sequence!\n"); +			pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");  			return; /* pray for the best */  		}  	} @@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl)  	plssr = be32_to_cpu(readl(target));  	/* If no error, the agent ID in the CSR is not valid */ -	printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " -	       "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); +	pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", +		 tbl->it_busno, csr, plssr);  }  static void calioc2_dump_error_regs(struct iommu_table *tbl) @@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)  	target = calgary_reg(bbar, phboff | 0x800);  	mck = be32_to_cpu(readl(target)); -	printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", -	       tbl->it_busno); +	pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); -	printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", -	       csr, plssr, csmr, mck); +	pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", +		 csr, plssr, csmr, mck);  	/* dump rest of error regs */ -	printk(KERN_EMERG "Calgary: "); +	pr_emerg("");  	for (i = 0; i < ARRAY_SIZE(errregs); i++) {  		/* err regs are at 0x810 - 0x870 */  		erroff = (0x810 + (i * 0x10));  		target = calgary_reg(bbar, phboff | erroff);  		errregs[i] = be32_to_cpu(readl(target)); -		printk("0x%08x@0x%lx ", errregs[i], erroff); +		pr_cont("0x%08x@0x%lx ", errregs[i], erroff);  	} -	printk("\n"); +	pr_cont("\n");  	/* root complex status */  	target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index c0f420f76cd3..de2b7ad70273 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0;   */  int iommu_pass_through __read_mostly; -/* - * Group multi-function PCI devices into a single device-group for the - * iommu_device_group interface.  This tells the iommu driver to pretend - * it cannot distinguish between functions of a device, exposing only one - * group for the device.  Useful for disallowing use of individual PCI - * functions from userspace drivers. - */ -int iommu_group_mf __read_mostly; -  extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];  /* Dummy device used for NULL arguments (normally ISA). */ @@ -194,8 +185,6 @@ static __init int iommu_setup(char *p)  #endif  		if (!strncmp(p, "pt", 2))  			iommu_pass_through = 1; -		if (!strncmp(p, "group_mf", 8)) -			iommu_group_mf = 1;  		gart_parse_options(p); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 735279e54e59..ef6a8456f719 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/errno.h>  #include <linux/kernel.h>  #include <linux/mm.h> @@ -145,16 +147,14 @@ void show_regs_common(void)  	/* Board Name is optional */  	board = dmi_get_system_info(DMI_BOARD_NAME); -	printk(KERN_CONT "\n"); -	printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", -		current->pid, current->comm, print_tainted(), -		init_utsname()->release, -		(int)strcspn(init_utsname()->version, " "), -		init_utsname()->version); -	printk(KERN_CONT " %s %s", vendor, product); -	if (board) -		printk(KERN_CONT "/%s", board); -	printk(KERN_CONT "\n"); +	printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", +	       current->pid, current->comm, print_tainted(), +	       init_utsname()->release, +	       (int)strcspn(init_utsname()->version, " "), +	       init_utsname()->version, +	       vendor, product, +	       board ? "/" : "", +	       board ? board : "");  }  void flush_thread(void) @@ -645,7 +645,7 @@ static void amd_e400_idle(void)  			amd_e400_c1e_detected = true;  			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))  				mark_tsc_unstable("TSC halt in AMD C1E"); -			printk(KERN_INFO "System has AMD C1E enabled\n"); +			pr_info("System has AMD C1E enabled\n");  		}  	} @@ -659,8 +659,7 @@ static void amd_e400_idle(void)  			 */  			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,  					   &cpu); -			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", -			       cpu); +			pr_info("Switch to broadcast mode on CPU%d\n", cpu);  		}  		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)  {  #ifdef CONFIG_SMP  	if (pm_idle == poll_idle && smp_num_siblings > 1) { -		printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," -			" performance may degrade.\n"); +		pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");  	}  #endif  	if (pm_idle) @@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)  		/*  		 * One CPU supports mwait => All CPUs supports mwait  		 */ -		printk(KERN_INFO "using mwait in idle threads.\n"); +		pr_info("using mwait in idle threads\n");  		pm_idle = mwait_idle;  	} else if (cpu_has_amd_erratum(amd_erratum_400)) {  		/* E400: APIC timer interrupt does not wake up CPU from C1e */ -		printk(KERN_INFO "using AMD E400 aware idle routine\n"); +		pr_info("using AMD E400 aware idle routine\n");  		pm_idle = amd_e400_idle;  	} else  		pm_idle = default_idle; @@ -715,7 +713,7 @@ static int __init idle_setup(char *str)  		return -EINVAL;  	if (!strcmp(str, "poll")) { -		printk("using polling idle threads.\n"); +		pr_info("using polling idle threads\n");  		pm_idle = poll_idle;  		boot_option_idle_override = IDLE_POLL;  	} else if (!strcmp(str, "mwait")) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7fdf099..0a980c9d7cb8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task)  {  	if (dead_task->mm) {  		if (dead_task->mm->context.size) { -			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", -					dead_task->comm, -					dead_task->mm->context.ldt, -					dead_task->mm->context.size); +			pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", +				dead_task->comm, +				dead_task->mm->context.ldt, +				dead_task->mm->context.size);  			BUG();  		}  	} @@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)  			task->thread.gs = addr;  			if (doit) {  				load_gs_index(0); -				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); +				ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);  			}  		}  		put_cpu(); @@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)  				/* set the selector to 0 to not confuse  				   __switch_to */  				loadsegment(fs, 0); -				ret = checking_wrmsrl(MSR_FS_BASE, addr); +				ret = wrmsrl_safe(MSR_FS_BASE, addr);  			}  		}  		put_cpu(); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 03920a15a632..1b27de563561 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,  #if defined(CONFIG_PCI) && defined(CONFIG_NUMA)  /* Set correct numa_node information for AMD NB functions */ -static void __init quirk_amd_nb_node(struct pci_dev *dev) +static void __devinit quirk_amd_nb_node(struct pci_dev *dev)  {  	struct pci_dev *nb_ht;  	unsigned int devfn; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 25b48edb847c..52190a938b4a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/module.h>  #include <linux/reboot.h>  #include <linux/init.h> @@ -20,14 +22,12 @@  #include <asm/virtext.h>  #include <asm/cpu.h>  #include <asm/nmi.h> +#include <asm/smp.h> -#ifdef CONFIG_X86_32 -# include <linux/ctype.h> -# include <linux/mc146818rtc.h> -# include <asm/realmode.h> -#else -# include <asm/x86_init.h> -#endif +#include <linux/ctype.h> +#include <linux/mc146818rtc.h> +#include <asm/realmode.h> +#include <asm/x86_init.h>  /*   * Power off function, if any @@ -49,7 +49,7 @@ int reboot_force;   */  static int reboot_default = 1; -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP  static int reboot_cpu = -1;  #endif @@ -67,8 +67,8 @@ bool port_cf9_safe = false;   * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]   * warm   Don't set the cold reboot flag   * cold   Set the cold reboot flag - * bios   Reboot by jumping through the BIOS (only for X86_32) - * smp    Reboot by executing reset on BSP or other CPU (only for X86_32) + * bios   Reboot by jumping through the BIOS + * smp    Reboot by executing reset on BSP or other CPU   * triple Force a triple fault (init)   * kbd    Use the keyboard controller. cold reset (default)   * acpi   Use the RESET_REG in the FADT @@ -95,7 +95,6 @@ static int __init reboot_setup(char *str)  			reboot_mode = 0;  			break; -#ifdef CONFIG_X86_32  #ifdef CONFIG_SMP  		case 's':  			if (isdigit(*(str+1))) { @@ -112,7 +111,6 @@ static int __init reboot_setup(char *str)  #endif /* CONFIG_SMP */  		case 'b': -#endif  		case 'a':  		case 'k':  		case 't': @@ -138,7 +136,6 @@ static int __init reboot_setup(char *str)  __setup("reboot=", reboot_setup); -#ifdef CONFIG_X86_32  /*   * Reboot options and system auto-detection code provided by   * Dell Inc. so their systems "just work". :-) @@ -152,16 +149,14 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)  {  	if (reboot_type != BOOT_BIOS) {  		reboot_type = BOOT_BIOS; -		printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); +		pr_info("%s series board detected. Selecting %s-method for reboots.\n", +			"BIOS", d->ident);  	}  	return 0;  } -void machine_real_restart(unsigned int type) +void __noreturn machine_real_restart(unsigned int type)  { -	void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) -		real_mode_header->machine_real_restart_asm; -  	local_irq_disable();  	/* @@ -181,25 +176,28 @@ void machine_real_restart(unsigned int type)  	/*  	 * Switch back to the initial page table.  	 */ +#ifdef CONFIG_X86_32  	load_cr3(initial_page_table); - -	/* -	 * Write 0x1234 to absolute memory location 0x472.  The BIOS reads -	 * this on booting to tell it to "Bypass memory test (also warm -	 * boot)".  This seems like a fairly standard thing that gets set by -	 * REBOOT.COM programs, and the previous reset routine did this -	 * too. */ -	*((unsigned short *)0x472) = reboot_mode; +#else +	write_cr3(real_mode_header->trampoline_pgd); +#endif  	/* Jump to the identity-mapped low memory code */ -	restart_lowmem(type); +#ifdef CONFIG_X86_32 +	asm volatile("jmpl *%0" : : +		     "rm" (real_mode_header->machine_real_restart_asm), +		     "a" (type)); +#else +	asm volatile("ljmpl *%0" : : +		     "m" (real_mode_header->machine_real_restart_asm), +		     "D" (type)); +#endif +	unreachable();  }  #ifdef CONFIG_APM_MODULE  EXPORT_SYMBOL(machine_real_restart);  #endif -#endif /* CONFIG_X86_32 */ -  /*   * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot   */ @@ -207,8 +205,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d)  {  	if (reboot_type != BOOT_CF9) {  		reboot_type = BOOT_CF9; -		printk(KERN_INFO "%s series board detected. " -		       "Selecting PCI-method for reboots.\n", d->ident); +		pr_info("%s series board detected. Selecting %s-method for reboots.\n", +			"PCI", d->ident);  	}  	return 0;  } @@ -217,17 +215,16 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)  {  	if (reboot_type != BOOT_KBD) {  		reboot_type = BOOT_KBD; -		printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); +		pr_info("%s series board detected. Selecting %s-method for reboot.\n", +			"KBD", d->ident);  	}  	return 0;  }  /* - * This is a single dmi_table handling all reboot quirks.  Note that - * REBOOT_BIOS is only available for 32bit + * This is a single dmi_table handling all reboot quirks.   */  static struct dmi_system_id __initdata reboot_dmi_table[] = { -#ifdef CONFIG_X86_32  	{	/* Handle problems with rebooting on Dell E520's */  		.callback = set_bios_reboot,  		.ident = "Dell E520", @@ -377,7 +374,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),  		},  	}, -#endif /* CONFIG_X86_32 */  	{	/* Handle reboot issue on Acer Aspire one */  		.callback = set_kbd_reboot, @@ -451,6 +447,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),  		},  	}, +	{	/* Handle problems with rebooting on the Precision M6600. */ +		.callback = set_pci_reboot, +		.ident = "Dell OptiPlex 990", +		.matches = { +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), +		}, +	},  	{ }  }; @@ -576,13 +580,11 @@ static void native_machine_emergency_restart(void)  			reboot_type = BOOT_KBD;  			break; -#ifdef CONFIG_X86_32  		case BOOT_BIOS:  			machine_real_restart(MRR_BIOS);  			reboot_type = BOOT_KBD;  			break; -#endif  		case BOOT_ACPI:  			acpi_reboot(); @@ -624,12 +626,10 @@ void native_machine_shutdown(void)  	/* The boot cpu is always logical cpu 0 */  	int reboot_cpu_id = 0; -#ifdef CONFIG_X86_32  	/* See if there has been given a command line override */  	if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&  		cpu_online(reboot_cpu))  		reboot_cpu_id = reboot_cpu; -#endif  	/* Make certain the cpu I'm about to reboot on is online */  	if (!cpu_online(reboot_cpu_id)) @@ -670,7 +670,7 @@ static void __machine_emergency_restart(int emergency)  static void native_machine_restart(char *__unused)  { -	printk("machine restart\n"); +	pr_notice("machine restart\n");  	if (!reboot_force)  		machine_shutdown(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 16be6dc14db1..f4b9b80e1b95 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1031,8 +1031,6 @@ void __init setup_arch(char **cmdline_p)  	x86_init.timers.wallclock_init(); -	x86_platform.wallclock_init(); -  	mcheck_init();  	arch_init_ideal_nops(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5a98aa272184..5cdff0357746 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -21,7 +21,7 @@  #include <asm/cpu.h>  #include <asm/stackprotector.h> -DEFINE_PER_CPU(int, cpu_number); +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);  EXPORT_PER_CPU_SYMBOL(cpu_number);  #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 21af737053aa..b280908a376e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,6 +6,9 @@   *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes   *  2000-2002   x86-64 support by Andi Kleen   */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/sched.h>  #include <linux/mm.h>  #include <linux/smp.h> @@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)  		       me->comm, me->pid, where, frame,  		       regs->ip, regs->sp, regs->orig_ax);  		print_vma_addr(" in ", regs->ip); -		printk(KERN_CONT "\n"); +		pr_cont("\n");  	}  	force_sig(SIGSEGV, me); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7bd8a0823654..7c5a8c314c02 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1,4 +1,4 @@ -/* + /*   *	x86 SMP booting functions   *   *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> @@ -39,6 +39,8 @@   *	Glauber Costa		:	i386 and x86_64 integration   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/init.h>  #include <linux/smp.h>  #include <linux/module.h> @@ -104,17 +106,17 @@ int smp_num_siblings = 1;  EXPORT_SYMBOL(smp_num_siblings);  /* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;  /* representing HT siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);  /* representing HT and core siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);  EXPORT_PER_CPU_SYMBOL(cpu_core_map); -DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);  /* Per CPU bogomips and other parameters */  DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); @@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void)  	 * boards)  	 */ -	pr_debug("CALLIN, before setup_local_APIC().\n"); +	pr_debug("CALLIN, before setup_local_APIC()\n");  	if (apic->smp_callin_clear_local_apic)  		apic->smp_callin_clear_local_apic();  	setup_local_APIC(); @@ -255,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused)  	check_tsc_sync_target();  	/* -	 * We need to hold call_lock, so there is no inconsistency -	 * between the time smp_call_function() determines number of -	 * IPI recipients, and the time when the determination is made -	 * for which cpus receive the IPI. Holding this -	 * lock helps us to not include this cpu in a currently in progress -	 * smp_call_function(). -	 *  	 * We need to hold vector_lock so there the set of online cpus  	 * does not change while we are assigning vectors to cpus.  Holding  	 * this lock ensures we don't half assign or remove an irq from a cpu.  	 */ -	ipi_call_lock();  	lock_vector_lock();  	set_cpu_online(smp_processor_id(), true);  	unlock_vector_lock(); -	ipi_call_unlock();  	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;  	x86_platform.nmi_init(); @@ -432,17 +425,16 @@ static void impress_friends(void)  	/*  	 * Allow the user to impress friends.  	 */ -	pr_debug("Before bogomips.\n"); +	pr_debug("Before bogomips\n");  	for_each_possible_cpu(cpu)  		if (cpumask_test_cpu(cpu, cpu_callout_mask))  			bogosum += cpu_data(cpu).loops_per_jiffy; -	printk(KERN_INFO -		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n", +	pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",  		num_online_cpus(),  		bogosum/(500000/HZ),  		(bogosum/(5000/HZ))%100); -	pr_debug("Before bogocount - setting activated=1.\n"); +	pr_debug("Before bogocount - setting activated=1\n");  }  void __inquire_remote_apic(int apicid) @@ -452,18 +444,17 @@ void __inquire_remote_apic(int apicid)  	int timeout;  	u32 status; -	printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); +	pr_info("Inquiring remote APIC 0x%x...\n", apicid);  	for (i = 0; i < ARRAY_SIZE(regs); i++) { -		printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); +		pr_info("... APIC 0x%x %s: ", apicid, names[i]);  		/*  		 * Wait for idle.  		 */  		status = safe_apic_wait_icr_idle();  		if (status) -			printk(KERN_CONT -			       "a previous APIC delivery may have failed\n"); +			pr_cont("a previous APIC delivery may have failed\n");  		apic_icr_write(APIC_DM_REMRD | regs[i], apicid); @@ -476,10 +467,10 @@ void __inquire_remote_apic(int apicid)  		switch (status) {  		case APIC_ICR_RR_VALID:  			status = apic_read(APIC_RRR); -			printk(KERN_CONT "%08x\n", status); +			pr_cont("%08x\n", status);  			break;  		default: -			printk(KERN_CONT "failed\n"); +			pr_cont("failed\n");  		}  	}  } @@ -513,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)  			apic_write(APIC_ESR, 0);  		accept_status = (apic_read(APIC_ESR) & 0xEF);  	} -	pr_debug("NMI sent.\n"); +	pr_debug("NMI sent\n");  	if (send_status) -		printk(KERN_ERR "APIC never delivered???\n"); +		pr_err("APIC never delivered???\n");  	if (accept_status) -		printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); +		pr_err("APIC delivery error (%lx)\n", accept_status);  	return (send_status | accept_status);  } @@ -540,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  		apic_read(APIC_ESR);  	} -	pr_debug("Asserting INIT.\n"); +	pr_debug("Asserting INIT\n");  	/*  	 * Turn INIT on target chip @@ -556,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  	mdelay(10); -	pr_debug("Deasserting INIT.\n"); +	pr_debug("Deasserting INIT\n");  	/* Target chip */  	/* Send IPI */ @@ -589,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  	/*  	 * Run STARTUP IPI loop.  	 */ -	pr_debug("#startup loops: %d.\n", num_starts); +	pr_debug("#startup loops: %d\n", num_starts);  	for (j = 1; j <= num_starts; j++) { -		pr_debug("Sending STARTUP #%d.\n", j); +		pr_debug("Sending STARTUP #%d\n", j);  		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */  			apic_write(APIC_ESR, 0);  		apic_read(APIC_ESR); -		pr_debug("After apic_write.\n"); +		pr_debug("After apic_write\n");  		/*  		 * STARTUP IPI @@ -613,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  		 */  		udelay(300); -		pr_debug("Startup point 1.\n"); +		pr_debug("Startup point 1\n");  		pr_debug("Waiting for send to finish...\n");  		send_status = safe_apic_wait_icr_idle(); @@ -628,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  		if (send_status || accept_status)  			break;  	} -	pr_debug("After Startup.\n"); +	pr_debug("After Startup\n");  	if (send_status) -		printk(KERN_ERR "APIC never delivered???\n"); +		pr_err("APIC never delivered???\n");  	if (accept_status) -		printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); +		pr_err("APIC delivery error (%lx)\n", accept_status);  	return (send_status | accept_status);  } @@ -647,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid)  	if (system_state == SYSTEM_BOOTING) {  		if (node != current_node) {  			if (current_node > (-1)) -				pr_cont(" Ok.\n"); +				pr_cont(" OK\n");  			current_node = node;  			pr_info("Booting Node %3d, Processors ", node);  		} -		pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); +		pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");  		return;  	} else  		pr_info("Booting Node %d Processor %d APIC 0x%x\n", @@ -731,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)  		/*  		 * allow APs to start initializing.  		 */ -		pr_debug("Before Callout %d.\n", cpu); +		pr_debug("Before Callout %d\n", cpu);  		cpumask_set_cpu(cpu, cpu_callout_mask); -		pr_debug("After Callout %d.\n", cpu); +		pr_debug("After Callout %d\n", cpu);  		/*  		 * Wait 5s total for a response @@ -761,7 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)  				pr_err("CPU%d: Stuck ??\n", cpu);  			else  				/* trampoline code not run */ -				pr_err("CPU%d: Not responding.\n", cpu); +				pr_err("CPU%d: Not responding\n", cpu);  			if (apic->inquire_remote_apic)  				apic->inquire_remote_apic(apicid);  		} @@ -806,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)  	if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||  	    !physid_isset(apicid, phys_cpu_present_map) ||  	    !apic->apic_id_valid(apicid)) { -		printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); +		pr_err("%s: bad cpu %d\n", __func__, cpu);  		return -EINVAL;  	} @@ -887,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus)  		unsigned int cpu;  		unsigned nr; -		printk(KERN_WARNING -		       "More than 8 CPUs detected - skipping them.\n" -		       "Use CONFIG_X86_BIGSMP.\n"); +		pr_warn("More than 8 CPUs detected - skipping them\n" +			"Use CONFIG_X86_BIGSMP\n");  		nr = 0;  		for_each_present_cpu(cpu) { @@ -910,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus)  #endif  	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { -		printk(KERN_WARNING -			"weird, boot CPU (#%d) not listed by the BIOS.\n", +		pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",  			hard_smp_processor_id());  		physid_set(hard_smp_processor_id(), phys_cpu_present_map); @@ -923,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus)  	 */  	if (!smp_found_config && !acpi_lapic) {  		preempt_enable(); -		printk(KERN_NOTICE "SMP motherboard not detected.\n"); +		pr_notice("SMP motherboard not detected\n");  		disable_smp();  		if (APIC_init_uniprocessor()) -			printk(KERN_NOTICE "Local APIC not detected." -					   " Using dummy APIC emulation.\n"); +			pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");  		return -1;  	} @@ -936,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus)  	 * CPU too, but we do it for the sake of robustness anyway.  	 */  	if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { -		printk(KERN_NOTICE -			"weird, boot CPU (#%d) not listed by the BIOS.\n", -			boot_cpu_physical_apicid); +		pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", +			  boot_cpu_physical_apicid);  		physid_set(hard_smp_processor_id(), phys_cpu_present_map);  	}  	preempt_enable(); @@ -951,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus)  		if (!disable_apic) {  			pr_err("BIOS bug, local APIC #%d not detected!...\n",  				boot_cpu_physical_apicid); -			pr_err("... forcing use of dummy APIC emulation." -				"(tell your hw vendor)\n"); +			pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");  		}  		smpboot_clear_io_apic();  		disable_ioapic_support(); @@ -965,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus)  	 * If SMP should be disabled, then really disable it!  	 */  	if (!max_cpus) { -		printk(KERN_INFO "SMP mode deactivated.\n"); +		pr_info("SMP mode deactivated\n");  		smpboot_clear_io_apic();  		connect_bsp_APIC(); @@ -1017,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)  	if (smp_sanity_check(max_cpus) < 0) { -		printk(KERN_INFO "SMP disabled\n"); +		pr_info("SMP disabled\n");  		disable_smp();  		goto out;  	} @@ -1055,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)  	 * Set up local APIC timer on boot CPU.  	 */ -	printk(KERN_INFO "CPU%d: ", 0); +	pr_info("CPU%d: ", 0);  	print_cpu_info(&cpu_data(0));  	x86_init.timers.setup_percpu_clockev(); @@ -1105,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void)  void __init native_smp_cpus_done(unsigned int max_cpus)  { -	pr_debug("Boot done.\n"); +	pr_debug("Boot done\n");  	nmi_selftest();  	impress_friends(); @@ -1166,8 +1152,7 @@ __init void prefill_possible_map(void)  	/* nr_cpu_ids could be reduced via nr_cpus= */  	if (possible > nr_cpu_ids) { -		printk(KERN_WARNING -			"%d Processors exceeds NR_CPUS limit of %d\n", +		pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",  			possible, nr_cpu_ids);  		possible = nr_cpu_ids;  	} @@ -1176,13 +1161,12 @@ __init void prefill_possible_map(void)  	if (!setup_max_cpus)  #endif  	if (possible > i) { -		printk(KERN_WARNING -			"%d Processors exceeds max_cpus limit of %u\n", +		pr_warn("%d Processors exceeds max_cpus limit of %u\n",  			possible, setup_max_cpus);  		possible = i;  	} -	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", +	pr_info("Allowing %d CPUs, %d hotplug CPUs\n",  		possible, max_t(int, possible - num_processors, 0));  	for (i = 0; i < possible; i++) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b31d92f69c..b481341c9369 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -9,6 +9,9 @@  /*   * Handle hardware traps and faults.   */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/interrupt.h>  #include <linux/kallsyms.h>  #include <linux/spinlock.h> @@ -143,12 +146,11 @@ trap_signal:  #ifdef CONFIG_X86_64  	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&  	    printk_ratelimit()) { -		printk(KERN_INFO -		       "%s[%d] trap %s ip:%lx sp:%lx error:%lx", -		       tsk->comm, tsk->pid, str, -		       regs->ip, regs->sp, error_code); +		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", +			tsk->comm, tsk->pid, str, +			regs->ip, regs->sp, error_code);  		print_vma_addr(" in ", regs->ip); -		printk("\n"); +		pr_cont("\n");  	}  #endif @@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code)  	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&  			printk_ratelimit()) { -		printk(KERN_INFO -			"%s[%d] general protection ip:%lx sp:%lx error:%lx", +		pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",  			tsk->comm, task_pid_nr(tsk),  			regs->ip, regs->sp, error_code);  		print_vma_addr(" in ", regs->ip); -		printk("\n"); +		pr_cont("\n");  	}  	force_sig(SIGSEGV, tsk); @@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)  	conditional_sti(regs);  #if 0  	/* No need to warn about this any longer. */ -	printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +	pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");  #endif  } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc0a147e3727..cfa5d4f7ca56 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/kernel.h>  #include <linux/sched.h>  #include <linux/init.h> @@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);  #ifdef CONFIG_X86_TSC  int __init notsc_setup(char *str)  { -	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " -			"cannot disable TSC completely.\n"); +	pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");  	tsc_disabled = 1;  	return 1;  } @@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void)  			goto success;  		}  	} -	printk("Fast TSC calibration failed\n"); +	pr_err("Fast TSC calibration failed\n");  	return 0;  success: @@ -392,7 +393,7 @@ success:  	 */  	delta *= PIT_TICK_RATE;  	do_div(delta, i*256*1000); -	printk("Fast TSC calibration using PIT\n"); +	pr_info("Fast TSC calibration using PIT\n");  	return delta;  } @@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void)  		 * use the reference value, as it is more precise.  		 */  		if (delta >= 90 && delta <= 110) { -			printk(KERN_INFO -			       "TSC: PIT calibration matches %s. %d loops\n", -			       hpet ? "HPET" : "PMTIMER", i + 1); +			pr_info("PIT calibration matches %s. %d loops\n", +				hpet ? "HPET" : "PMTIMER", i + 1);  			return tsc_ref_min;  		} @@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void)  	 */  	if (tsc_pit_min == ULONG_MAX) {  		/* PIT gave no useful value */ -		printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); +		pr_warn("Unable to calibrate against PIT\n");  		/* We don't have an alternative source, disable TSC */  		if (!hpet && !ref1 && !ref2) { -			printk("TSC: No reference (HPET/PMTIMER) available\n"); +			pr_notice("No reference (HPET/PMTIMER) available\n");  			return 0;  		}  		/* The alternative source failed as well, disable TSC */  		if (tsc_ref_min == ULONG_MAX) { -			printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " -			       "failed.\n"); +			pr_warn("HPET/PMTIMER calibration failed\n");  			return 0;  		}  		/* Use the alternative source */ -		printk(KERN_INFO "TSC: using %s reference calibration\n", -		       hpet ? "HPET" : "PMTIMER"); +		pr_info("using %s reference calibration\n", +			hpet ? "HPET" : "PMTIMER");  		return tsc_ref_min;  	}  	/* We don't have an alternative source, use the PIT calibration value */  	if (!hpet && !ref1 && !ref2) { -		printk(KERN_INFO "TSC: Using PIT calibration value\n"); +		pr_info("Using PIT calibration value\n");  		return tsc_pit_min;  	}  	/* The alternative source failed, use the PIT calibration value */  	if (tsc_ref_min == ULONG_MAX) { -		printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " -		       "Using PIT calibration\n"); +		pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");  		return tsc_pit_min;  	} @@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void)  	 * the PIT value as we know that there are PMTIMERs around  	 * running at double speed. At least we let the user know:  	 */ -	printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", -	       hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); -	printk(KERN_INFO "TSC: Using PIT calibration value\n"); +	pr_warn("PIT calibration deviates from %s: %lu %lu\n", +		hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); +	pr_info("Using PIT calibration value\n");  	return tsc_pit_min;  } @@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason)  		tsc_unstable = 1;  		sched_clock_stable = 0;  		disable_sched_clock_irqtime(); -		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); +		pr_info("Marking TSC unstable due to %s\n", reason);  		/* Change only the rating, when not registered */  		if (clocksource_tsc.mult)  			clocksource_mark_unstable(&clocksource_tsc); @@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)  		goto out;  	tsc_khz = freq; -	printk(KERN_INFO "Refined TSC clocksource calibration: " -		"%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, -					(unsigned long)tsc_khz % 1000); +	pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", +		(unsigned long)tsc_khz / 1000, +		(unsigned long)tsc_khz % 1000);  out:  	clocksource_register_khz(&clocksource_tsc, tsc_khz); @@ -970,9 +968,9 @@ void __init tsc_init(void)  		return;  	} -	printk("Detected %lu.%03lu MHz processor.\n", -			(unsigned long)cpu_khz / 1000, -			(unsigned long)cpu_khz % 1000); +	pr_info("Detected %lu.%03lu MHz processor\n", +		(unsigned long)cpu_khz / 1000, +		(unsigned long)cpu_khz % 1000);  	/*  	 * Secondary CPUs do not run through tsc_init(), so set up diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index dc4e910a7d96..36fd42091fa7 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,   * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.   * @mm: the probed address space.   * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint   * Return 0 on success or a -ve number on error.   */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)  {  	int ret;  	struct insn insn; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 255f58ae71e8..54abcc0baf23 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -28,6 +28,8 @@   *   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/capability.h>  #include <linux/errno.h>  #include <linux/interrupt.h> @@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)  	local_irq_enable();  	if (!current->thread.vm86_info) { -		printk("no vm86_info: BAD\n"); +		pr_alert("no vm86_info: BAD\n");  		do_exit(SIGSEGV);  	}  	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);  	tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs);  	tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap);  	if (tmp) { -		printk("vm86: could not access userspace vm86_info\n"); +		pr_alert("could not access userspace vm86_info\n");  		do_exit(SIGSEGV);  	} diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 8eeb55a551b4..992f890283e9 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -16,6 +16,7 @@  #include <linux/pci_ids.h>  #include <linux/pci_regs.h>  #include <linux/smp.h> +#include <linux/irq.h>  #include <asm/apic.h>  #include <asm/pci-direct.h> @@ -95,6 +96,18 @@ static void __init set_vsmp_pv_ops(void)  	ctl = readl(address + 4);  	printk(KERN_INFO "vSMP CTL: capabilities:0x%08x  control:0x%08x\n",  	       cap, ctl); + +	/* If possible, let the vSMP foundation route the interrupt optimally */ +#ifdef CONFIG_SMP +	if (cap & ctl & BIT(8)) { +		ctl &= ~BIT(8); +#ifdef CONFIG_PROC_FS +		/* Don't let users change irq affinity via procfs */ +		no_irq_affinity = 1; +#endif +	} +#endif +  	if (cap & ctl & (1 << 4)) {  		/* Setup irq ops and turn on vSMP  IRQ fastpath handling */  		pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); @@ -102,12 +115,11 @@ static void __init set_vsmp_pv_ops(void)  		pv_irq_ops.save_fl  = PV_CALLEE_SAVE(vsmp_save_fl);  		pv_irq_ops.restore_fl  = PV_CALLEE_SAVE(vsmp_restore_fl);  		pv_init_ops.patch = vsmp_patch; -  		ctl &= ~(1 << 4); -		writel(ctl, address + 4); -		ctl = readl(address + 4); -		printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);  	} +	writel(ctl, address + 4); +	ctl = readl(address + 4); +	pr_info("vSMP CTL: control set to:0x%08x\n", ctl);  	early_iounmap(address, 8);  } @@ -187,12 +199,36 @@ static void __init vsmp_cap_cpus(void)  #endif  } +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ +	return hard_smp_processor_id() >> index_msb; +} + +/* + * In vSMP, all cpus should be capable of handling interrupts, regardless of + * the APIC used. + */ +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, +					  const struct cpumask *mask) +{ +	cpumask_setall(retmask); +} + +static void vsmp_apic_post_init(void) +{ +	/* need to update phys_pkg_id */ +	apic->phys_pkg_id = apicid_phys_pkg_id; +	apic->vector_allocation_domain = fill_vector_allocation_domain; +} +  void __init vsmp_init(void)  {  	detect_vsmp_box();  	if (!is_vsmp_box())  		return; +	x86_platform.apic_post_init = vsmp_apic_post_init; +  	vsmp_cap_cpus();  	set_vsmp_pv_ops(); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 7515cf0e1805..8d141b309046 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,6 +18,8 @@   *  use the vDSO.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/time.h>  #include <linux/init.h>  #include <linux/kernel.h> @@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,  static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,  			      const char *message)  { -	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -	struct task_struct *tsk; - -	if (!show_unhandled_signals || !__ratelimit(&rs)) +	if (!show_unhandled_signals)  		return; -	tsk = current; - -	printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", -	       level, tsk->comm, task_pid_nr(tsk), -	       message, regs->ip, regs->cs, -	       regs->sp, regs->ax, regs->si, regs->di); +	pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", +			      level, current->comm, task_pid_nr(current), +			      message, regs->ip, regs->cs, +			      regs->sp, regs->ax, regs->si, regs->di);  }  static int addr_to_vsyscall_nr(unsigned long addr) @@ -139,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr)  	return nr;  } +#ifdef CONFIG_SECCOMP +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) +{ +	if (!seccomp_mode(&tsk->seccomp)) +		return 0; +	task_pt_regs(tsk)->orig_ax = syscall_nr; +	task_pt_regs(tsk)->ax = syscall_nr; +	return __secure_computing(syscall_nr); +} +#else +#define vsyscall_seccomp(_tsk, _nr) 0 +#endif +  static bool write_ok_or_segv(unsigned long ptr, size_t size)  {  	/* @@ -174,6 +184,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	int vsyscall_nr;  	int prev_sig_on_uaccess_error;  	long ret; +	int skip;  	/*  	 * No point in checking CS -- the only way to get here is a user mode @@ -205,9 +216,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	}  	tsk = current; -	if (seccomp_mode(&tsk->seccomp)) -		do_exit(SIGKILL); -  	/*  	 * With a real vsyscall, page faults cause SIGSEGV.  We want to  	 * preserve that behavior to make writing exploits harder. @@ -222,8 +230,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	 * address 0".  	 */  	ret = -EFAULT; +	skip = 0;  	switch (vsyscall_nr) {  	case 0: +		skip = vsyscall_seccomp(tsk, __NR_gettimeofday); +		if (skip) +			break; +  		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||  		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))  			break; @@ -234,6 +247,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  		break;  	case 1: +		skip = vsyscall_seccomp(tsk, __NR_time); +		if (skip) +			break; +  		if (!write_ok_or_segv(regs->di, sizeof(time_t)))  			break; @@ -241,6 +258,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  		break;  	case 2: +		skip = vsyscall_seccomp(tsk, __NR_getcpu); +		if (skip) +			break; +  		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||  		    !write_ok_or_segv(regs->si, sizeof(unsigned)))  			break; @@ -253,6 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; +	if (skip) { +		if ((long)regs->ax <= 0L) /* seccomp errno emulation */ +			goto do_ret; +		goto done; /* seccomp trace/trap */ +	} +  	if (ret == -EFAULT) {  		/* Bad news -- userspace fed a bad pointer to a vsyscall. */  		warn_bad_vsyscall(KERN_INFO, regs, @@ -271,10 +298,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	regs->ax = ret; +do_ret:  	/* Emulate a ret instruction. */  	regs->ip = caller;  	regs->sp += 8; - +done:  	return true;  sigsegv: diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 9796c2f3d074..6020f6f5927c 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8);  EXPORT_SYMBOL(copy_user_generic_string);  EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string);  EXPORT_SYMBOL(__copy_user_nocache);  EXPORT_SYMBOL(_copy_from_user);  EXPORT_SYMBOL(_copy_to_user); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 35c5e543f550..9f3167e891ef 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -29,7 +29,6 @@ void __init x86_init_uint_noop(unsigned int unused) { }  void __init x86_init_pgd_noop(pgd_t *unused) { }  int __init iommu_init_noop(void) { return 0; }  void iommu_shutdown_noop(void) { } -void wallclock_init_noop(void) { }  /*   * The platform setup functions are preset with the default functions @@ -101,7 +100,6 @@ static int default_i8042_detect(void) { return 1; };  struct x86_platform_ops x86_platform = {  	.calibrate_tsc			= native_calibrate_tsc, -	.wallclock_init			= wallclock_init_noop,  	.get_wallclock			= mach_get_cmos_time,  	.set_wallclock			= mach_set_rtc_mmss,  	.iommu_shutdown			= iommu_shutdown_noop, diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index bd18149b2b0f..3d3e20709119 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -3,6 +3,9 @@   *   * Author: Suresh Siddha <suresh.b.siddha@intel.com>   */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/bootmem.h>  #include <linux/compat.h>  #include <asm/i387.h> @@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf)  	BUG_ON(sig_xstate_size < xstate_size);  	if ((unsigned long)buf % 64) -		printk("save_i387_xstate: bad fpstate %p\n", buf); +		pr_err("%s: bad fpstate %p\n", __func__, buf);  	if (!used_math())  		return 0; @@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void)  	pcntxt_mask = eax + ((u64)edx << 32);  	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { -		printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", +		pr_err("FP/SSE not shown under xsave features 0x%llx\n",  		       pcntxt_mask);  		BUG();  	} @@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void)  	setup_xstate_init(); -	printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " -	       "cntxt size 0x%x\n", -	       pcntxt_mask, xstate_size); +	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", +		pcntxt_mask, xstate_size);  }  /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7df1c6d839fb..0595f1397b7c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  	unsigned f_lm = 0;  #endif  	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; +	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;  	/* cpuid 1.edx */  	const u32 kvm_supported_word0_x86_features = @@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  		0 /* DS-CPL, VMX, SMX, EST */ |  		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |  		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | -		0 /* Reserved, DCA */ | F(XMM4_1) | +		F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |  		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |  		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |  		F(F16C) | F(RDRAND); @@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  	/* cpuid 7.0.ebx */  	const u32 kvm_supported_word9_x86_features =  		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | -		F(BMI2) | F(ERMS) | F(RTM); +		F(BMI2) | F(ERMS) | f_invpcid | F(RTM);  	/* all calls to cpuid_count() should be made on the same cpu */  	get_cpu(); @@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  			     (1 << KVM_FEATURE_NOP_IO_DELAY) |  			     (1 << KVM_FEATURE_CLOCKSOURCE2) |  			     (1 << KVM_FEATURE_ASYNC_PF) | +			     (1 << KVM_FEATURE_PV_EOI) |  			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);  		if (sched_info_on()) @@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,  	return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);  } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)  { -	u32 function, index; +	u32 function = *eax, index = *ecx;  	struct kvm_cpuid_entry2 *best; -	function = kvm_register_read(vcpu, VCPU_REGS_RAX); -	index = kvm_register_read(vcpu, VCPU_REGS_RCX); -	kvm_register_write(vcpu, VCPU_REGS_RAX, 0); -	kvm_register_write(vcpu, VCPU_REGS_RBX, 0); -	kvm_register_write(vcpu, VCPU_REGS_RCX, 0); -	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);  	best = kvm_find_cpuid_entry(vcpu, function, index);  	if (!best)  		best = check_cpuid_limit(vcpu, function, index);  	if (best) { -		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); -		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); -		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); -		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); -	} +		*eax = best->eax; +		*ebx = best->ebx; +		*ecx = best->ecx; +		*edx = best->edx; +	} else +		*eax = *ebx = *ecx = *edx = 0; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ +	u32 function, eax, ebx, ecx, edx; + +	function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); +	ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); +	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); +	kvm_register_write(vcpu, VCPU_REGS_RAX, eax); +	kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); +	kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); +	kvm_register_write(vcpu, VCPU_REGS_RDX, edx);  	kvm_x86_ops->skip_emulated_instruction(vcpu); -	trace_kvm_cpuid(function, -			kvm_register_read(vcpu, VCPU_REGS_RAX), -			kvm_register_read(vcpu, VCPU_REGS_RBX), -			kvm_register_read(vcpu, VCPU_REGS_RCX), -			kvm_register_read(vcpu, VCPU_REGS_RDX)); +	trace_kvm_cpuid(function, eax, ebx, ecx, edx);  }  EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 26d1fb437eb5..a10e46016851 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,  int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,  			      struct kvm_cpuid2 *cpuid,  			      struct kvm_cpuid_entry2 __user *entries); +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);  static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) @@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)  	return best && (best->ecx & bit(X86_FEATURE_OSVW));  } +static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) +{ +	struct kvm_cpuid_entry2 *best; + +	best = kvm_find_cpuid_entry(vcpu, 1, 0); +	return best && (best->ecx & bit(X86_FEATURE_PCID)); +} +  #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f95d242ee9f7..97d9a9914ba8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,  	return ctxt->ops->intercept(ctxt, &info, stage);  } +static void assign_masked(ulong *dest, ulong src, ulong mask) +{ +	*dest = (*dest & ~mask) | (src & mask); +} +  static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)  {  	return (1UL << (ctxt->ad_bytes << 3)) - 1;  } +static ulong stack_mask(struct x86_emulate_ctxt *ctxt) +{ +	u16 sel; +	struct desc_struct ss; + +	if (ctxt->mode == X86EMUL_MODE_PROT64) +		return ~0UL; +	ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); +	return ~0U >> ((ss.d ^ 1) * 16);  /* d=0: 0xffff; d=1: 0xffffffff */ +} + +static int stack_size(struct x86_emulate_ctxt *ctxt) +{ +	return (__fls(stack_mask(ctxt)) + 1) >> 3; +} +  /* Access/update address held in a register, based on addressing mode. */  static inline unsigned long  address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) @@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,  	op->orig_val = op->val;  } +static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) +{ +	if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) +		ctxt->modrm_seg = VCPU_SREG_SS; +} +  static int decode_modrm(struct x86_emulate_ctxt *ctxt,  			struct operand *op)  { @@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,  			if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)  				modrm_ea += insn_fetch(s32, ctxt); -			else +			else {  				modrm_ea += ctxt->regs[base_reg]; +				adjust_modrm_seg(ctxt, base_reg); +			}  			if (index_reg != 4)  				modrm_ea += ctxt->regs[index_reg] << scale;  		} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {  			if (ctxt->mode == X86EMUL_MODE_PROT64)  				ctxt->rip_relative = 1; -		} else -			modrm_ea += ctxt->regs[ctxt->modrm_rm]; +		} else { +			base_reg = ctxt->modrm_rm; +			modrm_ea += ctxt->regs[base_reg]; +			adjust_modrm_seg(ctxt, base_reg); +		}  		switch (ctxt->modrm_mod) {  		case 0:  			if (ctxt->modrm_rm == 5) @@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,  /* allowed just for 8 bytes segments */  static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, -				   u16 selector, struct desc_struct *desc) +				   u16 selector, struct desc_struct *desc, +				   ulong *desc_addr_p)  {  	struct desc_ptr dt;  	u16 index = selector >> 3; @@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,  	if (dt.size < index * 8 + 7)  		return emulate_gp(ctxt, selector & 0xfffc); -	addr = dt.address + index * 8; +	*desc_addr_p = addr = dt.address + index * 8;  	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,  				   &ctxt->exception);  } @@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,  static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,  				   u16 selector, int seg)  { -	struct desc_struct seg_desc; +	struct desc_struct seg_desc, old_desc;  	u8 dpl, rpl, cpl;  	unsigned err_vec = GP_VECTOR;  	u32 err_code = 0;  	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ +	ulong desc_addr;  	int ret;  	memset(&seg_desc, 0, sizeof seg_desc); @@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,  		goto load;  	} -	/* NULL selector is not valid for TR, CS and SS */ -	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) +	rpl = selector & 3; +	cpl = ctxt->ops->cpl(ctxt); + +	/* NULL selector is not valid for TR, CS and SS (except for long mode) */ +	if ((seg == VCPU_SREG_CS +	     || (seg == VCPU_SREG_SS +		 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) +	     || seg == VCPU_SREG_TR)  	    && null_selector)  		goto exception; @@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,  	if (null_selector) /* for NULL selector skip all following checks */  		goto load; -	ret = read_segment_descriptor(ctxt, selector, &seg_desc); +	ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);  	if (ret != X86EMUL_CONTINUE)  		return ret; @@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,  		goto exception;  	} -	rpl = selector & 3;  	dpl = seg_desc.dpl; -	cpl = ctxt->ops->cpl(ctxt);  	switch (seg) {  	case VCPU_SREG_SS: @@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,  	case VCPU_SREG_TR:  		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))  			goto exception; +		old_desc = seg_desc; +		seg_desc.type |= 2; /* busy */ +		ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, +						  sizeof(seg_desc), &ctxt->exception); +		if (ret != X86EMUL_CONTINUE) +			return ret;  		break;  	case VCPU_SREG_LDTR:  		if (seg_desc.s || seg_desc.type != 2) @@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)  	return X86EMUL_CONTINUE;  } -static int em_push(struct x86_emulate_ctxt *ctxt) +static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)  {  	struct segmented_address addr; -	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); +	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);  	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);  	addr.seg = VCPU_SREG_SS; +	return segmented_write(ctxt, addr, data, bytes); +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{  	/* Disable writeback. */  	ctxt->dst.type = OP_NONE; -	return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); +	return push(ctxt, &ctxt->src.val, ctxt->op_bytes);  }  static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)  	return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);  } +static int em_enter(struct x86_emulate_ctxt *ctxt) +{ +	int rc; +	unsigned frame_size = ctxt->src.val; +	unsigned nesting_level = ctxt->src2.val & 31; + +	if (nesting_level) +		return X86EMUL_UNHANDLEABLE; + +	rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); +	if (rc != X86EMUL_CONTINUE) +		return rc; +	assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], +		      stack_mask(ctxt)); +	assign_masked(&ctxt->regs[VCPU_REGS_RSP], +		      ctxt->regs[VCPU_REGS_RSP] - frame_size, +		      stack_mask(ctxt)); +	return X86EMUL_CONTINUE; +} + +static int em_leave(struct x86_emulate_ctxt *ctxt) +{ +	assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], +		      stack_mask(ctxt)); +	return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); +} +  static int em_push_sreg(struct x86_emulate_ctxt *ctxt)  {  	int seg = ctxt->src2.val; @@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)  	u32 eax, ebx, ecx, edx;  	eax = ecx = 0; -	return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) -		&& ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx +	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); +	return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx  		&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx  		&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;  } @@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)  	eax = 0x00000000;  	ecx = 0x00000000; -	if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { -		/* -		 * Intel ("GenuineIntel") -		 * remark: Intel CPUs only support "syscall" in 64bit -		 * longmode. Also an 64bit guest with a -		 * 32bit compat-app running will #UD !! While this -		 * behaviour can be fixed (by emulating) into AMD -		 * response - CPUs of AMD can't behave like Intel. -		 */ -		if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && -		    ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && -		    edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) -			return false; +	ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); +	/* +	 * Intel ("GenuineIntel") +	 * remark: Intel CPUs only support "syscall" in 64bit +	 * longmode. Also an 64bit guest with a +	 * 32bit compat-app running will #UD !! While this +	 * behaviour can be fixed (by emulating) into AMD +	 * response - CPUs of AMD can't behave like Intel. +	 */ +	if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && +	    ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && +	    edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) +		return false; -		/* AMD ("AuthenticAMD") */ -		if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && -		    ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && -		    edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) -			return true; - -		/* AMD ("AMDisbetter!") */ -		if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && -		    ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && -		    edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) -			return true; -	} +	/* AMD ("AuthenticAMD") */ +	if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && +	    ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && +	    edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) +		return true; + +	/* AMD ("AMDisbetter!") */ +	if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && +	    ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && +	    edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) +		return true;  	/* default: (not Intel, not AMD), apply Intel's stricter rules... */  	return false; @@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,  	ulong old_tss_base =  		ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);  	u32 desc_limit; +	ulong desc_addr;  	/* FIXME: old_tss_base == ~0 ? */ -	ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); +	ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);  	if (ret != X86EMUL_CONTINUE)  		return ret; -	ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); +	ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);  	if (ret != X86EMUL_CONTINUE)  		return ret; @@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)  	return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);  } +static int em_lldt(struct x86_emulate_ctxt *ctxt) +{ +	u16 sel = ctxt->src.val; + +	/* Disable writeback. */ +	ctxt->dst.type = OP_NONE; +	return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); +} + +static int em_ltr(struct x86_emulate_ctxt *ctxt) +{ +	u16 sel = ctxt->src.val; + +	/* Disable writeback. */ +	ctxt->dst.type = OP_NONE; +	return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); +} +  static int em_invlpg(struct x86_emulate_ctxt *ctxt)  {  	int rc; @@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)  	return X86EMUL_CONTINUE;  } +static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, +				  void (*get)(struct x86_emulate_ctxt *ctxt, +					      struct desc_ptr *ptr)) +{ +	struct desc_ptr desc_ptr; + +	if (ctxt->mode == X86EMUL_MODE_PROT64) +		ctxt->op_bytes = 8; +	get(ctxt, &desc_ptr); +	if (ctxt->op_bytes == 2) { +		ctxt->op_bytes = 4; +		desc_ptr.address &= 0x00ffffff; +	} +	/* Disable writeback. */ +	ctxt->dst.type = OP_NONE; +	return segmented_write(ctxt, ctxt->dst.addr.mem, +			       &desc_ptr, 2 + ctxt->op_bytes); +} + +static int em_sgdt(struct x86_emulate_ctxt *ctxt) +{ +	return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); +} + +static int em_sidt(struct x86_emulate_ctxt *ctxt) +{ +	return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); +} +  static int em_lgdt(struct x86_emulate_ctxt *ctxt)  {  	struct desc_ptr desc_ptr;  	int rc; +	if (ctxt->mode == X86EMUL_MODE_PROT64) +		ctxt->op_bytes = 8;  	rc = read_descriptor(ctxt, ctxt->src.addr.mem,  			     &desc_ptr.size, &desc_ptr.address,  			     ctxt->op_bytes); @@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)  	struct desc_ptr desc_ptr;  	int rc; +	if (ctxt->mode == X86EMUL_MODE_PROT64) +		ctxt->op_bytes = 8;  	rc = read_descriptor(ctxt, ctxt->src.addr.mem,  			     &desc_ptr.size, &desc_ptr.address,  			     ctxt->op_bytes); @@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt)  	return X86EMUL_CONTINUE;  } +static int em_cpuid(struct x86_emulate_ctxt *ctxt) +{ +	u32 eax, ebx, ecx, edx; + +	eax = ctxt->regs[VCPU_REGS_RAX]; +	ecx = ctxt->regs[VCPU_REGS_RCX]; +	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); +	ctxt->regs[VCPU_REGS_RAX] = eax; +	ctxt->regs[VCPU_REGS_RBX] = ebx; +	ctxt->regs[VCPU_REGS_RCX] = ecx; +	ctxt->regs[VCPU_REGS_RDX] = edx; +	return X86EMUL_CONTINUE; +} + +static int em_lahf(struct x86_emulate_ctxt *ctxt) +{ +	ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; +	ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; +	return X86EMUL_CONTINUE; +} + +static int em_bswap(struct x86_emulate_ctxt *ctxt) +{ +	switch (ctxt->op_bytes) { +#ifdef CONFIG_X86_64 +	case 8: +		asm("bswap %0" : "+r"(ctxt->dst.val)); +		break; +#endif +	default: +		asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); +		break; +	} +	return X86EMUL_CONTINUE; +} +  static bool valid_cr(int nr)  {  	switch (nr) { @@ -3424,14 +3587,14 @@ static struct opcode group5[] = {  static struct opcode group6[] = {  	DI(Prot,	sldt),  	DI(Prot,	str), -	DI(Prot | Priv,	lldt), -	DI(Prot | Priv,	ltr), +	II(Prot | Priv | SrcMem16, em_lldt, lldt), +	II(Prot | Priv | SrcMem16, em_ltr, ltr),  	N, N, N, N,  };  static struct group_dual group7 = { { -	DI(Mov | DstMem | Priv,			sgdt), -	DI(Mov | DstMem | Priv,			sidt), +	II(Mov | DstMem | Priv,			em_sgdt, sgdt), +	II(Mov | DstMem | Priv,			em_sidt, sidt),  	II(SrcMem | Priv,			em_lgdt, lgdt),  	II(SrcMem | Priv,			em_lidt, lidt),  	II(SrcNone | DstMem | Mov,		em_smsw, smsw), N, @@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = {  	D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),  	I(SrcImmFAddr | No64, em_call_far), N,  	II(ImplicitOps | Stack, em_pushf, pushf), -	II(ImplicitOps | Stack, em_popf, popf), N, N, +	II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),  	/* 0xA0 - 0xA7 */  	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),  	I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), @@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = {  	I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),  	G(ByteOp, group11), G(0, group11),  	/* 0xC8 - 0xCF */ -	N, N, N, I(ImplicitOps | Stack, em_ret_far), +	I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), +	N, I(ImplicitOps | Stack, em_ret_far),  	D(ImplicitOps), DI(SrcImmByte, intn),  	D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),  	/* 0xD0 - 0xD7 */ @@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = {  	X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),  	/* 0xA0 - 0xA7 */  	I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), -	DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), +	II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),  	D(DstMem | SrcReg | Src2ImmByte | ModRM),  	D(DstMem | SrcReg | Src2CL | ModRM), N, N,  	/* 0xA8 - 0xAF */ @@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = {  	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),  	I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),  	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), -	/* 0xC0 - 0xCF */ +	/* 0xC0 - 0xC7 */  	D2bv(DstMem | SrcReg | ModRM | Lock),  	N, D(DstMem | SrcReg | ModRM | Mov),  	N, N, N, GD(0, &group9), -	N, N, N, N, N, N, N, N, +	/* 0xC8 - 0xCF */ +	X8(I(DstReg, em_bswap)),  	/* 0xD0 - 0xDF */  	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,  	/* 0xE0 - 0xEF */ @@ -4426,12 +4591,12 @@ twobyte_insn:  		break;  	case 0xb6 ... 0xb7:	/* movzx */  		ctxt->dst.bytes = ctxt->op_bytes; -		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val +		ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val  						       : (u16) ctxt->src.val;  		break;  	case 0xbe ... 0xbf:	/* movsx */  		ctxt->dst.bytes = ctxt->op_bytes; -		ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : +		ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :  							(s16) ctxt->src.val;  		break;  	case 0xc0 ... 0xc1:	/* xadd */ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 81cf4fa4a2be..1df8fb9e1d5d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s)  	pic_unlock(s);  } -int kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)  { -	struct kvm_pic *s = opaque;  	int ret = -1;  	pic_lock(s);  	if (irq >= 0 && irq < PIC_NUM_PINS) { -		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); +		int irq_level = __kvm_irq_line_state(&s->irq_states[irq], +						     irq_source_id, level); +		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);  		pic_update_irq(s);  		trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,  				      s->pics[irq >> 3].imr, ret == 0); @@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)  	return ret;  } +void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) +{ +	int i; + +	pic_lock(s); +	for (i = 0; i < PIC_NUM_PINS; i++) +		__clear_bit(irq_source_id, &s->irq_states[i]); +	pic_unlock(s); +} +  /*   * acknowledge interrupt 'irq'   */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c15743f1ee..ce878788a39f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)  	clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));  } +static inline int __apic_test_and_set_vector(int vec, void *bitmap) +{ +	return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int __apic_test_and_clear_vector(int vec, void *bitmap) +{ +	return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} +  static inline int apic_hw_enabled(struct kvm_lapic *apic)  {  	return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; @@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap)  		return fls(word[word_offset << 2]) - 1 + (word_offset << 5);  } +static u8 count_vectors(void *bitmap) +{ +	u32 *word = bitmap; +	int word_offset; +	u8 count = 0; +	for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) +		count += hweight32(word[word_offset << 2]); +	return count; +} +  static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)  {  	apic->irr_pending = true; @@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)  		apic->irr_pending = true;  } +static inline void apic_set_isr(int vec, struct kvm_lapic *apic) +{ +	if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) +		++apic->isr_count; +	BUG_ON(apic->isr_count > MAX_APIC_VECTOR); +	/* +	 * ISR (in service register) bit is set when injecting an interrupt. +	 * The highest vector is injected. Thus the latest bit set matches +	 * the highest bit in ISR. +	 */ +	apic->highest_isr_cache = vec; +} + +static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) +{ +	if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) +		--apic->isr_count; +	BUG_ON(apic->isr_count < 0); +	apic->highest_isr_cache = -1; +} +  int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)  {  	struct kvm_lapic *apic = vcpu->arch.apic; @@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)  			irq->level, irq->trig_mode);  } +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) +{ + +	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, +				      sizeof(val)); +} + +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + +	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, +				      sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ +	return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ +	u8 val; +	if (pv_eoi_get_user(vcpu, &val) < 0) +		apic_debug("Can't read EOI MSR value: 0x%llx\n", +			   (unsigned long long)vcpi->arch.pv_eoi.msr_val); +	return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ +	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { +		apic_debug("Can't set EOI MSR value: 0x%llx\n", +			   (unsigned long long)vcpi->arch.pv_eoi.msr_val); +		return; +	} +	__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ +	if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { +		apic_debug("Can't clear EOI MSR value: 0x%llx\n", +			   (unsigned long long)vcpi->arch.pv_eoi.msr_val); +		return; +	} +	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} +  static inline int apic_find_highest_isr(struct kvm_lapic *apic)  {  	int result; +	if (!apic->isr_count) +		return -1; +	if (likely(apic->highest_isr_cache != -1)) +		return apic->highest_isr_cache;  	result = find_highest_vector(apic->regs + APIC_ISR);  	ASSERT(result == -1 || result >= 16); @@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)  	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;  } -static void apic_set_eoi(struct kvm_lapic *apic) +static int apic_set_eoi(struct kvm_lapic *apic)  {  	int vector = apic_find_highest_isr(apic); + +	trace_kvm_eoi(apic, vector); +  	/*  	 * Not every write EOI will has corresponding ISR,  	 * one example is when Kernel check timer on setup_IO_APIC  	 */  	if (vector == -1) -		return; +		return vector; -	apic_clear_vector(vector, apic->regs + APIC_ISR); +	apic_clear_isr(vector, apic);  	apic_update_ppr(apic);  	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && @@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)  		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);  	}  	kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +	return vector;  }  static void apic_send_ipi(struct kvm_lapic *apic) @@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)  		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);  	}  	apic->irr_pending = false; +	apic->isr_count = 0; +	apic->highest_isr_cache = -1;  	update_divide_count(apic);  	atomic_set(&apic->lapic_timer.pending, 0);  	if (kvm_vcpu_is_bsp(vcpu))  		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; +	vcpu->arch.pv_eoi.msr_val = 0;  	apic_update_ppr(apic);  	vcpu->arch.apic_arb_prio = 0; @@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)  	if (vector == -1)  		return -1; -	apic_set_vector(vector, apic->regs + APIC_ISR); +	apic_set_isr(vector, apic);  	apic_update_ppr(apic);  	apic_clear_irr(vector, apic);  	return vector; @@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)  	update_divide_count(apic);  	start_apic_timer(apic);  	apic->irr_pending = true; +	apic->isr_count = count_vectors(apic->regs + APIC_ISR); +	apic->highest_isr_cache = -1;  	kvm_make_request(KVM_REQ_EVENT, vcpu);  } @@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)  		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);  } +/* + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + * Clear PV EOI in guest memory in any case. + */ +static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, +					struct kvm_lapic *apic) +{ +	bool pending; +	int vector; +	/* +	 * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host +	 * and KVM_PV_EOI_ENABLED in guest memory as follows: +	 * +	 * KVM_APIC_PV_EOI_PENDING is unset: +	 * 	-> host disabled PV EOI. +	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: +	 * 	-> host enabled PV EOI, guest did not execute EOI yet. +	 * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: +	 * 	-> host enabled PV EOI, guest executed EOI. +	 */ +	BUG_ON(!pv_eoi_enabled(vcpu)); +	pending = pv_eoi_get_pending(vcpu); +	/* +	 * Clear pending bit in any case: it will be set again on vmentry. +	 * While this might not be ideal from performance point of view, +	 * this makes sure pv eoi is only enabled when we know it's safe. +	 */ +	pv_eoi_clr_pending(vcpu); +	if (pending) +		return; +	vector = apic_set_eoi(apic); +	trace_kvm_pv_eoi(apic, vector); +} +  void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)  {  	u32 data;  	void *vapic; +	if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) +		apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); +  	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))  		return; @@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)  	apic_set_tpr(vcpu->arch.apic, data & 0xff);  } +/* + * apic_sync_pv_eoi_to_guest - called before vmentry + * + * Detect whether it's safe to enable PV EOI and + * if yes do so. + */ +static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, +					struct kvm_lapic *apic) +{ +	if (!pv_eoi_enabled(vcpu) || +	    /* IRR set or many bits in ISR: could be nested. */ +	    apic->irr_pending || +	    /* Cache not set: could be safe but we don't bother. */ +	    apic->highest_isr_cache == -1 || +	    /* Need EOI to update ioapic. */ +	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { +		/* +		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest +		 * so we need not do anything here. +		 */ +		return; +	} + +	pv_eoi_set_pending(apic->vcpu); +} +  void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)  {  	u32 data, tpr;  	int max_irr, max_isr; -	struct kvm_lapic *apic; +	struct kvm_lapic *apic = vcpu->arch.apic;  	void *vapic; +	apic_sync_pv_eoi_to_guest(vcpu, apic); +  	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))  		return; -	apic = vcpu->arch.apic;  	tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;  	max_irr = apic_find_highest_irr(apic);  	if (max_irr < 0) @@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)  	return 0;  } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ +	u64 addr = data & ~KVM_MSR_ENABLED; +	if (!IS_ALIGNED(addr, 4)) +		return 1; + +	vcpu->arch.pv_eoi.msr_val = data; +	if (!pv_eoi_enabled(vcpu)) +		return 0; +	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, +					 addr); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d09..4af5405ae1e2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,6 +13,15 @@ struct kvm_lapic {  	u32 divide_count;  	struct kvm_vcpu *vcpu;  	bool irr_pending; +	/* Number of bits set in ISR. */ +	s16 isr_count; +	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ +	int highest_isr_cache; +	/** +	 * APIC register page.  The layout matches the register layout seen by +	 * the guest 1:1, because it is accessed by the vmx microcode. +	 * Note: Only one register, the TPR, is used by the microcode. +	 */  	void *regs;  	gpa_t vapic_addr;  	struct page *vapic_page; @@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)  {  	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;  } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);  #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be3cea4407ff..01ca00423938 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);  #define PTE_PREFETCH_NUM		8 -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10  #define PT64_SECOND_AVAIL_BITS_SHIFT 52  #define PT64_LEVEL_BITS 9 @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);  #define CREATE_TRACE_POINTS  #include "mmutrace.h" -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;  static u64 __read_mostly shadow_mmio_mask;  static void mmu_spte_set(u64 *sptep, u64 spte); +static void mmu_free_roots(struct kvm_vcpu *vcpu);  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)  { @@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)  }  #endif +static bool spte_is_locklessly_modifiable(u64 spte) +{ +	return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} +  static bool spte_has_volatile_bits(u64 spte)  { +	/* +	 * Always atomicly update spte if it can be updated +	 * out of mmu-lock, it can ensure dirty bit is not lost, +	 * also, it can help us to get a stable is_writable_pte() +	 * to ensure tlb flush is not missed. +	 */ +	if (spte_is_locklessly_modifiable(spte)) +		return true; +  	if (!shadow_accessed_mask)  		return false; @@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)  /* Rules for using mmu_spte_update:   * Update the state bits, it means the mapped pfn is not changged. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case.   */ -static void mmu_spte_update(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte)  { -	u64 mask, old_spte = *sptep; +	u64 old_spte = *sptep; +	bool ret = false;  	WARN_ON(!is_rmap_spte(new_spte)); -	if (!is_shadow_present_pte(old_spte)) -		return mmu_spte_set(sptep, new_spte); - -	new_spte |= old_spte & shadow_dirty_mask; - -	mask = shadow_accessed_mask; -	if (is_writable_pte(old_spte)) -		mask |= shadow_dirty_mask; +	if (!is_shadow_present_pte(old_spte)) { +		mmu_spte_set(sptep, new_spte); +		return ret; +	} -	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) +	if (!spte_has_volatile_bits(old_spte))  		__update_clear_spte_fast(sptep, new_spte);  	else  		old_spte = __update_clear_spte_slow(sptep, new_spte); +	/* +	 * For the spte updated out of mmu-lock is safe, since +	 * we always atomicly update it, see the comments in +	 * spte_has_volatile_bits(). +	 */ +	if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) +		ret = true; +  	if (!shadow_accessed_mask) -		return; +		return ret;  	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))  		kvm_set_pfn_accessed(spte_to_pfn(old_spte));  	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))  		kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + +	return ret;  }  /* @@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)  				mmu_page_header_cache);  } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, -				    size_t size) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)  {  	void *p; @@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,  static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)  { -	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, -				      sizeof(struct pte_list_desc)); +	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);  }  static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)  		rmap_remove(kvm, sptep);  } -static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) + +static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +{ +	if (is_large_pte(*sptep)) { +		WARN_ON(page_header(__pa(sptep))->role.level == +			PT_PAGE_TABLE_LEVEL); +		drop_spte(kvm, sptep); +		--kvm->stat.lpages; +		return true; +	} + +	return false; +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ +	if (__drop_large_spte(vcpu->kvm, sptep)) +		kvm_flush_remote_tlbs(vcpu->kvm); +} + +/* + * Write-protect on the specified @sptep, @pt_protect indicates whether + * spte writ-protection is caused by protecting shadow page table. + * @flush indicates whether tlb need be flushed. + * + * Note: write protection is difference between drity logging and spte + * protection: + * - for dirty logging, the spte can be set to writable at anytime if + *   its dirty bitmap is properly set. + * - for spte protection, the spte can be writable only after unsync-ing + *   shadow page. + * + * Return true if the spte is dropped. + */ +static bool +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) +{ +	u64 spte = *sptep; + +	if (!is_writable_pte(spte) && +	      !(pt_protect && spte_is_locklessly_modifiable(spte))) +		return false; + +	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + +	if (__drop_large_spte(kvm, sptep)) { +		*flush |= true; +		return true; +	} + +	if (pt_protect) +		spte &= ~SPTE_MMU_WRITEABLE; +	spte = spte & ~PT_WRITABLE_MASK; + +	*flush |= mmu_spte_update(sptep, spte); +	return false; +} + +static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, +				 int level, bool pt_protect)  {  	u64 *sptep;  	struct rmap_iterator iter; -	int write_protected = 0; +	bool flush = false;  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {  		BUG_ON(!(*sptep & PT_PRESENT_MASK)); -		rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - -		if (!is_writable_pte(*sptep)) { -			sptep = rmap_get_next(&iter); -			continue; -		} - -		if (level == PT_PAGE_TABLE_LEVEL) { -			mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); -			sptep = rmap_get_next(&iter); -		} else { -			BUG_ON(!is_large_pte(*sptep)); -			drop_spte(kvm, sptep); -			--kvm->stat.lpages; +		if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {  			sptep = rmap_get_first(*rmapp, &iter); +			continue;  		} -		write_protected = 1; +		sptep = rmap_get_next(&iter);  	} -	return write_protected; +	return flush;  }  /** @@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,  	while (mask) {  		rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; -		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); +		__rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);  		/* clear the first set bit */  		mask &= mask - 1;  	}  } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm *kvm, u64 gfn)  {  	struct kvm_memory_slot *slot;  	unsigned long *rmapp;  	int i; -	int write_protected = 0; +	bool write_protected = false;  	slot = gfn_to_memslot(kvm, gfn);  	for (i = PT_PAGE_TABLE_LEVEL;  	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {  		rmapp = __gfn_to_rmap(gfn, i, slot); -		write_protected |= __rmap_write_protect(kvm, rmapp, i); +		write_protected |= __rmap_write_protect(kvm, rmapp, i, true);  	}  	return write_protected; @@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,  			 unsigned long data)  {  	u64 *sptep; -	struct rmap_iterator iter; +	struct rmap_iterator uninitialized_var(iter);  	int young = 0;  	/* -	 * Emulate the accessed bit for EPT, by checking if this page has +	 * In case of absence of EPT Access and Dirty Bits supports, +	 * emulate the accessed bit for EPT, by checking if this page has  	 * an EPT mapping, and clearing it if it does. On the next access,  	 * a new EPT mapping will be established.  	 * This has some overhead, but not as much as the cost of swapping @@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;  	     sptep = rmap_get_next(&iter)) { -		BUG_ON(!(*sptep & PT_PRESENT_MASK)); +		BUG_ON(!is_shadow_present_pte(*sptep)); -		if (*sptep & PT_ACCESSED_MASK) { +		if (*sptep & shadow_accessed_mask) {  			young = 1; -			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); +			clear_bit((ffs(shadow_accessed_mask) - 1), +				 (unsigned long *)sptep);  		}  	} @@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,  	for (sptep = rmap_get_first(*rmapp, &iter); sptep;  	     sptep = rmap_get_next(&iter)) { -		BUG_ON(!(*sptep & PT_PRESENT_MASK)); +		BUG_ON(!is_shadow_present_pte(*sptep)); -		if (*sptep & PT_ACCESSED_MASK) { +		if (*sptep & shadow_accessed_mask) {  			young = 1;  			break;  		} @@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,  					       u64 *parent_pte, int direct)  {  	struct kvm_mmu_page *sp; -	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, -					sizeof *sp); -	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); +	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); +	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);  	if (!direct) -		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, -						  PAGE_SIZE); +		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);  	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);  	bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); @@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,  	kvm_mmu_pages_init(parent, &parents, &pages);  	while (mmu_unsync_walk(parent, &pages)) { -		int protected = 0; +		bool protected = false;  		for_each_sp(pages, sp, parents, i)  			protected |= rmap_write_protect(vcpu->kvm, sp->gfn); @@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)  	mmu_spte_set(sptep, spte);  } -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ -	if (is_large_pte(*sptep)) { -		drop_spte(vcpu->kvm, sptep); -		--vcpu->kvm->stat.lpages; -		kvm_flush_remote_tlbs(vcpu->kvm); -	} -} -  static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,  				   unsigned direct_access)  { @@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		    gfn_t gfn, pfn_t pfn, bool speculative,  		    bool can_unsync, bool host_writable)  { -	u64 spte, entry = *sptep; +	u64 spte;  	int ret = 0;  	if (set_mmio_spte(sptep, gfn, pfn, pte_access)) @@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		spte |= shadow_x_mask;  	else  		spte |= shadow_nx_mask; +  	if (pte_access & ACC_USER_MASK)  		spte |= shadow_user_mask; +  	if (level > PT_PAGE_TABLE_LEVEL)  		spte |= PT_PAGE_SIZE_MASK;  	if (tdp_enabled) @@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  			goto done;  		} -		spte |= PT_WRITABLE_MASK; +		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;  		if (!vcpu->arch.mmu.direct_map  		    && !(pte_access & ACC_WRITE_MASK)) { @@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  				 __func__, gfn);  			ret = 1;  			pte_access &= ~ACC_WRITE_MASK; -			if (is_writable_pte(spte)) -				spte &= ~PT_WRITABLE_MASK; +			spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);  		}  	} @@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		mark_page_dirty(vcpu->kvm, gfn);  set_pte: -	mmu_spte_update(sptep, spte); -	/* -	 * If we overwrite a writable spte with a read-only one we -	 * should flush remote TLBs. Otherwise rmap_write_protect -	 * will find a read-only spte, even though the writable spte -	 * might be cached on a CPU's TLB. -	 */ -	if (is_writable_pte(entry) && !is_writable_pte(*sptep)) +	if (mmu_spte_update(sptep, spte))  		kvm_flush_remote_tlbs(vcpu->kvm);  done:  	return ret; @@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)  { +	mmu_free_roots(vcpu);  }  static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2625,18 +2685,116 @@ exit:  	return ret;  } +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +{ +	/* +	 * #PF can be fast only if the shadow page table is present and it +	 * is caused by write-protect, that means we just need change the +	 * W bit of the spte which can be done out of mmu-lock. +	 */ +	if (!(error_code & PFERR_PRESENT_MASK) || +	      !(error_code & PFERR_WRITE_MASK)) +		return false; + +	return true; +} + +static bool +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +{ +	struct kvm_mmu_page *sp = page_header(__pa(sptep)); +	gfn_t gfn; + +	WARN_ON(!sp->role.direct); + +	/* +	 * The gfn of direct spte is stable since it is calculated +	 * by sp->gfn. +	 */ +	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + +	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) +		mark_page_dirty(vcpu->kvm, gfn); + +	return true; +} + +/* + * Return value: + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, +			    u32 error_code) +{ +	struct kvm_shadow_walk_iterator iterator; +	bool ret = false; +	u64 spte = 0ull; + +	if (!page_fault_can_be_fast(vcpu, error_code)) +		return false; + +	walk_shadow_page_lockless_begin(vcpu); +	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) +		if (!is_shadow_present_pte(spte) || iterator.level < level) +			break; + +	/* +	 * If the mapping has been changed, let the vcpu fault on the +	 * same address again. +	 */ +	if (!is_rmap_spte(spte)) { +		ret = true; +		goto exit; +	} + +	if (!is_last_spte(spte, level)) +		goto exit; + +	/* +	 * Check if it is a spurious fault caused by TLB lazily flushed. +	 * +	 * Need not check the access of upper level table entries since +	 * they are always ACC_ALL. +	 */ +	 if (is_writable_pte(spte)) { +		ret = true; +		goto exit; +	} + +	/* +	 * Currently, to simplify the code, only the spte write-protected +	 * by dirty-log can be fast fixed. +	 */ +	if (!spte_is_locklessly_modifiable(spte)) +		goto exit; + +	/* +	 * Currently, fast page fault only works for direct mapping since +	 * the gfn is not stable for indirect shadow page. +	 * See Documentation/virtual/kvm/locking.txt to get more detail. +	 */ +	ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); +exit: +	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, +			      spte, ret); +	walk_shadow_page_lockless_end(vcpu); + +	return ret; +} +  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,  			 gva_t gva, pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, -			 bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, +			 gfn_t gfn, bool prefault)  {  	int r;  	int level;  	int force_pt_level;  	pfn_t pfn;  	unsigned long mmu_seq; -	bool map_writable; +	bool map_writable, write = error_code & PFERR_WRITE_MASK;  	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);  	if (likely(!force_pt_level)) { @@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,  	} else  		level = PT_PAGE_TABLE_LEVEL; +	if (fast_page_fault(vcpu, v, level, error_code)) +		return 0; +  	mmu_seq = vcpu->kvm->mmu_notifier_seq;  	smp_rmb(); @@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,  	gfn = gva >> PAGE_SHIFT;  	return nonpaging_map(vcpu, gva & PAGE_MASK, -			     error_code & PFERR_WRITE_MASK, gfn, prefault); +			     error_code, gfn, prefault);  }  static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) @@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,  	} else  		level = PT_PAGE_TABLE_LEVEL; +	if (fast_page_fault(vcpu, gpa, level, error_code)) +		return 0; +  	mmu_seq = vcpu->kvm->mmu_notifier_seq;  	smp_rmb(); @@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)  {  	struct kvm_mmu_page *sp; +	bool flush = false;  	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {  		int i; @@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)  			      !is_last_spte(pt[i], sp->role.level))  				continue; -			if (is_large_pte(pt[i])) { -				drop_spte(kvm, &pt[i]); -				--kvm->stat.lpages; -				continue; -			} - -			/* avoid RMW */ -			if (is_writable_pte(pt[i])) -				mmu_spte_update(&pt[i], -						pt[i] & ~PT_WRITABLE_MASK); +			spte_write_protect(kvm, &pt[i], &flush, false);  		}  	}  	kvm_flush_remote_tlbs(kvm); @@ -3934,6 +4090,9 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,  {  	struct kvm_mmu_page *page; +	if (list_empty(&kvm->arch.active_mmu_pages)) +		return; +  	page = container_of(kvm->arch.active_mmu_pages.prev,  			    struct kvm_mmu_page, link);  	kvm_mmu_prepare_zap_page(kvm, page, invalid_list); @@ -3942,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,  static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)  {  	struct kvm *kvm; -	struct kvm *kvm_freed = NULL;  	int nr_to_scan = sc->nr_to_scan;  	if (nr_to_scan == 0) @@ -3954,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)  		int idx;  		LIST_HEAD(invalid_list); +		/* +		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock +		 * here. We may skip a VM instance errorneosly, but we do not +		 * want to shrink a VM that only started to populate its MMU +		 * anyway. +		 */ +		if (kvm->arch.n_used_mmu_pages > 0) { +			if (!nr_to_scan--) +				break; +			continue; +		} +  		idx = srcu_read_lock(&kvm->srcu);  		spin_lock(&kvm->mmu_lock); -		if (!kvm_freed && nr_to_scan > 0 && -		    kvm->arch.n_used_mmu_pages > 0) { -			kvm_mmu_remove_some_alloc_mmu_pages(kvm, -							    &invalid_list); -			kvm_freed = kvm; -		} -		nr_to_scan--; +		kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);  		kvm_mmu_commit_zap_page(kvm, &invalid_list); +  		spin_unlock(&kvm->mmu_lock);  		srcu_read_unlock(&kvm->srcu, idx); + +		list_move_tail(&kvm->vm_list, &vm_list); +		break;  	} -	if (kvm_freed) -		list_move_tail(&kvm_freed->vm_list, &vm_list);  	raw_spin_unlock(&kvm_lock); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322a..cd6e98333ba3 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -54,8 +54,8 @@   */  TRACE_EVENT(  	kvm_mmu_pagetable_walk, -	TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), -	TP_ARGS(addr, write_fault, user_fault, fetch_fault), +	TP_PROTO(u64 addr, u32 pferr), +	TP_ARGS(addr, pferr),  	TP_STRUCT__entry(  		__field(__u64, addr) @@ -64,8 +64,7 @@ TRACE_EVENT(  	TP_fast_assign(  		__entry->addr = addr; -		__entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) -		                 | (!!fetch_fault << 4); +		__entry->pferr = pferr;  	),  	TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, @@ -243,6 +242,44 @@ TRACE_EVENT(  	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,  		  __entry->access)  ); + +#define __spte_satisfied(__spte)				\ +	(__entry->retry && is_writable_pte(__entry->__spte)) + +TRACE_EVENT( +	fast_page_fault, +	TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, +		 u64 *sptep, u64 old_spte, bool retry), +	TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + +	TP_STRUCT__entry( +		__field(int, vcpu_id) +		__field(gva_t, gva) +		__field(u32, error_code) +		__field(u64 *, sptep) +		__field(u64, old_spte) +		__field(u64, new_spte) +		__field(bool, retry) +	), + +	TP_fast_assign( +		__entry->vcpu_id = vcpu->vcpu_id; +		__entry->gva = gva; +		__entry->error_code = error_code; +		__entry->sptep = sptep; +		__entry->old_spte = old_spte; +		__entry->new_spte = *sptep; +		__entry->retry = retry; +	), + +	TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" +		  " new %llx spurious %d fixed %d", __entry->vcpu_id, +		  __entry->gva, __print_flags(__entry->error_code, "|", +		  kvm_mmu_trace_pferr_flags), __entry->sptep, +		  __entry->old_spte, __entry->new_spte, +		  __spte_satisfied(old_spte), __spte_satisfied(new_spte) +	) +);  #endif /* _TRACE_KVMMMU_H */  #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 34f970937ef1..bb7cf01cae76 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,  	const int fetch_fault = access & PFERR_FETCH_MASK;  	u16 errcode = 0; -	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, -				     fetch_fault); +	trace_kvm_mmu_pagetable_walk(addr, access);  retry_walk:  	eperm = false;  	walker->level = mmu->root_level; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 2e88438ffd83..9b7ec1150ab0 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)  static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)  { -	if (idx < X86_PMC_IDX_FIXED) +	if (idx < INTEL_PMC_IDX_FIXED)  		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);  	else -		return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); +		return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);  }  void kvm_deliver_pmi(struct kvm_vcpu *vcpu) @@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)  	if (pmc_is_gp(pmc))  		reprogram_gp_counter(pmc, pmc->eventsel);  	else { -		int fidx = idx - X86_PMC_IDX_FIXED; +		int fidx = idx - INTEL_PMC_IDX_FIXED;  		reprogram_fixed_counter(pmc,  				fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);  	} @@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)  		return;  	pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, -			X86_PMC_MAX_GENERIC); +			INTEL_PMC_MAX_GENERIC);  	pmu->counter_bitmask[KVM_PMC_GP] =  		((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;  	bitmap_len = (entry->eax >> 24) & 0xff; @@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)  		pmu->nr_arch_fixed_counters = 0;  	} else {  		pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), -				X86_PMC_MAX_FIXED); +				INTEL_PMC_MAX_FIXED);  		pmu->counter_bitmask[KVM_PMC_FIXED] =  			((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;  	}  	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | -		(((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); +		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);  	pmu->global_ctrl_mask = ~pmu->global_ctrl;  } @@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)  	struct kvm_pmu *pmu = &vcpu->arch.pmu;  	memset(pmu, 0, sizeof(*pmu)); -	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { +	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {  		pmu->gp_counters[i].type = KVM_PMC_GP;  		pmu->gp_counters[i].vcpu = vcpu;  		pmu->gp_counters[i].idx = i;  	} -	for (i = 0; i < X86_PMC_MAX_FIXED; i++) { +	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {  		pmu->fixed_counters[i].type = KVM_PMC_FIXED;  		pmu->fixed_counters[i].vcpu = vcpu; -		pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; +		pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;  	}  	init_irq_work(&pmu->irq_work, trigger_pmi);  	kvm_pmu_cpuid_update(vcpu); @@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)  	int i;  	irq_work_sync(&pmu->irq_work); -	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { +	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {  		struct kvm_pmc *pmc = &pmu->gp_counters[i];  		stop_counter(pmc);  		pmc->counter = pmc->eventsel = 0;  	} -	for (i = 0; i < X86_PMC_MAX_FIXED; i++) +	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)  		stop_counter(&pmu->fixed_counters[i]);  	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af406b268..baead950d6c8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)  		break;  	case MSR_IA32_DEBUGCTLMSR:  		if (!boot_cpu_has(X86_FEATURE_LBRV)) { -			pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", -					__func__, data); +			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", +				    __func__, data);  			break;  		}  		if (data & DEBUGCTL_RESERVED_BITS) @@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)  	case MSR_VM_CR:  		return svm_set_vm_cr(vcpu, data);  	case MSR_VM_IGNNE: -		pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); +		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);  		break;  	default:  		return kvm_set_msr_common(vcpu, ecx, data); @@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void)  	return false;  } +static bool svm_invpcid_supported(void) +{ +	return false; +} +  static bool svm_has_wbinvd_exit(void)  {  	return true; @@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {  	.cpuid_update = svm_cpuid_update,  	.rdtscp_supported = svm_rdtscp_supported, +	.invpcid_supported = svm_invpcid_supported,  	.set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14c..a71faf727ff3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,  		  __entry->coalesced ? " (coalesced)" : "")  ); +TRACE_EVENT(kvm_eoi, +	    TP_PROTO(struct kvm_lapic *apic, int vector), +	    TP_ARGS(apic, vector), + +	TP_STRUCT__entry( +		__field(	__u32,		apicid		) +		__field(	int,		vector		) +	), + +	TP_fast_assign( +		__entry->apicid		= apic->vcpu->vcpu_id; +		__entry->vector		= vector; +	), + +	TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, +	    TP_PROTO(struct kvm_lapic *apic, int vector), +	    TP_ARGS(apic, vector), + +	TP_STRUCT__entry( +		__field(	__u32,		apicid		) +		__field(	int,		vector		) +	), + +	TP_fast_assign( +		__entry->apicid		= apic->vcpu->vcpu_id; +		__entry->vector		= vector; +	), + +	TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); +  /*   * Tracepoint for nested VMRUN   */ @@ -710,16 +744,6 @@ TRACE_EVENT(kvm_skinit,  		  __entry->rip, __entry->slb)  ); -#define __print_insn(insn, ilen) ({		                 \ -	int i;							 \ -	const char *ret = p->buffer + p->len;			 \ -								 \ -	for (i = 0; i < ilen; ++i)				 \ -		trace_seq_printf(p, " %02x", insn[i]);		 \ -	trace_seq_printf(p, "%c", 0);				 \ -	ret;							 \ -	}) -  #define KVM_EMUL_INSN_F_CR0_PE (1 << 0)  #define KVM_EMUL_INSN_F_EFL_VM (1 << 1)  #define KVM_EMUL_INSN_F_CS_D   (1 << 2) @@ -786,7 +810,7 @@ TRACE_EVENT(kvm_emulate_insn,  	TP_printk("%x:%llx:%s (%s)%s",  		  __entry->csbase, __entry->rip, -		  __print_insn(__entry->insn, __entry->len), +		  __print_hex(__entry->insn, __entry->len),  		  __print_symbolic(__entry->flags,  				   kvm_trace_symbol_emul_flags),  		  __entry->failed ? " failed" : "" diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32eb58866292..c39b60707e02 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1;  module_param_named(unrestricted_guest,  			enable_unrestricted_guest, bool, S_IRUGO); -static bool __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + +static bool __read_mostly emulate_invalid_guest_state = true;  module_param(emulate_invalid_guest_state, bool, S_IRUGO);  static bool __read_mostly vmm_exclusive = 1; @@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);  static void kvm_cpu_vmxoff(void);  static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +static void vmx_set_segment(struct kvm_vcpu *vcpu, +			    struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, +			    struct kvm_segment *var, int seg);  static DEFINE_PER_CPU(struct vmcs *, vmxarea);  static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)  	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;  } +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ +	return vmx_capability.ept & VMX_EPT_AD_BIT; +} +  static inline bool cpu_has_vmx_invept_individual_addr(void)  {  	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)  		SECONDARY_EXEC_RDTSCP;  } +static inline bool cpu_has_vmx_invpcid(void) +{ +	return vmcs_config.cpu_based_2nd_exec_ctrl & +		SECONDARY_EXEC_ENABLE_INVPCID; +} +  static inline bool cpu_has_virtual_nmis(void)  {  	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void)  	return cpu_has_vmx_rdtscp();  } +static bool vmx_invpcid_supported(void) +{ +	return cpu_has_vmx_invpcid() && enable_ept; +} +  /*   * Swap MSR entry in host/guest MSR entry array.   */ @@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)  			SECONDARY_EXEC_ENABLE_EPT |  			SECONDARY_EXEC_UNRESTRICTED_GUEST |  			SECONDARY_EXEC_PAUSE_LOOP_EXITING | -			SECONDARY_EXEC_RDTSCP; +			SECONDARY_EXEC_RDTSCP | +			SECONDARY_EXEC_ENABLE_INVPCID;  		if (adjust_vmx_controls(min2, opt2,  					MSR_IA32_VMX_PROCBASED_CTLS2,  					&_cpu_based_2nd_exec_control) < 0) @@ -2645,8 +2669,12 @@ static __init int hardware_setup(void)  	    !cpu_has_vmx_ept_4levels()) {  		enable_ept = 0;  		enable_unrestricted_guest = 0; +		enable_ept_ad_bits = 0;  	} +	if (!cpu_has_vmx_ept_ad_bits()) +		enable_ept_ad_bits = 0; +  	if (!cpu_has_vmx_unrestricted_guest())  		enable_unrestricted_guest = 0; @@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)  {  	unsigned long flags;  	struct vcpu_vmx *vmx = to_vmx(vcpu); +	struct kvm_segment var;  	if (enable_unrestricted_guest)  		return; @@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)  	if (emulate_invalid_guest_state)  		goto continue_rmode; -	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); -	vmcs_write32(GUEST_SS_LIMIT, 0xffff); -	vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); +	vmx_get_segment(vcpu, &var, VCPU_SREG_SS); +	vmx_set_segment(vcpu, &var, VCPU_SREG_SS); + +	vmx_get_segment(vcpu, &var, VCPU_SREG_CS); +	vmx_set_segment(vcpu, &var, VCPU_SREG_CS); + +	vmx_get_segment(vcpu, &var, VCPU_SREG_ES); +	vmx_set_segment(vcpu, &var, VCPU_SREG_ES); + +	vmx_get_segment(vcpu, &var, VCPU_SREG_DS); +	vmx_set_segment(vcpu, &var, VCPU_SREG_DS); -	vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); -	vmcs_write32(GUEST_CS_LIMIT, 0xffff); -	if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) -		vmcs_writel(GUEST_CS_BASE, 0xf0000); -	vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); +	vmx_get_segment(vcpu, &var, VCPU_SREG_GS); +	vmx_set_segment(vcpu, &var, VCPU_SREG_GS); -	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); -	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); -	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); -	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); +	vmx_get_segment(vcpu, &var, VCPU_SREG_FS); +	vmx_set_segment(vcpu, &var, VCPU_SREG_FS);  continue_rmode:  	kvm_mmu_reset_context(vcpu); @@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa)  	/* TODO write the value reading from MSR */  	eptp = VMX_EPT_DEFAULT_MT |  		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; +	if (enable_ept_ad_bits) +		eptp |= VMX_EPT_AD_ENABLE_BIT;  	eptp |= (root_hpa & PAGE_MASK);  	return eptp; @@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)  static int vmx_get_cpl(struct kvm_vcpu *vcpu)  { +	struct vcpu_vmx *vmx = to_vmx(vcpu); + +	/* +	 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations +	 * fail; use the cache instead. +	 */ +	if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { +		return vmx->cpl; +	} +  	if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {  		__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); -		to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); +		vmx->cpl = __vmx_get_cpl(vcpu);  	} -	return to_vmx(vcpu)->cpl; + +	return vmx->cpl;  } @@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)  {  	u32 ar; -	if (var->unusable) +	if (var->unusable || !var->present)  		ar = 1 << 16;  	else {  		ar = var->type & 15; @@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)  		ar |= (var->db & 1) << 14;  		ar |= (var->g & 1) << 15;  	} -	if (ar == 0) /* a 0 value means unusable */ -		ar = AR_UNUSABLE_MASK;  	return ar;  } @@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,  	vmcs_write32(sf->ar_bytes, ar);  	__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + +	/* +	 * Fix segments for real mode guest in hosts that don't have +	 * "unrestricted_mode" or it was disabled. +	 * This is done to allow migration of the guests from hosts with +	 * unrestricted guest like Westmere to older host that don't have +	 * unrestricted guest like Nehelem. +	 */ +	if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { +		switch (seg) { +		case VCPU_SREG_CS: +			vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); +			vmcs_write32(GUEST_CS_LIMIT, 0xffff); +			if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) +				vmcs_writel(GUEST_CS_BASE, 0xf0000); +			vmcs_write16(GUEST_CS_SELECTOR, +				     vmcs_readl(GUEST_CS_BASE) >> 4); +			break; +		case VCPU_SREG_ES: +			fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); +			break; +		case VCPU_SREG_DS: +			fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); +			break; +		case VCPU_SREG_GS: +			fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); +			break; +		case VCPU_SREG_FS: +			fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); +			break; +		case VCPU_SREG_SS: +			vmcs_write16(GUEST_SS_SELECTOR, +				     vmcs_readl(GUEST_SS_BASE) >> 4); +			vmcs_write32(GUEST_SS_LIMIT, 0xffff); +			vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); +			break; +		} +	}  }  static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)  	if (!enable_ept) {  		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;  		enable_unrestricted_guest = 0; +		/* Enable INVPCID for non-ept guests may cause performance regression. */ +		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;  	}  	if (!enable_unrestricted_guest)  		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)  		break;  	}  	vcpu->run->exit_reason = 0; -	pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", +	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",  	       (int)(exit_qualification >> 4) & 3, cr);  	return 0;  } @@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)  {  	unsigned long exit_qualification;  	gpa_t gpa; +	u32 error_code;  	int gla_validity;  	exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)  	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);  	trace_kvm_page_fault(gpa, exit_qualification); -	return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); + +	/* It is a write fault? */ +	error_code = exit_qualification & (1U << 1); +	/* ept page table is present? */ +	error_code |= (exit_qualification >> 3) & 0x1; + +	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);  }  static u64 ept_rsvd_mask(u64 spte, int level) @@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  	int ret = 1;  	u32 cpu_exec_ctrl;  	bool intr_window_requested; +	unsigned count = 130;  	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);  	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; -	while (!guest_state_valid(vcpu)) { -		if (intr_window_requested -		    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) +	while (!guest_state_valid(vcpu) && count-- != 0) { +		if (intr_window_requested && vmx_interrupt_allowed(vcpu))  			return handle_interrupt_window(&vmx->vcpu); +		if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) +			return 1; +  		err = emulate_instruction(vcpu, 0);  		if (err == EMULATE_DO_MMIO) { @@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  			goto out;  		} -		if (err != EMULATE_DONE) +		if (err != EMULATE_DONE) { +			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; +			vcpu->run->internal.ndata = 0;  			return 0; +		}  		if (signal_pending(current))  			goto out; @@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  			schedule();  	} -	vmx->emulation_required = 0; +	vmx->emulation_required = !guest_state_valid(vcpu);  out:  	return ret;  } @@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)  			}  		}  	} + +	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); +	/* Exposing INVPCID only when PCID is exposed */ +	best = kvm_find_cpuid_entry(vcpu, 0x7, 0); +	if (vmx_invpcid_supported() && +	    best && (best->ecx & bit(X86_FEATURE_INVPCID)) && +	    guest_cpuid_has_pcid(vcpu)) { +		exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; +		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, +			     exec_control); +	} else { +		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; +		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, +			     exec_control); +		if (best) +			best->ecx &= ~bit(X86_FEATURE_INVPCID); +	}  }  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.cpuid_update = vmx_cpuid_update,  	.rdtscp_supported = vmx_rdtscp_supported, +	.invpcid_supported = vmx_invpcid_supported,  	.set_supported_cpuid = vmx_set_supported_cpuid, @@ -7230,23 +7345,21 @@ static int __init vmx_init(void)  	if (!vmx_io_bitmap_a)  		return -ENOMEM; +	r = -ENOMEM; +  	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); -	if (!vmx_io_bitmap_b) { -		r = -ENOMEM; +	if (!vmx_io_bitmap_b)  		goto out; -	}  	vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); -	if (!vmx_msr_bitmap_legacy) { -		r = -ENOMEM; +	if (!vmx_msr_bitmap_legacy)  		goto out1; -	} +  	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); -	if (!vmx_msr_bitmap_longmode) { -		r = -ENOMEM; +	if (!vmx_msr_bitmap_longmode)  		goto out2; -	} +  	/*  	 * Allow direct access to the PC debug port (it is often used for I/O @@ -7275,8 +7388,10 @@ static int __init vmx_init(void)  	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);  	if (enable_ept) { -		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, -				VMX_EPT_EXECUTABLE_MASK); +		kvm_mmu_set_mask_ptes(0ull, +			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, +			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, +			0ull, VMX_EPT_EXECUTABLE_MASK);  		ept_set_mmio_spte_mask();  		kvm_enable_tdp();  	} else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa7..59b59508ff07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  			return 1;  	} +	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) +		return 1; +  	kvm_x86_ops->set_cr0(vcpu, cr0);  	if ((cr0 ^ old_cr0) & X86_CR0_PG) { @@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)  				   kvm_read_cr3(vcpu)))  		return 1; +	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { +		if (!guest_cpuid_has_pcid(vcpu)) +			return 1; + +		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ +		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) +			return 1; +	} +  	if (kvm_x86_ops->set_cr4(vcpu, cr4))  		return 1; -	if ((cr4 ^ old_cr4) & pdptr_bits) +	if (((cr4 ^ old_cr4) & pdptr_bits) || +	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))  		kvm_mmu_reset_context(vcpu);  	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) @@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)  	}  	if (is_long_mode(vcpu)) { -		if (cr3 & CR3_L_MODE_RESERVED_BITS) -			return 1; +		if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { +			if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) +				return 1; +		} else +			if (cr3 & CR3_L_MODE_RESERVED_BITS) +				return 1;  	} else {  		if (is_pae(vcpu)) {  			if (cr3 & CR3_PAE_RESERVED_BITS) @@ -795,6 +812,7 @@ static u32 msrs_to_save[] = {  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,  	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, +	MSR_KVM_PV_EOI_EN,  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,  	MSR_STAR,  #ifdef CONFIG_X86_64 @@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		break;  	}  	default: -		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " -			  "data 0x%llx\n", msr, data); +		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " +			    "data 0x%llx\n", msr, data);  		return 1;  	}  	return 0; @@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)  	case HV_X64_MSR_TPR:  		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);  	default: -		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " -			  "data 0x%llx\n", msr, data); +		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " +			    "data 0x%llx\n", msr, data);  		return 1;  	} @@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		data &= ~(u64)0x100;	/* ignore ignne emulation enable */  		data &= ~(u64)0x8;	/* ignore TLB cache disable */  		if (data != 0) { -			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", -				data); +			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", +				    data);  			return 1;  		}  		break;  	case MSR_FAM10H_MMIO_CONF_BASE:  		if (data != 0) { -			pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " -				"0x%llx\n", data); +			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " +				    "0x%llx\n", data);  			return 1;  		}  		break; @@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  			   thus reserved and should throw a #GP */  			return 1;  		} -		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", -			__func__, data); +		vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", +			    __func__, data);  		break;  	case MSR_IA32_UCODE_REV:  	case MSR_IA32_UCODE_WRITE: @@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);  		break; +	case MSR_KVM_PV_EOI_EN: +		if (kvm_lapic_enable_pv_eoi(vcpu, data)) +			return 1; +		break;  	case MSR_IA32_MCG_CTL:  	case MSR_IA32_MCG_STATUS: @@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  	case MSR_K7_EVNTSEL2:  	case MSR_K7_EVNTSEL3:  		if (data != 0) -			pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " -				"0x%x data 0x%llx\n", msr, data); +			vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " +				    "0x%x data 0x%llx\n", msr, data);  		break;  	/* at least RHEL 4 unconditionally writes to the perfctr registers,  	 * so we ignore writes to make it happy. @@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  	case MSR_K7_PERFCTR1:  	case MSR_K7_PERFCTR2:  	case MSR_K7_PERFCTR3: -		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " -			"0x%x data 0x%llx\n", msr, data); +		vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " +			    "0x%x data 0x%llx\n", msr, data);  		break;  	case MSR_P6_PERFCTR0:  	case MSR_P6_PERFCTR1: @@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  			return kvm_pmu_set_msr(vcpu, msr, data);  		if (pr || data != 0) -			pr_unimpl(vcpu, "disabled perfctr wrmsr: " -				"0x%x data 0x%llx\n", msr, data); +			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " +				    "0x%x data 0x%llx\n", msr, data);  		break;  	case MSR_K7_CLK_CTL:  		/* @@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		/* Drop writes to this legacy MSR -- see rdmsr  		 * counterpart for further detail.  		 */ -		pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); +		vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);  		break;  	case MSR_AMD64_OSVW_ID_LENGTH:  		if (!guest_cpuid_has_osvw(vcpu)) @@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)  		if (kvm_pmu_msr(vcpu, msr))  			return kvm_pmu_set_msr(vcpu, msr, data);  		if (!ignore_msrs) { -			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", -				msr, data); +			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", +				    msr, data);  			return 1;  		} else { -			pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", -				msr, data); +			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", +				    msr, data);  			break;  		}  	} @@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  		data = kvm->arch.hv_hypercall;  		break;  	default: -		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); +		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);  		return 1;  	} @@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  		data = vcpu->arch.hv_vapic;  		break;  	default: -		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); +		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);  		return 1;  	}  	*pdata = data; @@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)  		if (kvm_pmu_msr(vcpu, msr))  			return kvm_pmu_get_msr(vcpu, msr, pdata);  		if (!ignore_msrs) { -			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); +			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);  			return 1;  		} else { -			pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); +			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);  			data = 0;  		}  		break; @@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)  		value = kvm_get_cr8(vcpu);  		break;  	default: -		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); +		kvm_err("%s: unexpected cr %u\n", __func__, cr);  		return 0;  	} @@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)  		res = kvm_set_cr8(vcpu, val);  		break;  	default: -		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); +		kvm_err("%s: unexpected cr %u\n", __func__, cr);  		res = -1;  	} @@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,  	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);  } -static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, +static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,  			       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)  { -	struct kvm_cpuid_entry2 *cpuid = NULL; - -	if (eax && ecx) -		cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), -					    *eax, *ecx); - -	if (cpuid) { -		*eax = cpuid->eax; -		*ecx = cpuid->ecx; -		if (ebx) -			*ebx = cpuid->ebx; -		if (edx) -			*edx = cpuid->edx; -		return true; -	} - -	return false; +	kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);  }  static struct x86_emulate_ops emulate_ops = { @@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  	r = kvm_mmu_reload(vcpu);  	if (unlikely(r)) { -		kvm_x86_ops->cancel_injection(vcpu); -		goto out; +		goto cancel_injection;  	}  	preempt_disable(); @@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  		smp_wmb();  		local_irq_enable();  		preempt_enable(); -		kvm_x86_ops->cancel_injection(vcpu);  		r = 1; -		goto out; +		goto cancel_injection;  	}  	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  	if (unlikely(vcpu->arch.tsc_always_catchup))  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); -	kvm_lapic_sync_from_vapic(vcpu); +	if (vcpu->arch.apic_attention) +		kvm_lapic_sync_from_vapic(vcpu);  	r = kvm_x86_ops->handle_exit(vcpu); +	return r; + +cancel_injection: +	kvm_x86_ops->cancel_injection(vcpu); +	if (unlikely(vcpu->arch.apic_attention)) +		kvm_lapic_sync_from_vapic(vcpu);  out:  	return r;  } @@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,  	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {  		if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { -			vfree(free->arch.lpage_info[i]); +			kvm_kvfree(free->arch.lpage_info[i]);  			free->arch.lpage_info[i] = NULL;  		}  	} @@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)  				      slot->base_gfn, level) + 1;  		slot->arch.lpage_info[i] = -			vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); +			kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));  		if (!slot->arch.lpage_info[i])  			goto out_free; @@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)  out_free:  	for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { -		vfree(slot->arch.lpage_info[i]); +		kvm_kvfree(slot->arch.lpage_info[i]);  		slot->arch.lpage_info[i] = NULL;  	}  	return -ENOMEM; diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 459b58a8a15c..25b7ae8d058a 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user);   * @src: source address   * @dst: destination address   * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) + * @sum: initial sum that is added into the result (32bit unfolded)   *   * Returns an 32bit unfolded checksum of the buffer.   */ diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c index a311cc59b65d..8d6ef78b5d01 100644 --- a/arch/x86/lib/msr-reg-export.c +++ b/arch/x86/lib/msr-reg-export.c @@ -1,5 +1,5 @@  #include <linux/module.h>  #include <asm/msr.h> -EXPORT_SYMBOL(native_rdmsr_safe_regs); -EXPORT_SYMBOL(native_wrmsr_safe_regs); +EXPORT_SYMBOL(rdmsr_safe_regs); +EXPORT_SYMBOL(wrmsr_safe_regs); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 69fa10623f21..f6d13eefad10 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -6,13 +6,13 @@  #ifdef CONFIG_X86_64  /* - * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);   *   * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]   *   */  .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs)  	CFI_STARTPROC  	pushq_cfi %rbx  	pushq_cfi %rbp @@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs)  	_ASM_EXTABLE(1b, 3b)  	CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs)  .endm  #else /* X86_32 */  .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs)  	CFI_STARTPROC  	pushl_cfi %ebx  	pushl_cfi %ebp @@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs)  	_ASM_EXTABLE(1b, 3b)  	CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs)  .endm  #endif diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bc4e9d84157f..e0e6990723e9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -385,7 +385,7 @@ void free_initmem(void)  }  #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +void __init free_initrd_mem(unsigned long start, unsigned long end)  {  	/*  	 * end could be not aligned, and We can not align that, diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a718e0d23503..931930a96160 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -919,11 +919,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,  	/*  	 * On success we use clflush, when the CPU supports it to -	 * avoid the wbindv. If the CPU does not support it and in the -	 * error case we fall back to cpa_flush_all (which uses -	 * wbindv): +	 * avoid the wbindv. If the CPU does not support it, in the +	 * error case, and during early boot (for EFI) we fall back +	 * to cpa_flush_all (which uses wbinvd):  	 */ -	if (!ret && cpu_has_clflush) { +	if (early_boot_irqs_disabled) +		__cpa_flush_all((void *)(long)cache); +	else if (!ret && cpu_has_clflush) {  		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {  			cpa_flush_array(addr, numpages, cache,  					cpa.flags, pages); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5e57e113b72c..613cd83e8c0c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@  #include <asm/cache.h>  #include <asm/apic.h>  #include <asm/uv/uv.h> +#include <linux/debugfs.h>  DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)  			= { &init_mm, 0, }; @@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)   *   *	More scalable flush, from Andi Kleen   * - *	To avoid global state use 8 different call vectors. - *	Each CPU uses a specific vector to trigger flushes on other - *	CPUs. Depending on the received vector the target CPUs look into - *	the right array slot for the flush data. - * - *	With more than 8 CPUs they are hashed to the 8 available - *	vectors. The limited global vector space forces us to this right now. - *	In future when interrupts are split into per CPU domains this could be - *	fixed, at the cost of triggering multiple IPIs in some cases. + *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi   */ -union smp_flush_state { -	struct { -		struct mm_struct *flush_mm; -		unsigned long flush_va; -		raw_spinlock_t tlbstate_lock; -		DECLARE_BITMAP(flush_cpumask, NR_CPUS); -	}; -	char pad[INTERNODE_CACHE_BYTES]; -} ____cacheline_internodealigned_in_smp; - -/* State is put into the per CPU data section, but padded -   to a full cache line because other CPUs can access it and we don't -   want false sharing in the per cpu data segment. */ -static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; - -static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); +struct flush_tlb_info { +	struct mm_struct *flush_mm; +	unsigned long flush_start; +	unsigned long flush_end; +};  /*   * We cannot call mmdrop() because we are in interrupt context, @@ -72,28 +54,25 @@ void leave_mm(int cpu)  EXPORT_SYMBOL_GPL(leave_mm);  /* - *   * The flush IPI assumes that a thread switch happens in this order:   * [cpu0: the cpu that switches]   * 1) switch_mm() either 1a) or 1b)   * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - *	Stop ipi delivery for the old mm. This is not synchronized with - *	the other cpus, but smp_invalidate_interrupt ignore flush ipis - *	for the wrong mm, and in the worst case we perform a superfluous - *	tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - *	was in lazy tlb mode. - * 1a3) update cpu active_mm + * 1a1) set cpu_tlbstate to TLBSTATE_OK + *	Now the tlb flush NMI handler flush_tlb_func won't call leave_mm + *	if cpu0 was in lazy tlb mode. + * 1a2) update cpu active_mm   *	Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);   *	Now the other cpus will send tlb flush ipis.   * 1a4) change cr3. + * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); + *	Stop ipi delivery for the old mm. This is not synchronized with + *	the other cpus, but flush_tlb_func ignore flush ipis for the wrong + *	mm, and in the worst case we perform a superfluous tlb flush.   * 1b) thread switch without mm change - *	cpu active_mm is correct, cpu0 already handles - *	flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK + *	cpu active_mm is correct, cpu0 already handles flush ipis. + * 1b1) set cpu_tlbstate to TLBSTATE_OK   * 1b2) test_and_set the cpu bit in cpu_vm_mask.   *	Atomically set the bit [other cpus will start sending flush ipis],   *	and test the bit. @@ -106,174 +85,62 @@ EXPORT_SYMBOL_GPL(leave_mm);   *   runs in kernel space, the cpu could load tlb entries for user space   *   pages.   * - * The good news is that cpu mmu_state is local to each cpu, no + * The good news is that cpu_tlbstate is local to each cpu, no   * write/read ordering problems.   */  /* - * TLB flush IPI: - * + * TLB flush funcation:   * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.   * 2) Leave the mm if we are in the lazy tlb mode. - * - * Interrupts are disabled. - */ - -/* - * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop - * but still used for documentation purpose but the usage is slightly - * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt - * entry calls in with the first parameter in %eax.  Maybe define - * intrlinkage?   */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void smp_invalidate_interrupt(struct pt_regs *regs) +static void flush_tlb_func(void *info)  { -	unsigned int cpu; -	unsigned int sender; -	union smp_flush_state *f; - -	cpu = smp_processor_id(); -	/* -	 * orig_rax contains the negated interrupt vector. -	 * Use that to determine where the sender put the data. -	 */ -	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; -	f = &flush_state[sender]; - -	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) -		goto out; -		/* -		 * This was a BUG() but until someone can quote me the -		 * line from the intel manual that guarantees an IPI to -		 * multiple CPUs is retried _only_ on the erroring CPUs -		 * its staying as a return -		 * -		 * BUG(); -		 */ - -	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) { -		if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { -			if (f->flush_va == TLB_FLUSH_ALL) -				local_flush_tlb(); -			else -				__flush_tlb_one(f->flush_va); -		} else -			leave_mm(cpu); -	} -out: -	ack_APIC_irq(); -	smp_mb__before_clear_bit(); -	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); -	smp_mb__after_clear_bit(); -	inc_irq_stat(irq_tlb_count); -} +	struct flush_tlb_info *f = info; -static void flush_tlb_others_ipi(const struct cpumask *cpumask, -				 struct mm_struct *mm, unsigned long va) -{ -	unsigned int sender; -	union smp_flush_state *f; - -	/* Caller has disabled preemption */ -	sender = this_cpu_read(tlb_vector_offset); -	f = &flush_state[sender]; - -	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) -		raw_spin_lock(&f->tlbstate_lock); - -	f->flush_mm = mm; -	f->flush_va = va; -	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { -		/* -		 * We have to send the IPI only to -		 * CPUs affected. -		 */ -		apic->send_IPI_mask(to_cpumask(f->flush_cpumask), -			      INVALIDATE_TLB_VECTOR_START + sender); - -		while (!cpumask_empty(to_cpumask(f->flush_cpumask))) -			cpu_relax(); -	} +	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) +		return; + +	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { +		if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg) +			local_flush_tlb(); +		else if (!f->flush_end) +			__flush_tlb_single(f->flush_start); +		else { +			unsigned long addr; +			addr = f->flush_start; +			while (addr < f->flush_end) { +				__flush_tlb_single(addr); +				addr += PAGE_SIZE; +			} +		} +	} else +		leave_mm(smp_processor_id()); -	f->flush_mm = NULL; -	f->flush_va = 0; -	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) -		raw_spin_unlock(&f->tlbstate_lock);  }  void native_flush_tlb_others(const struct cpumask *cpumask, -			     struct mm_struct *mm, unsigned long va) +				 struct mm_struct *mm, unsigned long start, +				 unsigned long end)  { +	struct flush_tlb_info info; +	info.flush_mm = mm; +	info.flush_start = start; +	info.flush_end = end; +  	if (is_uv_system()) {  		unsigned int cpu;  		cpu = smp_processor_id(); -		cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); +		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);  		if (cpumask) -			flush_tlb_others_ipi(cpumask, mm, va); +			smp_call_function_many(cpumask, flush_tlb_func, +								&info, 1);  		return;  	} -	flush_tlb_others_ipi(cpumask, mm, va); +	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);  } -static void __cpuinit calculate_tlb_offset(void) -{ -	int cpu, node, nr_node_vecs, idx = 0; -	/* -	 * we are changing tlb_vector_offset for each CPU in runtime, but this -	 * will not cause inconsistency, as the write is atomic under X86. we -	 * might see more lock contentions in a short time, but after all CPU's -	 * tlb_vector_offset are changed, everything should go normal -	 * -	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might -	 * waste some vectors. -	 **/ -	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) -		nr_node_vecs = 1; -	else -		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; - -	for_each_online_node(node) { -		int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * -			nr_node_vecs; -		int cpu_offset = 0; -		for_each_cpu(cpu, cpumask_of_node(node)) { -			per_cpu(tlb_vector_offset, cpu) = node_offset + -				cpu_offset; -			cpu_offset++; -			cpu_offset = cpu_offset % nr_node_vecs; -		} -		idx++; -	} -} - -static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, -		unsigned long action, void *hcpu) -{ -	switch (action & 0xf) { -	case CPU_ONLINE: -	case CPU_DEAD: -		calculate_tlb_offset(); -	} -	return NOTIFY_OK; -} - -static int __cpuinit init_smp_flush(void) -{ -	int i; - -	for (i = 0; i < ARRAY_SIZE(flush_state); i++) -		raw_spin_lock_init(&flush_state[i].tlbstate_lock); - -	calculate_tlb_offset(); -	hotcpu_notifier(tlb_cpuhp_notify, 0); -	return 0; -} -core_initcall(init_smp_flush); -  void flush_tlb_current_task(void)  {  	struct mm_struct *mm = current->mm; @@ -282,27 +149,91 @@ void flush_tlb_current_task(void)  	local_flush_tlb();  	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) -		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); +		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);  	preempt_enable();  } -void flush_tlb_mm(struct mm_struct *mm) +/* + * It can find out the THP large page, or + * HUGETLB page in tlb_flush when THP disabled + */ +static inline unsigned long has_large_page(struct mm_struct *mm, +				 unsigned long start, unsigned long end) +{ +	pgd_t *pgd; +	pud_t *pud; +	pmd_t *pmd; +	unsigned long addr = ALIGN(start, HPAGE_SIZE); +	for (; addr < end; addr += HPAGE_SIZE) { +		pgd = pgd_offset(mm, addr); +		if (likely(!pgd_none(*pgd))) { +			pud = pud_offset(pgd, addr); +			if (likely(!pud_none(*pud))) { +				pmd = pmd_offset(pud, addr); +				if (likely(!pmd_none(*pmd))) +					if (pmd_large(*pmd)) +						return addr; +			} +		} +	} +	return 0; +} + +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, +				unsigned long end, unsigned long vmflag)  { +	unsigned long addr; +	unsigned act_entries, tlb_entries = 0; +  	preempt_disable(); +	if (current->active_mm != mm) +		goto flush_all; -	if (current->active_mm == mm) { -		if (current->mm) +	if (!current->mm) { +		leave_mm(smp_processor_id()); +		goto flush_all; +	} + +	if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 +					|| vmflag == VM_HUGETLB) { +		local_flush_tlb(); +		goto flush_all; +	} + +	/* In modern CPU, last level tlb used for both data/ins */ +	if (vmflag & VM_EXEC) +		tlb_entries = tlb_lli_4k[ENTRIES]; +	else +		tlb_entries = tlb_lld_4k[ENTRIES]; +	/* Assume all of TLB entries was occupied by this task */ +	act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; + +	/* tlb_flushall_shift is on balance point, details in commit log */ +	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) +		local_flush_tlb(); +	else { +		if (has_large_page(mm, start, end)) {  			local_flush_tlb(); -		else -			leave_mm(smp_processor_id()); +			goto flush_all; +		} +		/* flush range by one by one 'invlpg' */ +		for (addr = start; addr < end;	addr += PAGE_SIZE) +			__flush_tlb_single(addr); + +		if (cpumask_any_but(mm_cpumask(mm), +				smp_processor_id()) < nr_cpu_ids) +			flush_tlb_others(mm_cpumask(mm), mm, start, end); +		preempt_enable(); +		return;  	} -	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) -		flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); +flush_all: +	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) +		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);  	preempt_enable();  } -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)  {  	struct mm_struct *mm = vma->vm_mm; @@ -310,13 +241,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)  	if (current->active_mm == mm) {  		if (current->mm) -			__flush_tlb_one(va); +			__flush_tlb_one(start);  		else  			leave_mm(smp_processor_id());  	}  	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) -		flush_tlb_others(mm_cpumask(mm), mm, va); +		flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);  	preempt_enable();  } @@ -332,3 +263,83 @@ void flush_tlb_all(void)  {  	on_each_cpu(do_flush_tlb_all, NULL, 1);  } + +static void do_kernel_range_flush(void *info) +{ +	struct flush_tlb_info *f = info; +	unsigned long addr; + +	/* flush range by one by one 'invlpg' */ +	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) +		__flush_tlb_single(addr); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ +	unsigned act_entries; +	struct flush_tlb_info info; + +	/* In modern CPU, last level tlb used for both data/ins */ +	act_entries = tlb_lld_4k[ENTRIES]; + +	/* Balance as user space task's flush, a bit conservative */ +	if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || +		(end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + +		on_each_cpu(do_flush_tlb_all, NULL, 1); +	else { +		info.flush_start = start; +		info.flush_end = end; +		on_each_cpu(do_kernel_range_flush, &info, 1); +	} +} + +#ifdef CONFIG_DEBUG_TLBFLUSH +static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, +			     size_t count, loff_t *ppos) +{ +	char buf[32]; +	unsigned int len; + +	len = sprintf(buf, "%hd\n", tlb_flushall_shift); +	return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t tlbflush_write_file(struct file *file, +		 const char __user *user_buf, size_t count, loff_t *ppos) +{ +	char buf[32]; +	ssize_t len; +	s8 shift; + +	len = min(count, sizeof(buf) - 1); +	if (copy_from_user(buf, user_buf, len)) +		return -EFAULT; + +	buf[len] = '\0'; +	if (kstrtos8(buf, 0, &shift)) +		return -EINVAL; + +	if (shift > 64) +		return -EINVAL; + +	tlb_flushall_shift = shift; +	return count; +} + +static const struct file_operations fops_tlbflush = { +	.read = tlbflush_read_file, +	.write = tlbflush_write_file, +	.llseek = default_llseek, +}; + +static int __cpuinit create_tlb_flushall_shift(void) +{ +	if (cpu_has_invlpg) { +		debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, +			arch_debugfs_dir, NULL, &fops_tlbflush); +	} +	return 0; +} +late_initcall(create_tlb_flushall_shift); +#endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0597f95b6da6..33643a8bcbbb 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -309,6 +309,10 @@ void bpf_jit_compile(struct sk_filter *fp)  				else  					EMIT1_off32(0x0d, K);	/* or imm32,%eax */  				break; +			case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ +				seen |= SEEN_XREG; +				EMIT2(0x31, 0xd8);		/* xor %ebx,%eax */ +				break;  			case BPF_S_ALU_LSH_X: /* A <<= X; */  				seen |= SEEN_XREG;  				EMIT4(0x89, 0xd9, 0xd3, 0xe0);	/* mov %ebx,%ecx; shl %cl,%eax */ diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 303f08637826..b2b94438ff05 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)  			goto fail;  		}  		/* both registers must be reserved */ -		if (num_counters == AMD64_NUM_COUNTERS_F15H) { +		if (num_counters == AMD64_NUM_COUNTERS_CORE) {  			msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);  			msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);  		} else { @@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops)  	ops->create_files = setup_ibs_files;  	if (boot_cpu_data.x86 == 0x15) { -		num_counters = AMD64_NUM_COUNTERS_F15H; +		num_counters = AMD64_NUM_COUNTERS_CORE;  	} else {  		num_counters = AMD64_NUM_COUNTERS;  	} diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index fc09c2754e08..505acdd6d600 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -12,8 +12,13 @@ struct pci_root_info {  	char name[16];  	unsigned int res_num;  	struct resource *res; -	int busnum;  	struct pci_sysdata sd; +#ifdef	CONFIG_PCI_MMCONFIG +	bool mcfg_added; +	u16 segment; +	u8 start_bus; +	u8 end_bus; +#endif  };  static bool pci_use_crs = true; @@ -120,6 +125,81 @@ void __init pci_acpi_crs_quirks(void)  	       pci_use_crs ? "nocrs" : "use_crs");  } +#ifdef	CONFIG_PCI_MMCONFIG +static int __devinit check_segment(u16 seg, struct device *dev, char *estr) +{ +	if (seg) { +		dev_err(dev, +			"%s can't access PCI configuration " +			"space under this host bridge.\n", +			estr); +		return -EIO; +	} + +	/* +	 * Failure in adding MMCFG information is not fatal, +	 * just can't access extended configuration space of +	 * devices under this host bridge. +	 */ +	dev_warn(dev, +		 "%s can't access extended PCI configuration " +		 "space under this bridge.\n", +		 estr); + +	return 0; +} + +static int __devinit setup_mcfg_map(struct pci_root_info *info, +				    u16 seg, u8 start, u8 end, +				    phys_addr_t addr) +{ +	int result; +	struct device *dev = &info->bridge->dev; + +	info->start_bus = start; +	info->end_bus = end; +	info->mcfg_added = false; + +	/* return success if MMCFG is not in use */ +	if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg) +		return 0; + +	if (!(pci_probe & PCI_PROBE_MMCONF)) +		return check_segment(seg, dev, "MMCONFIG is disabled,"); + +	result = pci_mmconfig_insert(dev, seg, start, end, addr); +	if (result == 0) { +		/* enable MMCFG if it hasn't been enabled yet */ +		if (raw_pci_ext_ops == NULL) +			raw_pci_ext_ops = &pci_mmcfg; +		info->mcfg_added = true; +	} else if (result != -EEXIST) +		return check_segment(seg, dev, +			 "fail to add MMCONFIG information,"); + +	return 0; +} + +static void teardown_mcfg_map(struct pci_root_info *info) +{ +	if (info->mcfg_added) { +		pci_mmconfig_delete(info->segment, info->start_bus, +				    info->end_bus); +		info->mcfg_added = false; +	} +} +#else +static int __devinit setup_mcfg_map(struct pci_root_info *info, +				    u16 seg, u8 start, u8 end, +				    phys_addr_t addr) +{ +	return 0; +} +static void teardown_mcfg_map(struct pci_root_info *info) +{ +} +#endif +  static acpi_status  resource_to_addr(struct acpi_resource *resource,  			struct acpi_resource_address64 *addr) @@ -234,13 +314,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)  	}  	info->res_num++; -	if (addr.translation_offset) -		dev_info(&info->bridge->dev, "host bridge window %pR " -			 "(PCI address [%#llx-%#llx])\n", -			 res, res->start - addr.translation_offset, -			 res->end - addr.translation_offset); -	else -		dev_info(&info->bridge->dev, "host bridge window %pR\n", res);  	return AE_OK;  } @@ -332,8 +405,11 @@ static void __release_pci_root_info(struct pci_root_info *info)  	free_pci_root_info_res(info); +	teardown_mcfg_map(info); +  	kfree(info);  } +  static void release_pci_root_info(struct pci_host_bridge *bridge)  {  	struct pci_root_info *info = bridge->release_data; @@ -347,7 +423,9 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,  {  	size_t size; +	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);  	info->bridge = device; +  	info->res_num = 0;  	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,  				info); @@ -360,8 +438,6 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,  	if (!info->res)  		return; -	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); -  	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,  				info);  } @@ -373,7 +449,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)  	int domain = root->segment;  	int busnum = root->secondary.start;  	LIST_HEAD(resources); -	struct pci_bus *bus; +	struct pci_bus *bus = NULL;  	struct pci_sysdata *sd;  	int node;  #ifdef CONFIG_ACPI_NUMA @@ -426,6 +502,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)  	} else {  		probe_pci_root_info(info, device, busnum, domain); +		/* insert busn res at first */ +		pci_add_resource(&resources,  &root->secondary);  		/*  		 * _CRS with no apertures is normal, so only fall back to  		 * defaults or native bridge info if we're ignoring _CRS. @@ -437,10 +515,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)  			x86_pci_root_bus_resources(busnum, &resources);  		} -		bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, -					  &resources); +		if (!setup_mcfg_map(info, domain, (u8)root->secondary.start, +				    (u8)root->secondary.end, root->mcfg_addr)) +			bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, +						  sd, &resources); +  		if (bus) { -			bus->subordinate = pci_scan_child_bus(bus); +			pci_scan_child_bus(bus);  			pci_set_host_bridge_release(  				to_pci_host_bridge(bus->bridge),  				release_pci_root_info, info); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 5aed49bff058..e9e6ed5cdf94 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -121,7 +121,6 @@ static int __init early_fill_mp_bus_info(void)  		link = (reg >> 8) & 0x03;  		info = alloc_pci_root_info(min_bus, max_bus, node, link); -		sprintf(info->name, "PCI Bus #%02x", min_bus);  	}  	/* get the default node and link for left over res */ @@ -300,9 +299,9 @@ static int __init early_fill_mp_bus_info(void)  		int busnum;  		struct pci_root_res *root_res; -		busnum = info->bus_min; -		printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", -		       info->bus_min, info->bus_max, info->node, info->link); +		busnum = info->busn.start; +		printk(KERN_DEBUG "bus: %pR on node %x link %x\n", +		       &info->busn, info->node, info->link);  		list_for_each_entry(root_res, &info->resources, list)  			printk(KERN_DEBUG "bus: %02x %pR\n",  				       busnum, &root_res->res); diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 306579f7d0fd..d37e2fec97e5 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -14,7 +14,7 @@ static struct pci_root_info *x86_find_pci_root_info(int bus)  		return NULL;  	list_for_each_entry(info, &pci_root_infos, list) -		if (info->bus_min == bus) +		if (info->busn.start == bus)  			return info;  	return NULL; @@ -24,6 +24,8 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)  {  	struct pci_root_info *info = x86_find_pci_root_info(bus);  	struct pci_root_res *root_res; +	struct pci_host_bridge_window *window; +	bool found = false;  	if (!info)  		goto default_resources; @@ -31,6 +33,16 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)  	printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",  	       bus); +	/* already added by acpi ? */ +	list_for_each_entry(window, resources, list) +		if (window->res->flags & IORESOURCE_BUS) { +			found = true; +			break; +		} + +	if (!found) +		pci_add_resource(resources, &info->busn); +  	list_for_each_entry(root_res, &info->resources, list) {  		struct resource *res;  		struct resource *root; @@ -66,9 +78,13 @@ struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max,  	if (!info)  		return info; +	sprintf(info->name, "PCI Bus #%02x", bus_min); +  	INIT_LIST_HEAD(&info->resources); -	info->bus_min = bus_min; -	info->bus_max = bus_max; +	info->busn.name  = info->name; +	info->busn.start = bus_min; +	info->busn.end   = bus_max; +	info->busn.flags = IORESOURCE_BUS;  	info->node = node;  	info->link = link; diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index 226a466b2b2b..ff8f65b04574 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -13,8 +13,7 @@ struct pci_root_info {  	struct list_head list;  	char name[12];  	struct list_head resources; -	int bus_min; -	int bus_max; +	struct resource busn;  	int node;  	int link;  }; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 0ad990a20d4a..720e973fc34a 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -494,7 +494,7 @@ int __init pcibios_init(void)  	return 0;  } -char * __devinit  pcibios_setup(char *str) +char * __init pcibios_setup(char *str)  {  	if (!strcmp(str, "off")) {  		pci_probe = 0; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 301e325992f6..937bcece7006 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -17,6 +17,8 @@  #include <linux/bitmap.h>  #include <linux/dmi.h>  #include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/rculist.h>  #include <asm/e820.h>  #include <asm/pci_x86.h>  #include <asm/acpi.h> @@ -24,7 +26,9 @@  #define PREFIX "PCI: "  /* Indicate if the mmcfg resources have been placed into the resource table. */ -static int __initdata pci_mmcfg_resources_inserted; +static bool pci_mmcfg_running_state; +static bool pci_mmcfg_arch_init_failed; +static DEFINE_MUTEX(pci_mmcfg_lock);  LIST_HEAD(pci_mmcfg_list); @@ -45,24 +49,25 @@ static __init void free_all_mmcfg(void)  		pci_mmconfig_remove(cfg);  } -static __init void list_add_sorted(struct pci_mmcfg_region *new) +static __devinit void list_add_sorted(struct pci_mmcfg_region *new)  {  	struct pci_mmcfg_region *cfg;  	/* keep list sorted by segment and starting bus number */ -	list_for_each_entry(cfg, &pci_mmcfg_list, list) { +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {  		if (cfg->segment > new->segment ||  		    (cfg->segment == new->segment &&  		     cfg->start_bus >= new->start_bus)) { -			list_add_tail(&new->list, &cfg->list); +			list_add_tail_rcu(&new->list, &cfg->list);  			return;  		}  	} -	list_add_tail(&new->list, &pci_mmcfg_list); +	list_add_tail_rcu(&new->list, &pci_mmcfg_list);  } -static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, -							int end, u64 addr) +static __devinit struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, +							     int start, +							     int end, u64 addr)  {  	struct pci_mmcfg_region *new;  	struct resource *res; @@ -79,8 +84,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,  	new->start_bus = start;  	new->end_bus = end; -	list_add_sorted(new); -  	res = &new->res;  	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);  	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; @@ -89,9 +92,25 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,  		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);  	res->name = new->name; -	printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at " -	       "%pR (base %#lx)\n", segment, start, end, &new->res, -	       (unsigned long) addr); +	return new; +} + +static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, +							int end, u64 addr) +{ +	struct pci_mmcfg_region *new; + +	new = pci_mmconfig_alloc(segment, start, end, addr); +	if (new) { +		mutex_lock(&pci_mmcfg_lock); +		list_add_sorted(new); +		mutex_unlock(&pci_mmcfg_lock); + +		pr_info(PREFIX +		       "MMCONFIG for domain %04x [bus %02x-%02x] at %pR " +		       "(base %#lx)\n", +		       segment, start, end, &new->res, (unsigned long)addr); +	}  	return new;  } @@ -100,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)  {  	struct pci_mmcfg_region *cfg; -	list_for_each_entry(cfg, &pci_mmcfg_list, list) +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)  		if (cfg->segment == segment &&  		    cfg->start_bus <= bus && bus <= cfg->end_bus)  			return cfg; @@ -343,8 +362,7 @@ static int __init pci_mmcfg_check_hostbridge(void)  			name = pci_mmcfg_probes[i].probe();  		if (name) -			printk(KERN_INFO PREFIX "%s with MMCONFIG support\n", -			       name); +			pr_info(PREFIX "%s with MMCONFIG support\n", name);  	}  	/* some end_bus_number is crazy, fix it */ @@ -353,19 +371,8 @@ static int __init pci_mmcfg_check_hostbridge(void)  	return !list_empty(&pci_mmcfg_list);  } -static void __init pci_mmcfg_insert_resources(void) -{ -	struct pci_mmcfg_region *cfg; - -	list_for_each_entry(cfg, &pci_mmcfg_list, list) -		insert_resource(&iomem_resource, &cfg->res); - -	/* Mark that the resources have been inserted. */ -	pci_mmcfg_resources_inserted = 1; -} - -static acpi_status __init check_mcfg_resource(struct acpi_resource *res, -					      void *data) +static acpi_status __devinit check_mcfg_resource(struct acpi_resource *res, +						 void *data)  {  	struct resource *mcfg_res = data;  	struct acpi_resource_address64 address; @@ -401,8 +408,8 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,  	return AE_OK;  } -static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, -		void *context, void **rv) +static acpi_status __devinit find_mboard_resource(acpi_handle handle, u32 lvl, +						  void *context, void **rv)  {  	struct resource *mcfg_res = context; @@ -415,7 +422,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,  	return AE_OK;  } -static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) +static int __devinit is_acpi_reserved(u64 start, u64 end, unsigned not_used)  {  	struct resource mcfg_res; @@ -434,13 +441,15 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)  typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); -static int __init is_mmconf_reserved(check_reserved_t is_reserved, -				    struct pci_mmcfg_region *cfg, int with_e820) +static int __ref is_mmconf_reserved(check_reserved_t is_reserved, +				    struct pci_mmcfg_region *cfg, +				    struct device *dev, int with_e820)  {  	u64 addr = cfg->res.start;  	u64 size = resource_size(&cfg->res);  	u64 old_size = size; -	int valid = 0, num_buses; +	int num_buses; +	char *method = with_e820 ? "E820" : "ACPI motherboard resources";  	while (!is_reserved(addr, addr + size, E820_RESERVED)) {  		size >>= 1; @@ -448,30 +457,76 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,  			break;  	} -	if (size >= (16UL<<20) || size == old_size) { -		printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n", -		       &cfg->res, -		       with_e820 ? "E820" : "ACPI motherboard resources"); -		valid = 1; - -		if (old_size != size) { -			/* update end_bus */ -			cfg->end_bus = cfg->start_bus + ((size>>20) - 1); -			num_buses = cfg->end_bus - cfg->start_bus + 1; -			cfg->res.end = cfg->res.start + -			    PCI_MMCFG_BUS_OFFSET(num_buses) - 1; -			snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, -				 "PCI MMCONFIG %04x [bus %02x-%02x]", -				 cfg->segment, cfg->start_bus, cfg->end_bus); -			printk(KERN_INFO PREFIX -			       "MMCONFIG for %04x [bus%02x-%02x] " -			       "at %pR (base %#lx) (size reduced!)\n", -			       cfg->segment, cfg->start_bus, cfg->end_bus, -			       &cfg->res, (unsigned long) cfg->address); -		} +	if (size < (16UL<<20) && size != old_size) +		return 0; + +	if (dev) +		dev_info(dev, "MMCONFIG at %pR reserved in %s\n", +			 &cfg->res, method); +	else +		pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n", +		       &cfg->res, method); + +	if (old_size != size) { +		/* update end_bus */ +		cfg->end_bus = cfg->start_bus + ((size>>20) - 1); +		num_buses = cfg->end_bus - cfg->start_bus + 1; +		cfg->res.end = cfg->res.start + +		    PCI_MMCFG_BUS_OFFSET(num_buses) - 1; +		snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN, +			 "PCI MMCONFIG %04x [bus %02x-%02x]", +			 cfg->segment, cfg->start_bus, cfg->end_bus); + +		if (dev) +			dev_info(dev, +				"MMCONFIG " +				"at %pR (base %#lx) (size reduced!)\n", +				&cfg->res, (unsigned long) cfg->address); +		else +			pr_info(PREFIX +				"MMCONFIG for %04x [bus%02x-%02x] " +				"at %pR (base %#lx) (size reduced!)\n", +				cfg->segment, cfg->start_bus, cfg->end_bus, +				&cfg->res, (unsigned long) cfg->address);  	} -	return valid; +	return 1; +} + +static int __ref pci_mmcfg_check_reserved(struct device *dev, +		  struct pci_mmcfg_region *cfg, int early) +{ +	if (!early && !acpi_disabled) { +		if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0)) +			return 1; + +		if (dev) +			dev_info(dev, FW_INFO +				 "MMCONFIG at %pR not reserved in " +				 "ACPI motherboard resources\n", +				 &cfg->res); +		else +			pr_info(FW_INFO PREFIX +			       "MMCONFIG at %pR not reserved in " +			       "ACPI motherboard resources\n", +			       &cfg->res); +	} + +	/* +	 * e820_all_mapped() is marked as __init. +	 * All entries from ACPI MCFG table have been checked at boot time. +	 * For MCFG information constructed from hotpluggable host bridge's +	 * _CBA method, just assume it's reserved. +	 */ +	if (pci_mmcfg_running_state) +		return 1; + +	/* Don't try to do this check unless configuration +	   type 1 is available. how about type 2 ?*/ +	if (raw_pci_ops) +		return is_mmconf_reserved(e820_all_mapped, cfg, dev, 1); + +	return 0;  }  static void __init pci_mmcfg_reject_broken(int early) @@ -479,38 +534,14 @@ static void __init pci_mmcfg_reject_broken(int early)  	struct pci_mmcfg_region *cfg;  	list_for_each_entry(cfg, &pci_mmcfg_list, list) { -		int valid = 0; - -		if (!early && !acpi_disabled) { -			valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); - -			if (valid) -				continue; -			else -				printk(KERN_ERR FW_BUG PREFIX -				       "MMCONFIG at %pR not reserved in " -				       "ACPI motherboard resources\n", -				       &cfg->res); +		if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) { +			pr_info(PREFIX "not using MMCONFIG\n"); +			free_all_mmcfg(); +			return;  		} - -		/* Don't try to do this check unless configuration -		   type 1 is available. how about type 2 ?*/ -		if (raw_pci_ops) -			valid = is_mmconf_reserved(e820_all_mapped, cfg, 1); - -		if (!valid) -			goto reject;  	} - -	return; - -reject: -	printk(KERN_INFO PREFIX "not using MMCONFIG\n"); -	free_all_mmcfg();  } -static int __initdata known_bridge; -  static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,  					struct acpi_mcfg_allocation *cfg)  { @@ -529,7 +560,7 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,  			return 0;  	} -	printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx " +	pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "  	       "is above 4GB, ignored\n", cfg->pci_segment,  	       cfg->start_bus_number, cfg->end_bus_number, cfg->address);  	return -EINVAL; @@ -556,7 +587,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)  		i -= sizeof(struct acpi_mcfg_allocation);  	};  	if (entries == 0) { -		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); +		pr_err(PREFIX "MMCONFIG has no entries\n");  		return -ENODEV;  	} @@ -570,8 +601,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)  		if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,  				   cfg->end_bus_number, cfg->address) == NULL) { -			printk(KERN_WARNING PREFIX -			       "no memory for MCFG entries\n"); +			pr_warn(PREFIX "no memory for MCFG entries\n");  			free_all_mmcfg();  			return -ENOMEM;  		} @@ -582,28 +612,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)  static void __init __pci_mmcfg_init(int early)  { -	/* MMCONFIG disabled */ -	if ((pci_probe & PCI_PROBE_MMCONF) == 0) -		return; - -	/* MMCONFIG already enabled */ -	if (!early && !(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF)) -		return; - -	/* for late to exit */ -	if (known_bridge) -		return; - -	if (early) { -		if (pci_mmcfg_check_hostbridge()) -			known_bridge = 1; -	} - -	if (!known_bridge) -		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); -  	pci_mmcfg_reject_broken(early); -  	if (list_empty(&pci_mmcfg_list))  		return; @@ -620,33 +629,48 @@ static void __init __pci_mmcfg_init(int early)  	if (pci_mmcfg_arch_init())  		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;  	else { -		/* -		 * Signal not to attempt to insert mmcfg resources because -		 * the architecture mmcfg setup could not initialize. -		 */ -		pci_mmcfg_resources_inserted = 1; +		free_all_mmcfg(); +		pci_mmcfg_arch_init_failed = true;  	}  } +static int __initdata known_bridge; +  void __init pci_mmcfg_early_init(void)  { -	__pci_mmcfg_init(1); +	if (pci_probe & PCI_PROBE_MMCONF) { +		if (pci_mmcfg_check_hostbridge()) +			known_bridge = 1; +		else +			acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); +		__pci_mmcfg_init(1); +	}  }  void __init pci_mmcfg_late_init(void)  { -	__pci_mmcfg_init(0); +	/* MMCONFIG disabled */ +	if ((pci_probe & PCI_PROBE_MMCONF) == 0) +		return; + +	if (known_bridge) +		return; + +	/* MMCONFIG hasn't been enabled yet, try again */ +	if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) { +		acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); +		__pci_mmcfg_init(0); +	}  }  static int __init pci_mmcfg_late_insert_resources(void)  { -	/* -	 * If resources are already inserted or we are not using MMCONFIG, -	 * don't insert the resources. -	 */ -	if ((pci_mmcfg_resources_inserted == 1) || -	    (pci_probe & PCI_PROBE_MMCONF) == 0 || -	    list_empty(&pci_mmcfg_list)) +	struct pci_mmcfg_region *cfg; + +	pci_mmcfg_running_state = true; + +	/* If we are not using MMCONFIG, don't insert the resources. */ +	if ((pci_probe & PCI_PROBE_MMCONF) == 0)  		return 1;  	/* @@ -654,7 +678,9 @@ static int __init pci_mmcfg_late_insert_resources(void)  	 * marked so it won't cause request errors when __request_region is  	 * called.  	 */ -	pci_mmcfg_insert_resources(); +	list_for_each_entry(cfg, &pci_mmcfg_list, list) +		if (!cfg->res.parent) +			insert_resource(&iomem_resource, &cfg->res);  	return 0;  } @@ -665,3 +691,101 @@ static int __init pci_mmcfg_late_insert_resources(void)   * with other system resources.   */  late_initcall(pci_mmcfg_late_insert_resources); + +/* Add MMCFG information for host bridges */ +int __devinit pci_mmconfig_insert(struct device *dev, +				  u16 seg, u8 start, u8 end, +				  phys_addr_t addr) +{ +	int rc; +	struct resource *tmp = NULL; +	struct pci_mmcfg_region *cfg; + +	if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed) +		return -ENODEV; + +	if (start > end) +		return -EINVAL; + +	mutex_lock(&pci_mmcfg_lock); +	cfg = pci_mmconfig_lookup(seg, start); +	if (cfg) { +		if (cfg->end_bus < end) +			dev_info(dev, FW_INFO +				 "MMCONFIG for " +				 "domain %04x [bus %02x-%02x] " +				 "only partially covers this bridge\n", +				  cfg->segment, cfg->start_bus, cfg->end_bus); +		mutex_unlock(&pci_mmcfg_lock); +		return -EEXIST; +	} + +	if (!addr) { +		mutex_unlock(&pci_mmcfg_lock); +		return -EINVAL; +	} + +	rc = -EBUSY; +	cfg = pci_mmconfig_alloc(seg, start, end, addr); +	if (cfg == NULL) { +		dev_warn(dev, "fail to add MMCONFIG (out of memory)\n"); +		rc = -ENOMEM; +	} else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) { +		dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n", +			 &cfg->res); +	} else { +		/* Insert resource if it's not in boot stage */ +		if (pci_mmcfg_running_state) +			tmp = insert_resource_conflict(&iomem_resource, +						       &cfg->res); + +		if (tmp) { +			dev_warn(dev, +				 "MMCONFIG %pR conflicts with " +				 "%s %pR\n", +				 &cfg->res, tmp->name, tmp); +		} else if (pci_mmcfg_arch_map(cfg)) { +			dev_warn(dev, "fail to map MMCONFIG %pR.\n", +				 &cfg->res); +		} else { +			list_add_sorted(cfg); +			dev_info(dev, "MMCONFIG at %pR (base %#lx)\n", +				 &cfg->res, (unsigned long)addr); +			cfg = NULL; +			rc = 0; +		} +	} + +	if (cfg) { +		if (cfg->res.parent) +			release_resource(&cfg->res); +		kfree(cfg); +	} + +	mutex_unlock(&pci_mmcfg_lock); + +	return rc; +} + +/* Delete MMCFG information for host bridges */ +int pci_mmconfig_delete(u16 seg, u8 start, u8 end) +{ +	struct pci_mmcfg_region *cfg; + +	mutex_lock(&pci_mmcfg_lock); +	list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) +		if (cfg->segment == seg && cfg->start_bus == start && +		    cfg->end_bus == end) { +			list_del_rcu(&cfg->list); +			synchronize_rcu(); +			pci_mmcfg_arch_unmap(cfg); +			if (cfg->res.parent) +				release_resource(&cfg->res); +			mutex_unlock(&pci_mmcfg_lock); +			kfree(cfg); +			return 0; +		} +	mutex_unlock(&pci_mmcfg_lock); + +	return -ENOENT; +} diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 5372e86834c0..db63ac23e3d9 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -11,6 +11,7 @@  #include <linux/pci.h>  #include <linux/init.h> +#include <linux/rcupdate.h>  #include <asm/e820.h>  #include <asm/pci_x86.h>  #include <acpi/acpi.h> @@ -60,9 +61,12 @@ err:		*value = -1;  		return -EINVAL;  	} +	rcu_read_lock();  	base = get_base_addr(seg, bus, devfn); -	if (!base) +	if (!base) { +		rcu_read_unlock();  		goto err; +	}  	raw_spin_lock_irqsave(&pci_config_lock, flags); @@ -80,6 +84,7 @@ err:		*value = -1;  		break;  	}  	raw_spin_unlock_irqrestore(&pci_config_lock, flags); +	rcu_read_unlock();  	return 0;  } @@ -93,9 +98,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,  	if ((bus > 255) || (devfn > 255) || (reg > 4095))  		return -EINVAL; +	rcu_read_lock();  	base = get_base_addr(seg, bus, devfn); -	if (!base) +	if (!base) { +		rcu_read_unlock();  		return -EINVAL; +	}  	raw_spin_lock_irqsave(&pci_config_lock, flags); @@ -113,11 +121,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,  		break;  	}  	raw_spin_unlock_irqrestore(&pci_config_lock, flags); +	rcu_read_unlock();  	return 0;  } -static const struct pci_raw_ops pci_mmcfg = { +const struct pci_raw_ops pci_mmcfg = {  	.read =		pci_mmcfg_read,  	.write =	pci_mmcfg_write,  }; @@ -132,3 +141,18 @@ int __init pci_mmcfg_arch_init(void)  void __init pci_mmcfg_arch_free(void)  {  } + +int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) +{ +	return 0; +} + +void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg) +{ +	unsigned long flags; + +	/* Invalidate the cached mmcfg map entry. */ +	raw_spin_lock_irqsave(&pci_config_lock, flags); +	mmcfg_last_accessed_device = 0; +	raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index 915a493502cb..d4ebd07c306d 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -9,6 +9,7 @@  #include <linux/init.h>  #include <linux/acpi.h>  #include <linux/bitmap.h> +#include <linux/rcupdate.h>  #include <asm/e820.h>  #include <asm/pci_x86.h> @@ -34,9 +35,12 @@ err:		*value = -1;  		return -EINVAL;  	} +	rcu_read_lock();  	addr = pci_dev_base(seg, bus, devfn); -	if (!addr) +	if (!addr) { +		rcu_read_unlock();  		goto err; +	}  	switch (len) {  	case 1: @@ -49,6 +53,7 @@ err:		*value = -1;  		*value = mmio_config_readl(addr + reg);  		break;  	} +	rcu_read_unlock();  	return 0;  } @@ -62,9 +67,12 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,  	if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))  		return -EINVAL; +	rcu_read_lock();  	addr = pci_dev_base(seg, bus, devfn); -	if (!addr) +	if (!addr) { +		rcu_read_unlock();  		return -EINVAL; +	}  	switch (len) {  	case 1: @@ -77,16 +85,17 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,  		mmio_config_writel(addr + reg, value);  		break;  	} +	rcu_read_unlock();  	return 0;  } -static const struct pci_raw_ops pci_mmcfg = { +const struct pci_raw_ops pci_mmcfg = {  	.read =		pci_mmcfg_read,  	.write =	pci_mmcfg_write,  }; -static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg) +static void __iomem * __devinit mcfg_ioremap(struct pci_mmcfg_region *cfg)  {  	void __iomem *addr;  	u64 start, size; @@ -105,16 +114,14 @@ int __init pci_mmcfg_arch_init(void)  {  	struct pci_mmcfg_region *cfg; -	list_for_each_entry(cfg, &pci_mmcfg_list, list) { -		cfg->virt = mcfg_ioremap(cfg); -		if (!cfg->virt) { -			printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n", -			       &cfg->res); +	list_for_each_entry(cfg, &pci_mmcfg_list, list) +		if (pci_mmcfg_arch_map(cfg)) {  			pci_mmcfg_arch_free();  			return 0;  		} -	} +  	raw_pci_ext_ops = &pci_mmcfg; +  	return 1;  } @@ -122,10 +129,25 @@ void __init pci_mmcfg_arch_free(void)  {  	struct pci_mmcfg_region *cfg; -	list_for_each_entry(cfg, &pci_mmcfg_list, list) { -		if (cfg->virt) { -			iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); -			cfg->virt = NULL; -		} +	list_for_each_entry(cfg, &pci_mmcfg_list, list) +		pci_mmcfg_arch_unmap(cfg); +} + +int __devinit pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg) +{ +	cfg->virt = mcfg_ioremap(cfg); +	if (!cfg->virt) { +		pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res); +		return -ENOMEM; +	} + +	return 0; +} + +void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg) +{ +	if (cfg && cfg->virt) { +		iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus)); +		cfg->virt = NULL;  	}  } diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 140942f66b31..e14a2ff708b5 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -264,7 +264,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);  static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev)  { -	pci_set_power_state(dev, PCI_D3cold); +	pci_set_power_state(dev, PCI_D3hot);  }  DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);  DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 92660edaa1e7..2dc29f51e75a 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -234,22 +234,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map(  	return status;  } -static efi_status_t __init phys_efi_get_time(efi_time_t *tm, -					     efi_time_cap_t *tc) -{ -	unsigned long flags; -	efi_status_t status; - -	spin_lock_irqsave(&rtc_lock, flags); -	efi_call_phys_prelog(); -	status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), -				virt_to_phys(tc)); -	efi_call_phys_epilog(); -	spin_unlock_irqrestore(&rtc_lock, flags); -	return status; -} - -int efi_set_rtc_mmss(unsigned long nowtime) +static int efi_set_rtc_mmss(unsigned long nowtime)  {  	int real_seconds, real_minutes;  	efi_status_t 	status; @@ -278,7 +263,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)  	return 0;  } -unsigned long efi_get_time(void) +static unsigned long efi_get_time(void)  {  	efi_status_t status;  	efi_time_t eft; @@ -621,18 +606,13 @@ static int __init efi_runtime_init(void)  	}  	/*  	 * We will only need *early* access to the following -	 * two EFI runtime services before set_virtual_address_map +	 * EFI runtime service before set_virtual_address_map  	 * is invoked.  	 */ -	efi_phys.get_time = (efi_get_time_t *)runtime->get_time;  	efi_phys.set_virtual_address_map =  		(efi_set_virtual_address_map_t *)  		runtime->set_virtual_address_map; -	/* -	 * Make efi_get_time can be called before entering -	 * virtual mode. -	 */ -	efi.get_time = phys_efi_get_time; +  	early_iounmap(runtime, sizeof(efi_runtime_services_t));  	return 0; @@ -720,12 +700,10 @@ void __init efi_init(void)  		efi_enabled = 0;  		return;  	} -#ifdef CONFIG_X86_32  	if (efi_native) {  		x86_platform.get_wallclock = efi_get_time;  		x86_platform.set_wallclock = efi_set_rtc_mmss;  	} -#endif  #if EFI_DEBUG  	print_efi_memmap(); diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 3c6e328483c7..028454f0c3a5 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c @@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper;  static int dumper_registered;  static void dw_kmsg_dump(struct kmsg_dumper *dumper, -			enum kmsg_dump_reason reason, -			const char *s1, unsigned long l1, -			const char *s2, unsigned long l2) +			 enum kmsg_dump_reason reason)  { -	int i; +	static char line[1024]; +	size_t len;  	/* When run to this, we'd better re-init the HW */  	mrst_early_console_init(); -	for (i = 0; i < l1; i++) -		early_mrst_console.write(&early_mrst_console, s1 + i, 1); -	for (i = 0; i < l2; i++) -		early_mrst_console.write(&early_mrst_console, s2 + i, 1); +	while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) +		early_mrst_console.write(&early_mrst_console, line, len);  }  /* Set the ratio rate to 115200, 8n1, IRQ disabled */ diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 23e5b9d7977b..599be499fdf7 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -203,7 +203,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type)  	return 0;  } -static int xo15_sci_resume(struct acpi_device *device) +static int xo15_sci_resume(struct device *dev)  {  	/* Enable all EC events */  	olpc_ec_mask_write(EC_SCI_SRC_ALL); @@ -215,6 +215,8 @@ static int xo15_sci_resume(struct acpi_device *device)  	return 0;  } +static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume); +  static const struct acpi_device_id xo15_sci_device_ids[] = {  	{"XO15EC", 0},  	{"", 0}, @@ -227,8 +229,8 @@ static struct acpi_driver xo15_sci_drv = {  	.ops = {  		.add = xo15_sci_add,  		.remove = xo15_sci_remove, -		.resume = xo15_sci_resume,  	}, +	.drv.pm = &xo15_sci_pm,  };  static int __init xo15_sci_init(void) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 59880afa851f..b8b3a37c80cd 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@  /*   *	SGI UltraViolet TLB flush routines.   * - *	(c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. + *	(c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.   *   *	This code is released under the GNU General Public License version 2 or   *	later. @@ -38,8 +38,7 @@ static int timeout_base_ns[] = {  static int timeout_us;  static int nobau; -static int baudisabled; -static spinlock_t disable_lock; +static int nobau_perm;  static cycles_t congested_cycles;  /* tunables: */ @@ -47,12 +46,13 @@ static int max_concurr		= MAX_BAU_CONCURRENT;  static int max_concurr_const	= MAX_BAU_CONCURRENT;  static int plugged_delay	= PLUGGED_DELAY;  static int plugsb4reset		= PLUGSB4RESET; +static int giveup_limit		= GIVEUP_LIMIT;  static int timeoutsb4reset	= TIMEOUTSB4RESET;  static int ipi_reset_limit	= IPI_RESET_LIMIT;  static int complete_threshold	= COMPLETE_THRESHOLD;  static int congested_respns_us	= CONGESTED_RESPONSE_US;  static int congested_reps	= CONGESTED_REPS; -static int congested_period	= CONGESTED_PERIOD; +static int disabled_period	= DISABLED_PERIOD;  static struct tunables tunables[] = {  	{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ @@ -63,7 +63,8 @@ static struct tunables tunables[] = {  	{&complete_threshold, COMPLETE_THRESHOLD},  	{&congested_respns_us, CONGESTED_RESPONSE_US},  	{&congested_reps, CONGESTED_REPS}, -	{&congested_period, CONGESTED_PERIOD} +	{&disabled_period, DISABLED_PERIOD}, +	{&giveup_limit, GIVEUP_LIMIT}  };  static struct dentry *tunables_dir; @@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);  static DEFINE_PER_CPU(struct bau_control, bau_control);  static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); +static void +set_bau_on(void) +{ +	int cpu; +	struct bau_control *bcp; + +	if (nobau_perm) { +		pr_info("BAU not initialized; cannot be turned on\n"); +		return; +	} +	nobau = 0; +	for_each_present_cpu(cpu) { +		bcp = &per_cpu(bau_control, cpu); +		bcp->nobau = 0; +	} +	pr_info("BAU turned on\n"); +	return; +} + +static void +set_bau_off(void) +{ +	int cpu; +	struct bau_control *bcp; + +	nobau = 1; +	for_each_present_cpu(cpu) { +		bcp = &per_cpu(bau_control, cpu); +		bcp->nobau = 1; +	} +	pr_info("BAU turned off\n"); +	return; +} +  /*   * Determine the first node on a uvhub. 'Nodes' are used for kernel   * memory allocation. @@ -278,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,  		 * Both sockets dump their completed count total into  		 * the message's count.  		 */ -		smaster->socket_acknowledge_count[mdp->msg_slot] = 0; +		*sp = 0;  		asp = (struct atomic_short *)&msg->acknowledge_count;  		msg_ack_count = atom_asr(socket_ack_count, asp); @@ -491,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,  }  /* - * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. + * But not currently used.   */  static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)  {  	unsigned long descriptor_status; -	unsigned long descriptor_status2; -	descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); -	descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; -	descriptor_status = (descriptor_status << 1) | descriptor_status2; +	descriptor_status = +		((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;  	return descriptor_status;  } @@ -531,87 +565,11 @@ int normal_busy(struct bau_control *bcp)   */  int handle_uv2_busy(struct bau_control *bcp)  { -	int busy_one = bcp->using_desc; -	int normal = bcp->uvhub_cpu; -	int selected = -1; -	int i; -	unsigned long descriptor_status; -	unsigned long status; -	int mmr_offset; -	struct bau_desc *bau_desc_old; -	struct bau_desc *bau_desc_new; -	struct bau_control *hmaster = bcp->uvhub_master;  	struct ptc_stats *stat = bcp->statp; -	cycles_t ttm;  	stat->s_uv2_wars++; -	spin_lock(&hmaster->uvhub_lock); -	/* try for the original first */ -	if (busy_one != normal) { -		if (!normal_busy(bcp)) -			selected = normal; -	} -	if (selected < 0) { -		/* can't use the normal, select an alternate */ -		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; -		descriptor_status = read_lmmr(mmr_offset); - -		/* scan available descriptors 32-63 */ -		for (i = 0; i < UV_CPUS_PER_AS; i++) { -			if ((hmaster->inuse_map & (1 << i)) == 0) { -				status = ((descriptor_status >> -						(i * UV_ACT_STATUS_SIZE)) & -						UV_ACT_STATUS_MASK) << 1; -				if (status != UV2H_DESC_BUSY) { -					selected = i + UV_CPUS_PER_AS; -					break; -				} -			} -		} -	} - -	if (busy_one != normal) -		/* mark the busy alternate as not in-use */ -		hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); - -	if (selected >= 0) { -		/* switch to the selected descriptor */ -		if (selected != normal) { -			/* set the selected alternate as in-use */ -			hmaster->inuse_map |= -					(1 << (selected - UV_CPUS_PER_AS)); -			if (selected > stat->s_uv2_wars_hw) -				stat->s_uv2_wars_hw = selected; -		} -		bau_desc_old = bcp->descriptor_base; -		bau_desc_old += (ITEMS_PER_DESC * busy_one); -		bcp->using_desc = selected; -		bau_desc_new = bcp->descriptor_base; -		bau_desc_new += (ITEMS_PER_DESC * selected); -		*bau_desc_new = *bau_desc_old; -	} else { -		/* -		 * All are busy. Wait for the normal one for this cpu to -		 * free up. -		 */ -		stat->s_uv2_war_waits++; -		spin_unlock(&hmaster->uvhub_lock); -		ttm = get_cycles(); -		do { -			cpu_relax(); -		} while (normal_busy(bcp)); -		spin_lock(&hmaster->uvhub_lock); -		/* switch to the original descriptor */ -		bcp->using_desc = normal; -		bau_desc_old = bcp->descriptor_base; -		bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); -		bcp->using_desc = (ITEMS_PER_DESC * normal); -		bau_desc_new = bcp->descriptor_base; -		bau_desc_new += (ITEMS_PER_DESC * normal); -		*bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ -	} -	spin_unlock(&hmaster->uvhub_lock); -	return FLUSH_RETRY_BUSYBUG; +	bcp->busy = 1; +	return FLUSH_GIVEUP;  }  static int uv2_wait_completion(struct bau_desc *bau_desc, @@ -620,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,  {  	unsigned long descriptor_stat;  	cycles_t ttm; -	int desc = bcp->using_desc; +	int desc = bcp->uvhub_cpu;  	long busy_reps = 0;  	struct ptc_stats *stat = bcp->statp; @@ -628,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,  	/* spin on the status MMR, waiting for it to go idle */  	while (descriptor_stat != UV2H_DESC_IDLE) { -		/* -		 * Our software ack messages may be blocked because -		 * there are no swack resources available.  As long -		 * as none of them has timed out hardware will NACK -		 * our message and its state will stay IDLE. -		 */ -		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || -		    (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { +		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) { +			/* +			 * A h/w bug on the destination side may +			 * have prevented the message being marked +			 * pending, thus it doesn't get replied to +			 * and gets continually nacked until it times +			 * out with a SOURCE_TIMEOUT. +			 */  			stat->s_stimeout++;  			return FLUSH_GIVEUP; -		} else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { -			stat->s_strongnacks++; -			bcp->conseccompletes = 0; -			return FLUSH_GIVEUP;  		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { +			ttm = get_cycles(); + +			/* +			 * Our retries may be blocked by all destination +			 * swack resources being consumed, and a timeout +			 * pending.  In that case hardware returns the +			 * ERROR that looks like a destination timeout. +			 * Without using the extended status we have to +			 * deduce from the short time that this was a +			 * strong nack. +			 */ +			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { +				bcp->conseccompletes = 0; +				stat->s_plugged++; +				/* FLUSH_RETRY_PLUGGED causes hang on boot */ +				return FLUSH_GIVEUP; +			}  			stat->s_dtimeout++;  			bcp->conseccompletes = 0; -			return FLUSH_RETRY_TIMEOUT; +			/* FLUSH_RETRY_TIMEOUT causes hang on boot */ +			return FLUSH_GIVEUP;  		} else {  			busy_reps++;  			if (busy_reps > 1000000) { @@ -653,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,  				busy_reps = 0;  				ttm = get_cycles();  				if ((ttm - bcp->send_message) > -					(bcp->clocks_per_100_usec)) { +						bcp->timeout_interval)  					return handle_uv2_busy(bcp); -				}  			}  			/*  			 * descriptor_stat is still BUSY @@ -679,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc,  {  	int right_shift;  	unsigned long mmr_offset; -	int desc = bcp->using_desc; +	int desc = bcp->uvhub_cpu;  	if (desc < UV_CPUS_PER_AS) {  		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; @@ -758,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc,  }  /* - * Completions are taking a very long time due to a congested numalink - * network. + * Stop all cpus on a uvhub from using the BAU for a period of time. + * This is reversed by check_enable.   */ -static void disable_for_congestion(struct bau_control *bcp, -					struct ptc_stats *stat) +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)  { -	/* let only one cpu do this disabling */ -	spin_lock(&disable_lock); - -	if (!baudisabled && bcp->period_requests && -	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) { -		int tcpu; -		struct bau_control *tbcp; -		/* it becomes this cpu's job to turn on the use of the -		   BAU again */ -		baudisabled = 1; -		bcp->set_bau_off = 1; -		bcp->set_bau_on_time = get_cycles(); -		bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); +	int tcpu; +	struct bau_control *tbcp; +	struct bau_control *hmaster; +	cycles_t tm1; + +	hmaster = bcp->uvhub_master; +	spin_lock(&hmaster->disable_lock); +	if (!bcp->baudisabled) {  		stat->s_bau_disabled++; +		tm1 = get_cycles();  		for_each_present_cpu(tcpu) {  			tbcp = &per_cpu(bau_control, tcpu); -			tbcp->baudisabled = 1; +			if (tbcp->uvhub_master == hmaster) { +				tbcp->baudisabled = 1; +				tbcp->set_bau_on_time = +					tm1 + bcp->disabled_period; +			}  		}  	} - -	spin_unlock(&disable_lock); +	spin_unlock(&hmaster->disable_lock);  }  static void count_max_concurr(int stat, struct bau_control *bcp, @@ -815,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2,  			bcp->period_requests++;  			bcp->period_time += elapsed;  			if ((elapsed > congested_cycles) && -			    (bcp->period_requests > bcp->cong_reps)) -				disable_for_congestion(bcp, stat); +			    (bcp->period_requests > bcp->cong_reps) && +			    ((bcp->period_time / bcp->period_requests) > +							congested_cycles)) { +				stat->s_congested++; +				disable_for_period(bcp, stat); +			}  		}  	} else  		stat->s_requestor--;  	if (completion_status == FLUSH_COMPLETE && try > 1)  		stat->s_retriesok++; -	else if (completion_status == FLUSH_GIVEUP) +	else if (completion_status == FLUSH_GIVEUP) {  		stat->s_giveup++; +		if (get_cycles() > bcp->period_end) +			bcp->period_giveups = 0; +		bcp->period_giveups++; +		if (bcp->period_giveups == 1) +			bcp->period_end = get_cycles() + bcp->disabled_period; +		if (bcp->period_giveups > bcp->giveup_limit) { +			disable_for_period(bcp, stat); +			stat->s_giveuplimit++; +		} +	}  }  /* @@ -868,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,   * Returns 1 if it gives up entirely and the original cpu mask is to be   * returned to the kernel.   */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, +	struct bau_desc *bau_desc)  {  	int seq_number = 0;  	int completion_stat = 0; @@ -881,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)  	struct bau_control *hmaster = bcp->uvhub_master;  	struct uv1_bau_msg_header *uv1_hdr = NULL;  	struct uv2_bau_msg_header *uv2_hdr = NULL; -	struct bau_desc *bau_desc; -	if (bcp->uvhub_version == 1) +	if (bcp->uvhub_version == 1) { +		uv1 = 1;  		uv1_throttle(hmaster, stat); +	}  	while (hmaster->uvhub_quiesce)  		cpu_relax();  	time1 = get_cycles(); +	if (uv1) +		uv1_hdr = &bau_desc->header.uv1_hdr; +	else +		uv2_hdr = &bau_desc->header.uv2_hdr; +  	do { -		bau_desc = bcp->descriptor_base; -		bau_desc += (ITEMS_PER_DESC * bcp->using_desc); -		if (bcp->uvhub_version == 1) { -			uv1 = 1; -			uv1_hdr = &bau_desc->header.uv1_hdr; -		} else -			uv2_hdr = &bau_desc->header.uv2_hdr; -		if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { +		if (try == 0) {  			if (uv1)  				uv1_hdr->msg_type = MSG_REGULAR;  			else @@ -916,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)  			uv1_hdr->sequence = seq_number;  		else  			uv2_hdr->sequence = seq_number; -		index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; +		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;  		bcp->send_message = get_cycles();  		write_mmr_activation(index);  		try++;  		completion_stat = wait_completion(bau_desc, bcp, try); -		/* UV2: wait_completion() may change the bcp->using_desc */  		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);  		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {  			bcp->ipi_attempts = 0; +			stat->s_overipilimit++;  			completion_stat = FLUSH_GIVEUP;  			break;  		}  		cpu_relax();  	} while ((completion_stat == FLUSH_RETRY_PLUGGED) || -		 (completion_stat == FLUSH_RETRY_BUSYBUG) ||  		 (completion_stat == FLUSH_RETRY_TIMEOUT));  	time2 = get_cycles(); @@ -955,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)  }  /* - * The BAU is disabled. When the disabled time period has expired, the cpu - * that disabled it must re-enable it. - * Return 0 if it is re-enabled for all cpus. + * The BAU is disabled for this uvhub. When the disabled time period has + * expired re-enable it. + * Return 0 if it is re-enabled for all cpus on this uvhub.   */  static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)  {  	int tcpu;  	struct bau_control *tbcp; +	struct bau_control *hmaster; -	if (bcp->set_bau_off) { -		if (get_cycles() >= bcp->set_bau_on_time) { -			stat->s_bau_reenabled++; -			baudisabled = 0; -			for_each_present_cpu(tcpu) { -				tbcp = &per_cpu(bau_control, tcpu); +	hmaster = bcp->uvhub_master; +	spin_lock(&hmaster->disable_lock); +	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) { +		stat->s_bau_reenabled++; +		for_each_present_cpu(tcpu) { +			tbcp = &per_cpu(bau_control, tcpu); +			if (tbcp->uvhub_master == hmaster) {  				tbcp->baudisabled = 0;  				tbcp->period_requests = 0;  				tbcp->period_time = 0; +				tbcp->period_giveups = 0;  			} -			return 0;  		} +		spin_unlock(&hmaster->disable_lock); +		return 0;  	} +	spin_unlock(&hmaster->disable_lock);  	return -1;  } @@ -1068,8 +1055,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,   * done.  The returned pointer is valid till preemption is re-enabled.   */  const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, -				struct mm_struct *mm, unsigned long va, -				unsigned int cpu) +				struct mm_struct *mm, unsigned long start, +				unsigned end, unsigned int cpu)  {  	int locals = 0;  	int remotes = 0; @@ -1078,18 +1065,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,  	struct cpumask *flush_mask;  	struct ptc_stats *stat;  	struct bau_control *bcp; - -	/* kernel was booted 'nobau' */ -	if (nobau) -		return cpumask; +	unsigned long descriptor_status; +	unsigned long status;  	bcp = &per_cpu(bau_control, cpu);  	stat = bcp->statp; +	stat->s_enters++; + +	if (bcp->nobau) +		return cpumask; + +	if (bcp->busy) { +		descriptor_status = +			read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0); +		status = ((descriptor_status >> (bcp->uvhub_cpu * +			UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1; +		if (status == UV2H_DESC_BUSY) +			return cpumask; +		bcp->busy = 0; +	}  	/* bau was disabled due to slow response */  	if (bcp->baudisabled) { -		if (check_enable(bcp, stat)) +		if (check_enable(bcp, stat)) { +			stat->s_ipifordisabled++;  			return cpumask; +		}  	}  	/* @@ -1105,38 +1106,40 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,  		stat->s_ntargself++;  	bau_desc = bcp->descriptor_base; -	bau_desc += (ITEMS_PER_DESC * bcp->using_desc); +	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);  	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);  	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))  		return NULL;  	record_send_statistics(stat, locals, hubs, remotes, bau_desc); -	bau_desc->payload.address = va; +	bau_desc->payload.address = start;  	bau_desc->payload.sending_cpu = cpu;  	/*  	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,  	 * or 1 if it gave up and the original cpumask should be returned.  	 */ -	if (!uv_flush_send_and_wait(flush_mask, bcp)) +	if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))  		return NULL;  	else  		return cpumask;  }  /* - * Search the message queue for any 'other' message with the same software - * acknowledge resource bit vector. + * Search the message queue for any 'other' unprocessed message with the + * same software acknowledge resource bit vector as the 'msg' message.   */  struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, -			struct bau_control *bcp, unsigned char swack_vec) +					   struct bau_control *bcp)  {  	struct bau_pq_entry *msg_next = msg + 1; +	unsigned char swack_vec = msg->swack_vec;  	if (msg_next > bcp->queue_last)  		msg_next = bcp->queue_first; -	while ((msg_next->swack_vec != 0) && (msg_next != msg)) { -		if (msg_next->swack_vec == swack_vec) +	while (msg_next != msg) { +		if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) && +				(msg_next->swack_vec == swack_vec))  			return msg_next;  		msg_next++;  		if (msg_next > bcp->queue_last) @@ -1165,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)  		 * This message was assigned a swack resource, but no  		 * reserved acknowlegment is pending.  		 * The bug has prevented this message from setting the MMR. -		 * And no other message has used the same sw_ack resource. -		 * Do the requested shootdown but do not reply to the msg. -		 * (the 0 means make no acknowledge)  		 */ -		bau_process_message(mdp, bcp, 0); -		return; -	} - -	/* -	 * Some message has set the MMR 'pending' bit; it might have been -	 * another message.  Look for that message. -	 */ -	other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); -	if (other_msg) { -		/* There is another.  Do not ack the current one. */ -		bau_process_message(mdp, bcp, 0);  		/* -		 * Let the natural processing of that message acknowledge -		 * it. Don't get the processing of sw_ack's out of order. +		 * Some message has set the MMR 'pending' bit; it might have +		 * been another message.  Look for that message.  		 */ -		return; +		other_msg = find_another_by_swack(msg, bcp); +		if (other_msg) { +			/* +			 * There is another. Process this one but do not +			 * ack it. +			 */ +			bau_process_message(mdp, bcp, 0); +			/* +			 * Let the natural processing of that other message +			 * acknowledge it. Don't get the processing of sw_ack's +			 * out of order. +			 */ +			return; +		}  	}  	/* -	 * There is no other message using this sw_ack, so it is safe to -	 * acknowledge it. +	 * Either the MMR shows this one pending a reply or there is no +	 * other message using this sw_ack, so it is safe to acknowledge it.  	 */  	bau_process_message(mdp, bcp, 1); @@ -1295,7 +1296,8 @@ static void __init enable_timeouts(void)  		 */  		mmr_image |= (1L << SOFTACK_MSHIFT);  		if (is_uv2_hub()) { -			mmr_image |= (1L << UV2_EXT_SHFT); +			/* hw bug workaround; do not use extended status */ +			mmr_image &= ~(1L << UV2_EXT_SHFT);  		}  		write_mmr_misc_control(pnode, mmr_image);  	} @@ -1338,29 +1340,34 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec)  static int ptc_seq_show(struct seq_file *file, void *data)  {  	struct ptc_stats *stat; +	struct bau_control *bcp;  	int cpu;  	cpu = *(loff_t *)data;  	if (!cpu) {  		seq_printf(file, -			"# cpu sent stime self locals remotes ncpus localhub "); +		 "# cpu bauoff sent stime self locals remotes ncpus localhub ");  		seq_printf(file,  			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");  		seq_printf(file, -		    "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); +			"numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); +		seq_printf(file, +			"rok resetp resett giveup sto bz throt disable ");  		seq_printf(file, -			"resetp resett giveup sto bz throt swack recv rtime "); +			"enable wars warshw warwaits enters ipidis plugged ");  		seq_printf(file, -			"all one mult none retry canc nocan reset rcan "); +			"ipiover glim cong swack recv rtime all one mult ");  		seq_printf(file, -			"disable enable wars warshw warwaits\n"); +			"none retry canc nocan reset rcan\n");  	}  	if (cpu < num_possible_cpus() && cpu_online(cpu)) { -		stat = &per_cpu(ptcstats, cpu); +		bcp = &per_cpu(bau_control, cpu); +		stat = bcp->statp;  		/* source side statistics */  		seq_printf(file, -			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", -			   cpu, stat->s_requestor, cycles_2_us(stat->s_time), +			"cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", +			   cpu, bcp->nobau, stat->s_requestor, +			   cycles_2_us(stat->s_time),  			   stat->s_ntargself, stat->s_ntarglocals,  			   stat->s_ntargremotes, stat->s_ntargcpu,  			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, @@ -1374,20 +1381,23 @@ static int ptc_seq_show(struct seq_file *file, void *data)  			   stat->s_resets_plug, stat->s_resets_timeout,  			   stat->s_giveup, stat->s_stimeout,  			   stat->s_busy, stat->s_throttles); +		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", +			   stat->s_bau_disabled, stat->s_bau_reenabled, +			   stat->s_uv2_wars, stat->s_uv2_wars_hw, +			   stat->s_uv2_war_waits, stat->s_enters, +			   stat->s_ipifordisabled, stat->s_plugged, +			   stat->s_overipilimit, stat->s_giveuplimit, +			   stat->s_congested);  		/* destination side statistics */  		seq_printf(file, -			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", +			"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",  			   read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),  			   stat->d_requestee, cycles_2_us(stat->d_time),  			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,  			   stat->d_nomsg, stat->d_retries, stat->d_canceled,  			   stat->d_nocanceled, stat->d_resets,  			   stat->d_rcanceled); -		seq_printf(file, "%ld %ld %ld %ld %ld\n", -			stat->s_bau_disabled, stat->s_bau_reenabled, -			stat->s_uv2_wars, stat->s_uv2_wars_hw, -			stat->s_uv2_war_waits);  	}  	return 0;  } @@ -1401,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,  	char *buf;  	int ret; -	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", -		"max_concur plugged_delay plugsb4reset", -		"timeoutsb4reset ipi_reset_limit complete_threshold", -		"congested_response_us congested_reps congested_period", +	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n", +		"max_concur plugged_delay plugsb4reset timeoutsb4reset", +		"ipi_reset_limit complete_threshold congested_response_us", +		"congested_reps disabled_period giveup_limit",  		max_concurr, plugged_delay, plugsb4reset,  		timeoutsb4reset, ipi_reset_limit, complete_threshold, -		congested_respns_us, congested_reps, congested_period); +		congested_respns_us, congested_reps, disabled_period, +		giveup_limit);  	if (!buf)  		return -ENOMEM; @@ -1438,6 +1449,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,  		return -EFAULT;  	optstr[count - 1] = '\0'; +	if (!strcmp(optstr, "on")) { +		set_bau_on(); +		return count; +	} else if (!strcmp(optstr, "off")) { +		set_bau_off(); +		return count; +	} +  	if (strict_strtol(optstr, 10, &input_arg) < 0) {  		printk(KERN_DEBUG "%s is invalid\n", optstr);  		return -EINVAL; @@ -1570,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,  		bcp->complete_threshold =	complete_threshold;  		bcp->cong_response_us =		congested_respns_us;  		bcp->cong_reps =		congested_reps; -		bcp->cong_period =		congested_period; +		bcp->disabled_period =		sec_2_cycles(disabled_period); +		bcp->giveup_limit =		giveup_limit;  	}  	return count;  } @@ -1699,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)  			 *   fairness chaining multilevel count replied_to  			 */  		} else { +			/* +			 * BIOS uses legacy mode, but UV2 hardware always +			 * uses native mode for selective broadcasts. +			 */  			uv2_hdr = &bd2->header.uv2_hdr;  			uv2_hdr->swack_flag =	1;  			uv2_hdr->base_dest_nasid = @@ -1811,8 +1835,8 @@ static int calculate_destination_timeout(void)  		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;  		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);  		mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; -		base = timeout_base_ns[index]; -		ts_ns = base * mult1 * mult2; +		ts_ns = timeout_base_ns[index]; +		ts_ns *= (mult1 * mult2);  		ret = ts_ns / 1000;  	} else {  		/* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */ @@ -1836,6 +1860,8 @@ static void __init init_per_cpu_tunables(void)  	for_each_present_cpu(cpu) {  		bcp = &per_cpu(bau_control, cpu);  		bcp->baudisabled		= 0; +		if (nobau) +			bcp->nobau		= 1;  		bcp->statp			= &per_cpu(ptcstats, cpu);  		/* time interval to catch a hardware stay-busy bug */  		bcp->timeout_interval		= usec_2_cycles(2*timeout_us); @@ -1848,10 +1874,11 @@ static void __init init_per_cpu_tunables(void)  		bcp->complete_threshold		= complete_threshold;  		bcp->cong_response_us		= congested_respns_us;  		bcp->cong_reps			= congested_reps; -		bcp->cong_period		= congested_period; -		bcp->clocks_per_100_usec =	usec_2_cycles(100); +		bcp->disabled_period =		sec_2_cycles(disabled_period); +		bcp->giveup_limit =		giveup_limit;  		spin_lock_init(&bcp->queue_lock);  		spin_lock_init(&bcp->uvhub_lock); +		spin_lock_init(&bcp->disable_lock);  	}  } @@ -1972,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,  		}  		bcp->uvhub_master = *hmasterp;  		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; -		bcp->using_desc = bcp->uvhub_cpu;  		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {  			printk(KERN_EMERG "%d cpus per uvhub invalid\n",  				bcp->uvhub_cpu); @@ -2069,16 +2095,12 @@ static int __init uv_bau_init(void)  	if (!is_uv_system())  		return 0; -	if (nobau) -		return 0; -  	for_each_possible_cpu(cur_cpu) {  		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);  		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));  	}  	nuvhubs = uv_num_possible_blades(); -	spin_lock_init(&disable_lock);  	congested_cycles = usec_2_cycles(congested_respns_us);  	uv_base_pnode = 0x7fffffff; @@ -2091,7 +2113,8 @@ static int __init uv_bau_init(void)  	enable_timeouts();  	if (init_per_cpu(nuvhubs, uv_base_pnode)) { -		nobau = 1; +		set_bau_off(); +		nobau_perm = 1;  		return 0;  	} diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index f25c2765a5c9..acf7752da952 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	unsigned long mmr_value;  	struct uv_IO_APIC_route_entry *entry;  	int mmr_pnode, err; +	unsigned int dest;  	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=  			sizeof(unsigned long)); @@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	if (err != 0)  		return err; +	err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); +	if (err != 0) +		return err; +  	if (limit == UV_AFFINITY_CPU)  		irq_set_status_flags(irq, IRQ_NO_BALANCING);  	else @@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	entry->polarity		= 0;  	entry->trigger		= 0;  	entry->mask		= 0; -	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu); +	entry->dest		= dest;  	mmr_pnode = uv_blade_to_pnode(mmr_blade);  	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -222,7 +227,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,  	if (cfg->move_in_progress)  		send_cleanup_vector(cfg); -	return 0; +	return IRQ_SET_MASK_OK_NOCOPY;  }  /* diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 5b84a2d30888..b2d534cab25f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -22,7 +22,7 @@ wakeup-objs	+= video-bios.o  realmode-y			+= header.o  realmode-y			+= trampoline_$(BITS).o  realmode-y			+= stack.o -realmode-$(CONFIG_X86_32)	+= reboot_32.o +realmode-y			+= reboot.o  realmode-$(CONFIG_ACPI_SLEEP)	+= $(wakeup-objs)  targets	+= $(realmode-y) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index fadf48378ada..a28221d94e69 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -6,6 +6,7 @@  #include <linux/linkage.h>  #include <asm/page_types.h> +#include <asm/segment.h>  #include "realmode.h" @@ -28,8 +29,9 @@ GLOBAL(real_mode_header)  	.long	pa_wakeup_header  #endif  	/* APM/BIOS reboot */ -#ifdef CONFIG_X86_32  	.long	pa_machine_real_restart_asm +#ifdef CONFIG_X86_64 +	.long	__KERNEL32_CS  #endif  END(real_mode_header) diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot.S index 114044876b3d..f932ea61d1c8 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot.S @@ -2,6 +2,8 @@  #include <linux/init.h>  #include <asm/segment.h>  #include <asm/page_types.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h>  #include "realmode.h"  /* @@ -12,13 +14,35 @@   * doesn't work with at least one type of 486 motherboard.  It is easy   * to stop this code working; hence the copious comments.   * - * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + * This code is called with the restart type (0 = BIOS, 1 = APM) in + * the primary argument register (%eax for 32 bit, %edi for 64 bit).   */  	.section ".text32", "ax"  	.code32 - -	.balign	16  ENTRY(machine_real_restart_asm) + +#ifdef CONFIG_X86_64 +	/* Switch to trampoline GDT as it is guaranteed < 4 GiB */ +	movl	$__KERNEL_DS, %eax +	movl	%eax, %ds +	lgdtl	pa_tr_gdt + +	/* Disable paging to drop us out of long mode */ +	movl	%cr0, %eax +	andl	$~X86_CR0_PG, %eax +	movl	%eax, %cr0 +	ljmpl	$__KERNEL32_CS, $pa_machine_real_restart_paging_off + +GLOBAL(machine_real_restart_paging_off) +	xorl	%eax, %eax +	xorl	%edx, %edx +	movl	$MSR_EFER, %ecx +	wrmsr + +	movl	%edi, %eax +	 +#endif /* CONFIG_X86_64 */ +	  	/* Set up the IDT for real mode. */  	lidtl	pa_machine_real_restart_idt diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 66e6d9359826..0faad646f5fd 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -205,9 +205,9 @@ void syscall32_cpu_init(void)  {  	/* Load these always in case some future AMD CPU supports  	   SYSENTER from compat mode too. */ -	checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); -	checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); -	checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); +	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); +	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);  	wrmsrl(MSR_CSTAR, ia32_cstar_target);  } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f1814fc2cb77..9642d4a38602 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1156,9 +1156,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {  	.wbinvd = native_wbinvd,  	.read_msr = native_read_msr_safe, -	.rdmsr_regs = native_rdmsr_safe_regs,  	.write_msr = xen_write_msr_safe, -	.wrmsr_regs = native_wrmsr_safe_regs,  	.read_tsc = native_read_tsc,  	.read_pmc = native_read_pmc, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 27336dfcda8e..b65a76133f4f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1256,7 +1256,8 @@ static void xen_flush_tlb_single(unsigned long addr)  }  static void xen_flush_tlb_others(const struct cpumask *cpus, -				 struct mm_struct *mm, unsigned long va) +				 struct mm_struct *mm, unsigned long start, +				 unsigned long end)  {  	struct {  		struct mmuext_op op; @@ -1268,7 +1269,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,  	} *args;  	struct multicall_space mcs; -	trace_xen_mmu_flush_tlb_others(cpus, mm, va); +	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);  	if (cpumask_empty(cpus))  		return;		/* nothing to do */ @@ -1281,11 +1282,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,  	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);  	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); -	if (va == TLB_FLUSH_ALL) { -		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; -	} else { +	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; +	if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {  		args->op.cmd = MMUEXT_INVLPG_MULTI; -		args->op.arg1.linear_addr = va; +		args->op.arg1.linear_addr = start;  	}  	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index afb250d22a6b..f58dca7a6e52 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void)  	notify_cpu_starting(cpu); -	ipi_call_lock();  	set_cpu_online(cpu, true); -	ipi_call_unlock();  	this_cpu_write(cpu_state, CPU_ONLINE);  |