diff options
Diffstat (limited to 'arch/x86/kvm')
| -rw-r--r-- | arch/x86/kvm/emulate.c | 749 | ||||
| -rw-r--r-- | arch/x86/kvm/i8254.c | 146 | ||||
| -rw-r--r-- | arch/x86/kvm/i8254.h | 4 | ||||
| -rw-r--r-- | arch/x86/kvm/i8259.c | 48 | ||||
| -rw-r--r-- | arch/x86/kvm/irq.c | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/irq.h | 4 | ||||
| -rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 8 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 17 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.c | 807 | ||||
| -rw-r--r-- | arch/x86/kvm/mmutrace.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 252 | ||||
| -rw-r--r-- | arch/x86/kvm/svm.c | 147 | ||||
| -rw-r--r-- | arch/x86/kvm/timer.c | 16 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx.c | 261 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 1176 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.h | 7 | 
16 files changed, 2173 insertions, 1473 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5ac0bb465ed6..b38bd8b92aa6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -9,6 +9,7 @@   * privileged instructions:   *   * Copyright (C) 2006 Qumranet + * Copyright 2010 Red Hat, Inc. and/or its affilates.   *   *   Avi Kivity <avi@qumranet.com>   *   Yaniv Kamay <yaniv@qumranet.com> @@ -67,6 +68,9 @@  #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */  #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */  #define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */ +#define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */ +#define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */ +#define SrcAcc      (0xd<<4)	/* Source Accumulator */  #define SrcMask     (0xf<<4)  /* Generic ModRM decode. */  #define ModRM       (1<<8) @@ -88,10 +92,6 @@  #define Src2CL      (1<<29)  #define Src2ImmByte (2<<29)  #define Src2One     (3<<29) -#define Src2Imm16   (4<<29) -#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be -			       in memory and second argument is located -			       immediately after the first one in memory. */  #define Src2Mask    (7<<29)  enum { @@ -124,15 +124,15 @@ static u32 opcode_table[256] = {  	/* 0x20 - 0x27 */  	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -	DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, +	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,  	/* 0x28 - 0x2F */  	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -	0, 0, 0, 0, +	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,  	/* 0x30 - 0x37 */  	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -	0, 0, 0, 0, +	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,  	/* 0x38 - 0x3F */  	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,  	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -170,20 +170,20 @@ static u32 opcode_table[256] = {  	/* 0x88 - 0x8F */  	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,  	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, -	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, -	DstReg | SrcMem | ModRM | Mov, Group | Group1A, +	DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, +	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,  	/* 0x90 - 0x97 */  	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,  	/* 0x98 - 0x9F */ -	0, 0, SrcImm | Src2Imm16 | No64, 0, +	0, 0, SrcImmFAddr | No64, 0,  	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,  	/* 0xA0 - 0xA7 */ -	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, -	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, +	ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, +	ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,  	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,  	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,  	/* 0xA8 - 0xAF */ -	0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, +	DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,  	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,  	ByteOp | DstDI | String, DstDI | String,  	/* 0xB0 - 0xB7 */ @@ -215,7 +215,7 @@ static u32 opcode_table[256] = {  	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,  	/* 0xE8 - 0xEF */  	SrcImm | Stack, SrcImm | ImplicitOps, -	SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, +	SrcImmFAddr | No64, SrcImmByte | ImplicitOps,  	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,  	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,  	/* 0xF0 - 0xF7 */ @@ -337,20 +337,20 @@ static u32 group_table[] = {  	[Group1A*8] =  	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,  	[Group3_Byte*8] = -	ByteOp | SrcImm | DstMem | ModRM, 0, +	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,  	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,  	0, 0, 0, 0,  	[Group3*8] = -	DstMem | SrcImm | ModRM, 0, +	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,  	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,  	0, 0, 0, 0,  	[Group4*8] = -	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, +	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,  	0, 0, 0, 0, 0, 0,  	[Group5*8] = -	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, +	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,  	SrcMem | ModRM | Stack, 0, -	SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, +	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,  	SrcMem | ModRM | Stack, 0,  	[Group7*8] =  	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, @@ -576,6 +576,13 @@ static u32 group2_table[] = {  	(_type)_x;							\  }) +#define insn_fetch_arr(_arr, _size, _eip)                                \ +({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\ +	if (rc != X86EMUL_CONTINUE)					\ +		goto done;						\ +	(_eip) += (_size);						\ +}) +  static inline unsigned long ad_mask(struct decode_cache *c)  {  	return (1UL << (c->ad_bytes << 3)) - 1; @@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)  	c->seg_override = seg;  } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, +			      struct x86_emulate_ops *ops, int seg)  {  	if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)  		return 0; -	return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); +	return ops->get_cached_segment_base(seg, ctxt->vcpu);  }  static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, +				       struct x86_emulate_ops *ops,  				       struct decode_cache *c)  {  	if (!c->has_seg_override)  		return 0; -	return seg_base(ctxt, c->seg_override); +	return seg_base(ctxt, ops, c->seg_override); +} + +static unsigned long es_base(struct x86_emulate_ctxt *ctxt, +			     struct x86_emulate_ops *ops) +{ +	return seg_base(ctxt, ops, VCPU_SREG_ES); +} + +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, +			     struct x86_emulate_ops *ops) +{ +	return seg_base(ctxt, ops, VCPU_SREG_SS); +} + +static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, +				      u32 error, bool valid) +{ +	ctxt->exception = vec; +	ctxt->error_code = error; +	ctxt->error_code_valid = valid; +	ctxt->restart = false; +} + +static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) +{ +	emulate_exception(ctxt, GP_VECTOR, err, true);  } -static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, +		       int err)  { -	return seg_base(ctxt, VCPU_SREG_ES); +	ctxt->cr2 = addr; +	emulate_exception(ctxt, PF_VECTOR, err, true);  } -static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +static void emulate_ud(struct x86_emulate_ctxt *ctxt)  { -	return seg_base(ctxt, VCPU_SREG_SS); +	emulate_exception(ctxt, UD_VECTOR, 0, false); +} + +static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) +{ +	emulate_exception(ctxt, TS_VECTOR, err, true);  }  static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, @@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  	/* we cannot decode insn before we complete previous rep insn */  	WARN_ON(ctxt->restart); -	/* Shadow copy of register state. Committed on successful emulation. */ -	memset(c, 0, sizeof(struct decode_cache));  	c->eip = ctxt->eip;  	c->fetch.start = c->fetch.end = c->eip; -	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); -	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); +	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);  	switch (mode) {  	case X86EMUL_MODE_REAL: @@ -1060,7 +1099,7 @@ done_prefixes:  		set_seg_override(c, VCPU_SREG_DS);  	if (!(!c->twobyte && c->b == 0x8d)) -		c->modrm_ea += seg_override_base(ctxt, c); +		c->modrm_ea += seg_override_base(ctxt, ops, c);  	if (c->ad_bytes != 8)  		c->modrm_ea = (u32)c->modrm_ea; @@ -1148,6 +1187,25 @@ done_prefixes:  		else  			c->src.val = insn_fetch(u8, 1, c->eip);  		break; +	case SrcAcc: +		c->src.type = OP_REG; +		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; +		c->src.ptr = &c->regs[VCPU_REGS_RAX]; +		switch (c->src.bytes) { +			case 1: +				c->src.val = *(u8 *)c->src.ptr; +				break; +			case 2: +				c->src.val = *(u16 *)c->src.ptr; +				break; +			case 4: +				c->src.val = *(u32 *)c->src.ptr; +				break; +			case 8: +				c->src.val = *(u64 *)c->src.ptr; +				break; +		} +		break;  	case SrcOne:  		c->src.bytes = 1;  		c->src.val = 1; @@ -1156,10 +1214,21 @@ done_prefixes:  		c->src.type = OP_MEM;  		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;  		c->src.ptr = (unsigned long *) -			register_address(c,  seg_override_base(ctxt, c), +			register_address(c,  seg_override_base(ctxt, ops, c),  					 c->regs[VCPU_REGS_RSI]);  		c->src.val = 0;  		break; +	case SrcImmFAddr: +		c->src.type = OP_IMM; +		c->src.ptr = (unsigned long *)c->eip; +		c->src.bytes = c->op_bytes + 2; +		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); +		break; +	case SrcMemFAddr: +		c->src.type = OP_MEM; +		c->src.ptr = (unsigned long *)c->modrm_ea; +		c->src.bytes = c->op_bytes + 2; +		break;  	}  	/* @@ -1179,22 +1248,10 @@ done_prefixes:  		c->src2.bytes = 1;  		c->src2.val = insn_fetch(u8, 1, c->eip);  		break; -	case Src2Imm16: -		c->src2.type = OP_IMM; -		c->src2.ptr = (unsigned long *)c->eip; -		c->src2.bytes = 2; -		c->src2.val = insn_fetch(u16, 2, c->eip); -		break;  	case Src2One:  		c->src2.bytes = 1;  		c->src2.val = 1;  		break; -	case Src2Mem16: -		c->src2.type = OP_MEM; -		c->src2.bytes = 2; -		c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); -		c->src2.val = 0; -		break;  	}  	/* Decode and fetch the destination operand: register or memory. */ @@ -1253,7 +1310,7 @@ done_prefixes:  		c->dst.type = OP_MEM;  		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;  		c->dst.ptr = (unsigned long *) -			register_address(c, es_base(ctxt), +			register_address(c, es_base(ctxt, ops),  					 c->regs[VCPU_REGS_RDI]);  		c->dst.val = 0;  		break; @@ -1263,6 +1320,37 @@ done:  	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;  } +static int read_emulated(struct x86_emulate_ctxt *ctxt, +			 struct x86_emulate_ops *ops, +			 unsigned long addr, void *dest, unsigned size) +{ +	int rc; +	struct read_cache *mc = &ctxt->decode.mem_read; +	u32 err; + +	while (size) { +		int n = min(size, 8u); +		size -= n; +		if (mc->pos < mc->end) +			goto read_cached; + +		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, +					ctxt->vcpu); +		if (rc == X86EMUL_PROPAGATE_FAULT) +			emulate_pf(ctxt, addr, err); +		if (rc != X86EMUL_CONTINUE) +			return rc; +		mc->end += n; + +	read_cached: +		memcpy(dest, mc->data + mc->pos, n); +		mc->pos += n; +		dest += n; +		addr += n; +	} +	return X86EMUL_CONTINUE; +} +  static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,  			   struct x86_emulate_ops *ops,  			   unsigned int size, unsigned short port, @@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,  	get_descriptor_table_ptr(ctxt, ops, selector, &dt);  	if (dt.size < index * 8 + 7) { -		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); +		emulate_gp(ctxt, selector & 0xfffc);  		return X86EMUL_PROPAGATE_FAULT;  	}  	addr = dt.address + index * 8;  	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) -		kvm_inject_page_fault(ctxt->vcpu, addr, err); +		emulate_pf(ctxt, addr, err);         return ret;  } @@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,  	get_descriptor_table_ptr(ctxt, ops, selector, &dt);  	if (dt.size < index * 8 + 7) { -		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); +		emulate_gp(ctxt, selector & 0xfffc);  		return X86EMUL_PROPAGATE_FAULT;  	}  	addr = dt.address + index * 8;  	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) -		kvm_inject_page_fault(ctxt->vcpu, addr, err); +		emulate_pf(ctxt, addr, err);  	return ret;  } @@ -1481,11 +1569,70 @@ load:  	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);  	return X86EMUL_CONTINUE;  exception: -	kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); +	emulate_exception(ctxt, err_vec, err_code, true);  	return X86EMUL_PROPAGATE_FAULT;  } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +static inline int writeback(struct x86_emulate_ctxt *ctxt, +			    struct x86_emulate_ops *ops) +{ +	int rc; +	struct decode_cache *c = &ctxt->decode; +	u32 err; + +	switch (c->dst.type) { +	case OP_REG: +		/* The 4-byte case *is* correct: +		 * in 64-bit mode we zero-extend. +		 */ +		switch (c->dst.bytes) { +		case 1: +			*(u8 *)c->dst.ptr = (u8)c->dst.val; +			break; +		case 2: +			*(u16 *)c->dst.ptr = (u16)c->dst.val; +			break; +		case 4: +			*c->dst.ptr = (u32)c->dst.val; +			break;	/* 64b: zero-ext */ +		case 8: +			*c->dst.ptr = c->dst.val; +			break; +		} +		break; +	case OP_MEM: +		if (c->lock_prefix) +			rc = ops->cmpxchg_emulated( +					(unsigned long)c->dst.ptr, +					&c->dst.orig_val, +					&c->dst.val, +					c->dst.bytes, +					&err, +					ctxt->vcpu); +		else +			rc = ops->write_emulated( +					(unsigned long)c->dst.ptr, +					&c->dst.val, +					c->dst.bytes, +					&err, +					ctxt->vcpu); +		if (rc == X86EMUL_PROPAGATE_FAULT) +			emulate_pf(ctxt, +					      (unsigned long)c->dst.ptr, err); +		if (rc != X86EMUL_CONTINUE) +			return rc; +		break; +	case OP_NONE: +		/* no writeback */ +		break; +	default: +		break; +	} +	return X86EMUL_CONTINUE; +} + +static inline void emulate_push(struct x86_emulate_ctxt *ctxt, +				struct x86_emulate_ops *ops)  {  	struct decode_cache *c = &ctxt->decode; @@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)  	c->dst.bytes = c->op_bytes;  	c->dst.val = c->src.val;  	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); -	c->dst.ptr = (void *) register_address(c, ss_base(ctxt), +	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),  					       c->regs[VCPU_REGS_RSP]);  } @@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,  	struct decode_cache *c = &ctxt->decode;  	int rc; -	rc = ops->read_emulated(register_address(c, ss_base(ctxt), -						 c->regs[VCPU_REGS_RSP]), -				dest, len, ctxt->vcpu); +	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), +						       c->regs[VCPU_REGS_RSP]), +			   dest, len);  	if (rc != X86EMUL_CONTINUE)  		return rc; @@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,  		break;  	case X86EMUL_MODE_VM86:  		if (iopl < 3) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			return X86EMUL_PROPAGATE_FAULT;  		}  		change_mask |= EFLG_IF; @@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,  	return rc;  } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, +			      struct x86_emulate_ops *ops, int seg)  {  	struct decode_cache *c = &ctxt->decode; -	struct kvm_segment segment; -	kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); +	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); -	c->src.val = segment.selector; -	emulate_push(ctxt); +	emulate_push(ctxt, ops);  }  static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,  	return rc;  } -static void emulate_pusha(struct x86_emulate_ctxt *ctxt) +static int emulate_pusha(struct x86_emulate_ctxt *ctxt, +			  struct x86_emulate_ops *ops)  {  	struct decode_cache *c = &ctxt->decode;  	unsigned long old_esp = c->regs[VCPU_REGS_RSP]; +	int rc = X86EMUL_CONTINUE;  	int reg = VCPU_REGS_RAX;  	while (reg <= VCPU_REGS_RDI) {  		(reg == VCPU_REGS_RSP) ?  		(c->src.val = old_esp) : (c->src.val = c->regs[reg]); -		emulate_push(ctxt); +		emulate_push(ctxt, ops); + +		rc = writeback(ctxt, ops); +		if (rc != X86EMUL_CONTINUE) +			return rc; +  		++reg;  	} + +	/* Disable writeback. */ +	c->dst.type = OP_NONE; + +	return rc;  }  static int emulate_popa(struct x86_emulate_ctxt *ctxt, @@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,  		old_eip = c->eip;  		c->eip = c->src.val;  		c->src.val = old_eip; -		emulate_push(ctxt); +		emulate_push(ctxt, ops);  		break;  	}  	case 4: /* jmp abs */  		c->eip = c->src.val;  		break;  	case 6:	/* push */ -		emulate_push(ctxt); +		emulate_push(ctxt, ops);  		break;  	}  	return X86EMUL_CONTINUE; @@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,  	return rc;  } -static inline int writeback(struct x86_emulate_ctxt *ctxt, -			    struct x86_emulate_ops *ops) -{ -	int rc; -	struct decode_cache *c = &ctxt->decode; - -	switch (c->dst.type) { -	case OP_REG: -		/* The 4-byte case *is* correct: -		 * in 64-bit mode we zero-extend. -		 */ -		switch (c->dst.bytes) { -		case 1: -			*(u8 *)c->dst.ptr = (u8)c->dst.val; -			break; -		case 2: -			*(u16 *)c->dst.ptr = (u16)c->dst.val; -			break; -		case 4: -			*c->dst.ptr = (u32)c->dst.val; -			break;	/* 64b: zero-ext */ -		case 8: -			*c->dst.ptr = c->dst.val; -			break; -		} -		break; -	case OP_MEM: -		if (c->lock_prefix) -			rc = ops->cmpxchg_emulated( -					(unsigned long)c->dst.ptr, -					&c->dst.orig_val, -					&c->dst.val, -					c->dst.bytes, -					ctxt->vcpu); -		else -			rc = ops->write_emulated( -					(unsigned long)c->dst.ptr, -					&c->dst.val, -					c->dst.bytes, -					ctxt->vcpu); -		if (rc != X86EMUL_CONTINUE) -			return rc; -		break; -	case OP_NONE: -		/* no writeback */ -		break; -	default: -		break; -	} -	return X86EMUL_CONTINUE; -} - -static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) -{ -	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); -	/* -	 * an sti; sti; sequence only disable interrupts for the first -	 * instruction. So, if the last instruction, be it emulated or -	 * not, left the system with the INT_STI flag enabled, it -	 * means that the last instruction is an sti. We should not -	 * leave the flag on in this case. The same goes for mov ss -	 */ -	if (!(int_shadow & mask)) -		ctxt->interruptibility = mask; -} -  static inline void  setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, -	struct kvm_segment *cs, struct kvm_segment *ss) +			struct x86_emulate_ops *ops, struct desc_struct *cs, +			struct desc_struct *ss)  { -	memset(cs, 0, sizeof(struct kvm_segment)); -	kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); -	memset(ss, 0, sizeof(struct kvm_segment)); +	memset(cs, 0, sizeof(struct desc_struct)); +	ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); +	memset(ss, 0, sizeof(struct desc_struct));  	cs->l = 0;		/* will be adjusted later */ -	cs->base = 0;		/* flat segment */ +	set_desc_base(cs, 0);	/* flat segment */  	cs->g = 1;		/* 4kb granularity */ -	cs->limit = 0xffffffff;	/* 4GB limit */ +	set_desc_limit(cs, 0xfffff);	/* 4GB limit */  	cs->type = 0x0b;	/* Read, Execute, Accessed */  	cs->s = 1;  	cs->dpl = 0;		/* will be adjusted later */ -	cs->present = 1; -	cs->db = 1; +	cs->p = 1; +	cs->d = 1; -	ss->unusable = 0; -	ss->base = 0;		/* flat segment */ -	ss->limit = 0xffffffff;	/* 4GB limit */ +	set_desc_base(ss, 0);	/* flat segment */ +	set_desc_limit(ss, 0xfffff);	/* 4GB limit */  	ss->g = 1;		/* 4kb granularity */  	ss->s = 1;  	ss->type = 0x03;	/* Read/Write, Accessed */ -	ss->db = 1;		/* 32bit stack segment */ +	ss->d = 1;		/* 32bit stack segment */  	ss->dpl = 0; -	ss->present = 1; +	ss->p = 1;  }  static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) +emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  {  	struct decode_cache *c = &ctxt->decode; -	struct kvm_segment cs, ss; +	struct desc_struct cs, ss;  	u64 msr_data; +	u16 cs_sel, ss_sel;  	/* syscall is not available in real mode */  	if (ctxt->mode == X86EMUL_MODE_REAL ||  	    ctxt->mode == X86EMUL_MODE_VM86) { -		kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +		emulate_ud(ctxt);  		return X86EMUL_PROPAGATE_FAULT;  	} -	setup_syscalls_segments(ctxt, &cs, &ss); +	setup_syscalls_segments(ctxt, ops, &cs, &ss); -	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); +	ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);  	msr_data >>= 32; -	cs.selector = (u16)(msr_data & 0xfffc); -	ss.selector = (u16)(msr_data + 8); +	cs_sel = (u16)(msr_data & 0xfffc); +	ss_sel = (u16)(msr_data + 8);  	if (is_long_mode(ctxt->vcpu)) { -		cs.db = 0; +		cs.d = 0;  		cs.l = 1;  	} -	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); -	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); +	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); +	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); +	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); +	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);  	c->regs[VCPU_REGS_RCX] = c->eip;  	if (is_long_mode(ctxt->vcpu)) {  #ifdef CONFIG_X86_64  		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; -		kvm_x86_ops->get_msr(ctxt->vcpu, -			ctxt->mode == X86EMUL_MODE_PROT64 ? -			MSR_LSTAR : MSR_CSTAR, &msr_data); +		ops->get_msr(ctxt->vcpu, +			     ctxt->mode == X86EMUL_MODE_PROT64 ? +			     MSR_LSTAR : MSR_CSTAR, &msr_data);  		c->eip = msr_data; -		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); +		ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);  		ctxt->eflags &= ~(msr_data | EFLG_RF);  #endif  	} else {  		/* legacy mode */ -		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); +		ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);  		c->eip = (u32)msr_data;  		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)  }  static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt) +emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  {  	struct decode_cache *c = &ctxt->decode; -	struct kvm_segment cs, ss; +	struct desc_struct cs, ss;  	u64 msr_data; +	u16 cs_sel, ss_sel;  	/* inject #GP if in real mode */  	if (ctxt->mode == X86EMUL_MODE_REAL) { -		kvm_inject_gp(ctxt->vcpu, 0); +		emulate_gp(ctxt, 0);  		return X86EMUL_PROPAGATE_FAULT;  	} @@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)  	* Therefore, we inject an #UD.  	*/  	if (ctxt->mode == X86EMUL_MODE_PROT64) { -		kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +		emulate_ud(ctxt);  		return X86EMUL_PROPAGATE_FAULT;  	} -	setup_syscalls_segments(ctxt, &cs, &ss); +	setup_syscalls_segments(ctxt, ops, &cs, &ss); -	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); +	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);  	switch (ctxt->mode) {  	case X86EMUL_MODE_PROT32:  		if ((msr_data & 0xfffc) == 0x0) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			return X86EMUL_PROPAGATE_FAULT;  		}  		break;  	case X86EMUL_MODE_PROT64:  		if (msr_data == 0x0) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			return X86EMUL_PROPAGATE_FAULT;  		}  		break;  	}  	ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); -	cs.selector = (u16)msr_data; -	cs.selector &= ~SELECTOR_RPL_MASK; -	ss.selector = cs.selector + 8; -	ss.selector &= ~SELECTOR_RPL_MASK; +	cs_sel = (u16)msr_data; +	cs_sel &= ~SELECTOR_RPL_MASK; +	ss_sel = cs_sel + 8; +	ss_sel &= ~SELECTOR_RPL_MASK;  	if (ctxt->mode == X86EMUL_MODE_PROT64  		|| is_long_mode(ctxt->vcpu)) { -		cs.db = 0; +		cs.d = 0;  		cs.l = 1;  	} -	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); -	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); +	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); +	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); +	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); +	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); -	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); +	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);  	c->eip = msr_data; -	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); +	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);  	c->regs[VCPU_REGS_RSP] = msr_data;  	return X86EMUL_CONTINUE;  }  static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt) +emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  {  	struct decode_cache *c = &ctxt->decode; -	struct kvm_segment cs, ss; +	struct desc_struct cs, ss;  	u64 msr_data;  	int usermode; +	u16 cs_sel, ss_sel;  	/* inject #GP if in real mode or Virtual 8086 mode */  	if (ctxt->mode == X86EMUL_MODE_REAL ||  	    ctxt->mode == X86EMUL_MODE_VM86) { -		kvm_inject_gp(ctxt->vcpu, 0); +		emulate_gp(ctxt, 0);  		return X86EMUL_PROPAGATE_FAULT;  	} -	setup_syscalls_segments(ctxt, &cs, &ss); +	setup_syscalls_segments(ctxt, ops, &cs, &ss);  	if ((c->rex_prefix & 0x8) != 0x0)  		usermode = X86EMUL_MODE_PROT64; @@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)  	cs.dpl = 3;  	ss.dpl = 3; -	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); +	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);  	switch (usermode) {  	case X86EMUL_MODE_PROT32: -		cs.selector = (u16)(msr_data + 16); +		cs_sel = (u16)(msr_data + 16);  		if ((msr_data & 0xfffc) == 0x0) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			return X86EMUL_PROPAGATE_FAULT;  		} -		ss.selector = (u16)(msr_data + 24); +		ss_sel = (u16)(msr_data + 24);  		break;  	case X86EMUL_MODE_PROT64: -		cs.selector = (u16)(msr_data + 32); +		cs_sel = (u16)(msr_data + 32);  		if (msr_data == 0x0) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			return X86EMUL_PROPAGATE_FAULT;  		} -		ss.selector = cs.selector + 8; -		cs.db = 0; +		ss_sel = cs_sel + 8; +		cs.d = 0;  		cs.l = 1;  		break;  	} -	cs.selector |= SELECTOR_RPL_MASK; -	ss.selector |= SELECTOR_RPL_MASK; +	cs_sel |= SELECTOR_RPL_MASK; +	ss_sel |= SELECTOR_RPL_MASK; -	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); -	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); +	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); +	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); +	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); +	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); -	c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; -	c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; +	c->eip = c->regs[VCPU_REGS_RDX]; +	c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];  	return X86EMUL_CONTINUE;  } @@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,  					    struct x86_emulate_ops *ops,  					    u16 port, u16 len)  { -	struct kvm_segment tr_seg; +	struct desc_struct tr_seg;  	int r;  	u16 io_bitmap_ptr;  	u8 perm, bit_idx = port & 0x7;  	unsigned mask = (1 << len) - 1; -	kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); -	if (tr_seg.unusable) +	ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); +	if (!tr_seg.p)  		return false; -	if (tr_seg.limit < 103) +	if (desc_limit_scaled(&tr_seg) < 103)  		return false; -	r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, -			  NULL); +	r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, +			  ctxt->vcpu, NULL);  	if (r != X86EMUL_CONTINUE)  		return false; -	if (io_bitmap_ptr + port/8 > tr_seg.limit) +	if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))  		return false; -	r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, -			  ctxt->vcpu, NULL); +	r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, +			  &perm, 1, ctxt->vcpu, NULL);  	if (r != X86EMUL_CONTINUE)  		return false;  	if ((perm >> bit_idx) & mask) @@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,  	return true;  } -static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, -				      struct x86_emulate_ops *ops, -				      int seg) -{ -	struct desc_struct desc; -	if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) -		return get_desc_base(&desc); -	else -		return ~0; -} -  static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,  				struct x86_emulate_ops *ops,  				struct tss_segment_16 *tss) @@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,  			    &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) {  		/* FIXME: need to provide precise fault address */ -		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); +		emulate_pf(ctxt, old_tss_base, err);  		return ret;  	} @@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,  			     &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) {  		/* FIXME: need to provide precise fault address */ -		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); +		emulate_pf(ctxt, old_tss_base, err);  		return ret;  	} @@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,  			    &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) {  		/* FIXME: need to provide precise fault address */ -		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); +		emulate_pf(ctxt, new_tss_base, err);  		return ret;  	} @@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,  				     ctxt->vcpu, &err);  		if (ret == X86EMUL_PROPAGATE_FAULT) {  			/* FIXME: need to provide precise fault address */ -			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); +			emulate_pf(ctxt, new_tss_base, err);  			return ret;  		}  	} @@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,  	struct decode_cache *c = &ctxt->decode;  	int ret; -	ops->set_cr(3, tss->cr3, ctxt->vcpu); +	if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { +		emulate_gp(ctxt, 0); +		return X86EMUL_PROPAGATE_FAULT; +	}  	c->eip = tss->eip;  	ctxt->eflags = tss->eflags | 2;  	c->regs[VCPU_REGS_RAX] = tss->eax; @@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,  			    &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) {  		/* FIXME: need to provide precise fault address */ -		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); +		emulate_pf(ctxt, old_tss_base, err);  		return ret;  	} @@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,  			     &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) {  		/* FIXME: need to provide precise fault address */ -		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); +		emulate_pf(ctxt, old_tss_base, err);  		return ret;  	} @@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,  			    &err);  	if (ret == X86EMUL_PROPAGATE_FAULT) {  		/* FIXME: need to provide precise fault address */ -		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); +		emulate_pf(ctxt, new_tss_base, err);  		return ret;  	} @@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,  				     ctxt->vcpu, &err);  		if (ret == X86EMUL_PROPAGATE_FAULT) {  			/* FIXME: need to provide precise fault address */ -			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); +			emulate_pf(ctxt, new_tss_base, err);  			return ret;  		}  	} @@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,  	int ret;  	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);  	ulong old_tss_base = -		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); +		ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);  	u32 desc_limit;  	/* FIXME: old_tss_base == ~0 ? */ @@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,  	if (reason != TASK_SWITCH_IRET) {  		if ((tss_selector & 3) > next_tss_desc.dpl ||  		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			return X86EMUL_PROPAGATE_FAULT;  		}  	} @@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,  	if (!next_tss_desc.p ||  	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||  	     desc_limit < 0x2b)) { -		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, -				      tss_selector & 0xfffc); +		emulate_ts(ctxt, tss_selector & 0xfffc);  		return X86EMUL_PROPAGATE_FAULT;  	} @@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,  		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;  		c->lock_prefix = 0;  		c->src.val = (unsigned long) error_code; -		emulate_push(ctxt); +		emulate_push(ctxt, ops);  	}  	return ret; @@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,  	struct decode_cache *c = &ctxt->decode;  	int rc; -	memset(c, 0, sizeof(struct decode_cache));  	c->eip = ctxt->eip; -	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);  	c->dst.type = OP_NONE;  	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,  				     has_error_code, error_code);  	if (rc == X86EMUL_CONTINUE) { -		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); -		kvm_rip_write(ctxt->vcpu, c->eip);  		rc = writeback(ctxt, ops); +		if (rc == X86EMUL_CONTINUE) +			ctxt->eip = c->eip;  	}  	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; @@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  	int rc = X86EMUL_CONTINUE;  	int saved_dst_type = c->dst.type; -	ctxt->interruptibility = 0; - -	/* Shadow copy of register state. Committed on successful emulation. -	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't -	 * modify them. -	 */ - -	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); +	ctxt->decode.mem_read.pos = 0;  	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { -		kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +		emulate_ud(ctxt);  		goto done;  	}  	/* LOCK prefix is allowed only with some instructions */  	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { -		kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +		emulate_ud(ctxt);  		goto done;  	}  	/* Privileged instruction can be executed only in CPL=0 */  	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { -		kvm_inject_gp(ctxt->vcpu, 0); +		emulate_gp(ctxt, 0);  		goto done;  	} @@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {  		string_done:  			ctxt->restart = false; -			kvm_rip_write(ctxt->vcpu, c->eip); +			ctxt->eip = c->eip;  			goto done;  		}  		/* The second termination condition only applies for REPE @@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  	}  	if (c->src.type == OP_MEM) { -		rc = ops->read_emulated((unsigned long)c->src.ptr, -					&c->src.val, -					c->src.bytes, -					ctxt->vcpu); +		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, +					c->src.valptr, c->src.bytes);  		if (rc != X86EMUL_CONTINUE)  			goto done;  		c->src.orig_val = c->src.val;  	}  	if (c->src2.type == OP_MEM) { -		rc = ops->read_emulated((unsigned long)c->src2.ptr, -					&c->src2.val, -					c->src2.bytes, -					ctxt->vcpu); +		rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, +					&c->src2.val, c->src2.bytes);  		if (rc != X86EMUL_CONTINUE)  			goto done;  	} @@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)  	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {  		/* optimisation - avoid slow emulated read if Mov */ -		rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, -					c->dst.bytes, ctxt->vcpu); +		rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, +				   &c->dst.val, c->dst.bytes);  		if (rc != X86EMUL_CONTINUE)  			goto done;  	} @@ -2571,7 +2650,7 @@ special_insn:  		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);  		break;  	case 0x06:		/* push es */ -		emulate_push_sreg(ctxt, VCPU_SREG_ES); +		emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);  		break;  	case 0x07:		/* pop es */  		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); @@ -2583,14 +2662,14 @@ special_insn:  		emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);  		break;  	case 0x0e:		/* push cs */ -		emulate_push_sreg(ctxt, VCPU_SREG_CS); +		emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);  		break;  	case 0x10 ... 0x15:  	      adc:		/* adc */  		emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);  		break;  	case 0x16:		/* push ss */ -		emulate_push_sreg(ctxt, VCPU_SREG_SS); +		emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);  		break;  	case 0x17:		/* pop ss */  		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); @@ -2602,7 +2681,7 @@ special_insn:  		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);  		break;  	case 0x1e:		/* push ds */ -		emulate_push_sreg(ctxt, VCPU_SREG_DS); +		emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);  		break;  	case 0x1f:		/* pop ds */  		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); @@ -2632,7 +2711,7 @@ special_insn:  		emulate_1op("dec", c->dst, ctxt->eflags);  		break;  	case 0x50 ... 0x57:  /* push reg */ -		emulate_push(ctxt); +		emulate_push(ctxt, ops);  		break;  	case 0x58 ... 0x5f: /* pop reg */  	pop_instruction: @@ -2641,7 +2720,9 @@ special_insn:  			goto done;  		break;  	case 0x60:	/* pusha */ -		emulate_pusha(ctxt); +		rc = emulate_pusha(ctxt, ops); +		if (rc != X86EMUL_CONTINUE) +			goto done;  		break;  	case 0x61:	/* popa */  		rc = emulate_popa(ctxt, ops); @@ -2655,14 +2736,14 @@ special_insn:  		break;  	case 0x68: /* push imm */  	case 0x6a: /* push imm8 */ -		emulate_push(ctxt); +		emulate_push(ctxt, ops);  		break;  	case 0x6c:		/* insb */  	case 0x6d:		/* insw/insd */  		c->dst.bytes = min(c->dst.bytes, 4u);  		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],  					  c->dst.bytes)) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			goto done;  		}  		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, @@ -2674,7 +2755,7 @@ special_insn:  		c->src.bytes = min(c->src.bytes, 4u);  		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],  					  c->src.bytes)) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			goto done;  		}  		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], @@ -2707,6 +2788,7 @@ special_insn:  		}  		break;  	case 0x84 ... 0x85: +	test:  		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);  		break;  	case 0x86 ... 0x87:	/* xchg */ @@ -2735,18 +2817,13 @@ special_insn:  		break;  	case 0x88 ... 0x8b:	/* mov */  		goto mov; -	case 0x8c: { /* mov r/m, sreg */ -		struct kvm_segment segreg; - -		if (c->modrm_reg <= VCPU_SREG_GS) -			kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); -		else { -			kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +	case 0x8c:  /* mov r/m, sreg */ +		if (c->modrm_reg > VCPU_SREG_GS) { +			emulate_ud(ctxt);  			goto done;  		} -		c->dst.val = segreg.selector; +		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);  		break; -	}  	case 0x8d: /* lea r16/r32, m */  		c->dst.val = c->modrm_ea;  		break; @@ -2757,12 +2834,12 @@ special_insn:  		if (c->modrm_reg == VCPU_SREG_CS ||  		    c->modrm_reg > VCPU_SREG_GS) { -			kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +			emulate_ud(ctxt);  			goto done;  		}  		if (c->modrm_reg == VCPU_SREG_SS) -			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); +			ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;  		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); @@ -2775,19 +2852,19 @@ special_insn:  			goto done;  		break;  	case 0x90: /* nop / xchg r8,rax */ -		if (!(c->rex_prefix & 1)) { /* nop */ -			c->dst.type = OP_NONE; +		if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { +			c->dst.type = OP_NONE;  /* nop */  			break;  		}  	case 0x91 ... 0x97: /* xchg reg,rax */ -		c->src.type = c->dst.type = OP_REG; -		c->src.bytes = c->dst.bytes = c->op_bytes; +		c->src.type = OP_REG; +		c->src.bytes = c->op_bytes;  		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];  		c->src.val = *(c->src.ptr);  		goto xchg;  	case 0x9c: /* pushf */  		c->src.val =  (unsigned long) ctxt->eflags; -		emulate_push(ctxt); +		emulate_push(ctxt, ops);  		break;  	case 0x9d: /* popf */  		c->dst.type = OP_REG; @@ -2797,19 +2874,15 @@ special_insn:  		if (rc != X86EMUL_CONTINUE)  			goto done;  		break; -	case 0xa0 ... 0xa1:	/* mov */ -		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; -		c->dst.val = c->src.val; -		break; -	case 0xa2 ... 0xa3:	/* mov */ -		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; -		break; +	case 0xa0 ... 0xa3:	/* mov */  	case 0xa4 ... 0xa5:	/* movs */  		goto mov;  	case 0xa6 ... 0xa7:	/* cmps */  		c->dst.type = OP_NONE; /* Disable writeback. */  		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);  		goto cmp; +	case 0xa8 ... 0xa9:	/* test ax, imm */ +		goto test;  	case 0xaa ... 0xab:	/* stos */  		c->dst.val = c->regs[VCPU_REGS_RAX];  		break; @@ -2855,19 +2928,23 @@ special_insn:  		long int rel = c->src.val;  		c->src.val = (unsigned long) c->eip;  		jmp_rel(c, rel); -		emulate_push(ctxt); +		emulate_push(ctxt, ops);  		break;  	}  	case 0xe9: /* jmp rel */  		goto jmp; -	case 0xea: /* jmp far */ +	case 0xea: { /* jmp far */ +		unsigned short sel;  	jump_far: -		if (load_segment_descriptor(ctxt, ops, c->src2.val, -					    VCPU_SREG_CS)) +		memcpy(&sel, c->src.valptr + c->op_bytes, 2); + +		if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))  			goto done; -		c->eip = c->src.val; +		c->eip = 0; +		memcpy(&c->eip, c->src.valptr, c->op_bytes);  		break; +	}  	case 0xeb:  	      jmp:		/* jmp rel short */  		jmp_rel(c, c->src.val); @@ -2879,20 +2956,20 @@ special_insn:  	do_io_in:  		c->dst.bytes = min(c->dst.bytes, 4u);  		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			goto done;  		}  		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,  				     &c->dst.val))  			goto done; /* IO is needed */  		break; -	case 0xee: /* out al,dx */ -	case 0xef: /* out (e/r)ax,dx */ +	case 0xee: /* out dx,al */ +	case 0xef: /* out dx,(e/r)ax */  		c->src.val = c->regs[VCPU_REGS_RDX];  	do_io_out:  		c->dst.bytes = min(c->dst.bytes, 4u);  		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { -			kvm_inject_gp(ctxt->vcpu, 0); +			emulate_gp(ctxt, 0);  			goto done;  		}  		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, @@ -2916,18 +2993,20 @@ special_insn:  		c->dst.type = OP_NONE;	/* Disable writeback. */  		break;  	case 0xfa: /* cli */ -		if (emulator_bad_iopl(ctxt, ops)) -			kvm_inject_gp(ctxt->vcpu, 0); -		else { +		if (emulator_bad_iopl(ctxt, ops)) { +			emulate_gp(ctxt, 0); +			goto done; +		} else {  			ctxt->eflags &= ~X86_EFLAGS_IF;  			c->dst.type = OP_NONE;	/* Disable writeback. */  		}  		break;  	case 0xfb: /* sti */ -		if (emulator_bad_iopl(ctxt, ops)) -			kvm_inject_gp(ctxt->vcpu, 0); -		else { -			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); +		if (emulator_bad_iopl(ctxt, ops)) { +			emulate_gp(ctxt, 0); +			goto done; +		} else { +			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;  			ctxt->eflags |= X86_EFLAGS_IF;  			c->dst.type = OP_NONE;	/* Disable writeback. */  		} @@ -2964,11 +3043,12 @@ writeback:  	c->dst.type = saved_dst_type;  	if ((c->d & SrcMask) == SrcSI) -		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, -				&c->src); +		string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), +				VCPU_REGS_RSI, &c->src);  	if ((c->d & DstMask) == DstDI) -		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); +		string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, +				&c->dst);  	if (c->rep_prefix && (c->d & String)) {  		struct read_cache *rc = &ctxt->decode.io_read; @@ -2981,11 +3061,12 @@ writeback:  		    (rc->end != 0 && rc->end == rc->pos))  			ctxt->restart = false;  	} - -	/* Commit shadow register state. */ -	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); -	kvm_rip_write(ctxt->vcpu, c->eip); -	ops->set_rflags(ctxt->vcpu, ctxt->eflags); +	/* +	 * reset read cache here in case string instruction is restared +	 * without decoding +	 */ +	ctxt->decode.mem_read.end = 0; +	ctxt->eip = c->eip;  done:  	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; @@ -3051,7 +3132,7 @@ twobyte_insn:  			c->dst.type = OP_NONE;  			break;  		case 5: /* not defined */ -			kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +			emulate_ud(ctxt);  			goto done;  		case 7: /* invlpg*/  			emulate_invlpg(ctxt->vcpu, c->modrm_ea); @@ -3063,7 +3144,7 @@ twobyte_insn:  		}  		break;  	case 0x05: 		/* syscall */ -		rc = emulate_syscall(ctxt); +		rc = emulate_syscall(ctxt, ops);  		if (rc != X86EMUL_CONTINUE)  			goto done;  		else @@ -3073,8 +3154,11 @@ twobyte_insn:  		emulate_clts(ctxt->vcpu);  		c->dst.type = OP_NONE;  		break; -	case 0x08:		/* invd */  	case 0x09:		/* wbinvd */ +		kvm_emulate_wbinvd(ctxt->vcpu); +		c->dst.type = OP_NONE; +		break; +	case 0x08:		/* invd */  	case 0x0d:		/* GrpP (prefetch) */  	case 0x18:		/* Grp16 (prefetch/nop) */  		c->dst.type = OP_NONE; @@ -3084,7 +3168,7 @@ twobyte_insn:  		case 1:  		case 5 ... 7:  		case 9 ... 15: -			kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +			emulate_ud(ctxt);  			goto done;  		}  		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); @@ -3093,31 +3177,42 @@ twobyte_insn:  	case 0x21: /* mov from dr to reg */  		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&  		    (c->modrm_reg == 4 || c->modrm_reg == 5)) { -			kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +			emulate_ud(ctxt);  			goto done;  		} -		emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); +		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);  		c->dst.type = OP_NONE;	/* no writeback */  		break;  	case 0x22: /* mov reg, cr */ -		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); +		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { +			emulate_gp(ctxt, 0); +			goto done; +		}  		c->dst.type = OP_NONE;  		break;  	case 0x23: /* mov from reg to dr */  		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&  		    (c->modrm_reg == 4 || c->modrm_reg == 5)) { -			kvm_queue_exception(ctxt->vcpu, UD_VECTOR); +			emulate_ud(ctxt); +			goto done; +		} + +		if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & +				((ctxt->mode == X86EMUL_MODE_PROT64) ? +				 ~0ULL : ~0U), ctxt->vcpu) < 0) { +			/* #UD condition is already handled by the code above */ +			emulate_gp(ctxt, 0);  			goto done;  		} -		emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); +  		c->dst.type = OP_NONE;	/* no writeback */  		break;  	case 0x30:  		/* wrmsr */  		msr_data = (u32)c->regs[VCPU_REGS_RAX]  			| ((u64)c->regs[VCPU_REGS_RDX] << 32); -		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { -			kvm_inject_gp(ctxt->vcpu, 0); +		if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { +			emulate_gp(ctxt, 0);  			goto done;  		}  		rc = X86EMUL_CONTINUE; @@ -3125,8 +3220,8 @@ twobyte_insn:  		break;  	case 0x32:  		/* rdmsr */ -		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { -			kvm_inject_gp(ctxt->vcpu, 0); +		if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { +			emulate_gp(ctxt, 0);  			goto done;  		} else {  			c->regs[VCPU_REGS_RAX] = (u32)msr_data; @@ -3136,14 +3231,14 @@ twobyte_insn:  		c->dst.type = OP_NONE;  		break;  	case 0x34:		/* sysenter */ -		rc = emulate_sysenter(ctxt); +		rc = emulate_sysenter(ctxt, ops);  		if (rc != X86EMUL_CONTINUE)  			goto done;  		else  			goto writeback;  		break;  	case 0x35:		/* sysexit */ -		rc = emulate_sysexit(ctxt); +		rc = emulate_sysexit(ctxt, ops);  		if (rc != X86EMUL_CONTINUE)  			goto done;  		else @@ -3160,7 +3255,7 @@ twobyte_insn:  		c->dst.type = OP_NONE;  		break;  	case 0xa0:	  /* push fs */ -		emulate_push_sreg(ctxt, VCPU_SREG_FS); +		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);  		break;  	case 0xa1:	 /* pop fs */  		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -3179,7 +3274,7 @@ twobyte_insn:  		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);  		break;  	case 0xa8:	/* push gs */ -		emulate_push_sreg(ctxt, VCPU_SREG_GS); +		emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);  		break;  	case 0xa9:	/* pop gs */  		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0150affad25d..0fd6378981f4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -5,6 +5,7 @@   * Copyright (c) 2006 Intel Corporation   * Copyright (c) 2007 Keir Fraser, XenSource Inc   * Copyright (c) 2008 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates.   *   * Permission is hereby granted, free of charge, to any person obtaining a copy   * of this software and associated documentation files (the "Software"), to deal @@ -33,6 +34,7 @@  #include <linux/kvm_host.h>  #include <linux/slab.h> +#include <linux/workqueue.h>  #include "irq.h"  #include "i8254.h" @@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)  {  	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,  						 irq_ack_notifier); -	raw_spin_lock(&ps->inject_lock); -	if (atomic_dec_return(&ps->pit_timer.pending) < 0) +	int value; + +	spin_lock(&ps->inject_lock); +	value = atomic_dec_return(&ps->pit_timer.pending); +	if (value < 0) +		/* spurious acks can be generated if, for example, the +		 * PIC is being reset.  Handle it gracefully here +		 */  		atomic_inc(&ps->pit_timer.pending); +	else if (value > 0) +		/* in this case, we had multiple outstanding pit interrupts +		 * that we needed to inject.  Reinject +		 */ +		queue_work(ps->pit->wq, &ps->pit->expired);  	ps->irq_ack = 1; -	raw_spin_unlock(&ps->inject_lock); +	spin_unlock(&ps->inject_lock);  }  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)  		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);  } -static void destroy_pit_timer(struct kvm_timer *pt) +static void destroy_pit_timer(struct kvm_pit *pit)  { -	pr_debug("execute del timer!\n"); -	hrtimer_cancel(&pt->timer); +	hrtimer_cancel(&pit->pit_state.pit_timer.timer); +	cancel_work_sync(&pit->expired);  }  static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {  	.is_periodic = kpit_is_periodic,  }; +static void pit_do_work(struct work_struct *work) +{ +	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); +	struct kvm *kvm = pit->kvm; +	struct kvm_vcpu *vcpu; +	int i; +	struct kvm_kpit_state *ps = &pit->pit_state; +	int inject = 0; + +	/* Try to inject pending interrupts when +	 * last one has been acked. +	 */ +	spin_lock(&ps->inject_lock); +	if (ps->irq_ack) { +		ps->irq_ack = 0; +		inject = 1; +	} +	spin_unlock(&ps->inject_lock); +	if (inject) { +		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); +		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + +		/* +		 * Provides NMI watchdog support via Virtual Wire mode. +		 * The route is: PIT -> PIC -> LVT0 in NMI mode. +		 * +		 * Note: Our Virtual Wire implementation is simplified, only +		 * propagating PIT interrupts to all VCPUs when they have set +		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to +		 * VCPU0, and only if its LVT0 is in EXTINT mode. +		 */ +		if (kvm->arch.vapics_in_nmi_mode > 0) +			kvm_for_each_vcpu(i, vcpu, kvm) +				kvm_apic_nmi_wd_deliver(vcpu); +	} +} + +static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) +{ +	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); +	struct kvm_pit *pt = ktimer->kvm->arch.vpit; + +	if (ktimer->reinject || !atomic_read(&ktimer->pending)) { +		atomic_inc(&ktimer->pending); +		queue_work(pt->wq, &pt->expired); +	} + +	if (ktimer->t_ops->is_periodic(ktimer)) { +		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); +		return HRTIMER_RESTART; +	} else +		return HRTIMER_NORESTART; +} +  static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)  {  	struct kvm_timer *pt = &ps->pit_timer; @@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)  	/* TODO The new value only affected after the retriggered */  	hrtimer_cancel(&pt->timer); +	cancel_work_sync(&ps->pit->expired);  	pt->period = interval;  	ps->is_periodic = is_period; -	pt->timer.function = kvm_timer_fn; +	pt->timer.function = pit_timer_fn;  	pt->t_ops = &kpit_ops;  	pt->kvm = ps->pit->kvm; -	pt->vcpu = pt->kvm->bsp_vcpu;  	atomic_set(&pt->pending, 0);  	ps->irq_ack = 1; @@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)  		}  		break;  	default: -		destroy_pit_timer(&ps->pit_timer); +		destroy_pit_timer(kvm->arch.vpit);  	}  } @@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)  	mutex_init(&pit->pit_state.lock);  	mutex_lock(&pit->pit_state.lock); -	raw_spin_lock_init(&pit->pit_state.inject_lock); +	spin_lock_init(&pit->pit_state.inject_lock); + +	pit->wq = create_singlethread_workqueue("kvm-pit-wq"); +	if (!pit->wq) { +		mutex_unlock(&pit->pit_state.lock); +		kfree(pit); +		return NULL; +	} +	INIT_WORK(&pit->expired, pit_do_work);  	kvm->arch.vpit = pit;  	pit->kvm = kvm; @@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)  	struct hrtimer *timer;  	if (kvm->arch.vpit) { +		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); +		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, +					      &kvm->arch.vpit->speaker_dev);  		kvm_unregister_irq_mask_notifier(kvm, 0,  					       &kvm->arch.vpit->mask_notifier);  		kvm_unregister_irq_ack_notifier(kvm, @@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)  		mutex_lock(&kvm->arch.vpit->pit_state.lock);  		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;  		hrtimer_cancel(timer); +		cancel_work_sync(&kvm->arch.vpit->expired);  		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);  		mutex_unlock(&kvm->arch.vpit->pit_state.lock); +		destroy_workqueue(kvm->arch.vpit->wq);  		kfree(kvm->arch.vpit);  	}  } - -static void __inject_pit_timer_intr(struct kvm *kvm) -{ -	struct kvm_vcpu *vcpu; -	int i; - -	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); -	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - -	/* -	 * Provides NMI watchdog support via Virtual Wire mode. -	 * The route is: PIT -> PIC -> LVT0 in NMI mode. -	 * -	 * Note: Our Virtual Wire implementation is simplified, only -	 * propagating PIT interrupts to all VCPUs when they have set -	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to -	 * VCPU0, and only if its LVT0 is in EXTINT mode. -	 */ -	if (kvm->arch.vapics_in_nmi_mode > 0) -		kvm_for_each_vcpu(i, vcpu, kvm) -			kvm_apic_nmi_wd_deliver(vcpu); -} - -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) -{ -	struct kvm_pit *pit = vcpu->kvm->arch.vpit; -	struct kvm *kvm = vcpu->kvm; -	struct kvm_kpit_state *ps; - -	if (pit) { -		int inject = 0; -		ps = &pit->pit_state; - -		/* Try to inject pending interrupts when -		 * last one has been acked. -		 */ -		raw_spin_lock(&ps->inject_lock); -		if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { -			ps->irq_ack = 0; -			inject = 1; -		} -		raw_spin_unlock(&ps->inject_lock); -		if (inject) -			__inject_pit_timer_intr(kvm); -	} -} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 900d6b0ba7c2..46d08ca0b48f 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state {  	u32    speaker_data_on;  	struct mutex lock;  	struct kvm_pit *pit; -	raw_spinlock_t inject_lock; +	spinlock_t inject_lock;  	unsigned long irq_ack;  	struct kvm_irq_ack_notifier irq_ack_notifier;  }; @@ -40,6 +40,8 @@ struct kvm_pit {  	struct kvm_kpit_state pit_state;  	int irq_source_id;  	struct kvm_irq_mask_notifier mask_notifier; +	struct workqueue_struct *wq; +	struct work_struct expired;  };  #define KVM_PIT_BASE_ADDRESS	    0x40 diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 93825ff3338f..8d10c063d7f2 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -3,6 +3,7 @@   *   * Copyright (c) 2003-2004 Fabrice Bellard   * Copyright (c) 2007 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates.   *   * Permission is hereby granted, free of charge, to any person obtaining a copy   * of this software and associated documentation files (the "Software"), to deal @@ -33,6 +34,8 @@  #include <linux/kvm_host.h>  #include "trace.h" +static void pic_irq_request(struct kvm *kvm, int level); +  static void pic_lock(struct kvm_pic *s)  	__acquires(&s->lock)  { @@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)  	__releases(&s->lock)  {  	bool wakeup = s->wakeup_needed; -	struct kvm_vcpu *vcpu; +	struct kvm_vcpu *vcpu, *found = NULL; +	int i;  	s->wakeup_needed = false;  	raw_spin_unlock(&s->lock);  	if (wakeup) { -		vcpu = s->kvm->bsp_vcpu; -		if (vcpu) -			kvm_vcpu_kick(vcpu); +		kvm_for_each_vcpu(i, vcpu, s->kvm) { +			if (kvm_apic_accept_pic_intr(vcpu)) { +				found = vcpu; +				break; +			} +		} + +		if (!found) +			found = s->kvm->bsp_vcpu; + +		kvm_vcpu_kick(found);  	}  } @@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)  		pic_set_irq1(&s->pics[0], 2, 0);  	}  	irq = pic_get_irq(&s->pics[0]); -	if (irq >= 0) -		s->irq_request(s->irq_request_opaque, 1); -	else -		s->irq_request(s->irq_request_opaque, 0); +	pic_irq_request(s->kvm, irq >= 0);  }  void kvm_pic_update_irq(struct kvm_pic *s) @@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)  void kvm_pic_reset(struct kvm_kpic_state *s)  {  	int irq; -	struct kvm *kvm = s->pics_state->irq_request_opaque; -	struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; +	struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;  	u8 irr = s->irr, isr = s->imr;  	s->last_irr = 0; @@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)  			/*  			 * deassert a pending interrupt  			 */ -			s->pics_state->irq_request(s->pics_state-> -						   irq_request_opaque, 0); +			pic_irq_request(s->pics_state->kvm, 0);  			s->init_state = 1;  			s->init4 = val & 1;  			if (val & 0x02) @@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)  		}  	} else  		switch (s->init_state) { -		case 0:		/* normal mode */ +		case 0: { /* normal mode */ +			u8 imr_diff = s->imr ^ val, +				off = (s == &s->pics_state->pics[0]) ? 0 : 8;  			s->imr = val; +			for (irq = 0; irq < PIC_NUM_PINS/2; irq++) +				if (imr_diff & (1 << irq)) +					kvm_fire_mask_notifiers( +						s->pics_state->kvm, +						SELECT_PIC(irq + off), +						irq + off, +						!!(s->imr & (1 << irq)));  			pic_update_irq(s->pics_state);  			break; +		}  		case 1:  			s->irq_base = val & 0xf8;  			s->init_state = 2; @@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,  /*   * callback when PIC0 irq status changed   */ -static void pic_irq_request(void *opaque, int level) +static void pic_irq_request(struct kvm *kvm, int level)  { -	struct kvm *kvm = opaque;  	struct kvm_vcpu *vcpu = kvm->bsp_vcpu;  	struct kvm_pic *s = pic_irqchip(kvm);  	int irq = pic_get_irq(&s->pics[0]); @@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)  	s->kvm = kvm;  	s->pics[0].elcr_mask = 0xf8;  	s->pics[1].elcr_mask = 0xde; -	s->irq_request = pic_irq_request; -	s->irq_request_opaque = kvm;  	s->pics[0].pics_state = s;  	s->pics[1].pics_state = s; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 96dfbb6ad2a9..2095a049835e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -1,6 +1,7 @@  /*   * irq.c: API for in kernel interrupt controller   * Copyright (c) 2007, Intel Corporation. + * Copyright 2009 Red Hat, Inc. and/or its affilates.   *   * This program is free software; you can redistribute it and/or modify it   * under the terms and conditions of the GNU General Public License, @@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)  {  	kvm_inject_apic_timer_irqs(vcpu); -	kvm_inject_pit_timer_irqs(vcpu);  	/* TODO: PIT, RTC etc. */  }  EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index cd1f362f413d..ffed06871c5c 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -38,8 +38,6 @@  struct kvm;  struct kvm_vcpu; -typedef void irq_request_func(void *opaque, int level); -  struct kvm_kpic_state {  	u8 last_irr;	/* edge detection */  	u8 irr;		/* interrupt request register */ @@ -67,8 +65,6 @@ struct kvm_pic {  	unsigned pending_acks;  	struct kvm *kvm;  	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ -	irq_request_func *irq_request; -	void *irq_request_opaque;  	int output;		/* intr from master PIC */  	struct kvm_io_device dev;  	void (*ack_notifier)(void *opaque, int irq); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cff851cf5322..6491ac8e755b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)  static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)  { +	might_sleep();  /* on svm */ +  	if (!test_bit(VCPU_EXREG_PDPTR,  		      (unsigned long *)&vcpu->arch.regs_avail))  		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); @@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)  	return kvm_read_cr4_bits(vcpu, ~0UL);  } +static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) +{ +	return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) +		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); +} +  #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1eb7a4ae0c9c..77d8c0f4817d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -5,6 +5,7 @@   * Copyright (C) 2006 Qumranet, Inc.   * Copyright (C) 2007 Novell   * Copyright (C) 2007 Intel + * Copyright 2009 Red Hat, Inc. and/or its affilates.   *   * Authors:   *   Dor Laor <dor.laor@qumranet.com> @@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,  		   "dest_mode 0x%x, short_hand 0x%x\n",  		   target, source, dest, dest_mode, short_hand); -	ASSERT(!target); +	ASSERT(target);  	switch (short_hand) {  	case APIC_DEST_NOSHORT:  		if (dest_mode == 0) @@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)  	struct kvm_vcpu *vcpu = apic->vcpu;  	struct kvm_run *run = vcpu->run; -	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); +	kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);  	run->tpr_access.rip = kvm_rip_read(vcpu);  	run->tpr_access.is_write = write;  } @@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)  	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);  	int r = 0; -	if (kvm_vcpu_is_bsp(vcpu)) { -		if (!apic_hw_enabled(vcpu->arch.apic)) -			r = 1; -		if ((lvt0 & APIC_LVT_MASKED) == 0 && -		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) -			r = 1; -	} +	if (!apic_hw_enabled(vcpu->arch.apic)) +		r = 1; +	if ((lvt0 & APIC_LVT_MASKED) == 0 && +	    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) +		r = 1;  	return r;  } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1ed0a1a5913..311f6dad8951 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,6 +7,7 @@   * MMU support   *   * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates.   *   * Authors:   *   Yaniv Kamay  <yaniv@qumranet.com> @@ -32,6 +33,7 @@  #include <linux/compiler.h>  #include <linux/srcu.h>  #include <linux/slab.h> +#include <linux/uaccess.h>  #include <asm/page.h>  #include <asm/cmpxchg.h> @@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);  #define PT_FIRST_AVAIL_BITS_SHIFT 9  #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define VALID_PAGE(x) ((x) != INVALID_PAGE) -  #define PT64_LEVEL_BITS 9  #define PT64_LEVEL_SHIFT(level) \ @@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {  	     shadow_walk_okay(&(_walker));			\  	     shadow_walk_next(&(_walker))) -typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); +typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);  static struct kmem_cache *pte_chain_cache;  static struct kmem_cache *rmap_desc_cache; @@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte)  static void __set_spte(u64 *sptep, u64 spte)  { +	set_64bit(sptep, spte); +} + +static u64 __xchg_spte(u64 *sptep, u64 new_spte) +{  #ifdef CONFIG_X86_64 -	set_64bit((unsigned long *)sptep, spte); +	return xchg(sptep, new_spte);  #else -	set_64bit((unsigned long long *)sptep, spte); +	u64 old_spte; + +	do { +		old_spte = *sptep; +	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); + +	return old_spte;  #endif  } +static void update_spte(u64 *sptep, u64 new_spte) +{ +	u64 old_spte; + +	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || +	      !is_rmap_spte(*sptep)) +		__set_spte(sptep, new_spte); +	else { +		old_spte = __xchg_spte(sptep, new_spte); +		if (old_spte & shadow_accessed_mask) +			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); +	} +} +  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,  				  struct kmem_cache *base_cache, int min)  { @@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,  	return 0;  } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, +				  struct kmem_cache *cache)  {  	while (mc->nobjs) -		kfree(mc->objects[--mc->nobjs]); +		kmem_cache_free(cache, mc->objects[--mc->nobjs]);  }  static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -355,10 +381,11 @@ out:  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)  { -	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); -	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); +	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); +	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);  	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); -	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); +	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, +				mmu_page_header_cache);  }  static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)  static void mmu_free_pte_chain(struct kvm_pte_chain *pc)  { -	kfree(pc); +	kmem_cache_free(pte_chain_cache, pc);  }  static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) @@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)  static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)  { -	kfree(rd); +	kmem_cache_free(rmap_desc_cache, rd); +} + +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) +{ +	if (!sp->role.direct) +		return sp->gfns[index]; + +	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); +} + +static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +{ +	if (sp->role.direct) +		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); +	else +		sp->gfns[index] = gfn;  }  /* @@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn,  {  	unsigned long idx; -	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - -	      (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); +	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - +	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));  	return &slot->lpage_info[level - 2][idx].write_count;  } @@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)  	int *write_count;  	int i; -	gfn = unalias_gfn(kvm, gfn); - -	slot = gfn_to_memslot_unaliased(kvm, gfn); +	slot = gfn_to_memslot(kvm, gfn);  	for (i = PT_DIRECTORY_LEVEL;  	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {  		write_count   = slot_largepage_idx(gfn, slot, i); @@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)  	int *write_count;  	int i; -	gfn = unalias_gfn(kvm, gfn); -	slot = gfn_to_memslot_unaliased(kvm, gfn); +	slot = gfn_to_memslot(kvm, gfn);  	for (i = PT_DIRECTORY_LEVEL;  	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {  		write_count   = slot_largepage_idx(gfn, slot, i); @@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm,  	struct kvm_memory_slot *slot;  	int *largepage_idx; -	gfn = unalias_gfn(kvm, gfn); -	slot = gfn_to_memslot_unaliased(kvm, gfn); +	slot = gfn_to_memslot(kvm, gfn);  	if (slot) {  		largepage_idx = slot_largepage_idx(gfn, slot, level);  		return *largepage_idx; @@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)  /*   * Take gfn and return the reverse mapping to it. - * Note: gfn must be unaliased before this function get called   */  static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) @@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)  	if (likely(level == PT_PAGE_TABLE_LEVEL))  		return &slot->rmap[gfn - slot->base_gfn]; -	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - -		(slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); +	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - +		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));  	return &slot->lpage_info[level - 2][idx].rmap_pde;  } @@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)  	if (!is_rmap_spte(*spte))  		return count; -	gfn = unalias_gfn(vcpu->kvm, gfn);  	sp = page_header(__pa(spte)); -	sp->gfns[spte - sp->spt] = gfn; +	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);  	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);  	if (!*rmapp) {  		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); @@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)  	struct kvm_rmap_desc *desc;  	struct kvm_rmap_desc *prev_desc;  	struct kvm_mmu_page *sp; -	pfn_t pfn; +	gfn_t gfn;  	unsigned long *rmapp;  	int i; -	if (!is_rmap_spte(*spte)) -		return;  	sp = page_header(__pa(spte)); -	pfn = spte_to_pfn(*spte); -	if (*spte & shadow_accessed_mask) -		kvm_set_pfn_accessed(pfn); -	if (is_writable_pte(*spte)) -		kvm_set_pfn_dirty(pfn); -	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); +	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); +	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);  	if (!*rmapp) {  		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);  		BUG(); @@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)  	}  } +static void set_spte_track_bits(u64 *sptep, u64 new_spte) +{ +	pfn_t pfn; +	u64 old_spte = *sptep; + +	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || +	      old_spte & shadow_accessed_mask) { +		__set_spte(sptep, new_spte); +	} else +		old_spte = __xchg_spte(sptep, new_spte); + +	if (!is_rmap_spte(old_spte)) +		return; +	pfn = spte_to_pfn(old_spte); +	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) +		kvm_set_pfn_accessed(pfn); +	if (is_writable_pte(old_spte)) +		kvm_set_pfn_dirty(pfn); +} + +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ +	set_spte_track_bits(sptep, new_spte); +	rmap_remove(kvm, sptep); +} +  static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)  {  	struct kvm_rmap_desc *desc; @@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)  	u64 *spte;  	int i, write_protected = 0; -	gfn = unalias_gfn(kvm, gfn);  	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);  	spte = rmap_next(kvm, rmapp, NULL); @@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)  		BUG_ON(!(*spte & PT_PRESENT_MASK));  		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);  		if (is_writable_pte(*spte)) { -			__set_spte(spte, *spte & ~PT_WRITABLE_MASK); +			update_spte(spte, *spte & ~PT_WRITABLE_MASK);  			write_protected = 1;  		}  		spte = rmap_next(kvm, rmapp, spte); @@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)  			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));  			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);  			if (is_writable_pte(*spte)) { -				rmap_remove(kvm, spte); +				drop_spte(kvm, spte, +					  shadow_trap_nonpresent_pte);  				--kvm->stat.lpages; -				__set_spte(spte, shadow_trap_nonpresent_pte);  				spte = NULL;  				write_protected = 1;  			} @@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,  	while ((spte = rmap_next(kvm, rmapp, NULL))) {  		BUG_ON(!(*spte & PT_PRESENT_MASK));  		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); -		rmap_remove(kvm, spte); -		__set_spte(spte, shadow_trap_nonpresent_pte); +		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);  		need_tlb_flush = 1;  	}  	return need_tlb_flush; @@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,  		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);  		need_flush = 1;  		if (pte_write(*ptep)) { -			rmap_remove(kvm, spte); -			__set_spte(spte, shadow_trap_nonpresent_pte); +			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);  			spte = rmap_next(kvm, rmapp, NULL);  		} else {  			new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,  			new_spte &= ~PT_WRITABLE_MASK;  			new_spte &= ~SPTE_HOST_WRITEABLE; -			if (is_writable_pte(*spte)) -				kvm_set_pfn_dirty(spte_to_pfn(*spte)); -			__set_spte(spte, new_spte); +			new_spte &= ~shadow_accessed_mask; +			set_spte_track_bits(spte, new_spte);  			spte = rmap_next(kvm, rmapp, spte);  		}  	} @@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,  			ret = handler(kvm, &memslot->rmap[gfn_offset], data);  			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { -				int idx = gfn_offset; -				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); +				unsigned long idx; +				int sh; + +				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); +				idx = ((memslot->base_gfn+gfn_offset) >> sh) - +					(memslot->base_gfn >> sh);  				ret |= handler(kvm,  					&memslot->lpage_info[j][idx].rmap_pde,  					data); @@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)  	sp = page_header(__pa(spte)); -	gfn = unalias_gfn(vcpu->kvm, gfn);  	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);  	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); @@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt)  static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)  {  	ASSERT(is_empty_shadow_page(sp->spt)); +	hlist_del(&sp->hash_link);  	list_del(&sp->link);  	__free_page(virt_to_page(sp->spt)); -	__free_page(virt_to_page(sp->gfns)); -	kfree(sp); +	if (!sp->role.direct) +		__free_page(virt_to_page(sp->gfns)); +	kmem_cache_free(mmu_page_header_cache, sp);  	++kvm->arch.n_free_mmu_pages;  } @@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)  }  static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, -					       u64 *parent_pte) +					       u64 *parent_pte, int direct)  {  	struct kvm_mmu_page *sp;  	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);  	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); -	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); +	if (!direct) +		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, +						  PAGE_SIZE);  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);  	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);  	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); @@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,  	BUG();  } -  static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)  {  	struct kvm_pte_chain *pte_chain; @@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)  	if (!sp->multimapped && sp->parent_pte) {  		parent_sp = page_header(__pa(sp->parent_pte)); -		fn(parent_sp); -		mmu_parent_walk(parent_sp, fn); +		fn(parent_sp, sp->parent_pte);  		return;  	} +  	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)  		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { -			if (!pte_chain->parent_ptes[i]) +			u64 *spte = pte_chain->parent_ptes[i]; + +			if (!spte)  				break; -			parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); -			fn(parent_sp); -			mmu_parent_walk(parent_sp, fn); +			parent_sp = page_header(__pa(spte)); +			fn(parent_sp, spte);  		}  } -static void kvm_mmu_update_unsync_bitmap(u64 *spte) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)  { -	unsigned int index; -	struct kvm_mmu_page *sp = page_header(__pa(spte)); - -	index = spte - sp->spt; -	if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) -		sp->unsync_children++; -	WARN_ON(!sp->unsync_children); +	mmu_parent_walk(sp, mark_unsync);  } -static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)  { -	struct kvm_pte_chain *pte_chain; -	struct hlist_node *node; -	int i; +	unsigned int index; -	if (!sp->parent_pte) +	index = spte - sp->spt; +	if (__test_and_set_bit(index, sp->unsync_child_bitmap))  		return; - -	if (!sp->multimapped) { -		kvm_mmu_update_unsync_bitmap(sp->parent_pte); +	if (sp->unsync_children++)  		return; -	} - -	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) -		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { -			if (!pte_chain->parent_ptes[i]) -				break; -			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); -		} -} - -static int unsync_walk_fn(struct kvm_mmu_page *sp) -{ -	kvm_mmu_update_parents_unsync(sp); -	return 1; -} - -static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) -{ -	mmu_parent_walk(sp, unsync_walk_fn); -	kvm_mmu_update_parents_unsync(sp); +	kvm_mmu_mark_parents_unsync(sp);  }  static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, @@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,  }  static int nonpaging_sync_page(struct kvm_vcpu *vcpu, -			       struct kvm_mmu_page *sp) +			       struct kvm_mmu_page *sp, bool clear_unsync)  {  	return 1;  } @@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,  	int i, ret, nr_unsync_leaf = 0;  	for_each_unsync_children(sp->unsync_child_bitmap, i) { +		struct kvm_mmu_page *child;  		u64 ent = sp->spt[i]; -		if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { -			struct kvm_mmu_page *child; -			child = page_header(ent & PT64_BASE_ADDR_MASK); - -			if (child->unsync_children) { -				if (mmu_pages_add(pvec, child, i)) -					return -ENOSPC; - -				ret = __mmu_unsync_walk(child, pvec); -				if (!ret) -					__clear_bit(i, sp->unsync_child_bitmap); -				else if (ret > 0) -					nr_unsync_leaf += ret; -				else -					return ret; -			} +		if (!is_shadow_present_pte(ent) || is_large_pte(ent)) +			goto clear_child_bitmap; + +		child = page_header(ent & PT64_BASE_ADDR_MASK); + +		if (child->unsync_children) { +			if (mmu_pages_add(pvec, child, i)) +				return -ENOSPC; + +			ret = __mmu_unsync_walk(child, pvec); +			if (!ret) +				goto clear_child_bitmap; +			else if (ret > 0) +				nr_unsync_leaf += ret; +			else +				return ret; +		} else if (child->unsync) { +			nr_unsync_leaf++; +			if (mmu_pages_add(pvec, child, i)) +				return -ENOSPC; +		} else +			 goto clear_child_bitmap; -			if (child->unsync) { -				nr_unsync_leaf++; -				if (mmu_pages_add(pvec, child, i)) -					return -ENOSPC; -			} -		} +		continue; + +clear_child_bitmap: +		__clear_bit(i, sp->unsync_child_bitmap); +		sp->unsync_children--; +		WARN_ON((int)sp->unsync_children < 0);  	} -	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) -		sp->unsync_children = 0;  	return nr_unsync_leaf;  } @@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,  	return __mmu_unsync_walk(sp, pvec);  } -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) -{ -	unsigned index; -	struct hlist_head *bucket; -	struct kvm_mmu_page *sp; -	struct hlist_node *node; - -	pgprintk("%s: looking for gfn %lx\n", __func__, gfn); -	index = kvm_page_table_hashfn(gfn); -	bucket = &kvm->arch.mmu_page_hash[index]; -	hlist_for_each_entry(sp, node, bucket, hash_link) -		if (sp->gfn == gfn && !sp->role.direct -		    && !sp->role.invalid) { -			pgprintk("%s: found role %x\n", -				 __func__, sp->role.word); -			return sp; -		} -	return NULL; -} -  static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)  {  	WARN_ON(!sp->unsync); @@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)  	--kvm->stat.mmu_unsync;  } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, +				    struct list_head *invalid_list); +static void kvm_mmu_commit_zap_page(struct kvm *kvm, +				    struct list_head *invalid_list); -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +#define for_each_gfn_sp(kvm, sp, gfn, pos)				\ +  hlist_for_each_entry(sp, pos,						\ +   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\ +	if ((sp)->gfn != (gfn)) {} else + +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\ +  hlist_for_each_entry(sp, pos,						\ +   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\ +		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\ +			(sp)->role.invalid) {} else + +/* @sp->gfn should be write-protected at the call site */ +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +			   struct list_head *invalid_list, bool clear_unsync)  {  	if (sp->role.cr4_pae != !!is_pae(vcpu)) { -		kvm_mmu_zap_page(vcpu->kvm, sp); +		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);  		return 1;  	} -	if (rmap_write_protect(vcpu->kvm, sp->gfn)) -		kvm_flush_remote_tlbs(vcpu->kvm); -	kvm_unlink_unsync_page(vcpu->kvm, sp); -	if (vcpu->arch.mmu.sync_page(vcpu, sp)) { -		kvm_mmu_zap_page(vcpu->kvm, sp); +	if (clear_unsync) +		kvm_unlink_unsync_page(vcpu->kvm, sp); + +	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { +		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);  		return 1;  	} @@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  	return 0;  } +static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, +				   struct kvm_mmu_page *sp) +{ +	LIST_HEAD(invalid_list); +	int ret; + +	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); +	if (ret) +		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + +	return ret; +} + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +			 struct list_head *invalid_list) +{ +	return __kvm_sync_page(vcpu, sp, invalid_list, true); +} + +/* @gfn should be write-protected at the call site */ +static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn) +{ +	struct kvm_mmu_page *s; +	struct hlist_node *node; +	LIST_HEAD(invalid_list); +	bool flush = false; + +	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { +		if (!s->unsync) +			continue; + +		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); +		if ((s->role.cr4_pae != !!is_pae(vcpu)) || +			(vcpu->arch.mmu.sync_page(vcpu, s, true))) { +			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); +			continue; +		} +		kvm_unlink_unsync_page(vcpu->kvm, s); +		flush = true; +	} + +	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); +	if (flush) +		kvm_mmu_flush_tlb(vcpu); +} +  struct mmu_page_path {  	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];  	unsigned int idx[PT64_ROOT_LEVEL-1]; @@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,  	struct kvm_mmu_page *sp;  	struct mmu_page_path parents;  	struct kvm_mmu_pages pages; +	LIST_HEAD(invalid_list);  	kvm_mmu_pages_init(parent, &parents, &pages);  	while (mmu_unsync_walk(parent, &pages)) { @@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,  			kvm_flush_remote_tlbs(vcpu->kvm);  		for_each_sp(pages, sp, parents, i) { -			kvm_sync_page(vcpu, sp); +			kvm_sync_page(vcpu, sp, &invalid_list);  			mmu_pages_clear_parents(&parents);  		} +		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);  		cond_resched_lock(&vcpu->kvm->mmu_lock);  		kvm_mmu_pages_init(parent, &parents, &pages);  	} @@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  					     u64 *parent_pte)  {  	union kvm_mmu_page_role role; -	unsigned index;  	unsigned quadrant; -	struct hlist_head *bucket;  	struct kvm_mmu_page *sp; -	struct hlist_node *node, *tmp; +	struct hlist_node *node; +	bool need_sync = false;  	role = vcpu->arch.mmu.base_role;  	role.level = level; @@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,  	if (role.direct)  		role.cr4_pae = 0;  	role.access = access; -	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { +	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {  		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));  		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;  		role.quadrant = quadrant;  	} -	index = kvm_page_table_hashfn(gfn); -	bucket = &vcpu->kvm->arch.mmu_page_hash[index]; -	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) -		if (sp->gfn == gfn) { -			if (sp->unsync) -				if (kvm_sync_page(vcpu, sp)) -					continue; +	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { +		if (!need_sync && sp->unsync) +			need_sync = true; -			if (sp->role.word != role.word) -				continue; +		if (sp->role.word != role.word) +			continue; -			mmu_page_add_parent_pte(vcpu, sp, parent_pte); -			if (sp->unsync_children) { -				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); -				kvm_mmu_mark_parents_unsync(sp); -			} -			trace_kvm_mmu_get_page(sp, false); -			return sp; -		} +		if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) +			break; + +		mmu_page_add_parent_pte(vcpu, sp, parent_pte); +		if (sp->unsync_children) { +			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); +			kvm_mmu_mark_parents_unsync(sp); +		} else if (sp->unsync) +			kvm_mmu_mark_parents_unsync(sp); + +		trace_kvm_mmu_get_page(sp, false); +		return sp; +	}  	++vcpu->kvm->stat.mmu_cache_miss; -	sp = kvm_mmu_alloc_page(vcpu, parent_pte); +	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);  	if (!sp)  		return sp;  	sp->gfn = gfn;  	sp->role = role; -	hlist_add_head(&sp->hash_link, bucket); +	hlist_add_head(&sp->hash_link, +		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);  	if (!direct) {  		if (rmap_write_protect(vcpu->kvm, gfn))  			kvm_flush_remote_tlbs(vcpu->kvm); +		if (level > PT_PAGE_TABLE_LEVEL && need_sync) +			kvm_sync_pages(vcpu, gfn); +  		account_shadowed(vcpu->kvm, gfn);  	}  	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) @@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)  	--iterator->level;  } +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +{ +	u64 spte; + +	spte = __pa(sp->spt) +		| PT_PRESENT_MASK | PT_ACCESSED_MASK +		| PT_WRITABLE_MASK | PT_USER_MASK; +	__set_spte(sptep, spte); +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ +	if (is_large_pte(*sptep)) { +		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); +		kvm_flush_remote_tlbs(vcpu->kvm); +	} +} + +static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, +				   unsigned direct_access) +{ +	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { +		struct kvm_mmu_page *child; + +		/* +		 * For the direct sp, if the guest pte's dirty bit +		 * changed form clean to dirty, it will corrupt the +		 * sp's access: allow writable in the read-only sp, +		 * so we should update the spte at this point to get +		 * a new sp with the correct access. +		 */ +		child = page_header(*sptep & PT64_BASE_ADDR_MASK); +		if (child->role.access == direct_access) +			return; + +		mmu_page_remove_parent_pte(child, sptep); +		__set_spte(sptep, shadow_trap_nonpresent_pte); +		kvm_flush_remote_tlbs(vcpu->kvm); +	} +} +  static void kvm_mmu_page_unlink_children(struct kvm *kvm,  					 struct kvm_mmu_page *sp)  { @@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,  			} else {  				if (is_large_pte(ent))  					--kvm->stat.lpages; -				rmap_remove(kvm, &pt[i]); +				drop_spte(kvm, &pt[i], +					  shadow_trap_nonpresent_pte);  			}  		}  		pt[i] = shadow_trap_nonpresent_pte; @@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)  }  static int mmu_zap_unsync_children(struct kvm *kvm, -				   struct kvm_mmu_page *parent) +				   struct kvm_mmu_page *parent, +				   struct list_head *invalid_list)  {  	int i, zapped = 0;  	struct mmu_page_path parents; @@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,  		struct kvm_mmu_page *sp;  		for_each_sp(pages, sp, parents, i) { -			kvm_mmu_zap_page(kvm, sp); +			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);  			mmu_pages_clear_parents(&parents);  			zapped++;  		} @@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,  	return zapped;  } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, +				    struct list_head *invalid_list)  {  	int ret; -	trace_kvm_mmu_zap_page(sp); +	trace_kvm_mmu_prepare_zap_page(sp);  	++kvm->stat.mmu_shadow_zapped; -	ret = mmu_zap_unsync_children(kvm, sp); +	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);  	kvm_mmu_page_unlink_children(kvm, sp);  	kvm_mmu_unlink_parents(kvm, sp); -	kvm_flush_remote_tlbs(kvm);  	if (!sp->role.invalid && !sp->role.direct)  		unaccount_shadowed(kvm, sp->gfn);  	if (sp->unsync)  		kvm_unlink_unsync_page(kvm, sp);  	if (!sp->root_count) { -		hlist_del(&sp->hash_link); -		kvm_mmu_free_page(kvm, sp); +		/* Count self */ +		ret++; +		list_move(&sp->link, invalid_list);  	} else { -		sp->role.invalid = 1;  		list_move(&sp->link, &kvm->arch.active_mmu_pages);  		kvm_reload_remote_mmus(kvm);  	} + +	sp->role.invalid = 1;  	kvm_mmu_reset_last_pte_updated(kvm);  	return ret;  } +static void kvm_mmu_commit_zap_page(struct kvm *kvm, +				    struct list_head *invalid_list) +{ +	struct kvm_mmu_page *sp; + +	if (list_empty(invalid_list)) +		return; + +	kvm_flush_remote_tlbs(kvm); + +	do { +		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); +		WARN_ON(!sp->role.invalid || sp->root_count); +		kvm_mmu_free_page(kvm, sp); +	} while (!list_empty(invalid_list)); + +} +  /*   * Changing the number of mmu pages allocated to the vm   * Note: if kvm_nr_mmu_pages is too small, you will get dead lock @@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)  {  	int used_pages; +	LIST_HEAD(invalid_list);  	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;  	used_pages = max(0, used_pages); @@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)  			page = container_of(kvm->arch.active_mmu_pages.prev,  					    struct kvm_mmu_page, link); -			used_pages -= kvm_mmu_zap_page(kvm, page); -			used_pages--; +			used_pages -= kvm_mmu_prepare_zap_page(kvm, page, +							       &invalid_list);  		} +		kvm_mmu_commit_zap_page(kvm, &invalid_list);  		kvm_nr_mmu_pages = used_pages;  		kvm->arch.n_free_mmu_pages = 0;  	} @@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)  static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)  { -	unsigned index; -	struct hlist_head *bucket;  	struct kvm_mmu_page *sp; -	struct hlist_node *node, *n; +	struct hlist_node *node; +	LIST_HEAD(invalid_list);  	int r;  	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);  	r = 0; -	index = kvm_page_table_hashfn(gfn); -	bucket = &kvm->arch.mmu_page_hash[index]; -restart: -	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) -		if (sp->gfn == gfn && !sp->role.direct) { -			pgprintk("%s: gfn %lx role %x\n", __func__, gfn, -				 sp->role.word); -			r = 1; -			if (kvm_mmu_zap_page(kvm, sp)) -				goto restart; -		} + +	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { +		pgprintk("%s: gfn %lx role %x\n", __func__, gfn, +			 sp->role.word); +		r = 1; +		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); +	} +	kvm_mmu_commit_zap_page(kvm, &invalid_list);  	return r;  }  static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)  { -	unsigned index; -	struct hlist_head *bucket;  	struct kvm_mmu_page *sp; -	struct hlist_node *node, *nn; +	struct hlist_node *node; +	LIST_HEAD(invalid_list); -	index = kvm_page_table_hashfn(gfn); -	bucket = &kvm->arch.mmu_page_hash[index]; -restart: -	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { -		if (sp->gfn == gfn && !sp->role.direct -		    && !sp->role.invalid) { -			pgprintk("%s: zap %lx %x\n", -				 __func__, gfn, sp->role.word); -			if (kvm_mmu_zap_page(kvm, sp)) -				goto restart; -		} +	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { +		pgprintk("%s: zap %lx %x\n", +			 __func__, gfn, sp->role.word); +		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);  	} +	kvm_mmu_commit_zap_page(kvm, &invalid_list);  }  static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) @@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)  }  EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); -static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  { -	unsigned index; -	struct hlist_head *bucket; -	struct kvm_mmu_page *s; -	struct hlist_node *node, *n; - -	index = kvm_page_table_hashfn(sp->gfn); -	bucket = &vcpu->kvm->arch.mmu_page_hash[index]; -	/* don't unsync if pagetable is shadowed with multiple roles */ -	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { -		if (s->gfn != sp->gfn || s->role.direct) -			continue; -		if (s->role.word != sp->role.word) -			return 1; -	}  	trace_kvm_mmu_unsync_page(sp);  	++vcpu->kvm->stat.mmu_unsync;  	sp->unsync = 1;  	kvm_mmu_mark_parents_unsync(sp); -  	mmu_convert_notrap(sp); -	return 0; +} + +static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn) +{ +	struct kvm_mmu_page *s; +	struct hlist_node *node; + +	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { +		if (s->unsync) +			continue; +		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); +		__kvm_unsync_page(vcpu, s); +	}  }  static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,  				  bool can_unsync)  { -	struct kvm_mmu_page *shadow; +	struct kvm_mmu_page *s; +	struct hlist_node *node; +	bool need_unsync = false; -	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); -	if (shadow) { -		if (shadow->role.level != PT_PAGE_TABLE_LEVEL) +	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { +		if (!can_unsync)  			return 1; -		if (shadow->unsync) -			return 0; -		if (can_unsync && oos_shadow) -			return kvm_unsync_page(vcpu, shadow); -		return 1; + +		if (s->role.level != PT_PAGE_TABLE_LEVEL) +			return 1; + +		if (!need_unsync && !s->unsync) { +			if (!oos_shadow) +				return 1; +			need_unsync = true; +		}  	} +	if (need_unsync) +		kvm_unsync_pages(vcpu, gfn);  	return 0;  } @@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  	spte |= (u64)pfn << PAGE_SHIFT;  	if ((pte_access & ACC_WRITE_MASK) -	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) { +	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu) +		&& !user_fault)) {  		if (level > PT_PAGE_TABLE_LEVEL &&  		    has_wrprotected_page(vcpu->kvm, gfn, level)) {  			ret = 1; -			spte = shadow_trap_nonpresent_pte; -			goto set_pte; +			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); +			goto done;  		}  		spte |= PT_WRITABLE_MASK; @@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		mark_page_dirty(vcpu->kvm, gfn);  set_pte: -	__set_spte(sptep, spte); +	if (is_writable_pte(*sptep) && !is_writable_pte(spte)) +		kvm_set_pfn_dirty(pfn); +	update_spte(sptep, spte); +done:  	return ret;  } @@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  			 bool reset_host_protection)  {  	int was_rmapped = 0; -	int was_writable = is_writable_pte(*sptep);  	int rmap_count;  	pgprintk("%s: spte %llx access %x write_fault %d" @@ -1878,8 +2025,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		} else if (pfn != spte_to_pfn(*sptep)) {  			pgprintk("hfn old %lx new %lx\n",  				 spte_to_pfn(*sptep), pfn); -			rmap_remove(vcpu->kvm, sptep); -			__set_spte(sptep, shadow_trap_nonpresent_pte); +			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);  			kvm_flush_remote_tlbs(vcpu->kvm);  		} else  			was_rmapped = 1; @@ -1890,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  		      reset_host_protection)) {  		if (write_fault)  			*ptwrite = 1; -		kvm_x86_ops->tlb_flush(vcpu); +		kvm_mmu_flush_tlb(vcpu);  	}  	pgprintk("%s: setting spte %llx\n", __func__, *sptep); @@ -1904,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,  	page_header_update_slot(vcpu->kvm, sptep, gfn);  	if (!was_rmapped) {  		rmap_count = rmap_add(vcpu, sptep, gfn); -		kvm_release_pfn_clean(pfn);  		if (rmap_count > RMAP_RECYCLE_THRESHOLD)  			rmap_recycle(vcpu, sptep, gfn); -	} else { -		if (was_writable) -			kvm_release_pfn_dirty(pfn); -		else -			kvm_release_pfn_clean(pfn);  	} +	kvm_release_pfn_clean(pfn);  	if (speculative) {  		vcpu->arch.last_pte_updated = sptep;  		vcpu->arch.last_pte_gfn = gfn; @@ -1941,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,  		}  		if (*iterator.sptep == shadow_trap_nonpresent_pte) { -			pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; +			u64 base_addr = iterator.addr; + +			base_addr &= PT64_LVL_ADDR_MASK(iterator.level); +			pseudo_gfn = base_addr >> PAGE_SHIFT;  			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,  					      iterator.level - 1,  					      1, ACC_ALL, iterator.sptep); @@ -1960,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,  	return pt_write;  } +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +{ +	char buf[1]; +	void __user *hva; +	int r; + +	/* Touch the page, so send SIGBUS */ +	hva = (void __user *)gfn_to_hva(kvm, gfn); +	r = copy_from_user(buf, hva, 1); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ +	kvm_release_pfn_clean(pfn); +	if (is_hwpoison_pfn(pfn)) { +		kvm_send_hwpoison_signal(kvm, gfn); +		return 0; +	} else if (is_fault_pfn(pfn)) +		return -EFAULT; + +	return 1; +} +  static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)  {  	int r; @@ -1983,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)  	pfn = gfn_to_pfn(vcpu->kvm, gfn);  	/* mmio */ -	if (is_error_pfn(pfn)) { -		kvm_release_pfn_clean(pfn); -		return 1; -	} +	if (is_error_pfn(pfn)) +		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);  	spin_lock(&vcpu->kvm->mmu_lock);  	if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2009,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)  {  	int i;  	struct kvm_mmu_page *sp; +	LIST_HEAD(invalid_list);  	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))  		return; @@ -2018,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)  		sp = page_header(root);  		--sp->root_count; -		if (!sp->root_count && sp->role.invalid) -			kvm_mmu_zap_page(vcpu->kvm, sp); +		if (!sp->root_count && sp->role.invalid) { +			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); +			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); +		}  		vcpu->arch.mmu.root_hpa = INVALID_PAGE;  		spin_unlock(&vcpu->kvm->mmu_lock);  		return; @@ -2032,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)  			sp = page_header(root);  			--sp->root_count;  			if (!sp->root_count && sp->role.invalid) -				kvm_mmu_zap_page(vcpu->kvm, sp); +				kvm_mmu_prepare_zap_page(vcpu->kvm, sp, +							 &invalid_list);  		}  		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;  	} +	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);  	spin_unlock(&vcpu->kvm->mmu_lock);  	vcpu->arch.mmu.root_hpa = INVALID_PAGE;  } @@ -2045,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)  	int ret = 0;  	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { -		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); +		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);  		ret = 1;  	} @@ -2073,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)  			root_gfn = 0;  		}  		spin_lock(&vcpu->kvm->mmu_lock); +		kvm_mmu_free_some_pages(vcpu);  		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,  				      PT64_ROOT_LEVEL, direct,  				      ACC_ALL, NULL); @@ -2103,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)  			root_gfn = i << 30;  		}  		spin_lock(&vcpu->kvm->mmu_lock); +		kvm_mmu_free_some_pages(vcpu);  		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,  				      PT32_ROOT_LEVEL, direct,  				      ACC_ALL, NULL); @@ -2198,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,  	mmu_seq = vcpu->kvm->mmu_notifier_seq;  	smp_rmb();  	pfn = gfn_to_pfn(vcpu->kvm, gfn); -	if (is_error_pfn(pfn)) { -		kvm_release_pfn_clean(pfn); -		return 1; -	} +	if (is_error_pfn(pfn)) +		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);  	spin_lock(&vcpu->kvm->mmu_lock);  	if (mmu_notifier_retry(vcpu, mmu_seq))  		goto out_unlock; @@ -2243,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)  void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)  {  	++vcpu->stat.tlb_flush; -	kvm_x86_ops->tlb_flush(vcpu); +	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);  }  static void paging_new_cr3(struct kvm_vcpu *vcpu) @@ -2457,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)  static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)  {  	ASSERT(vcpu); -	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { +	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) +		/* mmu.free() should set root_hpa = INVALID_PAGE */  		vcpu->arch.mmu.free(vcpu); -		vcpu->arch.mmu.root_hpa = INVALID_PAGE; -	}  }  int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -2477,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)  	r = mmu_topup_memory_caches(vcpu);  	if (r)  		goto out; -	spin_lock(&vcpu->kvm->mmu_lock); -	kvm_mmu_free_some_pages(vcpu); -	spin_unlock(&vcpu->kvm->mmu_lock);  	r = mmu_alloc_roots(vcpu);  	spin_lock(&vcpu->kvm->mmu_lock);  	mmu_sync_roots(vcpu); @@ -2508,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,  	pte = *spte;  	if (is_shadow_present_pte(pte)) {  		if (is_last_spte(pte, sp->role.level)) -			rmap_remove(vcpu->kvm, spte); +			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);  		else {  			child = page_header(pte & PT64_BASE_ADDR_MASK);  			mmu_page_remove_parent_pte(child, spte); @@ -2529,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,  		return;          } +	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) +		return; +  	++vcpu->kvm->stat.mmu_pte_updated;  	if (!sp->role.cr4_pae)  		paging32_update_pte(vcpu, sp, spte, new); @@ -2549,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new)  	return (old & ~new & PT64_PERM_MASK) != 0;  } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, +				    bool remote_flush, bool local_flush)  { -	if (need_remote_flush(old, new)) +	if (zap_page) +		return; + +	if (remote_flush)  		kvm_flush_remote_tlbs(vcpu->kvm); -	else +	else if (local_flush)  		kvm_mmu_flush_tlb(vcpu);  } @@ -2603,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  		       bool guest_initiated)  {  	gfn_t gfn = gpa >> PAGE_SHIFT; +	union kvm_mmu_page_role mask = { .word = 0 };  	struct kvm_mmu_page *sp; -	struct hlist_node *node, *n; -	struct hlist_head *bucket; -	unsigned index; +	struct hlist_node *node; +	LIST_HEAD(invalid_list);  	u64 entry, gentry;  	u64 *spte;  	unsigned offset = offset_in_page(gpa); @@ -2619,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  	int npte;  	int r;  	int invlpg_counter; +	bool remote_flush, local_flush, zap_page; + +	zap_page = remote_flush = local_flush = false;  	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -2674,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,  			vcpu->arch.last_pte_updated = NULL;  		}  	} -	index = kvm_page_table_hashfn(gfn); -	bucket = &vcpu->kvm->arch.mmu_page_hash[index]; -restart: -	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { -		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) -			continue; +	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; +	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {  		pte_size = sp->role.cr4_pae ? 8 : 4;  		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);  		misaligned |= bytes < 4; @@ -2697,8 +2869,8 @@ restart:  			 */  			pgprintk("misaligned: gpa %llx bytes %d role %x\n",  				 gpa, bytes, sp->role.word); -			if (kvm_mmu_zap_page(vcpu->kvm, sp)) -				goto restart; +			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, +						     &invalid_list);  			++vcpu->kvm->stat.mmu_flooded;  			continue;  		} @@ -2722,16 +2894,22 @@ restart:  			if (quadrant != sp->role.quadrant)  				continue;  		} +		local_flush = true;  		spte = &sp->spt[page_offset / sizeof(*spte)];  		while (npte--) {  			entry = *spte;  			mmu_pte_write_zap_pte(vcpu, sp, spte); -			if (gentry) +			if (gentry && +			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word) +			      & mask.word))  				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); -			mmu_pte_write_flush_tlb(vcpu, entry, *spte); +			if (!remote_flush && need_remote_flush(entry, *spte)) +				remote_flush = true;  			++spte;  		}  	} +	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); +	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);  	kvm_mmu_audit(vcpu, "post pte write");  	spin_unlock(&vcpu->kvm->mmu_lock);  	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { @@ -2759,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);  void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)  { -	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && +	int free_pages; +	LIST_HEAD(invalid_list); + +	free_pages = vcpu->kvm->arch.n_free_mmu_pages; +	while (free_pages < KVM_REFILL_PAGES &&  	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {  		struct kvm_mmu_page *sp;  		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,  				  struct kvm_mmu_page, link); -		kvm_mmu_zap_page(vcpu->kvm, sp); +		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, +						       &invalid_list);  		++vcpu->kvm->stat.mmu_recycled;  	} +	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);  }  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -2795,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)  		return 1;  	case EMULATE_DO_MMIO:  		++vcpu->stat.mmio_exits; -		return 0; +		/* fall through */  	case EMULATE_FAIL: -		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; -		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; -		vcpu->run->internal.ndata = 0;  		return 0;  	default:  		BUG(); @@ -2896,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)  		pt = sp->spt;  		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)  			/* avoid RMW */ -			if (pt[i] & PT_WRITABLE_MASK) +			if (is_writable_pte(pt[i]))  				pt[i] &= ~PT_WRITABLE_MASK;  	}  	kvm_flush_remote_tlbs(kvm); @@ -2905,25 +3086,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)  void kvm_mmu_zap_all(struct kvm *kvm)  {  	struct kvm_mmu_page *sp, *node; +	LIST_HEAD(invalid_list);  	spin_lock(&kvm->mmu_lock);  restart:  	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) -		if (kvm_mmu_zap_page(kvm, sp)) +		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))  			goto restart; +	kvm_mmu_commit_zap_page(kvm, &invalid_list);  	spin_unlock(&kvm->mmu_lock); - -	kvm_flush_remote_tlbs(kvm);  } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, +					       struct list_head *invalid_list)  {  	struct kvm_mmu_page *page;  	page = container_of(kvm->arch.active_mmu_pages.prev,  			    struct kvm_mmu_page, link); -	return kvm_mmu_zap_page(kvm, page) + 1; +	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);  }  static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) @@ -2936,6 +3118,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)  	list_for_each_entry(kvm, &vm_list, vm_list) {  		int npages, idx, freed_pages; +		LIST_HEAD(invalid_list);  		idx = srcu_read_lock(&kvm->srcu);  		spin_lock(&kvm->mmu_lock); @@ -2943,12 +3126,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)  			 kvm->arch.n_free_mmu_pages;  		cache_count += npages;  		if (!kvm_freed && nr_to_scan > 0 && npages > 0) { -			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); +			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, +							  &invalid_list);  			cache_count -= freed_pages;  			kvm_freed = kvm;  		}  		nr_to_scan--; +		kvm_mmu_commit_zap_page(kvm, &invalid_list);  		spin_unlock(&kvm->mmu_lock);  		srcu_read_unlock(&kvm->srcu, idx);  	} @@ -3074,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,  static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)  { -	kvm_set_cr3(vcpu, vcpu->arch.cr3); +	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);  	return 1;  } @@ -3331,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)  	struct kvm_mmu_page *rev_sp;  	gfn_t gfn; -	if (*sptep & PT_WRITABLE_MASK) { +	if (is_writable_pte(*sptep)) {  		rev_sp = page_header(__pa(sptep)); -		gfn = rev_sp->gfns[sptep - rev_sp->spt]; +		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);  		if (!gfn_to_memslot(kvm, gfn)) {  			if (!printk_ratelimit()) @@ -3347,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)  			return;  		} -		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], -				    rev_sp->role.level); +		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);  		if (!*rmapp) {  			if (!printk_ratelimit())  				return; @@ -3381,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)  			if (!(ent & PT_PRESENT_MASK))  				continue; -			if (!(ent & PT_WRITABLE_MASK)) +			if (!is_writable_pte(ent))  				continue;  			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);  		} @@ -3409,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)  		if (sp->unsync)  			continue; -		gfn = unalias_gfn(vcpu->kvm, sp->gfn); -		slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); +		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);  		rmapp = &slot->rmap[gfn - slot->base_gfn];  		spte = rmap_next(vcpu->kvm, rmapp, NULL);  		while (spte) { -			if (*spte & PT_WRITABLE_MASK) +			if (is_writable_pte(*spte))  				printk(KERN_ERR "%s: (%s) shadow page has "  				"writable mappings: gfn %lx role %x\n",  			       __func__, audit_msg, sp->gfn, diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 42f07b1bfbc9..3aab0f0930ef 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,  	TP_ARGS(sp)  ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,  	TP_PROTO(struct kvm_mmu_page *sp),  	TP_ARGS(sp) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2331bdc2b549..51ef9097960d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -7,6 +7,7 @@   * MMU support   *   * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates.   *   * Authors:   *   Yaniv Kamay  <yaniv@qumranet.com> @@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,  {  	pt_element_t pte;  	gfn_t table_gfn; -	unsigned index, pt_access, pte_access; +	unsigned index, pt_access, uninitialized_var(pte_access);  	gpa_t pte_gpa; -	int rsvd_fault = 0; +	bool eperm, present, rsvd_fault;  	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,  				     fetch_fault);  walk: +	present = true; +	eperm = rsvd_fault = false;  	walker->level = vcpu->arch.mmu.root_level;  	pte = vcpu->arch.cr3;  #if PTTYPE == 64  	if (!is_long_mode(vcpu)) {  		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);  		trace_kvm_mmu_paging_element(pte, walker->level); -		if (!is_present_gpte(pte)) -			goto not_present; +		if (!is_present_gpte(pte)) { +			present = false; +			goto error; +		}  		--walker->level;  	}  #endif @@ -150,37 +155,42 @@ walk:  		walker->table_gfn[walker->level - 1] = table_gfn;  		walker->pte_gpa[walker->level - 1] = pte_gpa; -		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) -			goto not_present; +		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { +			present = false; +			break; +		}  		trace_kvm_mmu_paging_element(pte, walker->level); -		if (!is_present_gpte(pte)) -			goto not_present; +		if (!is_present_gpte(pte)) { +			present = false; +			break; +		} -		rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); -		if (rsvd_fault) -			goto access_error; +		if (is_rsvd_bits_set(vcpu, pte, walker->level)) { +			rsvd_fault = true; +			break; +		}  		if (write_fault && !is_writable_pte(pte))  			if (user_fault || is_write_protection(vcpu)) -				goto access_error; +				eperm = true;  		if (user_fault && !(pte & PT_USER_MASK)) -			goto access_error; +			eperm = true;  #if PTTYPE == 64  		if (fetch_fault && (pte & PT64_NX_MASK)) -			goto access_error; +			eperm = true;  #endif -		if (!(pte & PT_ACCESSED_MASK)) { +		if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {  			trace_kvm_mmu_set_accessed_bit(table_gfn, index,  						       sizeof(pte)); -			mark_page_dirty(vcpu->kvm, table_gfn);  			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,  			    index, pte, pte|PT_ACCESSED_MASK))  				goto walk; +			mark_page_dirty(vcpu->kvm, table_gfn);  			pte |= PT_ACCESSED_MASK;  		} @@ -213,15 +223,18 @@ walk:  		--walker->level;  	} +	if (!present || eperm || rsvd_fault) +		goto error; +  	if (write_fault && !is_dirty_gpte(pte)) {  		bool ret;  		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); -		mark_page_dirty(vcpu->kvm, table_gfn);  		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,  			    pte|PT_DIRTY_MASK);  		if (ret)  			goto walk; +		mark_page_dirty(vcpu->kvm, table_gfn);  		pte |= PT_DIRTY_MASK;  		walker->ptes[walker->level - 1] = pte;  	} @@ -229,22 +242,18 @@ walk:  	walker->pt_access = pt_access;  	walker->pte_access = pte_access;  	pgprintk("%s: pte %llx pte_access %x pt_access %x\n", -		 __func__, (u64)pte, pt_access, pte_access); +		 __func__, (u64)pte, pte_access, pt_access);  	return 1; -not_present: +error:  	walker->error_code = 0; -	goto err; - -access_error: -	walker->error_code = PFERR_PRESENT_MASK; - -err: +	if (present) +		walker->error_code |= PFERR_PRESENT_MASK;  	if (write_fault)  		walker->error_code |= PFERR_WRITE_MASK;  	if (user_fault)  		walker->error_code |= PFERR_USER_MASK; -	if (fetch_fault) +	if (fetch_fault && is_nx(vcpu))  		walker->error_code |= PFERR_FETCH_MASK;  	if (rsvd_fault)  		walker->error_code |= PFERR_RSVD_MASK; @@ -252,7 +261,7 @@ err:  	return 0;  } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,  			      u64 *spte, const void *pte)  {  	pt_element_t gpte; @@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,  	gpte = *(const pt_element_t *)pte;  	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {  		if (!is_present_gpte(gpte)) { -			if (page->unsync) +			if (sp->unsync)  				new_spte = shadow_trap_nonpresent_pte;  			else  				new_spte = shadow_notrap_nonpresent_pte; @@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,  		return;  	}  	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); -	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); +	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);  	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)  		return;  	pfn = vcpu->arch.update_pte.pfn; @@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,  	 * we call mmu_set_spte() with reset_host_protection = true beacuse that  	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).  	 */ -	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, -		     gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, +	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, +		     is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,  		     gpte_to_gfn(gpte), pfn, true, true);  } +static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, +				struct guest_walker *gw, int level) +{ +	int r; +	pt_element_t curr_pte; + +	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], +				  &curr_pte, sizeof(curr_pte)); +	return r || curr_pte != gw->ptes[level - 1]; +} +  /*   * Fetch a shadow pte for a specific level in the paging hierarchy.   */ @@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,  			 int *ptwrite, pfn_t pfn)  {  	unsigned access = gw->pt_access; -	struct kvm_mmu_page *shadow_page; -	u64 spte, *sptep = NULL; -	int direct; -	gfn_t table_gfn; -	int r; -	int level; -	pt_element_t curr_pte; -	struct kvm_shadow_walk_iterator iterator; +	struct kvm_mmu_page *sp = NULL; +	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); +	int top_level; +	unsigned direct_access; +	struct kvm_shadow_walk_iterator it;  	if (!is_present_gpte(gw->ptes[gw->level - 1]))  		return NULL; -	for_each_shadow_entry(vcpu, addr, iterator) { -		level = iterator.level; -		sptep = iterator.sptep; -		if (iterator.level == hlevel) { -			mmu_set_spte(vcpu, sptep, access, -				     gw->pte_access & access, -				     user_fault, write_fault, -				     gw->ptes[gw->level-1] & PT_DIRTY_MASK, -				     ptwrite, level, -				     gw->gfn, pfn, false, true); -			break; -		} +	direct_access = gw->pt_access & gw->pte_access; +	if (!dirty) +		direct_access &= ~ACC_WRITE_MASK; -		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) -			continue; +	top_level = vcpu->arch.mmu.root_level; +	if (top_level == PT32E_ROOT_LEVEL) +		top_level = PT32_ROOT_LEVEL; +	/* +	 * Verify that the top-level gpte is still there.  Since the page +	 * is a root page, it is either write protected (and cannot be +	 * changed from now on) or it is invalid (in which case, we don't +	 * really care if it changes underneath us after this point). +	 */ +	if (FNAME(gpte_changed)(vcpu, gw, top_level)) +		goto out_gpte_changed; -		if (is_large_pte(*sptep)) { -			rmap_remove(vcpu->kvm, sptep); -			__set_spte(sptep, shadow_trap_nonpresent_pte); -			kvm_flush_remote_tlbs(vcpu->kvm); -		} +	for (shadow_walk_init(&it, vcpu, addr); +	     shadow_walk_okay(&it) && it.level > gw->level; +	     shadow_walk_next(&it)) { +		gfn_t table_gfn; -		if (level <= gw->level) { -			int delta = level - gw->level + 1; -			direct = 1; -			if (!is_dirty_gpte(gw->ptes[level - delta])) -				access &= ~ACC_WRITE_MASK; -			table_gfn = gpte_to_gfn(gw->ptes[level - delta]); -			/* advance table_gfn when emulating 1gb pages with 4k */ -			if (delta == 0) -				table_gfn += PT_INDEX(addr, level); -			access &= gw->pte_access; -		} else { -			direct = 0; -			table_gfn = gw->table_gfn[level - 2]; -		} -		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, -					       direct, access, sptep); -		if (!direct) { -			r = kvm_read_guest_atomic(vcpu->kvm, -						  gw->pte_gpa[level - 2], -						  &curr_pte, sizeof(curr_pte)); -			if (r || curr_pte != gw->ptes[level - 2]) { -				kvm_mmu_put_page(shadow_page, sptep); -				kvm_release_pfn_clean(pfn); -				sptep = NULL; -				break; -			} +		drop_large_spte(vcpu, it.sptep); + +		sp = NULL; +		if (!is_shadow_present_pte(*it.sptep)) { +			table_gfn = gw->table_gfn[it.level - 2]; +			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, +					      false, access, it.sptep);  		} -		spte = __pa(shadow_page->spt) -			| PT_PRESENT_MASK | PT_ACCESSED_MASK -			| PT_WRITABLE_MASK | PT_USER_MASK; -		*sptep = spte; +		/* +		 * Verify that the gpte in the page we've just write +		 * protected is still there. +		 */ +		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) +			goto out_gpte_changed; + +		if (sp) +			link_shadow_page(it.sptep, sp);  	} -	return sptep; +	for (; +	     shadow_walk_okay(&it) && it.level > hlevel; +	     shadow_walk_next(&it)) { +		gfn_t direct_gfn; + +		validate_direct_spte(vcpu, it.sptep, direct_access); + +		drop_large_spte(vcpu, it.sptep); + +		if (is_shadow_present_pte(*it.sptep)) +			continue; + +		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + +		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, +				      true, direct_access, it.sptep); +		link_shadow_page(it.sptep, sp); +	} + +	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, +		     user_fault, write_fault, dirty, ptwrite, it.level, +		     gw->gfn, pfn, false, true); + +	return it.sptep; + +out_gpte_changed: +	if (sp) +		kvm_mmu_put_page(sp, it.sptep); +	kvm_release_pfn_clean(pfn); +	return NULL;  }  /* @@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,  	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);  	/* mmio */ -	if (is_error_pfn(pfn)) { -		pgprintk("gfn %lx is mmio\n", walker.gfn); -		kvm_release_pfn_clean(pfn); -		return 1; -	} +	if (is_error_pfn(pfn)) +		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);  	spin_lock(&vcpu->kvm->mmu_lock);  	if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,  	kvm_mmu_free_some_pages(vcpu);  	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,  			     level, &write_pt, pfn); +	(void)sptep;  	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,  		 sptep, *sptep, write_pt); @@ -464,6 +493,7 @@ out_unlock:  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)  {  	struct kvm_shadow_walk_iterator iterator; +	struct kvm_mmu_page *sp;  	gpa_t pte_gpa = -1;  	int level;  	u64 *sptep; @@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)  		level = iterator.level;  		sptep = iterator.sptep; +		sp = page_header(__pa(sptep));  		if (is_last_spte(*sptep, level)) { -			struct kvm_mmu_page *sp = page_header(__pa(sptep));  			int offset, shift; +			if (!sp->unsync) +				break; +  			shift = PAGE_SHIFT -  				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;  			offset = sp->role.quadrant << shift; @@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)  			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);  			if (is_shadow_present_pte(*sptep)) { -				rmap_remove(vcpu->kvm, sptep);  				if (is_large_pte(*sptep))  					--vcpu->kvm->stat.lpages; +				drop_spte(vcpu->kvm, sptep, +					  shadow_trap_nonpresent_pte);  				need_flush = 1; -			} -			__set_spte(sptep, shadow_trap_nonpresent_pte); +			} else +				__set_spte(sptep, shadow_trap_nonpresent_pte);  			break;  		} -		if (!is_shadow_present_pte(*sptep)) +		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)  			break;  	} @@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,   * Using the cached information from sp->gfns is safe because:   * - The spte has a reference to the struct page, so the pfn for a given gfn   *   can't change unless all sptes pointing to it are nuked first. - * - Alias changes zap the entire shadow cache.   */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +			    bool clear_unsync)  {  	int i, offset, nr_present;  	bool reset_host_protection; @@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  	offset = nr_present = 0; +	/* direct kvm_mmu_page can not be unsync. */ +	BUG_ON(sp->role.direct); +  	if (PTTYPE == 32)  		offset = sp->role.quadrant << PT64_LEVEL_BITS; @@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  		unsigned pte_access;  		pt_element_t gpte;  		gpa_t pte_gpa; -		gfn_t gfn = sp->gfns[i]; +		gfn_t gfn;  		if (!is_shadow_present_pte(sp->spt[i]))  			continue; @@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)  					  sizeof(pt_element_t)))  			return -EINVAL; -		if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || -		    !(gpte & PT_ACCESSED_MASK)) { +		gfn = gpte_to_gfn(gpte); +		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) +		      || gfn != sp->gfns[i] || !is_present_gpte(gpte) +		      || !(gpte & PT_ACCESSED_MASK)) {  			u64 nonpresent; -			rmap_remove(vcpu->kvm, &sp->spt[i]); -			if (is_present_gpte(gpte)) +			if (is_present_gpte(gpte) || !clear_unsync)  				nonpresent = shadow_trap_nonpresent_pte;  			else  				nonpresent = shadow_notrap_nonpresent_pte; -			__set_spte(&sp->spt[i], nonpresent); +			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);  			continue;  		} diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd26..bc5b9b8d4a33 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4,6 +4,7 @@   * AMD SVM support   *   * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates.   *   * Authors:   *   Yaniv Kamay  <yaniv@qumranet.com> @@ -130,7 +131,7 @@ static struct svm_direct_access_msrs {  	u32 index;   /* Index of the MSR */  	bool always; /* True if intercept is always on */  } direct_access_msrs[] = { -	{ .index = MSR_K6_STAR,				.always = true  }, +	{ .index = MSR_STAR,				.always = true  },  	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },  #ifdef CONFIG_X86_64  	{ .index = MSR_GS_BASE,				.always = true  }, @@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)  static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)  { +	vcpu->arch.efer = efer;  	if (!npt_enabled && !(efer & EFER_LMA))  		efer &= ~EFER_LME;  	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; -	vcpu->arch.efer = efer;  }  static int is_external_interrupt(u32 info) @@ -383,8 +384,7 @@ static void svm_init_erratum_383(void)  	int err;  	u64 val; -	/* Only Fam10h is affected */ -	if (boot_cpu_data.x86 != 0x10) +	if (!cpu_has_amd_erratum(amd_erratum_383))  		return;  	/* Use _safe variants to not break nested virtualization */ @@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)  	if (nested) {  		printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); -		kvm_enable_efer_bits(EFER_SVME); +		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);  	}  	for_each_possible_cpu(cpu) { @@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm)  	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.  	 */  	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; -	kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); +	(void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);  	save->cr4 = X86_CR4_PAE;  	/* rdx = ?? */ @@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)  	svm->asid_generation = 0;  	init_vmcb(svm); -	fx_init(&svm->vcpu); +	err = fx_init(&svm->vcpu); +	if (err) +		goto free_page4; +  	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;  	if (kvm_vcpu_is_bsp(&svm->vcpu))  		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;  	return &svm->vcpu; +free_page4: +	__free_page(hsave_page);  free_page3:  	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);  free_page2: @@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)  		 */  		pr_err("KVM: Guest triggered AMD Erratum 383\n"); -		set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); +		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);  		return;  	} @@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm)  	string = (io_info & SVM_IOIO_STR_MASK) != 0;  	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;  	if (string || in) -		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); +		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;  	port = io_info >> 16;  	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)  		svm->vmcb->save.cr3 = hsave->save.cr3;  		svm->vcpu.arch.cr3 = hsave->save.cr3;  	} else { -		kvm_set_cr3(&svm->vcpu, hsave->save.cr3); +		(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);  	}  	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);  	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); @@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)  		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;  		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;  	} else -		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); +		(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);  	/* Guest paging mode is active - reset mmu */  	kvm_mmu_reset_context(&svm->vcpu); @@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm)  static int invlpg_interception(struct vcpu_svm *svm)  { -	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) -		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); -	return 1; +	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;  }  static int emulate_on_interception(struct vcpu_svm *svm)  { -	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) -		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); -	return 1; +	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;  }  static int cr8_write_interception(struct vcpu_svm *svm) @@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)  		*data = tsc_offset + native_read_tsc();  		break;  	} -	case MSR_K6_STAR: +	case MSR_STAR:  		*data = svm->vmcb->save.star;  		break;  #ifdef CONFIG_X86_64 @@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)  		break;  	} -	case MSR_K6_STAR: +	case MSR_STAR:  		svm->vmcb->save.star = data;  		break;  #ifdef CONFIG_X86_64 @@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {  	[SVM_EXIT_NPF]				= pf_interception,  }; +void dump_vmcb(struct kvm_vcpu *vcpu) +{ +	struct vcpu_svm *svm = to_svm(vcpu); +	struct vmcb_control_area *control = &svm->vmcb->control; +	struct vmcb_save_area *save = &svm->vmcb->save; + +	pr_err("VMCB Control Area:\n"); +	pr_err("cr_read:            %04x\n", control->intercept_cr_read); +	pr_err("cr_write:           %04x\n", control->intercept_cr_write); +	pr_err("dr_read:            %04x\n", control->intercept_dr_read); +	pr_err("dr_write:           %04x\n", control->intercept_dr_write); +	pr_err("exceptions:         %08x\n", control->intercept_exceptions); +	pr_err("intercepts:         %016llx\n", control->intercept); +	pr_err("pause filter count: %d\n", control->pause_filter_count); +	pr_err("iopm_base_pa:       %016llx\n", control->iopm_base_pa); +	pr_err("msrpm_base_pa:      %016llx\n", control->msrpm_base_pa); +	pr_err("tsc_offset:         %016llx\n", control->tsc_offset); +	pr_err("asid:               %d\n", control->asid); +	pr_err("tlb_ctl:            %d\n", control->tlb_ctl); +	pr_err("int_ctl:            %08x\n", control->int_ctl); +	pr_err("int_vector:         %08x\n", control->int_vector); +	pr_err("int_state:          %08x\n", control->int_state); +	pr_err("exit_code:          %08x\n", control->exit_code); +	pr_err("exit_info1:         %016llx\n", control->exit_info_1); +	pr_err("exit_info2:         %016llx\n", control->exit_info_2); +	pr_err("exit_int_info:      %08x\n", control->exit_int_info); +	pr_err("exit_int_info_err:  %08x\n", control->exit_int_info_err); +	pr_err("nested_ctl:         %lld\n", control->nested_ctl); +	pr_err("nested_cr3:         %016llx\n", control->nested_cr3); +	pr_err("event_inj:          %08x\n", control->event_inj); +	pr_err("event_inj_err:      %08x\n", control->event_inj_err); +	pr_err("lbr_ctl:            %lld\n", control->lbr_ctl); +	pr_err("next_rip:           %016llx\n", control->next_rip); +	pr_err("VMCB State Save Area:\n"); +	pr_err("es:   s: %04x a: %04x l: %08x b: %016llx\n", +		save->es.selector, save->es.attrib, +		save->es.limit, save->es.base); +	pr_err("cs:   s: %04x a: %04x l: %08x b: %016llx\n", +		save->cs.selector, save->cs.attrib, +		save->cs.limit, save->cs.base); +	pr_err("ss:   s: %04x a: %04x l: %08x b: %016llx\n", +		save->ss.selector, save->ss.attrib, +		save->ss.limit, save->ss.base); +	pr_err("ds:   s: %04x a: %04x l: %08x b: %016llx\n", +		save->ds.selector, save->ds.attrib, +		save->ds.limit, save->ds.base); +	pr_err("fs:   s: %04x a: %04x l: %08x b: %016llx\n", +		save->fs.selector, save->fs.attrib, +		save->fs.limit, save->fs.base); +	pr_err("gs:   s: %04x a: %04x l: %08x b: %016llx\n", +		save->gs.selector, save->gs.attrib, +		save->gs.limit, save->gs.base); +	pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", +		save->gdtr.selector, save->gdtr.attrib, +		save->gdtr.limit, save->gdtr.base); +	pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", +		save->ldtr.selector, save->ldtr.attrib, +		save->ldtr.limit, save->ldtr.base); +	pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", +		save->idtr.selector, save->idtr.attrib, +		save->idtr.limit, save->idtr.base); +	pr_err("tr:   s: %04x a: %04x l: %08x b: %016llx\n", +		save->tr.selector, save->tr.attrib, +		save->tr.limit, save->tr.base); +	pr_err("cpl:            %d                efer:         %016llx\n", +		save->cpl, save->efer); +	pr_err("cr0:            %016llx cr2:          %016llx\n", +		save->cr0, save->cr2); +	pr_err("cr3:            %016llx cr4:          %016llx\n", +		save->cr3, save->cr4); +	pr_err("dr6:            %016llx dr7:          %016llx\n", +		save->dr6, save->dr7); +	pr_err("rip:            %016llx rflags:       %016llx\n", +		save->rip, save->rflags); +	pr_err("rsp:            %016llx rax:          %016llx\n", +		save->rsp, save->rax); +	pr_err("star:           %016llx lstar:        %016llx\n", +		save->star, save->lstar); +	pr_err("cstar:          %016llx sfmask:       %016llx\n", +		save->cstar, save->sfmask); +	pr_err("kernel_gs_base: %016llx sysenter_cs:  %016llx\n", +		save->kernel_gs_base, save->sysenter_cs); +	pr_err("sysenter_esp:   %016llx sysenter_eip: %016llx\n", +		save->sysenter_esp, save->sysenter_eip); +	pr_err("gpat:           %016llx dbgctl:       %016llx\n", +		save->g_pat, save->dbgctl); +	pr_err("br_from:        %016llx br_to:        %016llx\n", +		save->br_from, save->br_to); +	pr_err("excp_from:      %016llx excp_to:      %016llx\n", +		save->last_excp_from, save->last_excp_to); + +} +  static int handle_exit(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); @@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)  		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;  		kvm_run->fail_entry.hardware_entry_failure_reason  			= svm->vmcb->control.exit_code; +		pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); +		dump_vmcb(vcpu);  		return 0;  	} @@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)  {  	struct vmcb_control_area *control; -	trace_kvm_inj_virq(irq); - -	++svm->vcpu.stat.irq_injections;  	control = &svm->vmcb->control;  	control->int_vector = irq;  	control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)  	BUG_ON(!(gif_set(svm))); +	trace_kvm_inj_virq(vcpu->arch.interrupt.nr); +	++vcpu->stat.irq_injections; +  	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |  		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;  } @@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void)  	return false;  } +static bool svm_has_wbinvd_exit(void) +{ +	return true; +} +  static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)  {  	struct vcpu_svm *svm = to_svm(vcpu); @@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = {  	.rdtscp_supported = svm_rdtscp_supported,  	.set_supported_cpuid = svm_set_supported_cpuid, + +	.has_wbinvd_exit = svm_has_wbinvd_exit,  };  static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 4ddadb1a5ffe..e16a0dbe74d8 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -1,3 +1,17 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * timer support + * + * Copyright 2010 Red Hat, Inc. and/or its affilates. + * + * This work is licensed under the terms of the GNU GPL, version 2.  See + * the COPYING file in the top-level directory. + */ +  #include <linux/kvm_host.h>  #include <linux/kvm.h>  #include <linux/hrtimer.h> @@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)  	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {  		atomic_inc(&ktimer->pending);  		/* FIXME: this code should not know anything about vcpus */ -		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); +		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);  	}  	if (waitqueue_active(q)) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe78..49b25eee25ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5,6 +5,7 @@   * machines without emulation or binary translation.   *   * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates.   *   * Authors:   *   Avi Kivity   <avi@qumranet.com> @@ -36,6 +37,8 @@  #include <asm/vmx.h>  #include <asm/virtext.h>  #include <asm/mce.h> +#include <asm/i387.h> +#include <asm/xcr.h>  #include "trace.h" @@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,  static int __read_mostly emulate_invalid_guest_state = 0;  module_param(emulate_invalid_guest_state, bool, S_IRUGO); +static int __read_mostly vmm_exclusive = 1; +module_param(vmm_exclusive, bool, S_IRUGO); +  #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\  	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)  #define KVM_GUEST_CR0_MASK						\ @@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)  static int init_rmode(struct kvm *kvm);  static u64 construct_eptp(unsigned long root_hpa); +static void kvm_cpu_vmxon(u64 addr); +static void kvm_cpu_vmxoff(void);  static DEFINE_PER_CPU(struct vmcs *, vmxarea);  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);  static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +static DEFINE_PER_CPU(struct desc_ptr, host_gdt);  static unsigned long *vmx_io_bitmap_a;  static unsigned long *vmx_io_bitmap_b; @@ -231,14 +240,14 @@ static u64 host_efer;  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);  /* - * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it   * away by decrementing the array size.   */  static const u32 vmx_msr_index[] = {  #ifdef CONFIG_X86_64  	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,  #endif -	MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, +	MSR_EFER, MSR_TSC_AUX, MSR_STAR,  };  #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)  	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;  } +static inline bool cpu_has_vmx_ept_4levels(void) +{ +	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} +  static inline bool cpu_has_vmx_invept_individual_addr(void)  {  	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)  	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;  } +static inline bool cpu_has_vmx_invvpid_single(void) +{ +	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invvpid_global(void) +{ +	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} +  static inline bool cpu_has_vmx_ept(void)  {  	return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)  	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;  } +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ +	return vmcs_config.cpu_based_2nd_exec_ctrl & +		SECONDARY_EXEC_WBINVD_EXITING; +} +  static inline bool report_flexpriority(void)  {  	return flexpriority_enabled; @@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)  		       vmcs, phys_addr);  } +static void vmcs_load(struct vmcs *vmcs) +{ +	u64 phys_addr = __pa(vmcs); +	u8 error; + +	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" +			: "=g"(error) : "a"(&phys_addr), "m"(phys_addr) +			: "cc", "memory"); +	if (error) +		printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", +		       vmcs, phys_addr); +} +  static void __vcpu_clear(void *arg)  {  	struct vcpu_vmx *vmx = arg; @@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)  	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);  } -static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)  {  	if (vmx->vpid == 0)  		return; -	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +	if (cpu_has_vmx_invvpid_single()) +		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +} + +static inline void vpid_sync_vcpu_global(void) +{ +	if (cpu_has_vmx_invvpid_global()) +		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(struct vcpu_vmx *vmx) +{ +	if (cpu_has_vmx_invvpid_single()) +		vpid_sync_vcpu_single(vmx); +	else +		vpid_sync_vcpu_global();  }  static inline void ept_sync_global(void) @@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)  		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);  	}  #endif +	if (current_thread_info()->status & TS_USEDFPU) +		clts(); +	load_gdt(&__get_cpu_var(host_gdt));  }  static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)  static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); -	u64 phys_addr = __pa(vmx->vmcs);  	u64 tsc_this, delta, new_offset; +	u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); -	if (vcpu->cpu != cpu) { +	if (!vmm_exclusive) +		kvm_cpu_vmxon(phys_addr); +	else if (vcpu->cpu != cpu)  		vcpu_clear(vmx); -		kvm_migrate_timers(vcpu); -		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); -		local_irq_disable(); -		list_add(&vmx->local_vcpus_link, -			 &per_cpu(vcpus_on_cpu, cpu)); -		local_irq_enable(); -	}  	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { -		u8 error; -  		per_cpu(current_vmcs, cpu) = vmx->vmcs; -		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" -			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) -			      : "cc"); -		if (error) -			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", -			       vmx->vmcs, phys_addr); +		vmcs_load(vmx->vmcs);  	}  	if (vcpu->cpu != cpu) {  		struct desc_ptr dt;  		unsigned long sysenter_esp; +		kvm_migrate_timers(vcpu); +		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); +		local_irq_disable(); +		list_add(&vmx->local_vcpus_link, +			 &per_cpu(vcpus_on_cpu, cpu)); +		local_irq_enable(); +  		vcpu->cpu = cpu;  		/*  		 * Linux uses per-cpu TSS and GDT, so set these when switching @@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)  {  	__vmx_load_host_state(to_vmx(vcpu)); +	if (!vmm_exclusive) { +		__vcpu_clear(to_vmx(vcpu)); +		kvm_cpu_vmxoff(); +	}  }  static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)  		if (index >= 0 && vmx->rdtscp_enabled)  			move_msr_up(vmx, index, save_nmsrs++);  		/* -		 * MSR_K6_STAR is only needed on long mode guests, and only +		 * MSR_STAR is only needed on long mode guests, and only  		 * if efer.sce is enabled.  		 */ -		index = __find_msr_index(vmx, MSR_K6_STAR); +		index = __find_msr_index(vmx, MSR_STAR);  		if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))  			move_msr_up(vmx, index, save_nmsrs++);  	} @@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)  	/* locked but not enabled */  } +static void kvm_cpu_vmxon(u64 addr) +{ +	asm volatile (ASM_VMX_VMXON_RAX +			: : "a"(&addr), "m"(addr) +			: "memory", "cc"); +} +  static int hardware_enable(void *garbage)  {  	int cpu = raw_smp_processor_id(); @@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)  		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);  	}  	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ -	asm volatile (ASM_VMX_VMXON_RAX -		      : : "a"(&phys_addr), "m"(phys_addr) -		      : "memory", "cc"); -	ept_sync_global(); +	if (vmm_exclusive) { +		kvm_cpu_vmxon(phys_addr); +		ept_sync_global(); +	} + +	store_gdt(&__get_cpu_var(host_gdt));  	return 0;  } @@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)  static void kvm_cpu_vmxoff(void)  {  	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); -	write_cr4(read_cr4() & ~X86_CR4_VMXE);  }  static void hardware_disable(void *garbage)  { -	vmclear_local_vcpus(); -	kvm_cpu_vmxoff(); +	if (vmm_exclusive) { +		vmclear_local_vcpus(); +		kvm_cpu_vmxoff(); +	} +	write_cr4(read_cr4() & ~X86_CR4_VMXE);  }  static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)  	if (!cpu_has_vmx_vpid())  		enable_vpid = 0; -	if (!cpu_has_vmx_ept()) { +	if (!cpu_has_vmx_ept() || +	    !cpu_has_vmx_ept_4levels()) {  		enable_ept = 0;  		enable_unrestricted_guest = 0;  	} @@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)  		gfn_t base_gfn;  		slots = kvm_memslots(kvm); -		base_gfn = kvm->memslots->memslots[0].base_gfn + +		base_gfn = slots->memslots[0].base_gfn +  				 kvm->memslots->memslots[0].npages - 3;  		return base_gfn << PAGE_SHIFT;  	} @@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)  static void vmx_flush_tlb(struct kvm_vcpu *vcpu)  { -	vpid_sync_vcpu_all(to_vmx(vcpu)); -	if (enable_ept) +	vpid_sync_context(to_vmx(vcpu)); +	if (enable_ept) { +		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) +			return;  		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); +	}  }  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);  	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */ -	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */ +	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */  	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */  	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */ @@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)  static int init_rmode(struct kvm *kvm)  { +	int idx, ret = 0; + +	idx = srcu_read_lock(&kvm->srcu);  	if (!init_rmode_tss(kvm)) -		return 0; +		goto exit;  	if (!init_rmode_identity_map(kvm)) -		return 0; -	return 1; +		goto exit; + +	ret = 1; +exit: +	srcu_read_unlock(&kvm->srcu, idx); +	return ret;  }  static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	u64 msr; -	int ret, idx; +	int ret;  	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); -	idx = srcu_read_lock(&vcpu->kvm->srcu);  	if (!init_rmode(vmx->vcpu.kvm)) {  		ret = -ENOMEM;  		goto out; @@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  		msr |= MSR_IA32_APICBASE_BSP;  	kvm_set_apic_base(&vmx->vcpu, msr); -	fx_init(&vmx->vcpu); +	ret = fx_init(&vmx->vcpu); +	if (ret != 0) +		goto out;  	seg_setup(VCPU_SREG_CS);  	/* @@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  	vmx_fpu_activate(&vmx->vcpu);  	update_exception_bitmap(&vmx->vcpu); -	vpid_sync_vcpu_all(vmx); +	vpid_sync_context(vmx);  	ret = 0; @@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)  	vmx->emulation_required = 0;  out: -	srcu_read_unlock(&vcpu->kvm->srcu, idx);  	return ret;  } @@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)  {  	if (!cpu_has_virtual_nmis())  		return to_vmx(vcpu)->soft_vnmi_blocked; -	else -		return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & -			  GUEST_INTR_STATE_NMI); +	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;  }  static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)  	++vcpu->stat.io_exits;  	if (string || in) -		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); +		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;  	port = exit_qualification >> 16;  	size = (exit_qualification & 7) + 1; @@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)  	hypercall[2] = 0xc1;  } +static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) +{ +	if (err) +		kvm_inject_gp(vcpu, 0); +	else +		skip_emulated_instruction(vcpu); +} +  static int handle_cr(struct kvm_vcpu *vcpu)  {  	unsigned long exit_qualification, val;  	int cr;  	int reg; +	int err;  	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);  	cr = exit_qualification & 15; @@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)  		trace_kvm_cr_write(cr, val);  		switch (cr) {  		case 0: -			kvm_set_cr0(vcpu, val); -			skip_emulated_instruction(vcpu); +			err = kvm_set_cr0(vcpu, val); +			complete_insn_gp(vcpu, err);  			return 1;  		case 3: -			kvm_set_cr3(vcpu, val); -			skip_emulated_instruction(vcpu); +			err = kvm_set_cr3(vcpu, val); +			complete_insn_gp(vcpu, err);  			return 1;  		case 4: -			kvm_set_cr4(vcpu, val); -			skip_emulated_instruction(vcpu); +			err = kvm_set_cr4(vcpu, val); +			complete_insn_gp(vcpu, err);  			return 1;  		case 8: {  				u8 cr8_prev = kvm_get_cr8(vcpu); @@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)  static int handle_wbinvd(struct kvm_vcpu *vcpu)  {  	skip_emulated_instruction(vcpu); -	/* TODO: Add support for VT-d/pass-through device */ +	kvm_emulate_wbinvd(vcpu);  	return 1;  } -static int handle_apic_access(struct kvm_vcpu *vcpu) +static int handle_xsetbv(struct kvm_vcpu *vcpu)  { -	unsigned long exit_qualification; -	enum emulation_result er; -	unsigned long offset; +	u64 new_bv = kvm_read_edx_eax(vcpu); +	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); -	exit_qualification = vmcs_readl(EXIT_QUALIFICATION); -	offset = exit_qualification & 0xffful; - -	er = emulate_instruction(vcpu, 0, 0, 0); - -	if (er !=  EMULATE_DONE) { -		printk(KERN_ERR -		       "Fail to handle apic access vmexit! Offset is 0x%lx\n", -		       offset); -		return -ENOEXEC; -	} +	if (kvm_set_xcr(vcpu, index, new_bv) == 0) +		skip_emulated_instruction(vcpu);  	return 1;  } +static int handle_apic_access(struct kvm_vcpu *vcpu) +{ +	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; +} +  static int handle_task_switch(struct kvm_vcpu *vcpu)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  			goto out;  		} -		if (err != EMULATE_DONE) { -			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; -			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; -			vcpu->run->internal.ndata = 0; -			ret = 0; -			goto out; -		} +		if (err != EMULATE_DONE) +			return 0;  		if (signal_pending(current))  			goto out; @@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {  	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,  	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,  	[EXIT_REASON_WBINVD]                  = handle_wbinvd, +	[EXIT_REASON_XSETBV]                  = handle_xsetbv,  	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,  	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,  	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation, @@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)  	if (enable_ept && is_paging(vcpu))  		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); +	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { +		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; +		vcpu->run->fail_entry.hardware_entry_failure_reason +			= exit_reason; +		return 0; +	} +  	if (unlikely(vmx->fail)) {  		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;  		vcpu->run->fail_entry.hardware_entry_failure_reason @@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)  	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)  		vmx_set_interrupt_shadow(vcpu, 0); -	/* -	 * Loading guest fpu may have cleared host cr0.ts -	 */ -	vmcs_writel(HOST_CR0, read_cr0()); -  	asm(  		/* Store host registers */  		"push %%"R"dx; push %%"R"bp;" @@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)  	kmem_cache_free(kvm_vcpu_cache, vmx);  } +static inline void vmcs_init(struct vmcs *vmcs) +{ +	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); + +	if (!vmm_exclusive) +		kvm_cpu_vmxon(phys_addr); + +	vmcs_clear(vmcs); + +	if (!vmm_exclusive) +		kvm_cpu_vmxoff(); +} +  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)  {  	int err; @@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)  	if (!vmx->vmcs)  		goto free_msrs; -	vmcs_clear(vmx->vmcs); +	vmcs_init(vmx->vmcs);  	cpu = get_cpu();  	vmx_vcpu_load(&vmx->vcpu, cpu); @@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {  	.rdtscp_supported = vmx_rdtscp_supported,  	.set_supported_cpuid = vmx_set_supported_cpuid, + +	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,  };  static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fa89c39c64f..25f19078b321 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,6 +6,7 @@   * Copyright (C) 2006 Qumranet, Inc.   * Copyright (C) 2008 Qumranet, Inc.   * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affilates.   *   * Authors:   *   Avi Kivity   <avi@qumranet.com> @@ -41,17 +42,19 @@  #include <linux/srcu.h>  #include <linux/slab.h>  #include <linux/perf_event.h> +#include <linux/uaccess.h>  #include <trace/events/kvm.h>  #define CREATE_TRACE_POINTS  #include "trace.h"  #include <asm/debugreg.h> -#include <asm/uaccess.h>  #include <asm/msr.h>  #include <asm/desc.h>  #include <asm/mtrr.h>  #include <asm/mce.h> +#include <asm/i387.h> +#include <asm/xcr.h>  #define MAX_IO_MSRS 256  #define CR0_RESERVED_BITS						\ @@ -62,6 +65,7 @@  	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\  			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\  			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\ +			  | X86_CR4_OSXSAVE \  			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {  	{ NULL }  }; +u64 __read_mostly host_xcr0; + +static inline u32 bit(int bitno) +{ +	return 1 << (bitno & 31); +} +  static void kvm_on_user_return(struct user_return_notifier *urn)  {  	unsigned slot; @@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,  	prev_nr = vcpu->arch.exception.nr;  	if (prev_nr == DF_VECTOR) {  		/* triple fault -> shutdown */ -		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); +		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);  		return;  	}  	class1 = exception_class(prev_nr); @@ -414,121 +425,163 @@ out:  	return changed;  } -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)  { +	unsigned long old_cr0 = kvm_read_cr0(vcpu); +	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | +				    X86_CR0_CD | X86_CR0_NW; +  	cr0 |= X86_CR0_ET;  #ifdef CONFIG_X86_64 -	if (cr0 & 0xffffffff00000000UL) { -		kvm_inject_gp(vcpu, 0); -		return; -	} +	if (cr0 & 0xffffffff00000000UL) +		return 1;  #endif  	cr0 &= ~CR0_RESERVED_BITS; -	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { -		kvm_inject_gp(vcpu, 0); -		return; -	} +	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) +		return 1; -	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { -		kvm_inject_gp(vcpu, 0); -		return; -	} +	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) +		return 1;  	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {  #ifdef CONFIG_X86_64  		if ((vcpu->arch.efer & EFER_LME)) {  			int cs_db, cs_l; -			if (!is_pae(vcpu)) { -				kvm_inject_gp(vcpu, 0); -				return; -			} +			if (!is_pae(vcpu)) +				return 1;  			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); -			if (cs_l) { -				kvm_inject_gp(vcpu, 0); -				return; - -			} +			if (cs_l) +				return 1;  		} else  #endif -		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { -			kvm_inject_gp(vcpu, 0); -			return; -		} - +		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) +			return 1;  	}  	kvm_x86_ops->set_cr0(vcpu, cr0); -	kvm_mmu_reset_context(vcpu); -	return; +	if ((cr0 ^ old_cr0) & update_bits) +		kvm_mmu_reset_context(vcpu); +	return 0;  }  EXPORT_SYMBOL_GPL(kvm_set_cr0);  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)  { -	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); +	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));  }  EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)  { -	unsigned long old_cr4 = kvm_read_cr4(vcpu); -	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; +	u64 xcr0; -	if (cr4 & CR4_RESERVED_BITS) { +	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */ +	if (index != XCR_XFEATURE_ENABLED_MASK) +		return 1; +	xcr0 = xcr; +	if (kvm_x86_ops->get_cpl(vcpu) != 0) +		return 1; +	if (!(xcr0 & XSTATE_FP)) +		return 1; +	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) +		return 1; +	if (xcr0 & ~host_xcr0) +		return 1; +	vcpu->arch.xcr0 = xcr0; +	vcpu->guest_xcr0_loaded = 0; +	return 0; +} + +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ +	if (__kvm_set_xcr(vcpu, index, xcr)) {  		kvm_inject_gp(vcpu, 0); +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_xcr); + +static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ +	struct kvm_cpuid_entry2 *best; + +	best = kvm_find_cpuid_entry(vcpu, 1, 0); +	return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static void update_cpuid(struct kvm_vcpu *vcpu) +{ +	struct kvm_cpuid_entry2 *best; + +	best = kvm_find_cpuid_entry(vcpu, 1, 0); +	if (!best)  		return; + +	/* Update OSXSAVE bit */ +	if (cpu_has_xsave && best->function == 0x1) { +		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); +		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) +			best->ecx |= bit(X86_FEATURE_OSXSAVE);  	} +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ +	unsigned long old_cr4 = kvm_read_cr4(vcpu); +	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + +	if (cr4 & CR4_RESERVED_BITS) +		return 1; + +	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) +		return 1;  	if (is_long_mode(vcpu)) { -		if (!(cr4 & X86_CR4_PAE)) { -			kvm_inject_gp(vcpu, 0); -			return; -		} +		if (!(cr4 & X86_CR4_PAE)) +			return 1;  	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)  		   && ((cr4 ^ old_cr4) & pdptr_bits) -		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) { -		kvm_inject_gp(vcpu, 0); -		return; -	} +		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) +		return 1; + +	if (cr4 & X86_CR4_VMXE) +		return 1; -	if (cr4 & X86_CR4_VMXE) { -		kvm_inject_gp(vcpu, 0); -		return; -	}  	kvm_x86_ops->set_cr4(vcpu, cr4); -	vcpu->arch.cr4 = cr4; -	kvm_mmu_reset_context(vcpu); + +	if ((cr4 ^ old_cr4) & pdptr_bits) +		kvm_mmu_reset_context(vcpu); + +	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) +		update_cpuid(vcpu); + +	return 0;  }  EXPORT_SYMBOL_GPL(kvm_set_cr4); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)  {  	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {  		kvm_mmu_sync_roots(vcpu);  		kvm_mmu_flush_tlb(vcpu); -		return; +		return 0;  	}  	if (is_long_mode(vcpu)) { -		if (cr3 & CR3_L_MODE_RESERVED_BITS) { -			kvm_inject_gp(vcpu, 0); -			return; -		} +		if (cr3 & CR3_L_MODE_RESERVED_BITS) +			return 1;  	} else {  		if (is_pae(vcpu)) { -			if (cr3 & CR3_PAE_RESERVED_BITS) { -				kvm_inject_gp(vcpu, 0); -				return; -			} -			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { -				kvm_inject_gp(vcpu, 0); -				return; -			} +			if (cr3 & CR3_PAE_RESERVED_BITS) +				return 1; +			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) +				return 1;  		}  		/*  		 * We don't check reserved bits in nonpae mode, because @@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)  	 * to debug) behavior on the guest side.  	 */  	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) -		kvm_inject_gp(vcpu, 0); -	else { -		vcpu->arch.cr3 = cr3; -		vcpu->arch.mmu.new_cr3(vcpu); -	} +		return 1; +	vcpu->arch.cr3 = cr3; +	vcpu->arch.mmu.new_cr3(vcpu); +	return 0;  }  EXPORT_SYMBOL_GPL(kvm_set_cr3); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)  { -	if (cr8 & CR8_RESERVED_BITS) { -		kvm_inject_gp(vcpu, 0); -		return; -	} +	if (cr8 & CR8_RESERVED_BITS) +		return 1;  	if (irqchip_in_kernel(vcpu->kvm))  		kvm_lapic_set_tpr(vcpu, cr8);  	else  		vcpu->arch.cr8 = cr8; +	return 0; +} + +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ +	if (__kvm_set_cr8(vcpu, cr8)) +		kvm_inject_gp(vcpu, 0);  }  EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)  }  EXPORT_SYMBOL_GPL(kvm_get_cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)  {  	switch (dr) {  	case 0 ... 3: @@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)  			vcpu->arch.eff_db[dr] = val;  		break;  	case 4: -		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { -			kvm_queue_exception(vcpu, UD_VECTOR); -			return 1; -		} +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) +			return 1; /* #UD */  		/* fall through */  	case 6: -		if (val & 0xffffffff00000000ULL) { -			kvm_inject_gp(vcpu, 0); -			return 1; -		} +		if (val & 0xffffffff00000000ULL) +			return -1; /* #GP */  		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;  		break;  	case 5: -		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { -			kvm_queue_exception(vcpu, UD_VECTOR); -			return 1; -		} +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) +			return 1; /* #UD */  		/* fall through */  	default: /* 7 */ -		if (val & 0xffffffff00000000ULL) { -			kvm_inject_gp(vcpu, 0); -			return 1; -		} +		if (val & 0xffffffff00000000ULL) +			return -1; /* #GP */  		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;  		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {  			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); @@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)  	return 0;  } + +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ +	int res; + +	res = __kvm_set_dr(vcpu, dr, val); +	if (res > 0) +		kvm_queue_exception(vcpu, UD_VECTOR); +	else if (res < 0) +		kvm_inject_gp(vcpu, 0); + +	return res; +}  EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)  {  	switch (dr) {  	case 0 ... 3:  		*val = vcpu->arch.db[dr];  		break;  	case 4: -		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { -			kvm_queue_exception(vcpu, UD_VECTOR); +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))  			return 1; -		}  		/* fall through */  	case 6:  		*val = vcpu->arch.dr6;  		break;  	case 5: -		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { -			kvm_queue_exception(vcpu, UD_VECTOR); +		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))  			return 1; -		}  		/* fall through */  	default: /* 7 */  		*val = vcpu->arch.dr7; @@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)  	return 0;  } -EXPORT_SYMBOL_GPL(kvm_get_dr); -static inline u32 bit(int bitno) +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)  { -	return 1 << (bitno & 31); +	if (_kvm_get_dr(vcpu, dr, val)) { +		kvm_queue_exception(vcpu, UD_VECTOR); +		return 1; +	} +	return 0;  } +EXPORT_SYMBOL_GPL(kvm_get_dr);  /*   * List of msr numbers which we expose to userspace through KVM_GET_MSRS @@ -671,7 +733,7 @@ static u32 msrs_to_save[] = {  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,  	HV_X64_MSR_APIC_ASSIST_PAGE,  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, -	MSR_K6_STAR, +	MSR_STAR,  #ifdef CONFIG_X86_64  	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,  #endif @@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;  static u32 emulated_msrs[] = {  	MSR_IA32_MISC_ENABLE, +	MSR_IA32_MCG_STATUS, +	MSR_IA32_MCG_CTL,  };  static int set_efer(struct kvm_vcpu *vcpu, u64 efer)  { +	u64 old_efer = vcpu->arch.efer; +  	if (efer & efer_reserved_bits)  		return 1; @@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)  	kvm_x86_ops->set_efer(vcpu, efer); -	vcpu->arch.efer = efer; -  	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;  	kvm_mmu_reset_context(vcpu); +	/* Update reserved bits */ +	if ((efer ^ old_efer) & EFER_NX) +		kvm_mmu_reset_context(vcpu); +  	return 0;  } @@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)  	if (!vcpu->time_page)  		return 0; -	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); +	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);  	return 1;  } @@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,  {  	int i, idx; -	vcpu_load(vcpu); -  	idx = srcu_read_lock(&vcpu->kvm->srcu);  	for (i = 0; i < msrs->nmsrs; ++i)  		if (do_msr(vcpu, entries[i].index, &entries[i].data))  			break;  	srcu_read_unlock(&vcpu->kvm->srcu, idx); -	vcpu_put(vcpu); -  	return i;  } @@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)  	case KVM_CAP_PCI_SEGMENT:  	case KVM_CAP_DEBUGREGS:  	case KVM_CAP_X86_ROBUST_SINGLESTEP: +	case KVM_CAP_XSAVE:  		r = 1;  		break;  	case KVM_CAP_COALESCED_MMIO: @@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)  	case KVM_CAP_MCE:  		r = KVM_MAX_MCE_BANKS;  		break; +	case KVM_CAP_XCRS: +		r = cpu_has_xsave; +		break;  	default:  		r = 0;  		break; @@ -1717,8 +1785,28 @@ out:  	return r;  } +static void wbinvd_ipi(void *garbage) +{ +	wbinvd(); +} + +static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ +	return vcpu->kvm->arch.iommu_domain && +		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); +} +  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  { +	/* Address WBINVD may be executed by guest */ +	if (need_emulate_wbinvd(vcpu)) { +		if (kvm_x86_ops->has_wbinvd_exit()) +			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); +		else if (vcpu->cpu != -1 && vcpu->cpu != cpu) +			smp_call_function_single(vcpu->cpu, +					wbinvd_ipi, NULL, 1); +	} +  	kvm_x86_ops->vcpu_load(vcpu, cpu);  	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {  		unsigned long khz = cpufreq_quick_get(cpu); @@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)  { -	kvm_put_guest_fpu(vcpu);  	kvm_x86_ops->vcpu_put(vcpu); +	kvm_put_guest_fpu(vcpu);  }  static int is_efer_nx(void) @@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,  	if (copy_from_user(cpuid_entries, entries,  			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))  		goto out_free; -	vcpu_load(vcpu);  	for (i = 0; i < cpuid->nent; i++) {  		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;  		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,  	r = 0;  	kvm_apic_set_version(vcpu);  	kvm_x86_ops->cpuid_update(vcpu); -	vcpu_put(vcpu); +	update_cpuid(vcpu);  out_free:  	vfree(cpuid_entries); @@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,  	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,  			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))  		goto out; -	vcpu_load(vcpu);  	vcpu->arch.cpuid_nent = cpuid->nent;  	kvm_apic_set_version(vcpu);  	kvm_x86_ops->cpuid_update(vcpu); -	vcpu_put(vcpu); +	update_cpuid(vcpu);  	return 0;  out: @@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,  {  	int r; -	vcpu_load(vcpu);  	r = -E2BIG;  	if (cpuid->nent < vcpu->arch.cpuid_nent)  		goto out; @@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,  out:  	cpuid->nent = vcpu->arch.cpuid_nent; -	vcpu_put(vcpu);  	return r;  } @@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);  	/* cpuid 1.ecx */  	const u32 kvm_supported_word4_x86_features = -		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | +		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |  		0 /* DS-CPL, VMX, SMX, EST */ |  		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |  		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |  		0 /* Reserved, DCA */ | F(XMM4_1) |  		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | -		0 /* Reserved, XSAVE, OSXSAVE */; +		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);  	/* cpuid 0x80000001.ecx */  	const u32 kvm_supported_word6_x86_features =  		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | @@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  	switch (function) {  	case 0: -		entry->eax = min(entry->eax, (u32)0xb); +		entry->eax = min(entry->eax, (u32)0xd);  		break;  	case 1:  		entry->edx &= kvm_supported_word0_x86_features; @@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,  		}  		break;  	} +	case 0xd: { +		int i; + +		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; +		for (i = 1; *nent < maxnent; ++i) { +			if (entry[i - 1].eax == 0 && i != 2) +				break; +			do_cpuid_1_ent(&entry[i], function, i); +			entry[i].flags |= +			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX; +			++*nent; +		} +		break; +	}  	case KVM_CPUID_SIGNATURE: {  		char signature[12] = "KVMKVMKVM\0\0";  		u32 *sigptr = (u32 *)signature; @@ -2081,9 +2179,7 @@ out:  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,  				    struct kvm_lapic_state *s)  { -	vcpu_load(vcpu);  	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); -	vcpu_put(vcpu);  	return 0;  } @@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,  static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,  				    struct kvm_lapic_state *s)  { -	vcpu_load(vcpu);  	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);  	kvm_apic_post_state_restore(vcpu);  	update_cr8_intercept(vcpu); -	vcpu_put(vcpu);  	return 0;  } @@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,  		return -EINVAL;  	if (irqchip_in_kernel(vcpu->kvm))  		return -ENXIO; -	vcpu_load(vcpu);  	kvm_queue_interrupt(vcpu, irq->irq, false); -	vcpu_put(vcpu); -  	return 0;  }  static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)  { -	vcpu_load(vcpu);  	kvm_inject_nmi(vcpu); -	vcpu_put(vcpu);  	return 0;  } @@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,  	int r;  	unsigned bank_num = mcg_cap & 0xff, bank; -	vcpu_load(vcpu);  	r = -EINVAL;  	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)  		goto out; @@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,  	for (bank = 0; bank < bank_num; bank++)  		vcpu->arch.mce_banks[bank*4] = ~(u64)0;  out: -	vcpu_put(vcpu);  	return r;  } @@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,  			printk(KERN_DEBUG "kvm: set_mce: "  			       "injects mce exception while "  			       "previous one is in progress!\n"); -			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); +			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);  			return 0;  		}  		if (banks[1] & MCI_STATUS_VAL) @@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,  					       struct kvm_vcpu_events *events)  { -	vcpu_load(vcpu); -  	events->exception.injected =  		vcpu->arch.exception.pending &&  		!kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,  	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING  			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR  			 | KVM_VCPUEVENT_VALID_SHADOW); - -	vcpu_put(vcpu);  }  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, @@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,  			      | KVM_VCPUEVENT_VALID_SHADOW))  		return -EINVAL; -	vcpu_load(vcpu); -  	vcpu->arch.exception.pending = events->exception.injected;  	vcpu->arch.exception.nr = events->exception.nr;  	vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,  	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)  		vcpu->arch.sipi_vector = events->sipi_vector; -	vcpu_put(vcpu); -  	return 0;  }  static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,  					     struct kvm_debugregs *dbgregs)  { -	vcpu_load(vcpu); -  	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));  	dbgregs->dr6 = vcpu->arch.dr6;  	dbgregs->dr7 = vcpu->arch.dr7;  	dbgregs->flags = 0; - -	vcpu_put(vcpu);  }  static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,  	if (dbgregs->flags)  		return -EINVAL; -	vcpu_load(vcpu); -  	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));  	vcpu->arch.dr6 = dbgregs->dr6;  	vcpu->arch.dr7 = dbgregs->dr7; -	vcpu_put(vcpu); +	return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, +					 struct kvm_xsave *guest_xsave) +{ +	if (cpu_has_xsave) +		memcpy(guest_xsave->region, +			&vcpu->arch.guest_fpu.state->xsave, +			sizeof(struct xsave_struct)); +	else { +		memcpy(guest_xsave->region, +			&vcpu->arch.guest_fpu.state->fxsave, +			sizeof(struct i387_fxsave_struct)); +		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = +			XSTATE_FPSSE; +	} +} + +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, +					struct kvm_xsave *guest_xsave) +{ +	u64 xstate_bv = +		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; +	if (cpu_has_xsave) +		memcpy(&vcpu->arch.guest_fpu.state->xsave, +			guest_xsave->region, sizeof(struct xsave_struct)); +	else { +		if (xstate_bv & ~XSTATE_FPSSE) +			return -EINVAL; +		memcpy(&vcpu->arch.guest_fpu.state->fxsave, +			guest_xsave->region, sizeof(struct i387_fxsave_struct)); +	}  	return 0;  } +static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, +					struct kvm_xcrs *guest_xcrs) +{ +	if (!cpu_has_xsave) { +		guest_xcrs->nr_xcrs = 0; +		return; +	} + +	guest_xcrs->nr_xcrs = 1; +	guest_xcrs->flags = 0; +	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; +	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; +} + +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, +				       struct kvm_xcrs *guest_xcrs) +{ +	int i, r = 0; + +	if (!cpu_has_xsave) +		return -EINVAL; + +	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) +		return -EINVAL; + +	for (i = 0; i < guest_xcrs->nr_xcrs; i++) +		/* Only support XCR0 currently */ +		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { +			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, +				guest_xcrs->xcrs[0].value); +			break; +		} +	if (r) +		r = -EINVAL; +	return r; +} +  long kvm_arch_vcpu_ioctl(struct file *filp,  			 unsigned int ioctl, unsigned long arg)  {  	struct kvm_vcpu *vcpu = filp->private_data;  	void __user *argp = (void __user *)arg;  	int r; -	struct kvm_lapic_state *lapic = NULL; +	union { +		struct kvm_lapic_state *lapic; +		struct kvm_xsave *xsave; +		struct kvm_xcrs *xcrs; +		void *buffer; +	} u; +	u.buffer = NULL;  	switch (ioctl) {  	case KVM_GET_LAPIC: {  		r = -EINVAL;  		if (!vcpu->arch.apic)  			goto out; -		lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); +		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);  		r = -ENOMEM; -		if (!lapic) +		if (!u.lapic)  			goto out; -		r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); +		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);  		if (r)  			goto out;  		r = -EFAULT; -		if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) +		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))  			goto out;  		r = 0;  		break; @@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,  		r = -EINVAL;  		if (!vcpu->arch.apic)  			goto out; -		lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); +		u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);  		r = -ENOMEM; -		if (!lapic) +		if (!u.lapic)  			goto out;  		r = -EFAULT; -		if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) +		if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))  			goto out; -		r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); +		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);  		if (r)  			goto out;  		r = 0; @@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,  		r = -EFAULT;  		if (copy_from_user(&mce, argp, sizeof mce))  			goto out; -		vcpu_load(vcpu);  		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); -		vcpu_put(vcpu);  		break;  	}  	case KVM_GET_VCPU_EVENTS: { @@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,  		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);  		break;  	} +	case KVM_GET_XSAVE: { +		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); +		r = -ENOMEM; +		if (!u.xsave) +			break; + +		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + +		r = -EFAULT; +		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) +			break; +		r = 0; +		break; +	} +	case KVM_SET_XSAVE: { +		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); +		r = -ENOMEM; +		if (!u.xsave) +			break; + +		r = -EFAULT; +		if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) +			break; + +		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); +		break; +	} +	case KVM_GET_XCRS: { +		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); +		r = -ENOMEM; +		if (!u.xcrs) +			break; + +		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + +		r = -EFAULT; +		if (copy_to_user(argp, u.xcrs, +				 sizeof(struct kvm_xcrs))) +			break; +		r = 0; +		break; +	} +	case KVM_SET_XCRS: { +		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); +		r = -ENOMEM; +		if (!u.xcrs) +			break; + +		r = -EFAULT; +		if (copy_from_user(u.xcrs, argp, +				   sizeof(struct kvm_xcrs))) +			break; + +		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); +		break; +	}  	default:  		r = -EINVAL;  	}  out: -	kfree(lapic); +	kfree(u.buffer);  	return r;  } @@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)  	return kvm->arch.n_alloc_mmu_pages;  } -gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) -{ -	int i; -	struct kvm_mem_alias *alias; -	struct kvm_mem_aliases *aliases; - -	aliases = kvm_aliases(kvm); - -	for (i = 0; i < aliases->naliases; ++i) { -		alias = &aliases->aliases[i]; -		if (alias->flags & KVM_ALIAS_INVALID) -			continue; -		if (gfn >= alias->base_gfn -		    && gfn < alias->base_gfn + alias->npages) -			return alias->target_gfn + gfn - alias->base_gfn; -	} -	return gfn; -} - -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) -{ -	int i; -	struct kvm_mem_alias *alias; -	struct kvm_mem_aliases *aliases; - -	aliases = kvm_aliases(kvm); - -	for (i = 0; i < aliases->naliases; ++i) { -		alias = &aliases->aliases[i]; -		if (gfn >= alias->base_gfn -		    && gfn < alias->base_gfn + alias->npages) -			return alias->target_gfn + gfn - alias->base_gfn; -	} -	return gfn; -} - -/* - * Set a new alias region.  Aliases map a portion of physical memory into - * another portion.  This is useful for memory windows, for example the PC - * VGA region. - */ -static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, -					 struct kvm_memory_alias *alias) -{ -	int r, n; -	struct kvm_mem_alias *p; -	struct kvm_mem_aliases *aliases, *old_aliases; - -	r = -EINVAL; -	/* General sanity checks */ -	if (alias->memory_size & (PAGE_SIZE - 1)) -		goto out; -	if (alias->guest_phys_addr & (PAGE_SIZE - 1)) -		goto out; -	if (alias->slot >= KVM_ALIAS_SLOTS) -		goto out; -	if (alias->guest_phys_addr + alias->memory_size -	    < alias->guest_phys_addr) -		goto out; -	if (alias->target_phys_addr + alias->memory_size -	    < alias->target_phys_addr) -		goto out; - -	r = -ENOMEM; -	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); -	if (!aliases) -		goto out; - -	mutex_lock(&kvm->slots_lock); - -	/* invalidate any gfn reference in case of deletion/shrinking */ -	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); -	aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; -	old_aliases = kvm->arch.aliases; -	rcu_assign_pointer(kvm->arch.aliases, aliases); -	synchronize_srcu_expedited(&kvm->srcu); -	kvm_mmu_zap_all(kvm); -	kfree(old_aliases); - -	r = -ENOMEM; -	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); -	if (!aliases) -		goto out_unlock; - -	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - -	p = &aliases->aliases[alias->slot]; -	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; -	p->npages = alias->memory_size >> PAGE_SHIFT; -	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; -	p->flags &= ~(KVM_ALIAS_INVALID); - -	for (n = KVM_ALIAS_SLOTS; n > 0; --n) -		if (aliases->aliases[n - 1].npages) -			break; -	aliases->naliases = n; - -	old_aliases = kvm->arch.aliases; -	rcu_assign_pointer(kvm->arch.aliases, aliases); -	synchronize_srcu_expedited(&kvm->srcu); -	kfree(old_aliases); -	r = 0; - -out_unlock: -	mutex_unlock(&kvm->slots_lock); -out: -	return r; -} -  static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)  {  	int r; @@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,  	struct kvm_memory_slot *memslot;  	unsigned long n;  	unsigned long is_dirty = 0; -	unsigned long *dirty_bitmap = NULL;  	mutex_lock(&kvm->slots_lock); @@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,  	n = kvm_dirty_bitmap_bytes(memslot); -	r = -ENOMEM; -	dirty_bitmap = vmalloc(n); -	if (!dirty_bitmap) -		goto out; -	memset(dirty_bitmap, 0, n); -  	for (i = 0; !is_dirty && i < n/sizeof(long); i++)  		is_dirty = memslot->dirty_bitmap[i];  	/* If nothing is dirty, don't bother messing with page tables. */  	if (is_dirty) {  		struct kvm_memslots *slots, *old_slots; +		unsigned long *dirty_bitmap;  		spin_lock(&kvm->mmu_lock);  		kvm_mmu_slot_remove_write_access(kvm, log->slot);  		spin_unlock(&kvm->mmu_lock); -		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); -		if (!slots) -			goto out_free; +		r = -ENOMEM; +		dirty_bitmap = vmalloc(n); +		if (!dirty_bitmap) +			goto out; +		memset(dirty_bitmap, 0, n); +		r = -ENOMEM; +		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); +		if (!slots) { +			vfree(dirty_bitmap); +			goto out; +		}  		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));  		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; @@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,  		synchronize_srcu_expedited(&kvm->srcu);  		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;  		kfree(old_slots); + +		r = -EFAULT; +		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { +			vfree(dirty_bitmap); +			goto out; +		} +		vfree(dirty_bitmap); +	} else { +		r = -EFAULT; +		if (clear_user(log->dirty_bitmap, n)) +			goto out;  	}  	r = 0; -	if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) -		r = -EFAULT; -out_free: -	vfree(dirty_bitmap);  out:  	mutex_unlock(&kvm->slots_lock);  	return r; @@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,  	union {  		struct kvm_pit_state ps;  		struct kvm_pit_state2 ps2; -		struct kvm_memory_alias alias;  		struct kvm_pit_config pit_config;  	} u; @@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,  			goto out;  		break;  	} -	case KVM_SET_MEMORY_REGION: { -		struct kvm_memory_region kvm_mem; -		struct kvm_userspace_memory_region kvm_userspace_mem; - -		r = -EFAULT; -		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) -			goto out; -		kvm_userspace_mem.slot = kvm_mem.slot; -		kvm_userspace_mem.flags = kvm_mem.flags; -		kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; -		kvm_userspace_mem.memory_size = kvm_mem.memory_size; -		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); -		if (r) -			goto out; -		break; -	}  	case KVM_SET_NR_MMU_PAGES:  		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);  		if (r) @@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,  	case KVM_GET_NR_MMU_PAGES:  		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);  		break; -	case KVM_SET_MEMORY_ALIAS: -		r = -EFAULT; -		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) -			goto out; -		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); -		if (r) -			goto out; -		break;  	case KVM_CREATE_IRQCHIP: {  		struct kvm_pic *vpic; @@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,  		}  		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);  		if (ret < 0) { -			r = X86EMUL_UNHANDLEABLE; +			r = X86EMUL_IO_NEEDED;  			goto out;  		} @@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,  		}  		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);  		if (ret < 0) { -			r = X86EMUL_UNHANDLEABLE; +			r = X86EMUL_IO_NEEDED;  			goto out;  		} @@ -3330,10 +3407,10 @@ out:  static int emulator_read_emulated(unsigned long addr,  				  void *val,  				  unsigned int bytes, +				  unsigned int *error_code,  				  struct kvm_vcpu *vcpu)  {  	gpa_t                 gpa; -	u32 error_code;  	if (vcpu->mmio_read_completed) {  		memcpy(val, vcpu->mmio_data, bytes); @@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,  		return X86EMUL_CONTINUE;  	} -	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); +	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); -	if (gpa == UNMAPPED_GVA) { -		kvm_inject_page_fault(vcpu, addr, error_code); +	if (gpa == UNMAPPED_GVA)  		return X86EMUL_PROPAGATE_FAULT; -	}  	/* For APIC access vmexit */  	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3370,11 +3445,12 @@ mmio:  	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);  	vcpu->mmio_needed = 1; -	vcpu->mmio_phys_addr = gpa; -	vcpu->mmio_size = bytes; -	vcpu->mmio_is_write = 0; +	vcpu->run->exit_reason = KVM_EXIT_MMIO; +	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; +	vcpu->run->mmio.len = vcpu->mmio_size = bytes; +	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; -	return X86EMUL_UNHANDLEABLE; +	return X86EMUL_IO_NEEDED;  }  int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,  static int emulator_write_emulated_onepage(unsigned long addr,  					   const void *val,  					   unsigned int bytes, +					   unsigned int *error_code,  					   struct kvm_vcpu *vcpu)  {  	gpa_t                 gpa; -	u32 error_code; -	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); +	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); -	if (gpa == UNMAPPED_GVA) { -		kvm_inject_page_fault(vcpu, addr, error_code); +	if (gpa == UNMAPPED_GVA)  		return X86EMUL_PROPAGATE_FAULT; -	}  	/* For APIC access vmexit */  	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3420,10 +3494,11 @@ mmio:  		return X86EMUL_CONTINUE;  	vcpu->mmio_needed = 1; -	vcpu->mmio_phys_addr = gpa; -	vcpu->mmio_size = bytes; -	vcpu->mmio_is_write = 1; -	memcpy(vcpu->mmio_data, val, bytes); +	vcpu->run->exit_reason = KVM_EXIT_MMIO; +	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; +	vcpu->run->mmio.len = vcpu->mmio_size = bytes; +	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; +	memcpy(vcpu->run->mmio.data, val, bytes);  	return X86EMUL_CONTINUE;  } @@ -3431,6 +3506,7 @@ mmio:  int emulator_write_emulated(unsigned long addr,  			    const void *val,  			    unsigned int bytes, +			    unsigned int *error_code,  			    struct kvm_vcpu *vcpu)  {  	/* Crossing a page boundary? */ @@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,  		int rc, now;  		now = -addr & ~PAGE_MASK; -		rc = emulator_write_emulated_onepage(addr, val, now, vcpu); +		rc = emulator_write_emulated_onepage(addr, val, now, error_code, +						     vcpu);  		if (rc != X86EMUL_CONTINUE)  			return rc;  		addr += now;  		val += now;  		bytes -= now;  	} -	return emulator_write_emulated_onepage(addr, val, bytes, vcpu); +	return emulator_write_emulated_onepage(addr, val, bytes, error_code, +					       vcpu);  } -EXPORT_SYMBOL_GPL(emulator_write_emulated);  #define CMPXCHG_TYPE(t, ptr, old, new) \  	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) @@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,  				     const void *old,  				     const void *new,  				     unsigned int bytes, +				     unsigned int *error_code,  				     struct kvm_vcpu *vcpu)  {  	gpa_t gpa; @@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,  		goto emul_write;  	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); +	if (is_error_page(page)) { +		kvm_release_page_clean(page); +		goto emul_write; +	}  	kaddr = kmap_atomic(page, KM_USER0);  	kaddr += offset_in_page(gpa); @@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,  emul_write:  	printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); -	return emulator_write_emulated(addr, new, bytes, vcpu); +	return emulator_write_emulated(addr, new, bytes, error_code, vcpu);  }  static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)  	return X86EMUL_CONTINUE;  } -int emulate_clts(struct kvm_vcpu *vcpu) +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)  { -	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); -	kvm_x86_ops->fpu_activate(vcpu); +	if (!need_emulate_wbinvd(vcpu)) +		return X86EMUL_CONTINUE; + +	if (kvm_x86_ops->has_wbinvd_exit()) { +		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, +				wbinvd_ipi, NULL, 1); +		cpumask_clear(vcpu->arch.wbinvd_dirty_mask); +	} +	wbinvd();  	return X86EMUL_CONTINUE;  } +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +int emulate_clts(struct kvm_vcpu *vcpu)  { -	return kvm_get_dr(ctxt->vcpu, dr, dest); +	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); +	kvm_x86_ops->fpu_activate(vcpu); +	return X86EMUL_CONTINUE;  } -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)  { -	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - -	return kvm_set_dr(ctxt->vcpu, dr, value & mask); +	return _kvm_get_dr(vcpu, dr, dest);  } -void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) +int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)  { -	u8 opcodes[4]; -	unsigned long rip = kvm_rip_read(vcpu); -	unsigned long rip_linear; - -	if (!printk_ratelimit()) -		return; -	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - -	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - -	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", -	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); +	return __kvm_set_dr(vcpu, dr, value);  } -EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);  static u64 mk_cr_64(u64 curr_cr, u32 new_val)  { @@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)  	return value;  } -static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)  { +	int res = 0; +  	switch (cr) {  	case 0: -		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); +		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));  		break;  	case 2:  		vcpu->arch.cr2 = val;  		break;  	case 3: -		kvm_set_cr3(vcpu, val); +		res = kvm_set_cr3(vcpu, val);  		break;  	case 4: -		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); +		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));  		break;  	case 8: -		kvm_set_cr8(vcpu, val & 0xfUL); +		res = __kvm_set_cr8(vcpu, val & 0xfUL);  		break;  	default:  		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); +		res = -1;  	} + +	return res;  }  static int emulator_get_cpl(struct kvm_vcpu *vcpu) @@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)  	kvm_x86_ops->get_gdt(vcpu, dt);  } +static unsigned long emulator_get_cached_segment_base(int seg, +						      struct kvm_vcpu *vcpu) +{ +	return get_segment_base(vcpu, seg); +} +  static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,  					   struct kvm_vcpu *vcpu)  { @@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,  	kvm_set_segment(vcpu, &kvm_seg, seg);  } -static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ -	kvm_x86_ops->set_rflags(vcpu, rflags); -} -  static struct x86_emulate_ops emulate_ops = {  	.read_std            = kvm_read_guest_virt_system,  	.write_std           = kvm_write_guest_virt_system, @@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {  	.set_cached_descriptor = emulator_set_cached_descriptor,  	.get_segment_selector = emulator_get_segment_selector,  	.set_segment_selector = emulator_set_segment_selector, +	.get_cached_segment_base = emulator_get_cached_segment_base,  	.get_gdt             = emulator_get_gdt,  	.get_cr              = emulator_get_cr,  	.set_cr              = emulator_set_cr,  	.cpl                 = emulator_get_cpl, -	.set_rflags          = emulator_set_rflags, +	.get_dr              = emulator_get_dr, +	.set_dr              = emulator_set_dr, +	.set_msr             = kvm_set_msr, +	.get_msr             = kvm_get_msr,  };  static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)  	vcpu->arch.regs_dirty = ~0;  } +static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +{ +	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); +	/* +	 * an sti; sti; sequence only disable interrupts for the first +	 * instruction. So, if the last instruction, be it emulated or +	 * not, left the system with the INT_STI flag enabled, it +	 * means that the last instruction is an sti. We should not +	 * leave the flag on in this case. The same goes for mov ss +	 */ +	if (!(int_shadow & mask)) +		kvm_x86_ops->set_interrupt_shadow(vcpu, mask); +} + +static void inject_emulated_exception(struct kvm_vcpu *vcpu) +{ +	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; +	if (ctxt->exception == PF_VECTOR) +		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); +	else if (ctxt->error_code_valid) +		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); +	else +		kvm_queue_exception(vcpu, ctxt->exception); +} + +static int handle_emulation_failure(struct kvm_vcpu *vcpu) +{ +	++vcpu->stat.insn_emulation_fail; +	trace_kvm_emulate_insn_failed(vcpu); +	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; +	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; +	vcpu->run->internal.ndata = 0; +	kvm_queue_exception(vcpu, UD_VECTOR); +	return EMULATE_FAIL; +} + +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +{ +	gpa_t gpa; + +	if (tdp_enabled) +		return false; + +	/* +	 * if emulation was due to access to shadowed page table +	 * and it failed try to unshadow page and re-entetr the +	 * guest to let CPU execute the instruction. +	 */ +	if (kvm_mmu_unprotect_page_virt(vcpu, gva)) +		return true; + +	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + +	if (gpa == UNMAPPED_GVA) +		return true; /* let cpu generate fault */ + +	if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) +		return true; + +	return false; +} +  int emulate_instruction(struct kvm_vcpu *vcpu,  			unsigned long cr2,  			u16 error_code,  			int emulation_type)  { -	int r, shadow_mask; -	struct decode_cache *c; -	struct kvm_run *run = vcpu->run; +	int r; +	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;  	kvm_clear_exception_queue(vcpu);  	vcpu->arch.mmio_fault_cr2 = cr2; @@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,  	 */  	cache_all_regs(vcpu); -	vcpu->mmio_is_write = 0; -  	if (!(emulation_type & EMULTYPE_NO_DECODE)) {  		int cs_db, cs_l;  		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); @@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,  			? X86EMUL_MODE_VM86 : cs_l  			? X86EMUL_MODE_PROT64 :	cs_db  			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; +		memset(c, 0, sizeof(struct decode_cache)); +		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); +		vcpu->arch.emulate_ctxt.interruptibility = 0; +		vcpu->arch.emulate_ctxt.exception = -1;  		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);  		trace_kvm_emulate_insn_start(vcpu);  		/* Only allow emulation of specific instructions on #UD  		 * (namely VMMCALL, sysenter, sysexit, syscall)*/ -		c = &vcpu->arch.emulate_ctxt.decode;  		if (emulation_type & EMULTYPE_TRAP_UD) {  			if (!c->twobyte)  				return EMULATE_FAIL; @@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,  		++vcpu->stat.insn_emulation;  		if (r)  { -			++vcpu->stat.insn_emulation_fail; -			trace_kvm_emulate_insn_failed(vcpu); -			if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) +			if (reexecute_instruction(vcpu, cr2))  				return EMULATE_DONE; -			return EMULATE_FAIL; +			if (emulation_type & EMULTYPE_SKIP) +				return EMULATE_FAIL; +			return handle_emulation_failure(vcpu);  		}  	} @@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,  		return EMULATE_DONE;  	} +	/* this is needed for vmware backdor interface to work since it +	   changes registers values  during IO operation */ +	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); +  restart:  	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); -	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; -	if (r == 0) -		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); +	if (r) { /* emulation failed */ +		if (reexecute_instruction(vcpu, cr2)) +			return EMULATE_DONE; -	if (vcpu->arch.pio.count) { -		if (!vcpu->arch.pio.in) -			vcpu->arch.pio.count = 0; -		return EMULATE_DO_MMIO; +		return handle_emulation_failure(vcpu);  	} -	if (r || vcpu->mmio_is_write) { -		run->exit_reason = KVM_EXIT_MMIO; -		run->mmio.phys_addr = vcpu->mmio_phys_addr; -		memcpy(run->mmio.data, vcpu->mmio_data, 8); -		run->mmio.len = vcpu->mmio_size; -		run->mmio.is_write = vcpu->mmio_is_write; +	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); +	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); +	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); +	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + +	if (vcpu->arch.emulate_ctxt.exception >= 0) { +		inject_emulated_exception(vcpu); +		return EMULATE_DONE;  	} -	if (r) { -		if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) -			goto done; -		if (!vcpu->mmio_needed) { -			++vcpu->stat.insn_emulation_fail; -			trace_kvm_emulate_insn_failed(vcpu); -			kvm_report_emulation_failure(vcpu, "mmio"); -			return EMULATE_FAIL; -		} +	if (vcpu->arch.pio.count) { +		if (!vcpu->arch.pio.in) +			vcpu->arch.pio.count = 0;  		return EMULATE_DO_MMIO;  	} -	if (vcpu->mmio_is_write) { -		vcpu->mmio_needed = 0; +	if (vcpu->mmio_needed) { +		if (vcpu->mmio_is_write) +			vcpu->mmio_needed = 0;  		return EMULATE_DO_MMIO;  	} -done: -	if (vcpu->arch.exception.pending) -		vcpu->arch.emulate_ctxt.restart = false; -  	if (vcpu->arch.emulate_ctxt.restart)  		goto restart; @@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)  	perf_register_guest_info_callbacks(&kvm_guest_cbs); +	if (cpu_has_xsave) +		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); +  	return 0;  out: @@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)  	kvm_x86_ops->patch_hypercall(vcpu, instruction); -	return emulator_write_emulated(rip, instruction, 3, vcpu); +	return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);  }  void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) @@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)  	}  } +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ +	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && +			!vcpu->guest_xcr0_loaded) { +		/* kvm_set_xcr() also depends on this */ +		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); +		vcpu->guest_xcr0_loaded = 1; +	} +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ +	if (vcpu->guest_xcr0_loaded) { +		if (vcpu->arch.xcr0 != host_xcr0) +			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); +		vcpu->guest_xcr0_loaded = 0; +	} +} +  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  {  	int r;  	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&  		vcpu->run->request_interrupt_window; -	if (vcpu->requests) -		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) -			kvm_mmu_unload(vcpu); - -	r = kvm_mmu_reload(vcpu); -	if (unlikely(r)) -		goto out; -  	if (vcpu->requests) { -		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) +		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) +			kvm_mmu_unload(vcpu); +		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))  			__kvm_migrate_timers(vcpu); -		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) +		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))  			kvm_write_guest_time(vcpu); -		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) +		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))  			kvm_mmu_sync_roots(vcpu); -		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) +		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))  			kvm_x86_ops->tlb_flush(vcpu); -		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, -				       &vcpu->requests)) { +		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {  			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;  			r = 0;  			goto out;  		} -		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { +		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {  			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;  			r = 0;  			goto out;  		} -		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { +		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {  			vcpu->fpu_active = 0;  			kvm_x86_ops->fpu_deactivate(vcpu);  		}  	} +	r = kvm_mmu_reload(vcpu); +	if (unlikely(r)) +		goto out; +  	preempt_disable();  	kvm_x86_ops->prepare_guest_switch(vcpu);  	if (vcpu->fpu_active)  		kvm_load_guest_fpu(vcpu); +	kvm_load_guest_xcr0(vcpu); -	local_irq_disable(); +	atomic_set(&vcpu->guest_mode, 1); +	smp_wmb(); -	clear_bit(KVM_REQ_KICK, &vcpu->requests); -	smp_mb__after_clear_bit(); +	local_irq_disable(); -	if (vcpu->requests || need_resched() || signal_pending(current)) { -		set_bit(KVM_REQ_KICK, &vcpu->requests); +	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests +	    || need_resched() || signal_pending(current)) { +		atomic_set(&vcpu->guest_mode, 0); +		smp_wmb();  		local_irq_enable();  		preempt_enable();  		r = 1; @@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  	if (hw_breakpoint_active())  		hw_breakpoint_restore(); -	set_bit(KVM_REQ_KICK, &vcpu->requests); +	atomic_set(&vcpu->guest_mode, 0); +	smp_wmb();  	local_irq_enable();  	++vcpu->stat.exits; @@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)  			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);  			kvm_vcpu_block(vcpu);  			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); -			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) +			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))  			{  				switch(vcpu->arch.mp_state) {  				case KVM_MP_STATE_HALTED: @@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  	int r;  	sigset_t sigsaved; -	vcpu_load(vcpu); -  	if (vcpu->sigset_active)  		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);  		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); -		if (r == EMULATE_DO_MMIO) { +		if (r != EMULATE_DONE) {  			r = 0;  			goto out;  		} @@ -4759,14 +4924,11 @@ out:  	if (vcpu->sigset_active)  		sigprocmask(SIG_SETMASK, &sigsaved, NULL); -	vcpu_put(vcpu);  	return r;  }  int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  { -	vcpu_load(vcpu); -  	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);  	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);  	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	regs->rip = kvm_rip_read(vcpu);  	regs->rflags = kvm_get_rflags(vcpu); -	vcpu_put(vcpu); -  	return 0;  }  int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  { -	vcpu_load(vcpu); -  	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);  	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);  	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  	vcpu->arch.exception.pending = false; -	vcpu_put(vcpu); -  	return 0;  } @@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,  {  	struct desc_ptr dt; -	vcpu_load(vcpu); -  	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);  	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);  	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,  		set_bit(vcpu->arch.interrupt.nr,  			(unsigned long *)sregs->interrupt_bitmap); -	vcpu_put(vcpu); -  	return 0;  }  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,  				    struct kvm_mp_state *mp_state)  { -	vcpu_load(vcpu);  	mp_state->mp_state = vcpu->arch.mp_state; -	vcpu_put(vcpu);  	return 0;  }  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,  				    struct kvm_mp_state *mp_state)  { -	vcpu_load(vcpu);  	vcpu->arch.mp_state = mp_state->mp_state; -	vcpu_put(vcpu);  	return 0;  }  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,  		    bool has_error_code, u32 error_code)  { +	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;  	int cs_db, cs_l, ret;  	cache_all_regs(vcpu); @@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,  		? X86EMUL_MODE_VM86 : cs_l  		? X86EMUL_MODE_PROT64 :	cs_db  		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; +	memset(c, 0, sizeof(struct decode_cache)); +	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);  	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,  				   tss_selector, reason, has_error_code, @@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,  	if (ret)  		return EMULATE_FAIL; +	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); +	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);  	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);  	return EMULATE_DONE;  } @@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  	int pending_vec, max_bits;  	struct desc_ptr dt; -	vcpu_load(vcpu); -  	dt.size = sregs->idt.limit;  	dt.address = sregs->idt.base;  	kvm_x86_ops->set_idt(vcpu, &dt); @@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  	    !is_protmode(vcpu))  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -	vcpu_put(vcpu); -  	return 0;  } @@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,  	unsigned long rflags;  	int i, r; -	vcpu_load(vcpu); -  	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {  		r = -EBUSY;  		if (vcpu->arch.exception.pending) -			goto unlock_out; +			goto out;  		if (dbg->control & KVM_GUESTDBG_INJECT_DB)  			kvm_queue_exception(vcpu, DB_VECTOR);  		else @@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,  	r = 0; -unlock_out: -	vcpu_put(vcpu); +out:  	return r;  }  /* - * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { -	u16	cwd; -	u16	swd; -	u16	twd; -	u16	fop; -	u64	rip; -	u64	rdp; -	u32	mxcsr; -	u32	mxcsr_mask; -	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 -	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */ -#else -	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - -/*   * Translate a guest virtual address to a guest physical address.   */  int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,  	gpa_t gpa;  	int idx; -	vcpu_load(vcpu);  	idx = srcu_read_lock(&vcpu->kvm->srcu);  	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);  	srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,  	tr->valid = gpa != UNMAPPED_GVA;  	tr->writeable = 1;  	tr->usermode = 0; -	vcpu_put(vcpu);  	return 0;  }  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)  { -	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - -	vcpu_load(vcpu); +	struct i387_fxsave_struct *fxsave = +			&vcpu->arch.guest_fpu.state->fxsave;  	memcpy(fpu->fpr, fxsave->st_space, 128);  	fpu->fcw = fxsave->cwd; @@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)  	fpu->last_dp = fxsave->rdp;  	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); -	vcpu_put(vcpu); -  	return 0;  }  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)  { -	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - -	vcpu_load(vcpu); +	struct i387_fxsave_struct *fxsave = +			&vcpu->arch.guest_fpu.state->fxsave;  	memcpy(fxsave->st_space, fpu->fpr, 128);  	fxsave->cwd = fpu->fcw; @@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)  	fxsave->rdp = fpu->last_dp;  	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); -	vcpu_put(vcpu); -  	return 0;  } -void fx_init(struct kvm_vcpu *vcpu) +int fx_init(struct kvm_vcpu *vcpu)  { -	unsigned after_mxcsr_mask; +	int err; + +	err = fpu_alloc(&vcpu->arch.guest_fpu); +	if (err) +		return err; + +	fpu_finit(&vcpu->arch.guest_fpu);  	/* -	 * Touch the fpu the first time in non atomic context as if -	 * this is the first fpu instruction the exception handler -	 * will fire before the instruction returns and it'll have to -	 * allocate ram with GFP_KERNEL. +	 * Ensure guest xcr0 is valid for loading  	 */ -	if (!used_math()) -		kvm_fx_save(&vcpu->arch.host_fx_image); - -	/* Initialize guest FPU by resetting ours and saving into guest's */ -	preempt_disable(); -	kvm_fx_save(&vcpu->arch.host_fx_image); -	kvm_fx_finit(); -	kvm_fx_save(&vcpu->arch.guest_fx_image); -	kvm_fx_restore(&vcpu->arch.host_fx_image); -	preempt_enable(); +	vcpu->arch.xcr0 = XSTATE_FP;  	vcpu->arch.cr0 |= X86_CR0_ET; -	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); -	vcpu->arch.guest_fx_image.mxcsr = 0x1f80; -	memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, -	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); + +	return 0;  }  EXPORT_SYMBOL_GPL(fx_init); +static void fx_free(struct kvm_vcpu *vcpu) +{ +	fpu_free(&vcpu->arch.guest_fpu); +} +  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)  {  	if (vcpu->guest_fpu_loaded)  		return; +	/* +	 * Restore all possible states in the guest, +	 * and assume host would use all available bits. +	 * Guest xcr0 would be loaded later. +	 */ +	kvm_put_guest_xcr0(vcpu);  	vcpu->guest_fpu_loaded = 1; -	kvm_fx_save(&vcpu->arch.host_fx_image); -	kvm_fx_restore(&vcpu->arch.guest_fx_image); +	unlazy_fpu(current); +	fpu_restore_checking(&vcpu->arch.guest_fpu);  	trace_kvm_fpu(1);  }  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)  { +	kvm_put_guest_xcr0(vcpu); +  	if (!vcpu->guest_fpu_loaded)  		return;  	vcpu->guest_fpu_loaded = 0; -	kvm_fx_save(&vcpu->arch.guest_fx_image); -	kvm_fx_restore(&vcpu->arch.host_fx_image); +	fpu_save_init(&vcpu->arch.guest_fpu);  	++vcpu->stat.fpu_reload; -	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); +	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);  	trace_kvm_fpu(0);  } @@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)  		vcpu->arch.time_page = NULL;  	} +	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); +	fx_free(vcpu);  	kvm_x86_ops->vcpu_free(vcpu);  } @@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)  {  	int r; -	/* We do fxsave: this must be aligned. */ -	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); -  	vcpu->arch.mtrr_state.have_fixed = 1;  	vcpu_load(vcpu);  	r = kvm_arch_vcpu_reset(vcpu); @@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)  	kvm_mmu_unload(vcpu);  	vcpu_put(vcpu); +	fx_free(vcpu);  	kvm_x86_ops->vcpu_free(vcpu);  } @@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)  	}  	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; +	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) +		goto fail_free_mce_banks; +  	return 0; +fail_free_mce_banks: +	kfree(vcpu->arch.mce_banks);  fail_free_lapic:  	kvm_free_lapic(vcpu);  fail_mmu_destroy: @@ -5364,12 +5490,6 @@ struct  kvm *kvm_arch_create_vm(void)  	if (!kvm)  		return ERR_PTR(-ENOMEM); -	kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); -	if (!kvm->arch.aliases) { -		kfree(kvm); -		return ERR_PTR(-ENOMEM); -	} -  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);  	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)  void kvm_arch_sync_events(struct kvm *kvm)  {  	kvm_free_all_assigned_devices(kvm); +	kvm_free_pit(kvm);  }  void kvm_arch_destroy_vm(struct kvm *kvm)  {  	kvm_iommu_unmap_guest(kvm); -	kvm_free_pit(kvm);  	kfree(kvm->arch.vpic);  	kfree(kvm->arch.vioapic);  	kvm_free_vcpus(kvm); @@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)  	if (kvm->arch.ept_identity_pagetable)  		put_page(kvm->arch.ept_identity_pagetable);  	cleanup_srcu_struct(&kvm->srcu); -	kfree(kvm->arch.aliases);  	kfree(kvm);  } @@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,  				int user_alloc)  {  	int npages = memslot->npages; +	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + +	/* Prevent internal slot pages from being moved by fork()/COW. */ +	if (memslot->id >= KVM_MEMORY_SLOTS) +		map_flags = MAP_SHARED | MAP_ANONYMOUS;  	/*To keep backward compatibility with older userspace,  	 *x86 needs to hanlde !user_alloc case. @@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,  			userspace_addr = do_mmap(NULL, 0,  						 npages * PAGE_SIZE,  						 PROT_READ | PROT_WRITE, -						 MAP_PRIVATE | MAP_ANONYMOUS, +						 map_flags,  						 0);  			up_write(¤t->mm->mmap_sem); @@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)  	me = get_cpu();  	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) -		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) +		if (atomic_xchg(&vcpu->guest_mode, 0))  			smp_send_reschedule(cpu);  	put_cpu();  } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f4b54458285b..b7a404722d2b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)  	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);  } -static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) -{ -	return rcu_dereference_check(kvm->arch.aliases, -			srcu_read_lock_held(&kvm->srcu) -			|| lockdep_is_held(&kvm->slots_lock)); -} -  void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);  void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);  |