From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 2 Aug 2016 20:11:27 +0000 (-0400)
Subject: Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
X-Git-Tag: v4.8-rc1~60
X-Git-Url: http://git.cascardo.info/?p=cascardo%2Flinux.git;a=commitdiff_plain;h=221bb8a46e230b9824204ae86537183d9991ff2a

Merge tag 'for-linus' of git://git./virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:

 - ARM: GICv3 ITS emulation and various fixes.  Removal of the
   old VGIC implementation.

 - s390: support for trapping software breakpoints, nested
   virtualization (vSIE), the STHYI opcode, initial extensions
   for CPU model support.

 - MIPS: support for MIPS64 hosts (32-bit guests only) and lots
   of cleanups, preliminary to this and the upcoming support for
   hardware virtualization extensions.

 - x86: support for execute-only mappings in nested EPT; reduced
   vmexit latency for TSC deadline timer (by about 30%) on Intel
   hosts; support for more than 255 vCPUs.

 - PPC: bugfixes.

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (302 commits)
  KVM: PPC: Introduce KVM_CAP_PPC_HTM
  MIPS: Select HAVE_KVM for MIPS64_R{2,6}
  MIPS: KVM: Reset CP0_PageMask during host TLB flush
  MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX()
  MIPS: KVM: Sign extend MFC0/RDHWR results
  MIPS: KVM: Fix 64-bit big endian dynamic translation
  MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase
  MIPS: KVM: Use 64-bit CP0_EBase when appropriate
  MIPS: KVM: Set CP0_Status.KX on MIPS64
  MIPS: KVM: Make entry code MIPS64 friendly
  MIPS: KVM: Use kmap instead of CKSEG0ADDR()
  MIPS: KVM: Use virt_to_phys() to get commpage PFN
  MIPS: Fix definition of KSEGX() for 64-bit
  KVM: VMX: Add VMCS to CPU's loaded VMCSs before VMPTRLD
  kvm: x86: nVMX: maintain internal copy of current VMCS
  KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE
  KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures
  KVM: arm64: vgic-its: Simplify MAPI error handling
  KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi similar to other handlers
  KVM: arm64: vgic-its: Turn device_id validation into generic ID validation
  ...
---

221bb8a46e230b9824204ae86537183d9991ff2a
diff --cc arch/powerpc/include/asm/paca.h
index ad171e979ab0,4b17bd058e01..148303e7771f
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@@ -25,7 -25,7 +25,8 @@@
  #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
  #include <asm/kvm_book3s_asm.h>
  #endif
 +#include <asm/accounting.h>
+ #include <asm/hmi.h>
  
  register struct paca_struct *local_paca asm("r13");
  
diff --cc arch/powerpc/kernel/Makefile
index fe4c075bcf50,6972a23433d3..b2027a5cf508
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@@ -41,7 -41,8 +41,7 @@@ obj-$(CONFIG_VDSO32)		+= vdso32
  obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
  obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
  obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
- obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
+ obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o hmi.o
 -obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
  obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
  obj-$(CONFIG_PPC64)		+= vdso64/
  obj-$(CONFIG_ALTIVEC)		+= vecemu.o
diff --cc arch/powerpc/kernel/exceptions-64s.S
index 6200e4925d26,0eba47e074b9..694def6c9d61
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@@ -669,8 -680,8 +669,10 @@@ _GLOBAL(__replay_interrupt
  BEGIN_FTR_SECTION
  	cmpwi	r3,0xe80
  	beq	h_doorbell_common
 +	cmpwi	r3,0xea0
 +	beq	h_virt_irq_common
+ 	cmpwi	r3,0xe60
+ 	beq	hmi_exception_common
  FTR_SECTION_ELSE
  	cmpwi	r3,0xa00
  	beq	doorbell_super_common
@@@ -1161,18 -1172,9 +1163,18 @@@ fwnmi_data_area
  	. = 0x8000
  #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
  
 +	STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
 +	STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
 +
 +#ifdef CONFIG_CBE_RAS
 +	STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
 +	STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
 +	STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
 +#endif /* CONFIG_CBE_RAS */
 +
  	.globl hmi_exception_early
  hmi_exception_early:
- 	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+ 	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
  	mr	r10,r1			/* Save r1			*/
  	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
  	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
diff --cc arch/powerpc/kernel/idle_book3s.S
index 335eb6cedae5,000000000000..8a56a51fc0cb
mode 100644,000000..100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@@ -1,662 -1,0 +1,664 @@@
 +/*
 + *  This file contains idle entry/exit functions for POWER7,
 + *  POWER8 and POWER9 CPUs.
 + *
 + *  This program is free software; you can redistribute it and/or
 + *  modify it under the terms of the GNU General Public License
 + *  as published by the Free Software Foundation; either version
 + *  2 of the License, or (at your option) any later version.
 + */
 +
 +#include <linux/threads.h>
 +#include <asm/processor.h>
 +#include <asm/page.h>
 +#include <asm/cputable.h>
 +#include <asm/thread_info.h>
 +#include <asm/ppc_asm.h>
 +#include <asm/asm-offsets.h>
 +#include <asm/ppc-opcode.h>
 +#include <asm/hw_irq.h>
 +#include <asm/kvm_book3s_asm.h>
 +#include <asm/opal.h>
 +#include <asm/cpuidle.h>
 +#include <asm/book3s/64/mmu-hash.h>
 +#include <asm/mmu.h>
 +
 +#undef DEBUG
 +
 +/*
 + * Use unused space in the interrupt stack to save and restore
 + * registers for winkle support.
 + */
 +#define _SDR1	GPR3
 +#define _RPR	GPR4
 +#define _SPURR	GPR5
 +#define _PURR	GPR6
 +#define _TSCR	GPR7
 +#define _DSCR	GPR8
 +#define _AMOR	GPR9
 +#define _WORT	GPR10
 +#define _WORC	GPR11
 +#define _PTCR	GPR12
 +
 +#define PSSCR_HV_TEMPLATE	PSSCR_ESL | PSSCR_EC | \
 +				PSSCR_PSLL_MASK | PSSCR_TR_MASK | \
 +				PSSCR_MTL_MASK
 +
 +/* Idle state entry routines */
 +
 +#define	IDLE_STATE_ENTER_SEQ(IDLE_INST)				\
 +	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
 +	std	r0,0(r1);					\
 +	ptesync;						\
 +	ld	r0,0(r1);					\
 +1:	cmp	cr0,r0,r0;					\
 +	bne	1b;						\
 +	IDLE_INST;						\
 +	b	.
 +
 +	.text
 +
 +/*
 + * Used by threads before entering deep idle states. Saves SPRs
 + * in interrupt stack frame
 + */
 +save_sprs_to_stack:
 +	/*
 +	 * Note all register i.e per-core, per-subcore or per-thread is saved
 +	 * here since any thread in the core might wake up first
 +	 */
 +BEGIN_FTR_SECTION
 +	mfspr	r3,SPRN_PTCR
 +	std	r3,_PTCR(r1)
 +	/*
 +	 * Note - SDR1 is dropped in Power ISA v3. Hence not restoring
 +	 * SDR1 here
 +	 */
 +FTR_SECTION_ELSE
 +	mfspr	r3,SPRN_SDR1
 +	std	r3,_SDR1(r1)
 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 +	mfspr	r3,SPRN_RPR
 +	std	r3,_RPR(r1)
 +	mfspr	r3,SPRN_SPURR
 +	std	r3,_SPURR(r1)
 +	mfspr	r3,SPRN_PURR
 +	std	r3,_PURR(r1)
 +	mfspr	r3,SPRN_TSCR
 +	std	r3,_TSCR(r1)
 +	mfspr	r3,SPRN_DSCR
 +	std	r3,_DSCR(r1)
 +	mfspr	r3,SPRN_AMOR
 +	std	r3,_AMOR(r1)
 +	mfspr	r3,SPRN_WORT
 +	std	r3,_WORT(r1)
 +	mfspr	r3,SPRN_WORC
 +	std	r3,_WORC(r1)
 +
 +	blr
 +
 +/*
 + * Used by threads when the lock bit of core_idle_state is set.
 + * Threads will spin in HMT_LOW until the lock bit is cleared.
 + * r14 - pointer to core_idle_state
 + * r15 - used to load contents of core_idle_state
 + */
 +
 +core_idle_lock_held:
 +	HMT_LOW
 +3:	lwz	r15,0(r14)
 +	andi.   r15,r15,PNV_CORE_IDLE_LOCK_BIT
 +	bne	3b
 +	HMT_MEDIUM
 +	lwarx	r15,0,r14
 +	blr
 +
 +/*
 + * Pass requested state in r3:
 + *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
 + *	   - Requested STOP state in POWER9
 + *
 + * To check IRQ_HAPPENED in r4
 + * 	0 - don't check
 + * 	1 - check
 + *
 + * Address to 'rfid' to in r5
 + */
 +_GLOBAL(pnv_powersave_common)
 +	/* Use r3 to pass state nap/sleep/winkle */
 +	/* NAP is a state loss, we create a regs frame on the
 +	 * stack, fill it up with the state we care about and
 +	 * stick a pointer to it in PACAR1. We really only
 +	 * need to save PC, some CR bits and the NV GPRs,
 +	 * but for now an interrupt frame will do.
 +	 */
 +	mflr	r0
 +	std	r0,16(r1)
 +	stdu	r1,-INT_FRAME_SIZE(r1)
 +	std	r0,_LINK(r1)
 +	std	r0,_NIP(r1)
 +
 +	/* Hard disable interrupts */
 +	mfmsr	r9
 +	rldicl	r9,r9,48,1
 +	rotldi	r9,r9,16
 +	mtmsrd	r9,1			/* hard-disable interrupts */
 +
 +	/* Check if something happened while soft-disabled */
 +	lbz	r0,PACAIRQHAPPENED(r13)
 +	andi.	r0,r0,~PACA_IRQ_HARD_DIS@l
 +	beq	1f
 +	cmpwi	cr0,r4,0
 +	beq	1f
 +	addi	r1,r1,INT_FRAME_SIZE
 +	ld	r0,16(r1)
 +	li	r3,0			/* Return 0 (no nap) */
 +	mtlr	r0
 +	blr
 +
 +1:	/* We mark irqs hard disabled as this is the state we'll
 +	 * be in when returning and we need to tell arch_local_irq_restore()
 +	 * about it
 +	 */
 +	li	r0,PACA_IRQ_HARD_DIS
 +	stb	r0,PACAIRQHAPPENED(r13)
 +
 +	/* We haven't lost state ... yet */
 +	li	r0,0
 +	stb	r0,PACA_NAPSTATELOST(r13)
 +
 +	/* Continue saving state */
 +	SAVE_GPR(2, r1)
 +	SAVE_NVGPRS(r1)
 +	mfcr	r4
 +	std	r4,_CCR(r1)
 +	std	r9,_MSR(r1)
 +	std	r1,PACAR1(r13)
 +
 +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 +	/* Tell KVM we're entering idle */
 +	li	r4,KVM_HWTHREAD_IN_IDLE
 +	stb	r4,HSTATE_HWTHREAD_STATE(r13)
 +#endif
 +
 +	/*
 +	 * Go to real mode to do the nap, as required by the architecture.
 +	 * Also, we need to be in real mode before setting hwthread_state,
 +	 * because as soon as we do that, another thread can switch
 +	 * the MMU context to the guest.
 +	 */
 +	LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
 +	li	r6, MSR_RI
 +	andc	r6, r9, r6
 +	mtmsrd	r6, 1		/* clear RI before setting SRR0/1 */
 +	mtspr	SPRN_SRR0, r5
 +	mtspr	SPRN_SRR1, r7
 +	rfid
 +
 +	.globl pnv_enter_arch207_idle_mode
 +pnv_enter_arch207_idle_mode:
 +	stb	r3,PACA_THREAD_IDLE_STATE(r13)
 +	cmpwi	cr3,r3,PNV_THREAD_SLEEP
 +	bge	cr3,2f
 +	IDLE_STATE_ENTER_SEQ(PPC_NAP)
 +	/* No return */
 +2:
 +	/* Sleep or winkle */
 +	lbz	r7,PACA_THREAD_MASK(r13)
 +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
 +lwarx_loop1:
 +	lwarx	r15,0,r14
 +
 +	andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
 +	bnel	core_idle_lock_held
 +
 +	andc	r15,r15,r7			/* Clear thread bit */
 +
 +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
 +
 +/*
 + * If cr0 = 0, then current thread is the last thread of the core entering
 + * sleep. Last thread needs to execute the hardware bug workaround code if
 + * required by the platform.
 + * Make the workaround call unconditionally here. The below branch call is
 + * patched out when the idle states are discovered if the platform does not
 + * require it.
 + */
 +.global pnv_fastsleep_workaround_at_entry
 +pnv_fastsleep_workaround_at_entry:
 +	beq	fastsleep_workaround_at_entry
 +
 +	stwcx.	r15,0,r14
 +	bne-	lwarx_loop1
 +	isync
 +
 +common_enter: /* common code for all the threads entering sleep or winkle */
 +	bgt	cr3,enter_winkle
 +	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
 +
 +fastsleep_workaround_at_entry:
 +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
 +	stwcx.	r15,0,r14
 +	bne-	lwarx_loop1
 +	isync
 +
 +	/* Fast sleep workaround */
 +	li	r3,1
 +	li	r4,1
 +	bl	opal_rm_config_cpu_idle_state
 +
 +	/* Clear Lock bit */
 +	li	r0,0
 +	lwsync
 +	stw	r0,0(r14)
 +	b	common_enter
 +
 +enter_winkle:
 +	bl	save_sprs_to_stack
 +
 +	IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
 +
 +/*
 + * r3 - requested stop state
 + */
 +power_enter_stop:
 +/*
 + * Check if the requested state is a deep idle state.
 + */
 +	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
 +	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 +	cmpd	r3,r4
 +	bge	2f
 +	IDLE_STATE_ENTER_SEQ(PPC_STOP)
 +2:
 +/*
 + * Entering deep idle state.
 + * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to
 + * stack and enter stop
 + */
 +	lbz     r7,PACA_THREAD_MASK(r13)
 +	ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
 +
 +lwarx_loop_stop:
 +	lwarx   r15,0,r14
 +	andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
 +	bnel    core_idle_lock_held
 +	andc    r15,r15,r7                      /* Clear thread bit */
 +
 +	stwcx.  r15,0,r14
 +	bne-    lwarx_loop_stop
 +	isync
 +
 +	bl	save_sprs_to_stack
 +
 +	IDLE_STATE_ENTER_SEQ(PPC_STOP)
 +
 +_GLOBAL(power7_idle)
 +	/* Now check if user or arch enabled NAP mode */
 +	LOAD_REG_ADDRBASE(r3,powersave_nap)
 +	lwz	r4,ADDROFF(powersave_nap)(r3)
 +	cmpwi	0,r4,0
 +	beqlr
 +	li	r3, 1
 +	/* fall through */
 +
 +_GLOBAL(power7_nap)
 +	mr	r4,r3
 +	li	r3,PNV_THREAD_NAP
 +	LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
 +	b	pnv_powersave_common
 +	/* No return */
 +
 +_GLOBAL(power7_sleep)
 +	li	r3,PNV_THREAD_SLEEP
 +	li	r4,1
 +	LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
 +	b	pnv_powersave_common
 +	/* No return */
 +
 +_GLOBAL(power7_winkle)
 +	li	r3,PNV_THREAD_WINKLE
 +	li	r4,1
 +	LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
 +	b	pnv_powersave_common
 +	/* No return */
 +
 +#define CHECK_HMI_INTERRUPT						\
 +	mfspr	r0,SPRN_SRR1;						\
 +BEGIN_FTR_SECTION_NESTED(66);						\
 +	rlwinm	r0,r0,45-31,0xf;  /* extract wake reason field (P8) */	\
 +FTR_SECTION_ELSE_NESTED(66);						\
 +	rlwinm	r0,r0,45-31,0xe;  /* P7 wake reason field is 3 bits */	\
 +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
 +	cmpwi	r0,0xa;			/* Hypervisor maintenance ? */	\
 +	bne	20f;							\
 +	/* Invoke opal call to handle hmi */				\
 +	ld	r2,PACATOC(r13);					\
 +	ld	r1,PACAR1(r13);						\
 +	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
- 	bl	opal_rm_handle_hmi;					\
++	li	r3,0;			/* NULL argument */		\
++	bl	hmi_exception_realmode;					\
++	nop;								\
 +	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
 +20:	nop;
 +
 +
 +/*
 + * r3 - requested stop state
 + */
 +_GLOBAL(power9_idle_stop)
 +	LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE)
 +	or	r4,r4,r3
 +	mtspr	SPRN_PSSCR, r4
 +	li	r4, 1
 +	LOAD_REG_ADDR(r5,power_enter_stop)
 +	b	pnv_powersave_common
 +	/* No return */
 +/*
 + * Called from reset vector. Check whether we have woken up with
 + * hypervisor state loss. If yes, restore hypervisor state and return
 + * back to reset vector.
 + *
 + * r13 - Contents of HSPRG0
 + * cr3 - set to gt if waking up with partial/complete hypervisor state loss
 + */
 +_GLOBAL(pnv_restore_hyp_resource)
 +	ld	r2,PACATOC(r13);
 +BEGIN_FTR_SECTION
 +	/*
 +	 * POWER ISA 3. Use PSSCR to determine if we
 +	 * are waking up from deep idle state
 +	 */
 +	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
 +	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
 +
 +	mfspr	r5,SPRN_PSSCR
 +	/*
 +	 * 0-3 bits correspond to Power-Saving Level Status
 +	 * which indicates the idle state we are waking up from
 +	 */
 +	rldicl  r5,r5,4,60
 +	cmpd	cr4,r5,r4
 +	bge	cr4,pnv_wakeup_tb_loss
 +	/*
 +	 * Waking up without hypervisor state loss. Return to
 +	 * reset vector
 +	 */
 +	blr
 +
 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 +
 +	/*
 +	 * POWER ISA 2.07 or less.
 +	 * Check if last bit of HSPGR0 is set. This indicates whether we are
 +	 * waking up from winkle.
 +	 */
 +	clrldi	r5,r13,63
 +	clrrdi	r13,r13,1
 +	cmpwi	cr4,r5,1
 +	mtspr	SPRN_HSPRG0,r13
 +
 +	lbz	r0,PACA_THREAD_IDLE_STATE(r13)
 +	cmpwi   cr2,r0,PNV_THREAD_NAP
 +	bgt     cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
 +
 +	/*
 +	 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
 +	 * up from nap. At this stage CR3 shouldn't contains 'gt' since that
 +	 * indicates we are waking with hypervisor state loss from nap.
 +	 */
 +	bgt	cr3,.
 +
 +	blr	/* Return back to System Reset vector from where
 +		   pnv_restore_hyp_resource was invoked */
 +
 +/*
 + * Called if waking up from idle state which can cause either partial or
 + * complete hyp state loss.
 + * In POWER8, called if waking up from fastsleep or winkle
 + * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state
 + *
 + * r13 - PACA
 + * cr3 - gt if waking up with partial/complete hypervisor state loss
 + * cr4 - eq if waking up from complete hypervisor state loss.
 + */
 +_GLOBAL(pnv_wakeup_tb_loss)
 +	ld	r1,PACAR1(r13)
 +	/*
 +	 * Before entering any idle state, the NVGPRs are saved in the stack
 +	 * and they are restored before switching to the process context. Hence
 +	 * until they are restored, they are free to be used.
 +	 *
 +	 * Save SRR1 and LR in NVGPRs as they might be clobbered in
 +	 * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required
 +	 * to determine the wakeup reason if we branch to kvm_start_guest. LR
 +	 * is required to return back to reset vector after hypervisor state
 +	 * restore is complete.
 +	 */
 +	mflr	r17
 +	mfspr	r16,SPRN_SRR1
 +BEGIN_FTR_SECTION
 +	CHECK_HMI_INTERRUPT
 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 +
 +	lbz	r7,PACA_THREAD_MASK(r13)
 +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
 +lwarx_loop2:
 +	lwarx	r15,0,r14
 +	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
 +	/*
 +	 * Lock bit is set in one of the 2 cases-
 +	 * a. In the sleep/winkle enter path, the last thread is executing
 +	 * fastsleep workaround code.
 +	 * b. In the wake up path, another thread is executing fastsleep
 +	 * workaround undo code or resyncing timebase or restoring context
 +	 * In either case loop until the lock bit is cleared.
 +	 */
 +	bnel	core_idle_lock_held
 +
 +	cmpwi	cr2,r15,0
 +
 +	/*
 +	 * At this stage
 +	 * cr2 - eq if first thread to wakeup in core
 +	 * cr3-  gt if waking up with partial/complete hypervisor state loss
 +	 * cr4 - eq if waking up from complete hypervisor state loss.
 +	 */
 +
 +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
 +	stwcx.	r15,0,r14
 +	bne-	lwarx_loop2
 +	isync
 +
 +BEGIN_FTR_SECTION
 +	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
 +	and	r4,r4,r15
 +	cmpwi	r4,0	/* Check if first in subcore */
 +
 +	or	r15,r15,r7		/* Set thread bit */
 +	beq	first_thread_in_subcore
 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 +
 +	or	r15,r15,r7		/* Set thread bit */
 +	beq	cr2,first_thread_in_core
 +
 +	/* Not first thread in core or subcore to wake up */
 +	b	clear_lock
 +
 +first_thread_in_subcore:
 +	/*
 +	 * If waking up from sleep, subcore state is not lost. Hence
 +	 * skip subcore state restore
 +	 */
 +	bne	cr4,subcore_state_restored
 +
 +	/* Restore per-subcore state */
 +	ld      r4,_SDR1(r1)
 +	mtspr   SPRN_SDR1,r4
 +
 +	ld      r4,_RPR(r1)
 +	mtspr   SPRN_RPR,r4
 +	ld	r4,_AMOR(r1)
 +	mtspr	SPRN_AMOR,r4
 +
 +subcore_state_restored:
 +	/*
 +	 * Check if the thread is also the first thread in the core. If not,
 +	 * skip to clear_lock.
 +	 */
 +	bne	cr2,clear_lock
 +
 +first_thread_in_core:
 +
 +	/*
 +	 * First thread in the core waking up from any state which can cause
 +	 * partial or complete hypervisor state loss. It needs to
 +	 * call the fastsleep workaround code if the platform requires it.
 +	 * Call it unconditionally here. The below branch instruction will
 +	 * be patched out if the platform does not have fastsleep or does not
 +	 * require the workaround. Patching will be performed during the
 +	 * discovery of idle-states.
 +	 */
 +.global pnv_fastsleep_workaround_at_exit
 +pnv_fastsleep_workaround_at_exit:
 +	b	fastsleep_workaround_at_exit
 +
 +timebase_resync:
 +	/*
 +	 * Use cr3 which indicates that we are waking up with atleast partial
 +	 * hypervisor state loss to determine if TIMEBASE RESYNC is needed.
 +	 */
 +	ble	cr3,clear_lock
 +	/* Time base re-sync */
 +	bl	opal_rm_resync_timebase;
 +	/*
 +	 * If waking up from sleep, per core state is not lost, skip to
 +	 * clear_lock.
 +	 */
 +	bne	cr4,clear_lock
 +
 +	/*
 +	 * First thread in the core to wake up and its waking up with
 +	 * complete hypervisor state loss. Restore per core hypervisor
 +	 * state.
 +	 */
 +BEGIN_FTR_SECTION
 +	ld	r4,_PTCR(r1)
 +	mtspr	SPRN_PTCR,r4
 +	ld	r4,_RPR(r1)
 +	mtspr	SPRN_RPR,r4
 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 +
 +	ld	r4,_TSCR(r1)
 +	mtspr	SPRN_TSCR,r4
 +	ld	r4,_WORC(r1)
 +	mtspr	SPRN_WORC,r4
 +
 +clear_lock:
 +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
 +	lwsync
 +	stw	r15,0(r14)
 +
 +common_exit:
 +	/*
 +	 * Common to all threads.
 +	 *
 +	 * If waking up from sleep, hypervisor state is not lost. Hence
 +	 * skip hypervisor state restore.
 +	 */
 +	bne	cr4,hypervisor_state_restored
 +
 +	/* Waking up from winkle */
 +
 +BEGIN_MMU_FTR_SECTION
 +	b	no_segments
 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX)
 +	/* Restore SLB  from PACA */
 +	ld	r8,PACA_SLBSHADOWPTR(r13)
 +
 +	.rept	SLB_NUM_BOLTED
 +	li	r3, SLBSHADOW_SAVEAREA
 +	LDX_BE	r5, r8, r3
 +	addi	r3, r3, 8
 +	LDX_BE	r6, r8, r3
 +	andis.	r7,r5,SLB_ESID_V@h
 +	beq	1f
 +	slbmte	r6,r5
 +1:	addi	r8,r8,16
 +	.endr
 +no_segments:
 +
 +	/* Restore per thread state */
 +
 +	ld	r4,_SPURR(r1)
 +	mtspr	SPRN_SPURR,r4
 +	ld	r4,_PURR(r1)
 +	mtspr	SPRN_PURR,r4
 +	ld	r4,_DSCR(r1)
 +	mtspr	SPRN_DSCR,r4
 +	ld	r4,_WORT(r1)
 +	mtspr	SPRN_WORT,r4
 +
 +	/* Call cur_cpu_spec->cpu_restore() */
 +	LOAD_REG_ADDR(r4, cur_cpu_spec)
 +	ld	r4,0(r4)
 +	ld	r12,CPU_SPEC_RESTORE(r4)
 +#ifdef PPC64_ELF_ABI_v1
 +	ld	r12,0(r12)
 +#endif
 +	mtctr	r12
 +	bctrl
 +
 +hypervisor_state_restored:
 +
 +	mtspr	SPRN_SRR1,r16
 +	mtlr	r17
 +	blr	/* Return back to System Reset vector from where
 +		   pnv_restore_hyp_resource was invoked */
 +
 +fastsleep_workaround_at_exit:
 +	li	r3,1
 +	li	r4,0
 +	bl	opal_rm_config_cpu_idle_state
 +	b	timebase_resync
 +
 +/*
 + * R3 here contains the value that will be returned to the caller
 + * of power7_nap.
 + */
 +_GLOBAL(pnv_wakeup_loss)
 +	ld	r1,PACAR1(r13)
 +BEGIN_FTR_SECTION
 +	CHECK_HMI_INTERRUPT
 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 +	REST_NVGPRS(r1)
 +	REST_GPR(2, r1)
 +	ld	r6,_CCR(r1)
 +	ld	r4,_MSR(r1)
 +	ld	r5,_NIP(r1)
 +	addi	r1,r1,INT_FRAME_SIZE
 +	mtcr	r6
 +	mtspr	SPRN_SRR1,r4
 +	mtspr	SPRN_SRR0,r5
 +	rfid
 +
 +/*
 + * R3 here contains the value that will be returned to the caller
 + * of power7_nap.
 + */
 +_GLOBAL(pnv_wakeup_noloss)
 +	lbz	r0,PACA_NAPSTATELOST(r13)
 +	cmpwi	r0,0
 +	bne	pnv_wakeup_loss
 +BEGIN_FTR_SECTION
 +	CHECK_HMI_INTERRUPT
 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 +	ld	r1,PACAR1(r13)
 +	ld	r6,_CCR(r1)
 +	ld	r4,_MSR(r1)
 +	ld	r5,_NIP(r1)
 +	addi	r1,r1,INT_FRAME_SIZE
 +	mtcr	r6
 +	mtspr	SPRN_SRR1,r4
 +	mtspr	SPRN_SRR0,r5
 +	rfid
diff --cc arch/powerpc/kernel/traps.c
index f7e2f2e318bd,9ec95daccad9..2cb589264cb7
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@@ -60,7 -60,7 +60,8 @@@
  #include <asm/switch_to.h>
  #include <asm/tm.h>
  #include <asm/debug.h>
 +#include <asm/asm-prototypes.h>
+ #include <asm/hmi.h>
  #include <sysdev/fsl_pci.h>
  
  #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
diff --cc arch/s390/include/asm/mmu.h
index 18226437a832,b941528cc49e..6d39329c894b
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@@ -6,10 -6,11 +6,11 @@@
  
  typedef struct {
  	cpumask_t cpu_attach_mask;
 -	atomic_t attach_count;
 +	atomic_t flush_count;
  	unsigned int flush_mm;
- 	spinlock_t list_lock;
+ 	spinlock_t pgtable_lock;
  	struct list_head pgtable_list;
+ 	spinlock_t gmap_lock;
  	struct list_head gmap_list;
  	unsigned long asce;
  	unsigned long asce_limit;
diff --cc arch/s390/include/asm/mmu_context.h
index f77c638bf397,3ce3854b7a41..c6a088c91aee
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@@ -15,11 -15,12 +15,12 @@@
  static inline int init_new_context(struct task_struct *tsk,
  				   struct mm_struct *mm)
  {
- 	spin_lock_init(&mm->context.list_lock);
+ 	spin_lock_init(&mm->context.pgtable_lock);
  	INIT_LIST_HEAD(&mm->context.pgtable_list);
+ 	spin_lock_init(&mm->context.gmap_lock);
  	INIT_LIST_HEAD(&mm->context.gmap_list);
  	cpumask_clear(&mm->context.cpu_attach_mask);
 -	atomic_set(&mm->context.attach_count, 0);
 +	atomic_set(&mm->context.flush_count, 0);
  	mm->context.flush_mm = 0;
  #ifdef CONFIG_PGSTE
  	mm->context.alloc_pgste = page_table_allocate_pgste;
diff --cc arch/s390/kernel/diag.c
index 48b37b8357e6,a44faf4a0454..a97354c8c667
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@@ -162,6 -162,28 +162,30 @@@ int diag14(unsigned long rx, unsigned l
  }
  EXPORT_SYMBOL(diag14);
  
 -static inline int __diag204(unsigned long subcode, unsigned long size, void *addr)
++static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+ {
 -	register unsigned long _subcode asm("0") = subcode;
++	register unsigned long _subcode asm("0") = *subcode;
+ 	register unsigned long _size asm("1") = size;
+ 
+ 	asm volatile(
+ 		"	diag	%2,%0,0x204\n"
 -		"0:\n"
++		"0:	nopr	%%r7\n"
+ 		EX_TABLE(0b,0b)
+ 		: "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
 -	if (_subcode)
 -		return -1;
++	*subcode = _subcode;
+ 	return _size;
+ }
+ 
+ int diag204(unsigned long subcode, unsigned long size, void *addr)
+ {
+ 	diag_stat_inc(DIAG_STAT_X204);
 -	return __diag204(subcode, size, addr);
++	size = __diag204(&subcode, size, addr);
++	if (subcode)
++		return -1;
++	return size;
+ }
+ EXPORT_SYMBOL(diag204);
+ 
  /*
   * Diagnose 210: Get information about a virtual device
   */
diff --cc arch/s390/kvm/kvm-s390.c
index 6f5c344cd785,63ac7c1641a7..3f3ae4865d57
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@@ -26,15 -27,18 +27,18 @@@
  #include <linux/slab.h>
  #include <linux/timer.h>
  #include <linux/vmalloc.h>
+ #include <linux/bitmap.h>
  #include <asm/asm-offsets.h>
  #include <asm/lowcore.h>
 -#include <asm/etr.h>
 +#include <asm/stp.h>
  #include <asm/pgtable.h>
  #include <asm/gmap.h>
  #include <asm/nmi.h>
  #include <asm/switch_to.h>
  #include <asm/isc.h>
  #include <asm/sclp.h>
+ #include <asm/cpacf.h>
 -#include <asm/etr.h>
++#include <asm/timex.h>
  #include "kvm-s390.h"
  #include "gaccess.h"
  
@@@ -61,9 -65,9 +65,10 @@@ struct kvm_stats_debugfs_item debugfs_e
  	{ "exit_external_request", VCPU_STAT(exit_external_request) },
  	{ "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
  	{ "exit_instruction", VCPU_STAT(exit_instruction) },
 +	{ "exit_pei", VCPU_STAT(exit_pei) },
  	{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
  	{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+ 	{ "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
  	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
  	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
  	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@@ -188,6 -211,101 +212,103 @@@ void kvm_arch_hardware_unsetup(void
  					 &kvm_clock_notifier);
  }
  
+ static void allow_cpu_feat(unsigned long nr)
+ {
+ 	set_bit_inv(nr, kvm_s390_available_cpu_feat);
+ }
+ 
+ static inline int plo_test_bit(unsigned char nr)
+ {
+ 	register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ 	int cc = 3; /* subfunction not available */
+ 
+ 	asm volatile(
+ 		/* Parameter registers are ignored for "test bit" */
+ 		"	plo	0,0,0,0(0)\n"
+ 		"	ipm	%0\n"
+ 		"	srl	%0,28\n"
+ 		: "=d" (cc)
+ 		: "d" (r0)
+ 		: "cc");
+ 	return cc == 0;
+ }
+ 
+ static void kvm_s390_cpu_feat_init(void)
+ {
+ 	int i;
+ 
+ 	for (i = 0; i < 256; ++i) {
+ 		if (plo_test_bit(i))
+ 			kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+ 	}
+ 
+ 	if (test_facility(28)) /* TOD-clock steering */
 -		etr_ptff(kvm_s390_available_subfunc.ptff, ETR_PTFF_QAF);
++		ptff(kvm_s390_available_subfunc.ptff,
++		     sizeof(kvm_s390_available_subfunc.ptff),
++		     PTFF_QAF);
+ 
+ 	if (test_facility(17)) { /* MSA */
+ 		__cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+ 		__cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+ 		__cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+ 		__cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+ 		__cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+ 	}
+ 	if (test_facility(76)) /* MSA3 */
+ 		__cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+ 	if (test_facility(77)) { /* MSA4 */
+ 		__cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+ 		__cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+ 		__cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+ 		__cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+ 	}
+ 	if (test_facility(57)) /* MSA5 */
+ 		__cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+ 
+ 	if (MACHINE_HAS_ESOP)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+ 	/*
+ 	 * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+ 	 * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+ 	 */
+ 	if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+ 	    !test_facility(3) || !nested)
+ 		return;
+ 	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+ 	if (sclp.has_64bscao)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+ 	if (sclp.has_siif)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+ 	if (sclp.has_gpere)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+ 	if (sclp.has_gsls)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+ 	if (sclp.has_ib)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+ 	if (sclp.has_cei)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+ 	if (sclp.has_ibs)
+ 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+ 	/*
+ 	 * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+ 	 * all skey handling functions read/set the skey from the PGSTE
+ 	 * instead of the real storage key.
+ 	 *
+ 	 * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+ 	 * pages being detected as preserved although they are resident.
+ 	 *
+ 	 * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+ 	 * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+ 	 *
+ 	 * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+ 	 * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+ 	 * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+ 	 *
+ 	 * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+ 	 * cannot easily shadow the SCA because of the ipte lock.
+ 	 */
+ }
+ 
  int kvm_arch_init(void *opaque)
  {
  	kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
diff --cc arch/s390/mm/pgtable.c
index b98d1a152d46,293130b5aee7..5f092015aaa7
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@@ -456,9 -415,92 +459,93 @@@ void ptep_set_notify(struct mm_struct *
  	pgste = pgste_get_lock(ptep);
  	pgste_val(pgste) |= PGSTE_IN_BIT;
  	pgste_set_unlock(ptep, pgste);
 +	preempt_enable();
  }
  
+ /**
+  * ptep_force_prot - change access rights of a locked pte
+  * @mm: pointer to the process mm_struct
+  * @addr: virtual address in the guest address space
+  * @ptep: pointer to the page table entry
+  * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+  * @bit: pgste bit to set (e.g. for notification)
+  *
+  * Returns 0 if the access rights were changed and -EAGAIN if the current
+  * and requested access rights are incompatible.
+  */
+ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+ 		    pte_t *ptep, int prot, unsigned long bit)
+ {
+ 	pte_t entry;
+ 	pgste_t pgste;
+ 	int pte_i, pte_p;
+ 
+ 	pgste = pgste_get_lock(ptep);
+ 	entry = *ptep;
+ 	/* Check pte entry after all locks have been acquired */
+ 	pte_i = pte_val(entry) & _PAGE_INVALID;
+ 	pte_p = pte_val(entry) & _PAGE_PROTECT;
+ 	if ((pte_i && (prot != PROT_NONE)) ||
+ 	    (pte_p && (prot & PROT_WRITE))) {
+ 		pgste_set_unlock(ptep, pgste);
+ 		return -EAGAIN;
+ 	}
+ 	/* Change access rights and set pgste bit */
+ 	if (prot == PROT_NONE && !pte_i) {
+ 		ptep_flush_direct(mm, addr, ptep);
+ 		pgste = pgste_update_all(entry, pgste, mm);
+ 		pte_val(entry) |= _PAGE_INVALID;
+ 	}
+ 	if (prot == PROT_READ && !pte_p) {
+ 		ptep_flush_direct(mm, addr, ptep);
+ 		pte_val(entry) &= ~_PAGE_INVALID;
+ 		pte_val(entry) |= _PAGE_PROTECT;
+ 	}
+ 	pgste_val(pgste) |= bit;
+ 	pgste = pgste_set_pte(ptep, pgste, entry);
+ 	pgste_set_unlock(ptep, pgste);
+ 	return 0;
+ }
+ 
+ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ 		    pte_t *sptep, pte_t *tptep, pte_t pte)
+ {
+ 	pgste_t spgste, tpgste;
+ 	pte_t spte, tpte;
+ 	int rc = -EAGAIN;
+ 
+ 	if (!(pte_val(*tptep) & _PAGE_INVALID))
+ 		return 0;	/* already shadowed */
+ 	spgste = pgste_get_lock(sptep);
+ 	spte = *sptep;
+ 	if (!(pte_val(spte) & _PAGE_INVALID) &&
+ 	    !((pte_val(spte) & _PAGE_PROTECT) &&
+ 	      !(pte_val(pte) & _PAGE_PROTECT))) {
+ 		pgste_val(spgste) |= PGSTE_VSIE_BIT;
+ 		tpgste = pgste_get_lock(tptep);
+ 		pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+ 				(pte_val(pte) & _PAGE_PROTECT);
+ 		/* don't touch the storage key - it belongs to parent pgste */
+ 		tpgste = pgste_set_pte(tptep, tpgste, tpte);
+ 		pgste_set_unlock(tptep, tpgste);
+ 		rc = 1;
+ 	}
+ 	pgste_set_unlock(sptep, spgste);
+ 	return rc;
+ }
+ 
+ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+ {
+ 	pgste_t pgste;
+ 
+ 	pgste = pgste_get_lock(ptep);
+ 	/* notifier is called by the caller */
+ 	ptep_flush_direct(mm, saddr, ptep);
+ 	/* don't touch the storage key - it belongs to parent pgste */
+ 	pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+ 	pgste_set_unlock(ptep, pgste);
+ }
+ 
  static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
  {
  	if (!non_swap_entry(entry))
diff --cc arch/x86/kvm/iommu.c
index 95e0e6481f07,4f2010c5feba..b181426f67b4
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@@ -25,12 -25,10 +25,10 @@@
  
  #include <linux/list.h>
  #include <linux/kvm_host.h>
 -#include <linux/module.h>
 +#include <linux/moduleparam.h>
  #include <linux/pci.h>
  #include <linux/stat.h>
- #include <linux/dmar.h>
  #include <linux/iommu.h>
- #include <linux/intel-iommu.h>
  #include "assigned-dev.h"
  
  static bool allow_unsafe_assigned_interrupts;
diff --cc arch/x86/kvm/lapic.c
index 57549ed47ca5,6895fd28aae9..730cf174090a
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -1310,10 -1313,111 +1313,112 @@@ void wait_lapic_expire(struct kvm_vcpu 
  
  	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
  	if (guest_tsc < tsc_deadline)
 -		__delay(tsc_deadline - guest_tsc);
 +		__delay(min(tsc_deadline - guest_tsc,
 +			nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
  }
  
+ static void start_sw_tscdeadline(struct kvm_lapic *apic)
+ {
+ 	u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ 	u64 ns = 0;
+ 	ktime_t expire;
+ 	struct kvm_vcpu *vcpu = apic->vcpu;
+ 	unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ 	unsigned long flags;
+ 	ktime_t now;
+ 
+ 	if (unlikely(!tscdeadline || !this_tsc_khz))
+ 		return;
+ 
+ 	local_irq_save(flags);
+ 
+ 	now = apic->lapic_timer.timer.base->get_time();
+ 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ 	if (likely(tscdeadline > guest_tsc)) {
+ 		ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ 		do_div(ns, this_tsc_khz);
+ 		expire = ktime_add_ns(now, ns);
+ 		expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+ 		hrtimer_start(&apic->lapic_timer.timer,
+ 				expire, HRTIMER_MODE_ABS_PINNED);
+ 	} else
+ 		apic_timer_expired(apic);
+ 
+ 	local_irq_restore(flags);
+ }
+ 
+ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+ {
+ 	return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+ 
+ static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+ {
+ 	kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+ 	apic->lapic_timer.hv_timer_in_use = false;
+ }
+ 
+ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+ {
+ 	struct kvm_lapic *apic = vcpu->arch.apic;
+ 
+ 	WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ 	WARN_ON(swait_active(&vcpu->wq));
+ 	cancel_hv_tscdeadline(apic);
+ 	apic_timer_expired(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+ 
+ static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+ {
+ 	u64 tscdeadline = apic->lapic_timer.tscdeadline;
+ 
+ 	if (atomic_read(&apic->lapic_timer.pending) ||
+ 		kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+ 		if (apic->lapic_timer.hv_timer_in_use)
+ 			cancel_hv_tscdeadline(apic);
+ 	} else {
+ 		apic->lapic_timer.hv_timer_in_use = true;
+ 		hrtimer_cancel(&apic->lapic_timer.timer);
+ 
+ 		/* In case the sw timer triggered in the window */
+ 		if (atomic_read(&apic->lapic_timer.pending))
+ 			cancel_hv_tscdeadline(apic);
+ 	}
+ 	trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+ 			apic->lapic_timer.hv_timer_in_use);
+ 	return apic->lapic_timer.hv_timer_in_use;
+ }
+ 
+ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+ {
+ 	struct kvm_lapic *apic = vcpu->arch.apic;
+ 
+ 	WARN_ON(apic->lapic_timer.hv_timer_in_use);
+ 
+ 	if (apic_lvtt_tscdeadline(apic))
+ 		start_hv_tscdeadline(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+ 
+ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+ {
+ 	struct kvm_lapic *apic = vcpu->arch.apic;
+ 
+ 	/* Possibly the TSC deadline timer is not enabled yet */
+ 	if (!apic->lapic_timer.hv_timer_in_use)
+ 		return;
+ 
+ 	cancel_hv_tscdeadline(apic);
+ 
+ 	if (atomic_read(&apic->lapic_timer.pending))
+ 		return;
+ 
+ 	start_sw_tscdeadline(apic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+ 
  static void start_apic_timer(struct kvm_lapic *apic)
  {
  	ktime_t now;
diff --cc arch/x86/kvm/vmx.c
index df07a0a4611f,b2f559159f3a..bc354f003ce1
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -7560,6 -7708,19 +7711,12 @@@ static int handle_pml_full(struct kvm_v
  	return 1;
  }
  
 -static int handle_pcommit(struct kvm_vcpu *vcpu)
 -{
 -	/* we never catch pcommit instruct for L1 guest. */
 -	WARN_ON(1);
 -	return 1;
 -}
 -
+ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+ {
+ 	kvm_lapic_expired_hv_timer(vcpu);
+ 	return 1;
+ }
+ 
  /*
   * The exit handlers return 1 if the exit was handled fully and guest execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@@ -7610,6 -7771,8 +7767,7 @@@ static int (*const kvm_vmx_exit_handler
  	[EXIT_REASON_XSAVES]                  = handle_xsaves,
  	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
  	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 -	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+ 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@@ -7918,6 -8081,10 +8076,8 @@@ static bool nested_vmx_exit_handled(str
  		 * the XSS exit bitmap in vmcs12.
  		 */
  		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 -	case EXIT_REASON_PCOMMIT:
 -		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+ 	case EXIT_REASON_PREEMPTION_TIMER:
+ 		return false;
  	default:
  		return true;
  	}
@@@ -8940,6 -9115,20 +9120,8 @@@ static struct kvm_vcpu *vmx_create_vcpu
  	vmx->nested.current_vmptr = -1ull;
  	vmx->nested.current_vmcs12 = NULL;
  
 -	/*
 -	 * If PML is turned on, failure on enabling PML just results in failure
 -	 * of creating the vcpu, therefore we can simplify PML logic (by
 -	 * avoiding dealing with cases, such as enabling PML partially on vcpus
 -	 * for the guest, etc.
 -	 */
 -	if (enable_pml) {
 -		err = vmx_create_pml_buffer(vmx);
 -		if (err)
 -			goto free_vmcs;
 -	}
 -
+ 	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ 
  	return &vmx->vcpu;
  
  free_vmcs:
@@@ -9080,6 -9267,22 +9262,13 @@@ static void vmx_cpuid_update(struct kvm
  
  	if (cpu_has_secondary_exec_ctrls())
  		vmcs_set_secondary_exec_control(secondary_exec_ctl);
+ 
 -	if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
 -		if (guest_cpuid_has_pcommit(vcpu))
 -			vmx->nested.nested_vmx_secondary_ctls_high |=
 -				SECONDARY_EXEC_PCOMMIT;
 -		else
 -			vmx->nested.nested_vmx_secondary_ctls_high &=
 -				~SECONDARY_EXEC_PCOMMIT;
 -	}
 -
+ 	if (nested_vmx_allowed(vcpu))
+ 		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ 			FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ 	else
+ 		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ 			~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
  }
  
  static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
diff --cc arch/x86/kvm/x86.c
index 9c496c7e8c00,a27b33033700..19f9f9e05c2a
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -66,12 -68,10 +66,13 @@@
  #include <asm/div64.h>
  #include <asm/irq_remapping.h>
  
 +#define CREATE_TRACE_POINTS
 +#include "trace.h"
 +
  #define MAX_IO_MSRS 256
  #define KVM_MAX_MCE_BANKS 32
- #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+ EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
  
  #define emul_to_vcpu(ctxt) \
  	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
diff --cc include/linux/irqchip/arm-gic-v3.h
index 107eed475b94,700b4216c87a..56b0b7ec66aa
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@@ -229,7 -300,7 +300,8 @@@
  #define GITS_BASER_PAGE_SIZE_64K	(2UL << GITS_BASER_PAGE_SIZE_SHIFT)
  #define GITS_BASER_PAGE_SIZE_MASK	(3UL << GITS_BASER_PAGE_SIZE_SHIFT)
  #define GITS_BASER_PAGES_MAX		256
 +#define GITS_BASER_PAGES_SHIFT		(0)
+ #define GITS_BASER_NR_PAGES(r)		(((r) & 0xff) + 1)
  
  #define GITS_BASER_TYPE_NONE		0
  #define GITS_BASER_TYPE_DEVICE		1
diff --cc virt/kvm/kvm_main.c
index 2e791367c576,61b31a5f76c8..cc081ccfcaa3
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -3487,6 -3545,34 +3543,30 @@@ int kvm_io_bus_unregister_dev(struct kv
  	return r;
  }
  
+ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ 					 gpa_t addr)
+ {
+ 	struct kvm_io_bus *bus;
+ 	int dev_idx, srcu_idx;
+ 	struct kvm_io_device *iodev = NULL;
+ 
+ 	srcu_idx = srcu_read_lock(&kvm->srcu);
+ 
+ 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+ 
+ 	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+ 	if (dev_idx < 0)
+ 		goto out_unlock;
+ 
+ 	iodev = bus->range[dev_idx].dev;
+ 
+ out_unlock:
+ 	srcu_read_unlock(&kvm->srcu, srcu_idx);
+ 
+ 	return iodev;
+ }
+ EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+ 
 -static struct notifier_block kvm_cpu_notifier = {
 -	.notifier_call = kvm_cpu_hotplug,
 -};
 -
  static int kvm_debugfs_open(struct inode *inode, struct file *file,
  			   int (*get)(void *, u64 *), int (*set)(void *, u64),
  			   const char *fmt)