Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[cascardo/linux.git] / arch / x86 / kvm / x86.c
index b7e5794..ca3d760 100644 (file)
@@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -211,6 +212,7 @@ static void shared_msr_update(unsigned slot, u32 msr)
 
 void kvm_define_shared_msr(unsigned slot, u32 msr)
 {
+       BUG_ON(slot >= KVM_NR_SHARED_MSRS);
        if (slot >= shared_msrs_global.nr)
                shared_msrs_global.nr = slot + 1;
        shared_msrs_global.msrs[slot] = msr;
@@ -310,6 +312,31 @@ static int exception_class(int vector)
        return EXCPT_BENIGN;
 }
 
+#define EXCPT_FAULT            0
+#define EXCPT_TRAP             1
+#define EXCPT_ABORT            2
+#define EXCPT_INTERRUPT                3
+
+static int exception_type(int vector)
+{
+       unsigned int mask;
+
+       if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+               return EXCPT_INTERRUPT;
+
+       mask = 1 << vector;
+
+       /* #DB is trap, as instruction watchpoints are handled elsewhere */
+       if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+               return EXCPT_TRAP;
+
+       if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+               return EXCPT_ABORT;
+
+       /* Reserved exceptions will result in fault */
+       return EXCPT_FAULT;
+}
+
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
                unsigned nr, bool has_error, u32 error_code,
                bool reinject)
@@ -758,6 +785,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
 }
 
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+       u64 fixed = DR6_FIXED_1;
+
+       if (!guest_cpuid_has_rtm(vcpu))
+               fixed |= DR6_RTM;
+       return fixed;
+}
+
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
        switch (dr) {
@@ -773,7 +809,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
        case 6:
                if (val & 0xffffffff00000000ULL)
                        return -1; /* #GP */
-               vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+               vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
                kvm_update_dr6(vcpu);
                break;
        case 5:
@@ -1204,6 +1240,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
        unsigned long flags;
        s64 usdiff;
        bool matched;
+       bool already_matched;
        u64 data = msr->data;
 
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -1268,6 +1305,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                }
                matched = true;
+               already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
        } else {
                /*
                 * We split periods of matched TSC writes into generations.
@@ -1283,7 +1321,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
                kvm->arch.cur_tsc_write = data;
                kvm->arch.cur_tsc_offset = offset;
                matched = false;
-               pr_debug("kvm: new tsc generation %u, clock %llu\n",
+               pr_debug("kvm: new tsc generation %llu, clock %llu\n",
                         kvm->arch.cur_tsc_generation, data);
        }
 
@@ -1308,10 +1346,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
-       if (matched)
-               kvm->arch.nr_vcpus_matched_tsc++;
-       else
+       if (!matched) {
                kvm->arch.nr_vcpus_matched_tsc = 0;
+       } else if (!already_matched) {
+               kvm->arch.nr_vcpus_matched_tsc++;
+       }
 
        kvm_track_tsc_matching(vcpu);
        spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -2012,6 +2051,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                data &= ~(u64)0x40;     /* ignore flush filter disable */
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
                data &= ~(u64)0x8;      /* ignore TLB cache disable */
+               data &= ~(u64)0x40000;  /* ignore Mc status write enable */
                if (data != 0) {
                        vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
                                    data);
@@ -2954,9 +2994,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
        events->interrupt.nr = vcpu->arch.interrupt.nr;
        events->interrupt.soft = 0;
-       events->interrupt.shadow =
-               kvm_x86_ops->get_interrupt_shadow(vcpu,
-                       KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
+       events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
 
        events->nmi.injected = vcpu->arch.nmi_injected;
        events->nmi.pending = vcpu->arch.nmi_pending != 0;
@@ -4062,7 +4100,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 
                if (gpa == UNMAPPED_GVA)
                        return X86EMUL_PROPAGATE_FAULT;
-               ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+               ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
+                                         offset, toread);
                if (ret < 0) {
                        r = X86EMUL_IO_NEEDED;
                        goto out;
@@ -4083,10 +4122,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       unsigned offset;
+       int ret;
 
-       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
-                                         access | PFERR_FETCH_MASK,
-                                         exception);
+       /* Inline kvm_read_guest_virt_helper for speed.  */
+       gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
+                                                   exception);
+       if (unlikely(gpa == UNMAPPED_GVA))
+               return X86EMUL_PROPAGATE_FAULT;
+
+       offset = addr & (PAGE_SIZE-1);
+       if (WARN_ON(offset + bytes > PAGE_SIZE))
+               bytes = (unsigned)PAGE_SIZE - offset;
+       ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
+                                 offset, bytes);
+       if (unlikely(ret < 0))
+               return X86EMUL_IO_NEEDED;
+
+       return X86EMUL_CONTINUE;
 }
 
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
@@ -4710,7 +4763,6 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
        if (desc->g)
                var.limit = (var.limit << 12) | 0xfff;
        var.type = desc->type;
-       var.present = desc->p;
        var.dpl = desc->dpl;
        var.db = desc->d;
        var.s = desc->s;
@@ -4742,6 +4794,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
        return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 
+static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
+                             u32 pmc)
+{
+       return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+}
+
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
                             u32 pmc, u64 *pdata)
 {
@@ -4818,6 +4876,7 @@ static const struct x86_emulate_ops emulate_ops = {
        .set_dr              = emulator_set_dr,
        .set_msr             = emulator_set_msr,
        .get_msr             = emulator_get_msr,
+       .check_pmc           = emulator_check_pmc,
        .read_pmc            = emulator_read_pmc,
        .halt                = emulator_halt,
        .wbinvd              = emulator_wbinvd,
@@ -4830,7 +4889,7 @@ static const struct x86_emulate_ops emulate_ops = {
 
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 {
-       u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+       u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
        /*
         * an sti; sti; sequence only disable interrupts for the first
         * instruction. So, if the last instruction, be it emulated or
@@ -4838,8 +4897,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
         * means that the last instruction is an sti. We should not
         * leave the flag on in this case. The same goes for mov ss
         */
-       if (!(int_shadow & mask))
+       if (int_shadow & mask)
+               mask = 0;
+       if (unlikely(int_shadow || mask)) {
                kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+               if (!mask)
+                       kvm_make_request(KVM_REQ_EVENT, vcpu);
+       }
 }
 
 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
@@ -4854,19 +4918,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
                kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
-{
-       memset(&ctxt->opcode_len, 0,
-              (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
-
-       ctxt->fetch.start = 0;
-       ctxt->fetch.end = 0;
-       ctxt->io_read.pos = 0;
-       ctxt->io_read.end = 0;
-       ctxt->mem_read.pos = 0;
-       ctxt->mem_read.end = 0;
-}
-
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -5065,23 +5116,22 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
        return dr6;
 }
 
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
 {
        struct kvm_run *kvm_run = vcpu->run;
 
        /*
-        * Use the "raw" value to see if TF was passed to the processor.
-        * Note that the new value of the flags has not been saved yet.
+        * rflags is the old, "raw" value of the flags.  The new value has
+        * not been saved yet.
         *
         * This is correct even for TF set by the guest, because "the
         * processor will not generate this exception after the instruction
         * that sets the TF flag".
         */
-       unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
-
        if (unlikely(rflags & X86_EFLAGS_TF)) {
                if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-                       kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+                       kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
+                                                 DR6_RTM;
                        kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
                        kvm_run->debug.arch.exception = DB_VECTOR;
                        kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -5094,7 +5144,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
                         * cleared by the processor".
                         */
                        vcpu->arch.dr6 &= ~15;
-                       vcpu->arch.dr6 |= DR6_BS;
+                       vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
                        kvm_queue_exception(vcpu, DB_VECTOR);
                }
        }
@@ -5113,7 +5163,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
                                           vcpu->arch.eff_db);
 
                if (dr6 != 0) {
-                       kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+                       kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
                        kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
                                get_segment_base(vcpu, VCPU_SREG_CS);
 
@@ -5124,14 +5174,15 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
                }
        }
 
-       if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+       if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+           !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
                dr6 = kvm_vcpu_check_hw_bp(eip, 0,
                                           vcpu->arch.dr7,
                                           vcpu->arch.db);
 
                if (dr6 != 0) {
                        vcpu->arch.dr6 &= ~15;
-                       vcpu->arch.dr6 |= dr6;
+                       vcpu->arch.dr6 |= dr6 | DR6_RTM;
                        kvm_queue_exception(vcpu, DB_VECTOR);
                        *r = EMULATE_DONE;
                        return true;
@@ -5195,6 +5246,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 
        if (emulation_type & EMULTYPE_SKIP) {
                kvm_rip_write(vcpu, ctxt->_eip);
+               if (ctxt->eflags & X86_EFLAGS_RF)
+                       kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
                return EMULATE_DONE;
        }
 
@@ -5245,13 +5298,22 @@ restart:
                r = EMULATE_DONE;
 
        if (writeback) {
+               unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
                toggle_interruptibility(vcpu, ctxt->interruptibility);
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                kvm_rip_write(vcpu, ctxt->eip);
                if (r == EMULATE_DONE)
-                       kvm_vcpu_check_singlestep(vcpu, &r);
-               kvm_set_rflags(vcpu, ctxt->eflags);
+                       kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+               __kvm_set_rflags(vcpu, ctxt->eflags);
+
+               /*
+                * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+                * do nothing, and it will be requested again as soon as
+                * the shadow expires.  But we still need to check here,
+                * because POPF has no interrupt shadow.
+                */
+               if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+                       kvm_make_request(KVM_REQ_EVENT, vcpu);
        } else
                vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
 
@@ -5642,7 +5704,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
        u64 param, ingpa, outgpa, ret;
        uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
        bool fast, longmode;
-       int cs_db, cs_l;
 
        /*
         * hypercall generates UD from non zero cpl and real mode
@@ -5653,8 +5714,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-       longmode = is_long_mode(vcpu) && cs_l == 1;
+       longmode = is_64_bit_mode(vcpu);
 
        if (!longmode) {
                param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
@@ -5719,7 +5779,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
-       int r = 1;
+       int op_64_bit, r = 1;
 
        if (kvm_hv_hypercall_enabled(vcpu->kvm))
                return kvm_hv_hypercall(vcpu);
@@ -5732,7 +5792,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 
        trace_kvm_hypercall(nr, a0, a1, a2, a3);
 
-       if (!is_long_mode(vcpu)) {
+       op_64_bit = is_64_bit_mode(vcpu);
+       if (!op_64_bit) {
                nr &= 0xFFFFFFFF;
                a0 &= 0xFFFFFFFF;
                a1 &= 0xFFFFFFFF;
@@ -5758,6 +5819,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                break;
        }
 out:
+       if (!op_64_bit)
+               ret = (u32)ret;
        kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
        ++vcpu->stat.hypercalls;
        return r;
@@ -5836,6 +5899,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
                trace_kvm_inj_exception(vcpu->arch.exception.nr,
                                        vcpu->arch.exception.has_error_code,
                                        vcpu->arch.exception.error_code);
+
+               if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+                       __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+                                            X86_EFLAGS_RF);
+
                kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
                                          vcpu->arch.exception.has_error_code,
                                          vcpu->arch.exception.error_code,
@@ -5867,6 +5935,18 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
                        kvm_x86_ops->set_nmi(vcpu);
                }
        } else if (kvm_cpu_has_injectable_intr(vcpu)) {
+               /*
+                * Because interrupts can be injected asynchronously, we are
+                * calling check_nested_events again here to avoid a race condition.
+                * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+                * proposal and current concerns.  Perhaps we should be setting
+                * KVM_REQ_EVENT only on certain events and not unconditionally?
+                */
+               if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+                       r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+                       if (r != 0)
+                               return r;
+               }
                if (kvm_x86_ops->interrupt_allowed(vcpu)) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
                                            false);
@@ -6815,9 +6895,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
        vcpu->arch.nmi_injected = false;
+       kvm_clear_interrupt_queue(vcpu);
+       kvm_clear_exception_queue(vcpu);
 
        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
-       vcpu->arch.dr6 = DR6_FIXED_1;
+       vcpu->arch.dr6 = DR6_INIT;
        kvm_update_dr6(vcpu);
        vcpu->arch.dr7 = DR7_FIXED_1;
        kvm_update_dr7(vcpu);
@@ -7373,12 +7455,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_rflags);
 
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
            kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
                rflags |= X86_EFLAGS_TF;
        kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+       __kvm_set_rflags(vcpu, rflags);
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);