本文共 16148 字,大约阅读时间需要 53 分钟。
上一篇讲到了虚拟cpu的构建过程,接下来就要利用vmm接管vcpu的中断,异常等行为。
struct kvm_vcpu *vcpu_k;kvm_run->exit_reason/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */struct kvm_run { /* in */ __u8 request_interrupt_window; __u8 padding1[7]; /* out */ __u32 exit_reason;.......#define KVM_EXIT_UNKNOWN 0#define KVM_EXIT_EXCEPTION 1#define KVM_EXIT_IO 2#define KVM_EXIT_HYPERCALL 3#define KVM_EXIT_DEBUG 4#define KVM_EXIT_HLT 5#define KVM_EXIT_MMIO 6#define KVM_EXIT_IRQ_WINDOW_OPEN 7#define KVM_EXIT_SHUTDOWN 8#define KVM_EXIT_FAIL_ENTRY 9#define KVM_EXIT_INTR 10#define KVM_EXIT_SET_TPR 11#define KVM_EXIT_TPR_ACCESS 12
vcpu_init kvm_vm_create_vcpu vmx_create_vcpu kvm_vcpu_init_vtvoid vcpu_init(struct vm *vm, struct vcpu *vcpu){ int vcpu_mmap_size; long r = 0; if(vm->kvm) r= kvm_vm_create_vcpu(vm->kvm, 0); vcpu->vcpu_k = vm->kvm->vcpus[0]; vcpu->kvm_run = vcpu->vcpu_k->run;
kvm_run 记录了vcpu 运行的各种状态,并且在create_vcpu的时候进行了初始化,分配了page。
int run_vm(struct vm *vm, struct vcpu *vcpu, size_t sz){ struct kvm_vcpu *vcpu_k; struct kvm_regs regs; uint64_t memval = 0; //long r =0; vcpu_k = vcpu->vcpu_k; for (;;) { if (kvm_vcpu_ioctl_s(vcpu_k, (unsigned int)KVM_RUN, 0) < 0) { printk("KVM_RUN error \n"); return -1; } printk("exit_reason %lx\n", vcpu->kvm_run); switch (vcpu->kvm_run->exit_reason) { case KVM_EXIT_HLT: goto check; case KVM_EXIT_IO: // if (vcpu->kvm_run->io.direction == KVM_EXIT_IO_OUT // && vcpu->kvm_run->io.port == 0xE9) { // //char *p = (char *)vcpu->kvm_run; // // fwrite(p + vcpu->kvm_run->io.data_offset, // // vcpu->kvm_run->io.size, 1, stdout); // // fflush(stdout); // continue; // } /* fall through */ default: //printk( "Got exit_reason %d," // " expected KVM_EXIT_HLT (%d)\n", // vcpu->kvm_run->exit_reason, KVM_EXIT_HLT); return 0; } } check: if (kvm_vcpu_ioctl_s(vcpu_k,(unsigned int)KVM_GET_REGS, (unsigned long)®s) < 0) { printk("KVM_GET_REGS error \n"); return 0; } printk("excute success \n"); return 1;}
由于虚拟机和VMM存在同一份代码,vcpu运行的的逻辑需要区别于VMM。我们需要标记vcpu的状态,处于vcpu将不运行VMM的vm_run逻辑,而跟模式将会进行中断检测处理。
int kvm_emulate_halt(struct kvm_vcpu *vcpu){ ++vcpu->stat.halt_exits; if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; kvm_vcpu_block(vcpu); if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) return -EINTR; return 1; } else { vcpu->run->exit_reason = KVM_EXIT_HLT; return 0; }}static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run){ skip_emulated_instruction(vcpu); return kvm_emulate_halt(vcpu);}handle_exception kvm_emulate_halt(vcpu);static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run){ svm->next_rip = svm->vmcb->save.rip + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu);}
__vcpu_run kvm_x86_ops_vt->prepare_guest_switch(vcpu); kvm_x86_ops_vt->run(vcpu, kvm_run);
但是,中断这里还没反应。
首先我们的代码会运行在两种状态下,跟模式状态,非根模式状态。
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0){ vmx_fpu_deactivate(vcpu); if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu);#ifdef CONFIG_X86_64 if (vcpu->arch.shadow_efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); }#endif vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); vcpu->arch.cr0 = cr0; if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) vmx_fpu_activate(vcpu);}
static void hardware_enable(void *garbage){ int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old; rdmsrl(MSR_IA32_FEATURE_CONTROL, old); if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) != (MSR_IA32_FEATURE_CONTROL_LOCKED | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | MSR_IA32_FEATURE_CONTROL_LOCKED | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc");}
VMXON开启根模式
kvm 利用vmx_vcpu_run启动guest虚拟机也就是非根模式开始。
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run){ struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; /* * Loading guest fpu may have cleared host cr0.ts */ vmcs_writel(HOST_CR0, read_cr0()); asm( /* Store host registers */#ifdef CONFIG_X86_64 "push %%rdx; push %%rbp;" "push %%rcx \n\t"#else "push %%edx; push %%ebp;" "push %%ecx \n\t"#endif ASM_VMX_VMWRITE_RSP_RDX "\n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */#ifdef CONFIG_X86_64 "mov %c[cr2](%0), %%rax \n\t" "mov %%rax, %%cr2 \n\t" "mov %c[rax](%0), %%rax \n\t" "mov %c[rbx](%0), %%rbx \n\t" "mov %c[rdx](%0), %%rdx \n\t" "mov %c[rsi](%0), %%rsi \n\t" "mov %c[rdi](%0), %%rdi \n\t" "mov %c[rbp](%0), %%rbp \n\t" "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" "mov %c[r10](%0), %%r10 \n\t" "mov %c[r11](%0), %%r11 \n\t" "mov %c[r12](%0), %%r12 \n\t" "mov %c[r13](%0), %%r13 \n\t" "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */#else "mov %c[cr2](%0), %%eax \n\t" "mov %%eax, %%cr2 \n\t" "mov %c[rax](%0), %%eax \n\t" "mov %c[rbx](%0), %%ebx \n\t" "mov %c[rdx](%0), %%edx \n\t" "mov %c[rsi](%0), %%esi \n\t" "mov %c[rdi](%0), %%edi \n\t" "mov %c[rbp](%0), %%ebp \n\t" "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */#endif /* Enter guest mode */ "jne .Llaunched \n\t" ASM_VMX_VMLAUNCH "\n\t" "jmp .Lkvm_vmx_return \n\t" ".Llaunched: " ASM_VMX_VMRESUME "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */#ifdef CONFIG_X86_64 "xchg %0, (%%rsp) \n\t" "mov %%rax, %c[rax](%0) \n\t" "mov %%rbx, %c[rbx](%0) \n\t" "pushq (%%rsp); popq %c[rcx](%0) \n\t" "mov %%rdx, %c[rdx](%0) \n\t" "mov %%rsi, %c[rsi](%0) \n\t" "mov %%rdi, %c[rdi](%0) \n\t" "mov %%rbp, %c[rbp](%0) \n\t" "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t" "mov %%r11, %c[r11](%0) \n\t" "mov %%r12, %c[r12](%0) \n\t" "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" "mov %%cr2, %%rax \n\t" "mov %%rax, %c[cr2](%0) \n\t" "pop %%rbp; pop %%rbp; pop %%rdx \n\t"#else "xchg %0, (%%esp) \n\t" "mov %%eax, %c[rax](%0) \n\t" "mov %%ebx, %c[rbx](%0) \n\t" "pushl (%%esp); popl %c[rcx](%0) \n\t" "mov %%edx, %c[rdx](%0) \n\t" "mov %%esi, %c[rsi](%0) \n\t" "mov %%edi, %c[rdi](%0) \n\t" "mov %%ebp, %c[rbp](%0) \n\t" "mov %%cr2, %%eax \n\t" "mov %%eax, %c[cr2](%0) \n\t" "pop %%ebp; pop %%ebp; pop %%edx \n\t"#endif "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),#ifdef CONFIG_X86_64 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),#endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory"#ifdef CONFIG_X86_64 , "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"#else , "ebx", "edi", "rsi"#endif ); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); vcpu->arch.interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ asm("int $2");}
在之后就是两种状态的切换。
cr4中有个vmxe的位,用来使能VMX。
read cr4 3426e0 VMXE 2000 & = 2000
11 0100 0010 0110 1110 0000 10 0000 0000 0000
struct vmcs { u32 revision_id; u32 abort; char data[0];};static DEFINE_PER_CPU(struct vmcs *, current_vmcs_vt);void vcpu_load(struct kvm_vcpu *vcpu){ int cpu; mutex_lock(&vcpu->mutex); cpu = get_cpu(); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); printk("put cpu \n");}void vcpu_put(struct kvm_vcpu *vcpu){ printk("vcpu_put %lx muxte %lx onwer %lx\n", vcpu, &vcpu->mutex, vcpu->mutex.owner); preempt_disable(); kvm_arch_vcpu_put(vcpu); //preempt_notifier_unregister(&vcpu->preempt_notifier); preempt_enable(); mutex_unlock(&vcpu->mutex);}
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu){ struct vcpu_vmx *vmx = to_vmx(vcpu); u64 phys_addr = __pa(vmx->vmcs); u64 tsc_this, delta; if (vcpu->cpu != cpu) { vcpu_clear(vmx); kvm_migrate_apic_timer(vcpu); } if (per_cpu(current_vmcs_vt, cpu) != vmx->vmcs) { u8 error; per_cpu(current_vmcs_vt, cpu) = vmx->vmcs; asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc"); if (error) printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", vmx->vmcs, phys_addr); } //printk("VMx_VMPTRLD \n"); if (vcpu->cpu != cpu) { //printk("vcpu->cpu != cpu \n"); struct descriptor_table dt; unsigned long sysenter_esp; vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ get_gdt(&dt); vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ /* * Make sure the time stamp counter is monotonous. */ rdtscll(tsc_this); delta = vcpu->arch.host_tsc - tsc_this; vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); }}
static void vmx_load_host_state(struct vcpu_vmx *vmx){ unsigned long flags; if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) load_fs(vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { load_ldt(vmx->host_state.ldt_sel); /* * If we have to reload gs, we must take care to * preserve our gs base. */ local_irq_save(flags); load_gs(vmx->host_state.gs_sel);#ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));#endif local_irq_restore(flags); } reload_tss(); save_msrs(vmx->guest_msrs, vmx->save_nmsrs); load_msrs(vmx->host_msrs, vmx->save_nmsrs); reload_host_efer(vmx);}static void vmx_vcpu_put(struct kvm_vcpu *vcpu){ vmx_load_host_state(to_vmx(vcpu));}
struct vmcs {
u32 revision_id; u32 abort; char data[0]; };这个就够有点怪,但其实是kvm的核心。
最后一个data包含了以上6个内容,而且每个cpu都会定义一个current_vmcs变量,用来保存需要切换的状态。
setup_vmcs_config 对vm_exit, vm_entry的控制域进行设置。
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf){ u32 vmx_msr_low, vmx_msr_high; u32 min, opt; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; opt = 0; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; min = CPU_BASED_HLT_EXITING |#ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |#endif CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO;#ifdef CONFIG_X86_64 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & ~CPU_BASED_CR8_STORE_EXITING;#endif if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min = 0; opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) return -EIO; }#ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;#endif min = 0;#ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;#endif opt = 0; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; min = opt = 0; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) return -EIO;#ifdef CONFIG_X86_64 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ if (vmx_msr_high & (1u<<16)) return -EIO;#endif /* Require Write-Back (WB) memory type for VMCS accesses. */ if (((vmx_msr_high >> 18) & 15) != 6) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; vmcs_conf->order = get_order(vmcs_config.size); vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; return 0;}
虽然零星的信息显示,这个vmExit的回调要在这个控制域里设置,但是,并没有在文档中找到办法。
继续读下bluepill
VOID NTAPI HvmEventCallback ( PCPU Cpu, PGUEST_REGS GuestRegs){ NTSTATUS Status; if (!Cpu || !GuestRegs) return; // FIXME: This should be moved away from the HVM to VMX-specific code!!! if (Hvm->Architecture == ARCH_VMX) GuestRegs->rsp = VmxRead (GUEST_RSP); if (Hvm->ArchIsNestedEvent (Cpu, GuestRegs)) { // it's an event of a nested guest Hvm->ArchDispatchNestedEvent (Cpu, GuestRegs); // FIXME: This should be moved away from the HVM to VMX-specific code!!! if (Hvm->Architecture == ARCH_VMX) VmxWrite (GUEST_RSP, GuestRegs->rsp); return; } // it's an original event Hvm->ArchDispatchEvent (Cpu, GuestRegs); // FIXME: This should be moved away from the HVM to VMX-specific code!!! if (Hvm->Architecture == ARCH_VMX) VmxWrite (GUEST_RSP, GuestRegs->rsp); return;}
下面还要idt gdt设置的函数,先忽略。
;HvmEventCallback(PCPU Cpu,PGUEST_REGS GuestRegs,ULONG64 Ticks1)VmxVmexitHandler PROC HVM_SAVE_ALL_NOSEGREGS mov rcx, [rsp + 80h] ;PCPU mov rdx, rsp ;GuestRegs mov r8, 0 ;TSC sub rsp, 28h ;rdtsc call HvmEventCallback add rsp, 28h HVM_RESTORE_ALL_NOSEGREGS vmx_resume retVmxVmexitHandler ENDP
static NTSTATUS VmxSetupVMCS ( PCPU Cpu, PVOID GuestRip, PVOID GuestRsp){.......#ifdef _X86_ VmxWrite (HOST_RSP, g_HostStackBaseAddress + 0x0C00); //setup host sp at vmxLaunch(...)#else VmxWrite (HOST_RSP, (ULONG64) Cpu); //setup host sp at vmxLaunch(...)#endif VmxWrite (HOST_RIP, (ULONG64) VmxVmexitHandler); //setup host ip:CmSlipIntoMatrix _KdPrint (("VmxSetupVMCS(): Exit\n")); return STATUS_SUCCESS;}
貌似是这个 。这里的host_ip 的描述应该是,guest 中断退出时,rip所应该赋的值。这个应该是写在控制域中的,整理下代码,跑一下。
转载地址:http://hnjvb.baihongyu.com/