/* * plex86: run multiple x86 operating systems concurrently * Copyright (C) 1999-2003 Kevin P. Lawton * * monitor-host.c: This file contains the top-level monitor code, * accessible from the host space. (kernel independent code) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "plex86.h" #define IN_HOST_SPACE #include "monitor.h" /* ===================================================================== * Plex86 module global variables. This should be the _only_ place * where globals are declared. Since plex86 supports multiple VMs, almost * all data is stored per-VM. For the few variables which are global * to all VMs, we have to be careful to access them in SMP friendly ways. * The ones which are written upon kernel module initialization are fine, * since they are only written once. * ===================================================================== */ /* Info regarding the physical pages that comprise the kernel module, * including physical page information. This is written (once) at * kernel module initialization time. Thus there are no SMP access issues. */ kernelModulePages_t kernelModulePages; /* Information of the host processor as returned by the CPUID * instruction. This is written (once) at kernel module initialization time. * Thus there no are SMP access issues. */ cpuid_info_t hostCpuIDInfo; /* Some constants used by the VM logic. Since they're "const", there are * no problems with SMP access. */ static const selector_t nullSelector = { raw: 0 }; static const descriptor_t nullDescriptor = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; static int hostInitIDTSlot(vm_t *vm, unsigned vec, int type); static void hostMapMonPages(vm_t *vm, Bit32u *, unsigned, Bit32u *, page_t *, unsigned user, unsigned writable, char *name); #if ANAL_CHECKS static void hostMapBlankPage(vm_t *vm, Bit32u *laddr_p, page_t *pageTable); #endif #define RW0 0 #define RW1 1 #define US0 0 #define US1 1 #define IDT_INTERRUPT 0 #define IDT_EXCEPTION_ERROR 1 #define IDT_EXCEPTION_NOERROR 2 unsigned hostModuleInit(void) { /* Kernel independent stuff to do at kernel module load time. */ if (!hostGetCpuCapabilities()) { hostOSPrint("getCpuCapabilities returned error\n"); return(0); /* Fail. */ } else { #if 0 hostOSPrint("ptype:%u, family:%u, model:%u stepping:%u\n", hostCpuIDInfo.procSignature.fields.procType, hostCpuIDInfo.procSignature.fields.family, hostCpuIDInfo.procSignature.fields.model, hostCpuIDInfo.procSignature.fields.stepping); #endif } /* xxx Should check that host CS.base is page aligned here. */ #if 1 { Bit32u cr0; asm volatile ( "movl %%cr0, %0" : "=r" (cr0) ); hostOSPrint("host CR0=0x%x\n", cr0); } #endif return(1); /* Pass. */ } void hostDeviceOpen(vm_t *vm) { /* Kernel independent stuff to do at device open time. */ /* Zero out entire VM structure. */ mon_memzero( vm, sizeof(vm_t) ); vm->vmState = VMStateFDOpened; } int hostInitMonitor(vm_t *vm) { unsigned pdi, pti; unsigned int i; Bit32u nexus_size; page_t *pageTable; Bit32u laddr, base; int r; vm->kernel_offset = hostOSKernelOffset(); vm->system.a20Enable = 1; /* Start with A20 line enabled. */ vm->system.a20AddrMask = 0xffffffff; /* All address lines contribute. */ vm->system.a20IndexMask = 0x000fffff; /* All address lines contribute. */ /* Initialize nexus */ mon_memzero(vm->host.addr.nexus, 4096); /* Copy transition code (nexus) into code page allocated for this VM. */ nexus_size = ((Bit32u) &__nexus_end) - ((Bit32u) &__nexus_start); if (nexus_size > 4096) goto error; mon_memcpy(vm->host.addr.nexus, &__nexus_start, nexus_size); /* Init the convenience pointers. */ /* Pointer to host2mon routine inside nexus page */ vm->host.__host2mon = (void (*)(void)) HOST_NEXUS_OFFSET(vm, __host2mon); /* Pointer to guest context on monitor stack */ vm->host.addr.guest_context = (guest_context_t *) ( (Bit32u)vm->host.addr.nexus + PAGESIZE - sizeof(guest_context_t) ); /* Zero out various monitor data structures */ mon_memzero(vm->host.addr.log_buffer, 4096*LOG_BUFF_PAGES); mon_memzero(&vm->log_buffer_info, sizeof(vm->log_buffer_info)); mon_memzero(vm->host.addr.page_dir, 4096); mon_memzero(vm->host.addr.guest_cpu, 4096); mon_memzero(vm->host.addr.idt, MON_IDT_PAGES*4096); mon_memzero(vm->host.addr.gdt, MON_GDT_PAGES*4096); mon_memzero(vm->host.addr.ldt, MON_LDT_PAGES*4096); mon_memzero(vm->host.addr.tss, MON_TSS_PAGES*4096); mon_memzero(vm->host.addr.idt_stubs, MON_IDT_STUBS_PAGES*4096); vm->guestPhyPagePinQueue.nEntries = 0; vm->guestPhyPagePinQueue.tail = 0; /* * ================ * Nexus Page Table * ================ * * All structures needed by the monitor inside the guest environment * (code to perform the transition between host<-->guest, fault handler * code, various processor data structures like page directory, GDT, * IDT, TSS etc.) are mapped into a single Page Table. * * This allows us to migrate the complete nexus to anywhere in the * guest address space by just updating a single (unused) page directory * entry in the monitor/guest page directory to point to this nexus * page table. * * To simplify nexus migration, we try to avoid storing guest linear * addresses to nexus structures as far as possible. Instead, we use * offsets relative to the monitor code/data segments. As we update * the base of these segments whenever the monitor migrates, the net * effect is that those *offsets* remain valid across nexus migration. */ /* Fill in the PDE flags. The US bit is set to 1 (user access). * All of the US bits in the monitor PTEs are set to 0 (system access). */ vm->host.nexus_pde.fields.base = vm->pages.nexus_page_tbl; vm->host.nexus_pde.fields.avail = 0; vm->host.nexus_pde.fields.G = 0; /* not global */ vm->host.nexus_pde.fields.PS = 0; /* 4K pages */ vm->host.nexus_pde.fields.D = 0; /* (unused in pde) */ vm->host.nexus_pde.fields.A = 0; /* not accessed */ vm->host.nexus_pde.fields.PCD = 0; /* normal caching */ vm->host.nexus_pde.fields.PWT = 0; /* normal write-back */ vm->host.nexus_pde.fields.US = 1; /* user access (see above) */ vm->host.nexus_pde.fields.RW = 1; /* read or write */ vm->host.nexus_pde.fields.P = 1; /* present in memory */ /* Clear Page Table. */ pageTable = vm->host.addr.nexus_page_tbl; mon_memzero(pageTable, 4096); /* xxx Comment here */ laddr = 0; base = MON_BASE_FROM_LADDR(laddr); hostMapMonPages(vm, kernelModulePages.ppi, kernelModulePages.nPages, &laddr, pageTable, US0, RW1, "Monitor code/data pages"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif vm->guest.addr.nexus = (nexus_t *) (laddr - base); hostMapMonPages(vm, &vm->pages.nexus, 1, &laddr, pageTable, US0, RW1, "Nexus"); vm->guest.addr.guest_context = (guest_context_t *) ( (Bit32u)vm->guest.addr.nexus + PAGESIZE - sizeof(guest_context_t) ); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif vm->host.addr.nexus->vm = (void *) (laddr - base); hostMapMonPages(vm, vm->pages.vm, BytesToPages(sizeof(*vm)), &laddr, pageTable, US0, RW1, "VM structure"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif vm->guest.addr.idt = (gate_t *) (laddr - base); hostMapMonPages(vm, vm->pages.idt, MON_IDT_PAGES, &laddr, pageTable, US0, RW1, "IDT"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif vm->guest.addr.gdt = (descriptor_t *) (laddr - base); hostMapMonPages(vm, vm->pages.gdt, MON_GDT_PAGES, &laddr, pageTable, US0, RW1, "GDT"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif vm->guest.addr.ldt = (descriptor_t *) (laddr - base); hostMapMonPages(vm, vm->pages.ldt, MON_LDT_PAGES, &laddr, pageTable, US0, RW1, "LDT"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif vm->guest.addr.tss = (tss_t *) (laddr - base); hostMapMonPages(vm, vm->pages.tss, MON_TSS_PAGES, &laddr, pageTable, US0, RW1, "TSS"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif vm->guest.addr.idt_stubs = (idt_stub_t *) (laddr - base); hostMapMonPages(vm, vm->pages.idt_stubs, MON_IDT_STUBS_PAGES, &laddr, pageTable, US0, RW1, "IDT stubs"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* Monitor Page Directory */ vm->guest.addr.page_dir = (pageEntry_t *) (laddr - base); hostMapMonPages(vm, &vm->pages.page_dir, 1, &laddr, pageTable, US0, RW1, "Monitor Page Directory"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* Nexus Page Table */ vm->guest.addr.nexus_page_tbl = (page_t *) (laddr - base); hostMapMonPages(vm, &vm->pages.nexus_page_tbl, 1, &laddr, pageTable, US0, RW1, "Nexus Page Table"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* Map virtualized guest page tables into monitor. */ vm->guest.addr.page_tbl = (page_t *) (laddr - base); hostMapMonPages(vm, vm->pages.page_tbl, MON_PAGE_TABLES, &laddr, pageTable, US0, RW1, "Guest Page Tables"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* Map of linear addresses of page tables mapped into monitor */ vm->guest.addr.page_tbl_laddr_map = (unsigned *) (laddr - base); hostMapMonPages(vm, &vm->pages.page_tbl_laddr_map, 1, &laddr, pageTable, US0, RW1, "Page Table Laddr Map"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* Guest CPU state (mapped RW into user space also). */ vm->guest.addr.guest_cpu = (guest_cpu_t *) (laddr - base); hostMapMonPages(vm, &vm->pages.guest_cpu, 1, &laddr, pageTable, US0, RW1, "Guest CPU State"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* * We need a buffer to implement a debug print facility which * can work in either host or monitor space. Map the buffer * into monitor/guest space. */ vm->guest.addr.log_buffer = (unsigned char *) (laddr - base); hostMapMonPages(vm, vm->pages.log_buffer, LOG_BUFF_PAGES, &laddr, pageTable, US0, RW1, "Log Buffer"); { /* The physical addresses of the following pages are not */ /* yet established. Pass dummy info until they are mapped. */ Bit32u tmp[1]; tmp[0] = 0; #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* Window into the guest's current physical code page */ vm->guest.addr.code_phy_page = (unsigned char *) (laddr - base); hostMapMonPages(vm, tmp, 1, &laddr, pageTable, US0, RW1, "Code Phy Page"); #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif /* Temporary window into a guest physical page, for accessing */ /* guest GDT, IDT, etc info. */ vm->guest.addr.tmp_phy_page0 = (unsigned char *) (laddr - base); hostMapMonPages(vm, tmp, 1, &laddr, pageTable, US0, RW1, "Tmp Phy Page0"); vm->guest.addr.tmp_phy_page1 = (unsigned char *) (laddr - base); hostMapMonPages(vm, tmp, 1, &laddr, pageTable, US0, RW1, "Tmp Phy Page1"); } #if ANAL_CHECKS hostMapBlankPage(vm, &laddr, pageTable); #endif hostOSPrint("Using %u/1024 PTE slots in 4Meg monitor range.\n", (laddr >> 12) & 0x3ff); /* Pointer to mon2host routine inside nexus page */ vm->guest.__mon2host = (void (*)(void)) MON_NEXUS_OFFSET(vm, __mon2host); /* * ===================== * Transition Page Table * ===================== * * To aid in the transition between host<-->monitor/guest spaces, * we need to have an address identity map situation for at least * one page; the page containing the transition code. As we do * not know in advance whether this linear address range is in use * by the guest as well, we set aside a complete additional Page * Table, which contains only a single PTE pointing to the nexus page. * * To create the identity map, we simply change the corresponding * monitor page directory entry to point to this transition Page Table. * This happens transparently inside the host<-->guest transition code; * both the guest/monitor code and the host side code never see this * transition page table entered into the page directory! * * NOTE: We need to ensure that the nexus page table never spans the * same 4Meg linear address space region as this page table! * As we are free to choose the nexus linear address, this is * not a problem. */ /* Get full linear address of nexus code page, as seen in host space. */ laddr = (Bit32u)vm->host.addr.nexus + vm->kernel_offset; pdi = laddr >> 22; pti = (laddr >> 12) & 0x3ff; /* * We need to be able to access the PDE in the monitor page directory * that corresponds to this linear address from both host and monitor * address spaces. */ vm->host.addr.nexus->transition_pde_p_host = vm->host.addr.page_dir + pdi; vm->host.addr.nexus->transition_pde_p_mon = (pageEntry_t *) (((Bit32u)vm->guest.addr.page_dir) + (pdi << 2)); vm->host.addr.nexus->transition_laddr = laddr; /* Fill in the PDE flags */ vm->host.addr.nexus->transition_pde.fields.base = vm->pages.transition_PT; vm->host.addr.nexus->transition_pde.fields.avail = 0; vm->host.addr.nexus->transition_pde.fields.G = 0; /* not global */ vm->host.addr.nexus->transition_pde.fields.PS = 0; /* 4K pages */ vm->host.addr.nexus->transition_pde.fields.D = 0; /* (unused in pde) */ vm->host.addr.nexus->transition_pde.fields.A = 0; /* not accessed */ vm->host.addr.nexus->transition_pde.fields.PCD = 0; /* normal caching */ vm->host.addr.nexus->transition_pde.fields.PWT = 0; /* normal write-back*/ vm->host.addr.nexus->transition_pde.fields.US = 0; /* no user access */ vm->host.addr.nexus->transition_pde.fields.RW = 1; /* read or write */ vm->host.addr.nexus->transition_pde.fields.P = 1; /* present in memory*/ /* Clear Page Table; only one PTE is used. */ pageTable = vm->host.addr.transition_PT; mon_memzero(pageTable, 4096); /* Fill in the PTE for identity mapping the code page */ pageTable->pte[pti].fields.base = vm->pages.nexus; pageTable->pte[pti].fields.avail = 0; pageTable->pte[pti].fields.G = 0; /* not global */ pageTable->pte[pti].fields.PS = 0; /* (unused in pte) */ pageTable->pte[pti].fields.D = 0; /* clean */ pageTable->pte[pti].fields.A = 0; /* not accessed */ pageTable->pte[pti].fields.PCD = 0; /* normal caching */ pageTable->pte[pti].fields.PWT = 0; /* normal write-back */ pageTable->pte[pti].fields.US = 0; /* user can not access */ pageTable->pte[pti].fields.RW = 1; /* read or write */ pageTable->pte[pti].fields.P = 1; /* present in memory */ /* * Setup the TSS for the monitor/guest environment. * * We don't need to set the pagedir in the TSS, because we don't * actually jump to it anyway. The TSS is just used to set the kernel * stack and in a later stage, perhaps the I/O permission bitmap. */ /* No task chain. */ vm->host.addr.tss->back = 0; /* No debugging or I/O, for now. */ vm->host.addr.tss->trap = 0; vm->host.addr.tss->io = sizeof(tss_t); /* Monitor stack offset. */ vm->host.addr.tss->esp0 = ((Bit32u)vm->guest.addr.nexus) + PAGESIZE; /* * Set up initial monitor code and stack offset. */ vm->host.addr.nexus->mon_jmp_info.offset = MON_NEXUS_OFFSET(vm, __mon_cs); vm->host.addr.nexus->mon_stack_info.offset = vm->host.addr.tss->esp0 - (sizeof(guest_context_t) + 48); /* xxx 48 above should be calculated from code below which winds * xxx up monitor stack. */ /* * Setup the IDT for the monitor/guest environment */ r = 0; r |= hostInitIDTSlot(vm, 0, IDT_EXCEPTION_NOERROR); /* Divide error */ r |= hostInitIDTSlot(vm, 1, IDT_EXCEPTION_NOERROR); /* Debug exceptions */ r |= hostInitIDTSlot(vm, 2, IDT_INTERRUPT); /* NMI */ r |= hostInitIDTSlot(vm, 3, IDT_EXCEPTION_NOERROR); /* Breakpoint */ r |= hostInitIDTSlot(vm, 4, IDT_EXCEPTION_NOERROR); /* Overflow */ r |= hostInitIDTSlot(vm, 5, IDT_EXCEPTION_NOERROR); /* Bounds check */ r |= hostInitIDTSlot(vm, 6, IDT_EXCEPTION_NOERROR); /* Invalid opcode */ r |= hostInitIDTSlot(vm, 7, IDT_EXCEPTION_NOERROR); /* FPU not available */ r |= hostInitIDTSlot(vm, 8, IDT_EXCEPTION_ERROR); /* Double fault */ r |= hostInitIDTSlot(vm, 9, IDT_EXCEPTION_NOERROR); /* FPU segment overrun */ r |= hostInitIDTSlot(vm, 10, IDT_EXCEPTION_ERROR); /* Invalid TSS */ r |= hostInitIDTSlot(vm, 11, IDT_EXCEPTION_ERROR); /* Segment not present */ r |= hostInitIDTSlot(vm, 12, IDT_EXCEPTION_ERROR); /* Stack exception */ r |= hostInitIDTSlot(vm, 13, IDT_EXCEPTION_ERROR); /* GP fault */ r |= hostInitIDTSlot(vm, 14, IDT_EXCEPTION_ERROR); /* Page fault */ r |= hostInitIDTSlot(vm, 15, IDT_EXCEPTION_NOERROR); /* reserved */ r |= hostInitIDTSlot(vm, 16, IDT_EXCEPTION_NOERROR); /* Coprocessor error */ r |= hostInitIDTSlot(vm, 17, IDT_EXCEPTION_ERROR); /* Alignment check */ r |= hostInitIDTSlot(vm, 18, IDT_EXCEPTION_NOERROR); /* Machine check */ /* Reserved exceptions */ for (i = 19; i < 32; i++) r |= hostInitIDTSlot(vm, i, IDT_EXCEPTION_NOERROR); /* Hardware interrupts */ for (i = 32; i < 256; i++) r |= hostInitIDTSlot(vm, i, IDT_INTERRUPT); if (r!=0) goto error; /* * Setup the initial guest context */ mon_memzero(vm->host.addr.guest_context, sizeof(guest_context_t)); /* Wind up the monitor stack for the initial transition via * __host2mon. At the tail end, monitor state is popped from the * stack and a RET is executed. */ { Bit32u *ptr; ptr = (Bit32u *) (((unsigned char *) vm->host.addr.guest_context) - 4); *ptr-- = (Bit32u) &__ret_to_guest; *ptr-- = 0x02; /* eflags: only reserved bit on */ *ptr-- = 0; /* eax */ *ptr-- = 0; /* ecx */ *ptr-- = 0; /* edx */ *ptr-- = 0; /* ebx */ *ptr-- = 0; /* esp dummy */ *ptr-- = 0; /* ebp */ *ptr-- = 0; /* esi */ *ptr-- = 0; /* edi */ *ptr-- = 0; /* FS; start with null value. */ *ptr-- = 0; /* GS; start with null value. */ } vm->vmState |= VMStateInitMonitor; vm->mon_request = MonReqNone; return(1); /* all OK */ error: return(0); /* error */ } unsigned hostInitGuestPhyMem(vm_t *vm) { unsigned i; mon_memzero(vm->pageInfo, sizeof(vm->pageInfo)); for (i=0; ipages.guest_n_pages; i++) { /* For now, we start out by preallocating physical pages */ /* for the guest, though not necessarily mapped into linear */ /* space. */ vm->pageInfo[i].attr.raw = 0; vm->pageInfo[i].tsc = 0; vm->pageInfo[i].attr.fields.allocated = 1; } { Bit32u rom_page; unsigned npages; /* Mark BIOS ROM area as ReadOnly */ rom_page = 0xf0000 >> 12; npages = (1 + 0xfffff - 0xf0000) / 4096; for (i=0; ipageInfo[rom_page + i].attr.fields.RO = 1; /* Mark VGA BIOS ROM area as ReadOnly */ rom_page = 0xc0000 >> 12; npages = (1 + 0xc7fff - 0xc0000) / 4096; for (i=0; ipageInfo[rom_page + i].attr.fields.RO = 1; } #if 1 /* Mark VGA framebuffer area as Memory Mapped IO */ { Bit32u vga_page; unsigned npages; vga_page = 0xa0000 >> 12; npages = (1 + 0xbffff - 0xa0000) / 4096; for (i=0; ipageInfo[vga_page + i].attr.fields.memMapIO = 1; } #endif return(0); } int hostInitIDTSlot(vm_t *vm, unsigned vec, int type) /* * initIDTSlot(): Initialize a monitor IDT slot. */ { /* IDT slot stubs */ idt_stub_t *stub = &vm->host.addr.idt_stubs[vec]; Bit32u stub_mon = ((Bit32u) vm->guest.addr.idt_stubs) + vec*sizeof(idt_stub_t); if (sizeof(idt_stub_t) != IDT_STUB_SIZE) return( -1 ); switch (type) { case IDT_INTERRUPT: stub->m2.pushla = 0x68; stub->m2.dummy = 0; stub->m2.pushlb = 0x68; stub->m2.vector = vec; stub->m2.jmp = 0xe9; stub->m2.reloc = ((Bit32u) &__handle_int) - (stub_mon + sizeof(idt_method2_t)); break; case IDT_EXCEPTION_ERROR: stub->m1.pushl = 0x68; stub->m1.vector = vec; stub->m1.jmp = 0xe9; stub->m1.reloc = ((Bit32u) &__handle_fault) - (stub_mon + sizeof(idt_method1_t)); break; case IDT_EXCEPTION_NOERROR: stub->m2.pushla = 0x68; stub->m2.dummy = 0; stub->m2.pushlb = 0x68; stub->m2.vector = vec; stub->m2.jmp = 0xe9; stub->m2.reloc = ((Bit32u) &__handle_fault) - (stub_mon + sizeof(idt_method2_t)); break; default: return -1; } /* Set the interrupt gate */ SET_INT_GATE(vm->host.addr.idt[vec], nullSelector, stub_mon, D_PRESENT, D_DPL0, D_D32); return 0; } /* * Map pages allocated by host, into the linear address space of * the monitor/guest, given the Page Table supplied. */ void hostMapMonPages(vm_t *vm, Bit32u *pages, unsigned n, Bit32u *laddr_p, page_t *pageTable, unsigned user, unsigned writable, char *name) { unsigned i, pti; #if 0 hostOSPrint("hostMapMonPages: '%s' mapped at 0x%x .. 0x%x.\n", name, (*laddr_p) - MON_BASE_FROM_LADDR(0), ((*laddr_p) + (n*4096)) - MON_BASE_FROM_LADDR(0) ); #endif pti = (*laddr_p >> 12) & 0x3ff; for (i = 0; i < n; i++, pti++) { if (pti > 1024) break; /* This should not happen! */ /* Fill in the PTE flags */ pageTable->pte[pti].fields.base = pages[i]; pageTable->pte[pti].fields.avail = 0; pageTable->pte[pti].fields.G = 0; /* not global */ pageTable->pte[pti].fields.PS = 0; /* (unused in pte) */ pageTable->pte[pti].fields.D = 0; /* clean */ pageTable->pte[pti].fields.A = 0; /* not accessed */ pageTable->pte[pti].fields.PCD = 0; /* normal caching */ pageTable->pte[pti].fields.PWT = 0; /* normal write-back */ pageTable->pte[pti].fields.US = user; /* 0=system, 1=user */ pageTable->pte[pti].fields.RW = writable; /* 0=RO, 1=RW */ pageTable->pte[pti].fields.P = 1; /* present in memory */ } /* * Advance linear address pointer, for the next set of pages * to be mapped. */ *laddr_p += 4096 * n; } #if ANAL_CHECKS void hostMapBlankPage(vm_t *vm, Bit32u *laddr_p, page_t *pageTable) { unsigned pti; pti = (*laddr_p >> 12) & 0x3ff; if (pti > 1024) return; /* This should not happen! */ /* Fill in the PTE flags */ pageTable->pte[pti].fields.base = 0; pageTable->pte[pti].fields.avail = 0; pageTable->pte[pti].fields.G = 0; /* not global */ pageTable->pte[pti].fields.PS = 0; /* (unused in pte) */ pageTable->pte[pti].fields.D = 0; /* clean */ pageTable->pte[pti].fields.A = 0; /* not accessed */ pageTable->pte[pti].fields.PCD = 0; /* normal caching */ pageTable->pte[pti].fields.PWT = 0; /* normal write-back */ pageTable->pte[pti].fields.US = 0; pageTable->pte[pti].fields.RW = 0; pageTable->pte[pti].fields.P = 0; /* * Advance linear address pointer, for the next set of pages * to be mapped. */ *laddr_p += 4096; } #endif int hostIoctlGeneric(vm_t *vm, void *inode, void *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { /* * Set the guest CPUID info. */ case PLEX86_CPUID: { if ( vm->vmState & VMStateGuestCPUID ) { /* Can't change guest CPUID. */ return -Plex86ErrnoEINVAL; } if ( hostOSCopyFromUser(&vm->guestCPUIDInfo, (void *)arg, sizeof(vm->guestCPUIDInfo)) ) return -Plex86ErrnoEFAULT; /* xxx Value checks here. */ vm->vmState |= VMStateGuestCPUID; return 0; } case PLEX86_REGISTER_MEMORY: { plex86IoctlRegisterMem_t registerMemMsg; if ( hostOSCopyFromUser(®isterMemMsg, (void *)arg, sizeof(registerMemMsg)) ) return -Plex86ErrnoEFAULT; return( hostIoctlRegisterMem(vm, ®isterMemMsg) ); } /* * Tear down the VM environment. */ case PLEX86_TEARDOWN: if ( vm->vmState & VMStateRegisteredAll ) { hostOSPrint("plex86: guest memory is still registered!\n"); /* Could effect the unpinning here and then do: * vm->vmState &= ~VMStateRegisteredAll; */ return -Plex86ErrnoEBUSY; } hostUnallocVmPages(vm); /* Fixme: deal with state better here. */ /* Reset state to only FD opened. */ vm->vmState = VMStateFDOpened; return 0; /* * Execute the guest in the VM for a while. The guest CPU state * is specified in a memory window mmap()'d to user space. */ case PLEX86_EXECUTE: { plex86IoctlExecute_t executeMsg; int ret; if ( hostOSCopyFromUser(&executeMsg, (void *)arg, sizeof(executeMsg)) ) return -Plex86ErrnoEFAULT; ret = hostIoctlExecute(vm, &executeMsg); if ( hostOSCopyToUser((void *)arg, &executeMsg, sizeof(executeMsg)) ) return -Plex86ErrnoEFAULT; return ret; } #warning "PLEX86_RESET should only conditionally compiled for debugging." /* * For debugging, when the module gets hosed, this is a way * to reset the in-use count, so we can rmmod it. */ case PLEX86_RESET: hostOSModuleCountReset(vm, inode, filp); return 0; default: hostOSPrint("plex86: unknown ioctl(%d) called\n", cmd); return -Plex86ErrnoEINVAL; } } int hostIoctlExecute(vm_t *vm, plex86IoctlExecute_t *executeMsg) { guest_cpu_t *guest_cpu; guest_context_t *guest_stack_context; nexus_t *nexus; unsigned s; int retval; if ( (vm->vmState != VMStateReady) || (vm->mon_request != MonReqNone) ) { retval = Plex86NoExecute_VMState; /* Fail. */ goto handlePanic; } /* Only (virtualized) native execution is supported currently. * Later, it will be interesting to breakpoint one instruction * at-a-time using Plex86ExecuteMethodBreakpoint, for * cosimulation. */ if (executeMsg->executeMethod != Plex86ExecuteMethodNative) { retval = Plex86NoExecute_Method; /* Fail. */ goto handleFail; } /* A pointer to the guest CPU state as passed from host-user space. * This structure is memory mapped between user and kernel/monitor space. */ guest_cpu = vm->host.addr.guest_cpu; /* A pointer to the guest CPU state saved on the monitor stack. */ guest_stack_context = vm->host.addr.guest_context; /* ================================================================= * Before executing the guest in the VM, we must check that * the guest conditions meet the requirements of the user-level-only * VM. * ================================================================= */ /* CR0: * PG(31)==1 * CD(30)==? (look into this later) * NW(29)==? (look into this later) * AM(18)==pass-thru from guest * WP(16)==Don't care. Monitor always sets this to 1. * NE( 5)==? (look into this later) * ET( 4)==? (look into this later) * TS( 3)==? (look into this later) * EM( 2)==? (look into this later) * MP( 1)==? (look into this later) * PE( 0)==1 */ /* 0x8005003b */ if ( (guest_cpu->cr0.raw & 0xe0000037) != 0x80000033 ) { hostOSPrint("plex86: guest CR0=0x%x\n", guest_cpu->cr0.raw); retval = Plex86NoExecute_CR0; /* Fail. */ goto handleFail; } /* CR4: * OSXMMEXCPT(10)==? (look into this later) * OSFXSR(9)==? (look into this later) * PCE(8)==? (look into this later) * PGE(7)==? (look into this later) * MCE(6)==? (look into this later) * PAE(5)==? (look into this later) * PSE(4)==? (look into this later) * DE(3)==? (look into this later) * TSD(2)==? (look into this later) * PVI(1)==? (look into this later) * VME(0)==? (look into this later) */ if ( (guest_cpu->cr4.raw & 0x000007ff) != 0x00000000 ) { hostOSPrint("plex86: guest CR4=0x%x\n", guest_cpu->cr4.raw); retval = Plex86NoExecute_CR4; /* Fail. */ goto handleFail; } /* Guest CPL must be 3 (user-level). * CS selector must not be NULL. */ if ( (guest_cpu->sreg[SRegCS].sel.fields.rpl != 3) || (guest_cpu->sreg[SRegCS].sel.fields.index == 0) || (guest_cpu->sreg[SRegCS].des.dpl != 3) ) { retval = Plex86NoExecute_CS; /* Fail. */ goto handleFail; } /* A20 line must be enabled. */ if ( guest_cpu->a20Enable != 1 ) { retval = Plex86NoExecute_A20; /* Fail. */ goto handleFail; } /* Some code not really used now, since we only support A20 being enabled. */ { unsigned newA20Enable; newA20Enable = guest_cpu->a20Enable > 0; /* Make 0 or 1. */ if ( newA20Enable != vm->system.a20Enable ) { if ( (!newA20Enable) && guest_cpu->cr0.fields.pg ) { /* A20 disabled, paging on not supported. Well, really I have to * see if it matters. This check was in old plex86 code. */ retval = Plex86NoExecute_A20; /* Fail. */ goto handleFail; } vm->system.a20Enable = newA20Enable; vm->system.a20AddrMask = 0xffefffff | (newA20Enable << 20); vm->system.a20IndexMask = 0x000ffeff | (newA20Enable << 8); } } /* LDT not supported. * Monitor uses GDT slots 1,2,3, so guest segments can not. * Segment descriptor cache DPL should equal 3. */ for (s=0; s<6; s++) { unsigned selector = guest_cpu->sreg[s].sel.raw; unsigned index; /* Only care if selector is not NULL. */ if ( selector & 0xfffc ) { if ( (selector & 0x0007) != 3 ) { /* Either TI=1 (LDT usage) or RPL!=3. */ retval = Plex86NoExecute_Selector; /* Fail. */ goto handleFail; } index = selector >> 3; if ( index <= 3 ) { /* Selector index field uses one of the monitor GDT slots. */ retval = Plex86NoExecute_Selector; /* Fail. */ goto handleFail; } if ( index >= (MON_GDT_SIZE/8) ) { /* Selector index field uses a slot beyond the monitor GDT size. */ retval = Plex86NoExecute_Selector; /* Fail. */ goto handleFail; } if ( guest_cpu->sreg[s].des.dpl != 3 ) { retval = Plex86NoExecute_DPL; /* Fail. */ goto handleFail; } } } /* EFlags constraints: * VIP/VIF==0 * VM==0 * RF==0 * NT==0 * IOPL==0 (We may be able to allow this to be 0..2) * IF==1 * TF==0 * bit1==1 */ if ( (guest_cpu->eflags & (0x001b7302)) != (0x00000202) ) { retval = Plex86NoExecute_EFlags; /* Fail. */ goto handleFail; } /* Notes on other stuff: * - CPUID emulation vs virtualization match. */ /* NOTE: We should commit to executing the guest at this point. * We must not leave stray entries in the GDT. */ /* Install virtualized guest descriptors in GDT. * Either use descriptor caches from guest space, or we have * to chase down the GDT entries using the guest's paging * system. Might be a cheaper/safe bet to just use the * descriptor caches. If the guest reloads a descriptor, * just let the user space deal with it. */ for (s=0; s<6; s++) { if ( (guest_cpu->sreg[s].sel.raw & 0xfffc) != 0) { vm->host.addr.gdt[ guest_cpu->sreg[s].sel.fields.index ] = guest_cpu->sreg[s].des; } } #warning "Have to clear out GDT" guest_stack_context->gs = guest_cpu->sreg[SRegGS].sel.raw; guest_stack_context->fs = guest_cpu->sreg[SRegFS].sel.raw; guest_stack_context->ds = guest_cpu->sreg[SRegDS].sel.raw; guest_stack_context->es = guest_cpu->sreg[SRegES].sel.raw; /* Could use memcpy(); both are in order. Pack both structs. */ guest_stack_context->edi = guest_cpu->edi; guest_stack_context->esi = guest_cpu->esi; guest_stack_context->ebp = guest_cpu->ebp; guest_stack_context->dummy_esp = 0; /* Not needed. */ guest_stack_context->ebx = guest_cpu->ebx; guest_stack_context->edx = guest_cpu->edx; guest_stack_context->ecx = guest_cpu->ecx; guest_stack_context->eax = guest_cpu->eax; /* Fields vector/error are ignored for return to guest. */ /* CS:EIP */ guest_stack_context->eip = guest_cpu->eip; guest_stack_context->cs = guest_cpu->sreg[SRegCS].sel.raw; guest_stack_context->eflags.raw = guest_cpu->eflags; vm->veflags.raw = 0; /* Virtualized EFLAGS - implement later. */ guest_stack_context->esp = guest_cpu->esp; guest_stack_context->ss = guest_cpu->sreg[SRegSS].sel.raw; /* Pointer to the fields in the nexus.S assembly code. */ nexus = vm->host.addr.nexus; #warning "Monitor CRx hacks" nexus->mon_cr0 = 0x8001003b | /* PG/WP/NE/ET/TS/MP/PE */ (guest_cpu->cr0.raw & 0x00040000); /* Pass-thru AM from guest. */ /* Could move mon_cr3 load to mapMonitor. */ nexus->mon_cr3 = vm->pages.page_dir << 12; nexus->mon_cr4 = 0x00000004; /* TSD=1 */ /* vm->guest_cpu.cr0.raw = guest_cpu->cr0 | 0x32; */ /* +++ hack for now */ // Notes: // - Implement some of monPagingRemap from old code, since that // was intended to be run/triggered by an initial mode change. // - After execution of 1st timeslice, need to copy dynamic state // from VM to guest_cpu area. // - Deal with cycle counts etc. hostInitShadowPaging(vm); for (;;) { unsigned long eflags; #if 0 /* If print buffer has contents, return to user space to print. */ if (vm->log_buffer_info.offset) { vm->mon_msgs.header.msg_type = VMMessagePrintBuf; vm->mon_msgs.header.msg_len = 0; vm->mon_request = MonReqNone; /* Request satisfied */ resetPrintBuf(vm); /* xxx Fix print mess */ retval = 100; goto handleFail; } #endif vm_save_flags(eflags); vm_restore_flags(eflags & ~0x00004300); /* clear NT/IF/TF */ #if ANAL_CHECKS if (!(eflags & 0x200)) { vm_restore_flags(eflags); hostOSPrint("ioctlExecute: EFLAGS.IF==0\n"); retval = 101; /* Fail. */ goto handlePanic; } #endif /* Call assembly routine to effect transition. */ vm->host.__host2mon(); /* First check for an asynchronous event (interrupt redirection) */ if ( vm->mon_request == MonReqRedirect ) { vm_restore_flags(eflags & ~0x00000200); /* restore all but IF */ soft_int(vm->redirect_vector); /* sets IF to 1 */ hostOSInstrumentIntRedirCount(vm->redirect_vector); vm->mon_request = MonReqNone; /* Request satisfied */ } /* Event was synchronous; monitor requested a switch back to host. */ else { vm_restore_flags(eflags); /* Perform action requested by monitor. */ switch ( vm->mon_request ) { case MonReqRemapMonitor: #if 0 if ( mapMonitor(vm) ) { vm->mon_request = MonReqNone; /* Request satisfied */ break; } else { hostOSPrint("mapMonitor failed.\n"); hostOSPrint("Panic w/ abort_code=%u\n", vm->abort_code); retval = 102; goto handlePanic; } #endif hostOSPrint("ioctlExecute: case MonReqRemapMonitor.\n"); retval = 103; goto handlePanic; case MonReqFlushPrintBuf: hostOSPrint("ioctlExecute: case MonReqFlushPrintBuf.\n"); retval = 104; goto handlePanic; case MonReqGuestFault: /* Encountered a guest fault. */ hostCopyGuestStateToUserSpace(vm); executeMsg->cyclesExecuted = 0; /* Handle later. */ executeMsg->instructionsExecuted = 0; /* Handle later. */ executeMsg->monitorState.state = vm->vmState; executeMsg->monitorState.request = vm->mon_request; executeMsg->monitorState.guestFaultNo = vm->guestFaultNo; vm->mon_request = MonReqNone; return 0; case MonReqPanic: if (vm->abort_code) hostOSPrint("Panic w/ abort_code=%u\n", vm->abort_code); hostOSPrint("ioctlExecute: case MonReqPanic.\n"); retval = 106; goto handlePanic; case MonReqPinUserPage: if ( !hostHandlePagePinRequest(vm, vm->pinReqPPI) ) { retval = 108; goto handlePanic; } continue; /* Back to VM monitor. */ default: hostOSPrint("ioctlExecute: default case (%u).\n", vm->mon_request); retval = 107; goto handlePanic; } } /* Let host decide whether we are allowed another timeslice */ if ( !hostOSIdle() ) { /* We are returning only because the host wants to * schedule other work. */ executeMsg->monitorState.state = vm->vmState; executeMsg->monitorState.request = MonReqNone; return 0; } } /* Should not get here. */ retval = 109; goto handlePanic; handleFail: /* Handle inabilitiy to execute the guest due to certain state. */ executeMsg->monitorState.state = vm->vmState; executeMsg->monitorState.request = vm->mon_request; return(retval); handlePanic: vm->vmState |= VMStatePanic; vm->mon_request = MonReqPanic; executeMsg->monitorState.state = vm->vmState; executeMsg->monitorState.request = vm->mon_request; return(retval); } void hostCopyGuestStateToUserSpace(vm_t *vm) { guest_cpu_t *guest_cpu; guest_context_t *guest_stack_context; /* A pointer to the guest CPU state as passed from host-user space. * This structure is memory mapped between user and kernel/monitor space. */ guest_cpu = vm->host.addr.guest_cpu; /* A pointer to the guest CPU state saved on the monitor stack. */ guest_stack_context = vm->host.addr.guest_context; guest_cpu->sreg[SRegES].sel.raw = guest_stack_context->es; if ( (guest_stack_context->es & 0xfffc) == 0 ) { guest_cpu->sreg[SRegES].des = nullDescriptor; guest_cpu->sreg[SRegES].valid = 0; } else { guest_cpu->sreg[SRegES].des = vm->host.addr.gdt[ guest_cpu->sreg[SRegES].sel.fields.index ]; guest_cpu->sreg[SRegES].valid = 1; } guest_cpu->sreg[SRegCS].sel.raw = guest_stack_context->cs; if ( (guest_stack_context->cs & 0xfffc) == 0 ) { guest_cpu->sreg[SRegCS].des = nullDescriptor; guest_cpu->sreg[SRegCS].valid = 0; } else { guest_cpu->sreg[SRegCS].des = vm->host.addr.gdt[ guest_cpu->sreg[SRegCS].sel.fields.index ]; guest_cpu->sreg[SRegCS].valid = 1; } guest_cpu->sreg[SRegSS].sel.raw = guest_stack_context->ss; if ( (guest_stack_context->ss & 0xfffc) == 0 ) { guest_cpu->sreg[SRegSS].des = nullDescriptor; guest_cpu->sreg[SRegSS].valid = 0; } else { guest_cpu->sreg[SRegSS].des = vm->host.addr.gdt[ guest_cpu->sreg[SRegSS].sel.fields.index ]; guest_cpu->sreg[SRegSS].valid = 1; } guest_cpu->sreg[SRegDS].sel.raw = guest_stack_context->ds; if ( (guest_stack_context->ds & 0xfffc) == 0 ) { guest_cpu->sreg[SRegDS].des = nullDescriptor; guest_cpu->sreg[SRegDS].valid = 0; } else { guest_cpu->sreg[SRegDS].des = vm->host.addr.gdt[ guest_cpu->sreg[SRegDS].sel.fields.index ]; guest_cpu->sreg[SRegDS].valid = 1; } guest_cpu->sreg[SRegFS].sel.raw = guest_stack_context->fs; if ( (guest_stack_context->fs & 0xfffc) == 0 ) { guest_cpu->sreg[SRegFS].des = nullDescriptor; guest_cpu->sreg[SRegFS].valid = 0; } else { guest_cpu->sreg[SRegFS].des = vm->host.addr.gdt[ guest_cpu->sreg[SRegFS].sel.fields.index ]; guest_cpu->sreg[SRegFS].valid = 1; } guest_cpu->sreg[SRegGS].sel.raw = guest_stack_context->gs; if ( (guest_stack_context->gs & 0xfffc) == 0 ) { guest_cpu->sreg[SRegGS].des = nullDescriptor; guest_cpu->sreg[SRegGS].valid = 0; } else { guest_cpu->sreg[SRegGS].des = vm->host.addr.gdt[ guest_cpu->sreg[SRegGS].sel.fields.index ]; guest_cpu->sreg[SRegGS].valid = 1; } /* Could use memcpy(); both are in order. Pack both structs. */ guest_cpu->edi = guest_stack_context->edi; guest_cpu->esi = guest_stack_context->esi; guest_cpu->ebp = guest_stack_context->ebp; guest_cpu->esp = guest_stack_context->esp; guest_cpu->ebx = guest_stack_context->ebx; guest_cpu->edx = guest_stack_context->edx; guest_cpu->ecx = guest_stack_context->ecx; guest_cpu->eax = guest_stack_context->eax; /* CS:EIP */ guest_cpu->eip = guest_stack_context->eip; guest_cpu->eflags = guest_stack_context->eflags.raw; /* vm->veflags.raw = 0; */ /* Virtualized EFLAGS - implement later. */ } int hostIoctlRegisterMem(vm_t *vm, plex86IoctlRegisterMem_t *registerMemMsg) { unsigned error; /* Do not allow duplicate allocation. The file descriptor must be * opened. The guest CPUID info can be filled in later. */ if ( (vm->vmState & ~VMStateGuestCPUID) != VMStateFDOpened ) return -Plex86ErrnoEBUSY; if (vm->pages.guest_n_megs != 0) return -Plex86ErrnoEBUSY; /* Check that the amount of memory is reasonable. */ if ( (registerMemMsg->nMegs > PLEX86_MAX_PHY_MEGS) || (registerMemMsg->nMegs < 4) || (registerMemMsg->nMegs & 0x3) ) return -Plex86ErrnoEINVAL; /* Check that the guest memory vector is page aligned. */ if ( registerMemMsg->guestPhyMemVector & 0xfff ) return -Plex86ErrnoEINVAL; /* Check that the log buffer area is page aligned. */ if ( registerMemMsg->logBufferWindow & 0xfff ) return -Plex86ErrnoEINVAL; /* Check that the guest CPU area is page aligned. */ if ( registerMemMsg->guestCPUWindow & 0xfff ) return -Plex86ErrnoEINVAL; /* Check that none of the user areas overlap. In case we have a * number of regions, use some generic code to handle N regions. */ { #define NumUserRegions 3 struct { Bit32u min, max; } userRegion[NumUserRegions]; unsigned i,j; userRegion[0].min = registerMemMsg->guestPhyMemVector; userRegion[0].max = userRegion[0].min + (registerMemMsg->nMegs<<20) - 1; userRegion[1].min = registerMemMsg->logBufferWindow; userRegion[1].max = userRegion[1].min + LOG_BUFF_SIZE - 1; userRegion[2].min = registerMemMsg->guestCPUWindow; userRegion[2].max = userRegion[2].min + (4096) - 1; for (i=1; i= userRegion[i].min) && (userRegion[j].min <= userRegion[i].max) ) return -Plex86ErrnoEINVAL; /* Check for max(j) contained in region(i). */ if ( (userRegion[j].max >= userRegion[i].min) && (userRegion[j].max <= userRegion[i].max) ) return -Plex86ErrnoEINVAL; } } } /* Allocate memory */ if ( (error = hostAllocVmPages(vm, registerMemMsg)) != 0 ) { hostOSPrint("plex86: allocVmPages failed at %u\n", error); return -Plex86ErrnoENOMEM; } /* Initialize the guests physical memory. */ if ( hostInitGuestPhyMem(vm) ) { hostUnallocVmPages(vm); return -Plex86ErrnoEFAULT; } /* Initialize the monitor. */ if ( !hostInitMonitor(vm) || !hostMapMonitor(vm) ) { hostUnallocVmPages(vm); return -Plex86ErrnoEFAULT; } return 0; } /* * Allocate various pages/memory needed by monitor. */ int hostAllocVmPages(vm_t *vm, plex86IoctlRegisterMem_t *registerMemMsg) { vm_pages_t *pg = &vm->pages; vm_addr_t *ad = &vm->host.addr; #warning "Fix these shortcuts" unsigned where = 1; /* clear out allocated pages lists */ mon_memzero(pg, sizeof(*pg)); mon_memzero(ad, sizeof(*ad)); /* Guest physical memory pages */ pg->guest_n_megs = registerMemMsg->nMegs; pg->guest_n_pages = registerMemMsg->nMegs * 256; pg->guest_n_bytes = registerMemMsg->nMegs * 1024 * 1024; if ( pg->guest_n_pages > MAX_MON_GUEST_PAGES) { /* The size of the user-space allocated guest physical memory must * fit within the maximum number of guest pages that the VM monitor * supports. */ goto error; } where++; vm->guestPhyMemAddr = registerMemMsg->guestPhyMemVector; vm->vmState |= VMStateRegisteredPhyMem; /* Bogus for now. */ where++; { Bit32u hostPPI, kernelAddr; /* Guest CPU state (malloc()'d in user space). */ if ( !hostOSGetAndPinUserPage(vm, registerMemMsg->guestCPUWindow, &pg->guest_cpu_hostOSPtr, &hostPPI, &kernelAddr) ) { goto error; } ad->guest_cpu = (guest_cpu_t *) kernelAddr; pg->guest_cpu = hostPPI; vm->vmState |= VMStateRegisteredGuestCPU; /* For now. */ where++; /* Log buffer area (malloc()'d in user space). */ /* LOG_BUFF_PAGES */ if ( !hostOSGetAndPinUserPage(vm, registerMemMsg->logBufferWindow, &pg->log_buffer_hostOSPtr[0], &hostPPI, &kernelAddr) ) { goto error; } ad->log_buffer = (Bit8u *) kernelAddr; pg->log_buffer[0] = hostPPI; where++; vm->vmState |= VMStateRegisteredPrintBuffer; /* For now. */ } /* Monitor page directory */ if ( !(ad->page_dir = (pageEntry_t *) hostOSAllocZeroedPage()) ) { goto error; } where++; if (!(pg->page_dir = hostOSGetAllocedPagePhyPage(ad->page_dir))) { goto error; } where++; /* Monitor page tables */ if ( !(ad->page_tbl = hostOSAllocZeroedMem(4096 * MON_PAGE_TABLES)) ) { goto error; } where++; if (!hostOSGetAllocedMemPhyPages(pg->page_tbl, MON_PAGE_TABLES, ad->page_tbl, 4096 * MON_PAGE_TABLES)) { goto error; } where++; /* Map of the linear addresses of page tables currently */ /* mapped into the monitor space. */ if ( !(ad->page_tbl_laddr_map = (unsigned *) hostOSAllocZeroedPage()) ) { goto error; } where++; if ( !(pg->page_tbl_laddr_map = hostOSGetAllocedPagePhyPage(ad->page_tbl_laddr_map)) ) { goto error; } where++; /* Nexus page table */ if ( !(ad->nexus_page_tbl = (page_t *) hostOSAllocZeroedPage()) ) { goto error; } where++; if ( !(pg->nexus_page_tbl = hostOSGetAllocedPagePhyPage(ad->nexus_page_tbl)) ) { goto error; } where++; /* Transition page table */ if ( !(ad->transition_PT = (page_t *) hostOSAllocZeroedPage()) ) { goto error; } where++; if ( !(pg->transition_PT = hostOSGetAllocedPagePhyPage(ad->transition_PT)) ) { goto error; } where++; /* Nexus page */ if ( !(ad->nexus = (nexus_t *) hostOSAllocZeroedPage()) ) { goto error; } where++; if ( !(pg->nexus = hostOSGetAllocedPagePhyPage(ad->nexus)) ) { goto error; } where++; /* Monitor IDT */ if ( !(ad->idt = hostOSAllocZeroedMem(MON_IDT_PAGES*4096)) ) { goto error; } where++; if (!hostOSGetAllocedMemPhyPages(pg->idt, MON_IDT_PAGES, ad->idt, MON_IDT_SIZE)) { goto error; } where++; /* Monitor GDT */ if ( !(ad->gdt = hostOSAllocZeroedMem(MON_GDT_PAGES*4096)) ) { goto error; } where++; if (!hostOSGetAllocedMemPhyPages(pg->gdt, MON_GDT_PAGES, ad->gdt, MON_GDT_SIZE)) { goto error; } where++; /* Monitor LDT */ if ( !(ad->ldt = hostOSAllocZeroedMem(MON_LDT_PAGES*4096)) ) { goto error; } where++; if (!hostOSGetAllocedMemPhyPages(pg->ldt, MON_LDT_PAGES, ad->ldt, MON_LDT_SIZE)) { goto error; } where++; /* Monitor TSS */ if ( !(ad->tss = hostOSAllocZeroedMem(MON_TSS_PAGES*4096)) ) { goto error; } where++; if (!hostOSGetAllocedMemPhyPages(pg->tss, MON_TSS_PAGES, ad->tss, MON_TSS_SIZE)) { goto error; } where++; /* Monitor IDT stubs */ if ( !(ad->idt_stubs = hostOSAllocZeroedMem(MON_IDT_STUBS_PAGES*4096)) ) { goto error; } where++; if (!hostOSGetAllocedMemPhyPages(pg->idt_stubs, MON_IDT_STUBS_PAGES, ad->idt_stubs, MON_IDT_STUBS_SIZE)) { goto error; } where++; /* Get the physical pages associated with the vm_t structure. */ if (!hostOSGetAllocedMemPhyPages(pg->vm, MAX_VM_STRUCT_PAGES, vm, sizeof(*vm))) { goto error; } where++; vm->vmState |= VMStateMemAllocated; return 0; /* OK. */ error: hostUnallocVmPages( vm ); return( where ); } /* */ /* Unallocate pages/memory used by monitor */ /* */ void hostUnallocVmPages( vm_t *vm ) { vm_pages_t *pg = &vm->pages; vm_addr_t *ad = &vm->host.addr; /* Guest physical memory pages */ if (vm->guestPhyMemAddr) { hostReleasePinnedUserPages(vm); vm->guestPhyMemAddr = 0; } vm->vmState &= ~VMStateRegisteredPhyMem; /* Bogus for now. */ /* Monitor page directory */ if (ad->page_dir) hostOSFreePage(ad->page_dir); /* Monitor page tables */ if (ad->page_tbl) hostOSFreeMem(ad->page_tbl); /* Map of linear addresses of page tables mapped into monitor. */ if (ad->page_tbl_laddr_map) hostOSFreePage(ad->page_tbl_laddr_map); /* Nexus page table */ if (ad->nexus_page_tbl) hostOSFreePage(ad->nexus_page_tbl); /* Guest CPU state. */ if (ad->guest_cpu) hostOSFreePage(ad->guest_cpu); /* Transition page table */ if (ad->transition_PT) hostOSFreePage(ad->transition_PT); if (ad->log_buffer) hostOSFreeMem(ad->log_buffer); /* Nexus page */ if (ad->nexus) hostOSFreePage(ad->nexus); /* Monitor IDT */ if (ad->idt) hostOSFreeMem(ad->idt); /* Monitor GDT */ if (ad->gdt) hostOSFreeMem(ad->gdt); /* Monitor LDT */ if (ad->ldt) hostOSFreeMem(ad->ldt); /* Monitor TSS */ if (ad->tss) hostOSFreeMem(ad->tss); /* Monitor IDT stubs */ if (ad->idt_stubs) hostOSFreeMem(ad->idt_stubs); /* clear out allocated pages lists */ mon_memzero(pg, sizeof(*pg)); mon_memzero(ad, sizeof(*ad)); } unsigned hostGetCpuCapabilities(void) { Bit32u eax, ebx, ecx, edx; /* Get the highest allowed cpuid level */ asm volatile ( "xorl %%eax,%%eax\n\t" "cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : : "cc" ); if (eax < 1) return(0); /* not enough capabilities */ /* Copy vendor string. */ hostCpuIDInfo.vendorDWord0 = ebx; hostCpuIDInfo.vendorDWord1 = edx; hostCpuIDInfo.vendorDWord2 = ecx; /* CPUID w/ EAX==1: Processor Signature & Feature Flags */ asm volatile ( "movl $1,%%eax\n\t" "cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : : "cc" ); hostCpuIDInfo.procSignature.raw = eax; hostCpuIDInfo.featureFlags.raw = edx; /* Plex86 needs TSC */ if (hostCpuIDInfo.featureFlags.fields.tsc==0) return(0); return(1); } /* Map the monitor and guest into the VM. */ unsigned hostMapMonitor(vm_t *vm) { selector_t monCsSel, monSsSel, monTssSel; Bit32u laddr, base; unsigned slot; guest_context_t *guestContext; nexus_t *nexus; descriptor_t *gdt; /* For convenience, some pointers. */ guestContext = vm->host.addr.guest_context; nexus = vm->host.addr.nexus; gdt = vm->host.addr.gdt; #warning "Is the GDT being cleared of old values?" /* +++ should zero out GDT, so prev entries do not remain */ /* ========================= * Map in Monitor structures * ========================= */ /* CS/SS/TSS selectors: * For now, hardcode in monitor descriptors at slots 1,2,3. As we * are only running user code in the VM, these are likely safe slots * as they are often used guest OSes for kernel descriptors. */ monCsSel.raw = Selector(1, 0, RPL0); monSsSel.raw = Selector(2, 0, RPL0); monTssSel.raw = Selector(3, 0, RPL0); /* Search for unused PDE for nexus PT (fixed for now) */ laddr = 0x70000000; vm->mon_pde_mask = laddr & 0xffc00000; vm->mon_pdi = laddr >> 22; base = MON_BASE_FROM_LADDR(laddr); /* Map nexus into monitor/guest address space */ vm->host.addr.page_dir[laddr >> 22] = vm->host.nexus_pde; /* CS/SS/TSS descriptors: Put at fixed GDT location for now. */ SET_DESCRIPTOR(gdt[monCsSel.fields.index], base, 0xfffff, D_PG, D_D32, D_AVL0, D_PRESENT, D_DPL0, D_CODE | D_READ) SET_DESCRIPTOR(gdt[monSsSel.fields.index], base, 0xfffff, D_PG, D_D32, D_AVL0, D_PRESENT, D_DPL0, D_DATA | D_WRITE) SET_DESCRIPTOR(gdt[monTssSel.fields.index], base + (Bit32u) vm->guest.addr.tss, sizeof(tss_t)-1, D_BG, 0, D_AVL0, D_PRESENT, D_DPL0, D_TSS) /* Fix up the selectors of all IDT entries. */ for ( slot = 0; slot < 256; slot++ ) vm->host.addr.idt[slot].selector = monCsSel; /* The monitor GDT/IDT loading info. */ nexus->mon_gdt_info.base = base + (Bit32u) vm->guest.addr.gdt; nexus->mon_gdt_info.limit = MON_GDT_SIZE; nexus->mon_idt_info.base = base + (Bit32u) vm->guest.addr.idt; nexus->mon_idt_info.limit = MON_IDT_SIZE; /* We don't have a monitor LDT for now. */ nexus->mon_ldt_sel = 0; /* The monitor TSS. */ nexus->mon_tss_sel = monTssSel.raw; vm->host.addr.tss->esp0 = ((Bit32u)vm->guest.addr.nexus) + PAGESIZE; vm->host.addr.tss->ss0 = monSsSel.raw; /* Monitor code and stack segments. */ nexus->mon_jmp_info.selector = monCsSel.raw; nexus->mon_stack_info.selector = monSsSel.raw; /* Monitor code/data segment base. */ nexus->mon_base = base; vm->vmState |= VMStateMapMonitor; return(1); } void hostInitShadowPaging(vm_t *vm) { pageEntry_t *monPDir; Bit32u pdi; /*Bit32u cr3_page_index;*/ /*phy_page_usage_t *pusage;*/ #if 0 cr3_page_index = A20Addr(vm, vm->guest_cpu.cr3) >> 12; if ( cr3_page_index >= vm->pages.guest_n_pages) xxxpanic(vm, "monPagingRemap: CR3 conflicts with monitor space\n"); #endif /* Reset page table heap */ vm->ptbl_laddr_map_i = 0; /* Clear monitor PD except 4Meg range used by monitor */ monPDir = vm->host.addr.page_dir; for (pdi=0; pdi<1024; pdi++) { #if ANAL_CHECKS vm->host.addr.page_tbl_laddr_map[pdi] = -1; /* max unsigned */ #endif if (pdi != vm->mon_pdi) monPDir[pdi].raw = 0; } /* Update vpaging timestamp. */ vm->vpaging_tsc = vm_rdtsc(); #if 0 /* When we remap the monitor page tables, IF guest paging is * enabled, then mark the page containing the guest page directory * as such. In non-paged mode, there is no page directory. */ if (vm->guest_cpu.cr0.fields.pg) { pusage = &vm->pageInfo[cr3_page_index]; pusage->tsc = vm->vpaging_tsc; pusage->attr.raw &= PageUsageSticky; pusage->attr.raw |= PageUsagePDir; pusage->attr.fields.access_perm = PagePermNA; if (pusage->attr.raw & PageBadUsage4PDir) xxxpanic(vm, "monPagingRemap: BadUsage4PDir\n"); } #endif } void hostReleasePinnedUserPages(vm_t *vm) { unsigned ppi; unsigned dirty; unsigned nPages; Bit32u kernelAddr; /* Unpin the pages associate with the guest physical memory. */ nPages = vm->pages.guest_n_pages; for (ppi=0; ppipageInfo[ppi].attr.fields.pinned ) { void *osSpecificPtr; osSpecificPtr = (void *) vm->hostStructPagePtr[ppi]; #warning "Conditionalize page dirtying before page release." dirty = 1; /* FIXME: 1 for now. */ hostOSUnpinUserPage(vm, vm->guestPhyMemAddr + (ppi<<12), osSpecificPtr, ppi, 0 /* There was no host kernel addr mapped for this page. */, dirty); vm->pageInfo[ppi].attr.fields.pinned = 0; } } /* Unpin the pages associated with the guest_cpu area. */ kernelAddr = (Bit32u) vm->host.addr.guest_cpu; hostOSUnpinUserPage(vm, 0, /* User space address. */ vm->pages.guest_cpu_hostOSPtr, vm->pages.guest_cpu, &kernelAddr, 1 /* Dirty. */); /* Unpin the pages associated with the log buffer area. */ kernelAddr = (Bit32u) vm->host.addr.log_buffer; hostOSUnpinUserPage(vm, 0, /* User space address. */ vm->pages.log_buffer_hostOSPtr[0], vm->pages.log_buffer[0], &kernelAddr, 1 /* Dirty. */); #warning "User space address is passed as 0 for now..." } unsigned hostHandlePagePinRequest(vm_t *vm, Bit32u reqGuestPPI) { Bit32u hostPPI; unsigned qIndex; #warning "We must not unpin open pages (for page walking) here." if (vm->guestPhyPagePinQueue.nEntries < MaxPhyPagesPinned) { /* There is room in the Q for another entry - we have not reached * the upper limit of allowable number of pinned pages. */ qIndex = vm->guestPhyPagePinQueue.nEntries; } else { unsigned dirty; Bit32u unpinGuestPPI; /* There is no room in the Q for another entry - we have reached * the upper limit of allowable number of pinned pages. We must * first unpin a page to free up the limit, then we can pin the * requested page. This keeps plex86 from pinning an unconstrained * number of pages at one time. */ qIndex = vm->guestPhyPagePinQueue.tail; dirty = 1; /* FIXME: 1 for now. */ unpinGuestPPI = vm->guestPhyPagePinQueue.ppi[qIndex]; hostOSUnpinUserPage(vm, vm->guestPhyMemAddr + (unpinGuestPPI<<12), vm->hostStructPagePtr[unpinGuestPPI], unpinGuestPPI, 0 /* There was no host kernel addr mapped for this page. */, dirty); vm->pageInfo[unpinGuestPPI].attr.fields.pinned = 0; } /* Pin the requested guest physical page in the host OS. */ if ( !hostOSGetAndPinUserPage(vm, vm->guestPhyMemAddr + (reqGuestPPI<<12), &vm->hostStructPagePtr[reqGuestPPI], &hostPPI, 0 /* Don't need a host kernel address. */ ) ) { hostOSPrint("handlePagePinReq: request to pin failed.\n"); return(0); /* Fail. */ } /* Pinning activities have succeeded. Mark this physical page as being * pinnned, and store it's physical address. */ vm->pageInfo[reqGuestPPI].attr.fields.pinned = 1; vm->pageInfo[reqGuestPPI].hostPPI = hostPPI; /* Now add this entry to the Q. */ vm->guestPhyPagePinQueue.ppi[qIndex] = reqGuestPPI; if (vm->guestPhyPagePinQueue.nEntries < MaxPhyPagesPinned) { vm->guestPhyPagePinQueue.nEntries++; vm->guestPhyPagePinQueue.tail = vm->guestPhyPagePinQueue.nEntries % MaxPhyPagesPinned; } else { /* Leave .nEntries at the maximum value - Q is full. */ vm->guestPhyPagePinQueue.tail = (vm->guestPhyPagePinQueue.tail + 1) % MaxPhyPagesPinned; } return(1); /* OK. */ }