Bochs/bochs/plex86/kernel/monitor-host.c
2008-02-05 22:57:43 +00:00

1887 lines
58 KiB
C

/*
* plex86: run multiple x86 operating systems concurrently
* Copyright (C) 1999-2003 Kevin P. Lawton
*
* monitor-host.c: This file contains the top-level monitor code,
* accessible from the host space. (kernel independent code)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "plex86.h"
#define IN_HOST_SPACE
#include "monitor.h"
/* =====================================================================
* Plex86 module global variables. This should be the _only_ place
* where globals are declared. Since plex86 supports multiple VMs, almost
* all data is stored per-VM. For the few variables which are global
* to all VMs, we have to be careful to access them in SMP friendly ways.
* The ones which are written upon kernel module initialization are fine,
* since they are only written once.
* =====================================================================
*/
/* Info regarding the physical pages that comprise the kernel module,
* including physical page information. This is written (once) at
* kernel module initialization time. Thus there are no SMP access issues.
*/
kernelModulePages_t kernelModulePages;
/* Information of the host processor as returned by the CPUID
* instruction. This is written (once) at kernel module initialization time.
* Thus there no are SMP access issues.
*/
cpuid_info_t hostCpuIDInfo;
/* Some constants used by the VM logic. Since they're "const", there are
* no problems with SMP access.
*/
static const selector_t nullSelector = { raw: 0 };
static const descriptor_t nullDescriptor = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static int hostInitIDTSlot(vm_t *vm, unsigned vec, int type);
static void hostMapMonPages(vm_t *vm, Bit32u *, unsigned, Bit32u *, page_t *,
unsigned user, unsigned writable, char *name);
#if ANAL_CHECKS
static void hostMapBlankPage(vm_t *vm, Bit32u *laddr_p, page_t *pageTable);
#endif
#define RW0 0
#define RW1 1
#define US0 0
#define US1 1
#define IDT_INTERRUPT 0
#define IDT_EXCEPTION_ERROR 1
#define IDT_EXCEPTION_NOERROR 2
unsigned
hostModuleInit(void)
{
/* Kernel independent stuff to do at kernel module load time. */
if (!hostGetCpuCapabilities()) {
hostOSPrint("getCpuCapabilities returned error\n");
return(0); /* Fail. */
}
else {
#if 0
hostOSPrint("ptype:%u, family:%u, model:%u stepping:%u\n",
hostCpuIDInfo.procSignature.fields.procType,
hostCpuIDInfo.procSignature.fields.family,
hostCpuIDInfo.procSignature.fields.model,
hostCpuIDInfo.procSignature.fields.stepping);
#endif
}
/* xxx Should check that host CS.base is page aligned here. */
#if 1
{
Bit32u cr0;
asm volatile ( "movl %%cr0, %0" : "=r" (cr0) );
hostOSPrint("host CR0=0x%x\n", cr0);
}
#endif
return(1); /* Pass. */
}
void
hostDeviceOpen(vm_t *vm)
{
/* Kernel independent stuff to do at device open time. */
/* Zero out entire VM structure. */
mon_memzero( vm, sizeof(vm_t) );
vm->vmState = VMStateFDOpened;
}
int
hostInitMonitor(vm_t *vm)
{
unsigned pdi, pti;
unsigned int i;
Bit32u nexus_size;
page_t *pageTable;
Bit32u laddr, base;
int r;
vm->kernel_offset = hostOSKernelOffset();
vm->system.a20Enable = 1; /* Start with A20 line enabled. */
vm->system.a20AddrMask = 0xffffffff; /* All address lines contribute. */
vm->system.a20IndexMask = 0x000fffff; /* All address lines contribute. */
/* Initialize nexus */
mon_memzero(vm->host.addr.nexus, 4096);
/* Copy transition code (nexus) into code page allocated for this VM. */
nexus_size = ((Bit32u) &__nexus_end) - ((Bit32u) &__nexus_start);
if (nexus_size > 4096)
goto error;
mon_memcpy(vm->host.addr.nexus, &__nexus_start, nexus_size);
/* Init the convenience pointers. */
/* Pointer to host2mon routine inside nexus page */
vm->host.__host2mon = (void (*)(void)) HOST_NEXUS_OFFSET(vm, __host2mon);
/* Pointer to guest context on monitor stack */
vm->host.addr.guest_context = (guest_context_t *)
( (Bit32u)vm->host.addr.nexus + PAGESIZE - sizeof(guest_context_t) );
/* Zero out various monitor data structures */
mon_memzero(vm->host.addr.log_buffer, 4096*LOG_BUFF_PAGES);
mon_memzero(&vm->log_buffer_info,
sizeof(vm->log_buffer_info));
mon_memzero(vm->host.addr.page_dir, 4096);
mon_memzero(vm->host.addr.guest_cpu, 4096);
mon_memzero(vm->host.addr.idt, MON_IDT_PAGES*4096);
mon_memzero(vm->host.addr.gdt, MON_GDT_PAGES*4096);
mon_memzero(vm->host.addr.ldt, MON_LDT_PAGES*4096);
mon_memzero(vm->host.addr.tss, MON_TSS_PAGES*4096);
mon_memzero(vm->host.addr.idt_stubs, MON_IDT_STUBS_PAGES*4096);
vm->guestPhyPagePinQueue.nEntries = 0;
vm->guestPhyPagePinQueue.tail = 0;
/*
* ================
* Nexus Page Table
* ================
*
* All structures needed by the monitor inside the guest environment
* (code to perform the transition between host<-->guest, fault handler
* code, various processor data structures like page directory, GDT,
* IDT, TSS etc.) are mapped into a single Page Table.
*
* This allows us to migrate the complete nexus to anywhere in the
* guest address space by just updating a single (unused) page directory
* entry in the monitor/guest page directory to point to this nexus
* page table.
*
* To simplify nexus migration, we try to avoid storing guest linear
* addresses to nexus structures as far as possible. Instead, we use
* offsets relative to the monitor code/data segments. As we update
* the base of these segments whenever the monitor migrates, the net
* effect is that those *offsets* remain valid across nexus migration.
*/
/* Fill in the PDE flags. The US bit is set to 1 (user access).
* All of the US bits in the monitor PTEs are set to 0 (system access).
*/
vm->host.nexus_pde.fields.base = vm->pages.nexus_page_tbl;
vm->host.nexus_pde.fields.avail = 0;
vm->host.nexus_pde.fields.G = 0; /* not global */
vm->host.nexus_pde.fields.PS = 0; /* 4K pages */
vm->host.nexus_pde.fields.D = 0; /* (unused in pde) */
vm->host.nexus_pde.fields.A = 0; /* not accessed */
vm->host.nexus_pde.fields.PCD = 0; /* normal caching */
vm->host.nexus_pde.fields.PWT = 0; /* normal write-back */
vm->host.nexus_pde.fields.US = 1; /* user access (see above) */
vm->host.nexus_pde.fields.RW = 1; /* read or write */
vm->host.nexus_pde.fields.P = 1; /* present in memory */
/* Clear Page Table. */
pageTable = vm->host.addr.nexus_page_tbl;
mon_memzero(pageTable, 4096);
/* xxx Comment here */
laddr = 0;
base = MON_BASE_FROM_LADDR(laddr);
hostMapMonPages(vm, kernelModulePages.ppi, kernelModulePages.nPages, &laddr,
pageTable, US0, RW1, "Monitor code/data pages");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
vm->guest.addr.nexus = (nexus_t *) (laddr - base);
hostMapMonPages(vm, &vm->pages.nexus, 1, &laddr, pageTable, US0, RW1, "Nexus");
vm->guest.addr.guest_context = (guest_context_t *)
( (Bit32u)vm->guest.addr.nexus + PAGESIZE - sizeof(guest_context_t) );
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
vm->host.addr.nexus->vm = (void *) (laddr - base);
hostMapMonPages(vm, vm->pages.vm, BytesToPages(sizeof(*vm)),
&laddr, pageTable, US0, RW1, "VM structure");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
vm->guest.addr.idt = (gate_t *) (laddr - base);
hostMapMonPages(vm, vm->pages.idt, MON_IDT_PAGES, &laddr, pageTable, US0, RW1,
"IDT");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
vm->guest.addr.gdt = (descriptor_t *) (laddr - base);
hostMapMonPages(vm, vm->pages.gdt, MON_GDT_PAGES, &laddr, pageTable, US0, RW1,
"GDT");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
vm->guest.addr.ldt = (descriptor_t *) (laddr - base);
hostMapMonPages(vm, vm->pages.ldt, MON_LDT_PAGES, &laddr, pageTable, US0, RW1,
"LDT");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
vm->guest.addr.tss = (tss_t *) (laddr - base);
hostMapMonPages(vm, vm->pages.tss, MON_TSS_PAGES, &laddr, pageTable, US0, RW1,
"TSS");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
vm->guest.addr.idt_stubs = (idt_stub_t *) (laddr - base);
hostMapMonPages(vm, vm->pages.idt_stubs, MON_IDT_STUBS_PAGES, &laddr,
pageTable, US0, RW1, "IDT stubs");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/* Monitor Page Directory */
vm->guest.addr.page_dir = (pageEntry_t *) (laddr - base);
hostMapMonPages(vm, &vm->pages.page_dir, 1, &laddr, pageTable, US0, RW1,
"Monitor Page Directory");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/* Nexus Page Table */
vm->guest.addr.nexus_page_tbl = (page_t *) (laddr - base);
hostMapMonPages(vm, &vm->pages.nexus_page_tbl, 1, &laddr, pageTable, US0, RW1,
"Nexus Page Table");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/* Map virtualized guest page tables into monitor. */
vm->guest.addr.page_tbl = (page_t *) (laddr - base);
hostMapMonPages(vm, vm->pages.page_tbl, MON_PAGE_TABLES,
&laddr, pageTable, US0, RW1, "Guest Page Tables");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/* Map of linear addresses of page tables mapped into monitor */
vm->guest.addr.page_tbl_laddr_map = (unsigned *) (laddr - base);
hostMapMonPages(vm, &vm->pages.page_tbl_laddr_map, 1, &laddr, pageTable,
US0, RW1, "Page Table Laddr Map");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/* Guest CPU state (mapped RW into user space also). */
vm->guest.addr.guest_cpu = (guest_cpu_t *) (laddr - base);
hostMapMonPages(vm, &vm->pages.guest_cpu, 1, &laddr,
pageTable, US0, RW1, "Guest CPU State");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/*
* We need a buffer to implement a debug print facility which
* can work in either host or monitor space. Map the buffer
* into monitor/guest space.
*/
vm->guest.addr.log_buffer = (unsigned char *) (laddr - base);
hostMapMonPages(vm, vm->pages.log_buffer, LOG_BUFF_PAGES, &laddr,
pageTable, US0, RW1, "Log Buffer");
{
/* The physical addresses of the following pages are not */
/* yet established. Pass dummy info until they are mapped. */
Bit32u tmp[1];
tmp[0] = 0;
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/* Window into the guest's current physical code page */
vm->guest.addr.code_phy_page = (unsigned char *) (laddr - base);
hostMapMonPages(vm, tmp, 1, &laddr, pageTable, US0, RW1, "Code Phy Page");
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
/* Temporary window into a guest physical page, for accessing */
/* guest GDT, IDT, etc info. */
vm->guest.addr.tmp_phy_page0 = (unsigned char *) (laddr - base);
hostMapMonPages(vm, tmp, 1, &laddr, pageTable, US0, RW1, "Tmp Phy Page0");
vm->guest.addr.tmp_phy_page1 = (unsigned char *) (laddr - base);
hostMapMonPages(vm, tmp, 1, &laddr, pageTable, US0, RW1, "Tmp Phy Page1");
}
#if ANAL_CHECKS
hostMapBlankPage(vm, &laddr, pageTable);
#endif
hostOSPrint("Using %u/1024 PTE slots in 4Meg monitor range.\n",
(laddr >> 12) & 0x3ff);
/* Pointer to mon2host routine inside nexus page */
vm->guest.__mon2host = (void (*)(void)) MON_NEXUS_OFFSET(vm, __mon2host);
/*
* =====================
* Transition Page Table
* =====================
*
* To aid in the transition between host<-->monitor/guest spaces,
* we need to have an address identity map situation for at least
* one page; the page containing the transition code. As we do
* not know in advance whether this linear address range is in use
* by the guest as well, we set aside a complete additional Page
* Table, which contains only a single PTE pointing to the nexus page.
*
* To create the identity map, we simply change the corresponding
* monitor page directory entry to point to this transition Page Table.
* This happens transparently inside the host<-->guest transition code;
* both the guest/monitor code and the host side code never see this
* transition page table entered into the page directory!
*
* NOTE: We need to ensure that the nexus page table never spans the
* same 4Meg linear address space region as this page table!
* As we are free to choose the nexus linear address, this is
* not a problem.
*/
/* Get full linear address of nexus code page, as seen in host space. */
laddr = (Bit32u)vm->host.addr.nexus + vm->kernel_offset;
pdi = laddr >> 22;
pti = (laddr >> 12) & 0x3ff;
/*
* We need to be able to access the PDE in the monitor page directory
* that corresponds to this linear address from both host and monitor
* address spaces.
*/
vm->host.addr.nexus->transition_pde_p_host = vm->host.addr.page_dir + pdi;
vm->host.addr.nexus->transition_pde_p_mon = (pageEntry_t *)
(((Bit32u)vm->guest.addr.page_dir) + (pdi << 2));
vm->host.addr.nexus->transition_laddr = laddr;
/* Fill in the PDE flags */
vm->host.addr.nexus->transition_pde.fields.base = vm->pages.transition_PT;
vm->host.addr.nexus->transition_pde.fields.avail = 0;
vm->host.addr.nexus->transition_pde.fields.G = 0; /* not global */
vm->host.addr.nexus->transition_pde.fields.PS = 0; /* 4K pages */
vm->host.addr.nexus->transition_pde.fields.D = 0; /* (unused in pde) */
vm->host.addr.nexus->transition_pde.fields.A = 0; /* not accessed */
vm->host.addr.nexus->transition_pde.fields.PCD = 0; /* normal caching */
vm->host.addr.nexus->transition_pde.fields.PWT = 0; /* normal write-back*/
vm->host.addr.nexus->transition_pde.fields.US = 0; /* no user access */
vm->host.addr.nexus->transition_pde.fields.RW = 1; /* read or write */
vm->host.addr.nexus->transition_pde.fields.P = 1; /* present in memory*/
/* Clear Page Table; only one PTE is used. */
pageTable = vm->host.addr.transition_PT;
mon_memzero(pageTable, 4096);
/* Fill in the PTE for identity mapping the code page */
pageTable->pte[pti].fields.base = vm->pages.nexus;
pageTable->pte[pti].fields.avail = 0;
pageTable->pte[pti].fields.G = 0; /* not global */
pageTable->pte[pti].fields.PS = 0; /* (unused in pte) */
pageTable->pte[pti].fields.D = 0; /* clean */
pageTable->pte[pti].fields.A = 0; /* not accessed */
pageTable->pte[pti].fields.PCD = 0; /* normal caching */
pageTable->pte[pti].fields.PWT = 0; /* normal write-back */
pageTable->pte[pti].fields.US = 0; /* user can not access */
pageTable->pte[pti].fields.RW = 1; /* read or write */
pageTable->pte[pti].fields.P = 1; /* present in memory */
/*
* Setup the TSS for the monitor/guest environment.
*
* We don't need to set the pagedir in the TSS, because we don't
* actually jump to it anyway. The TSS is just used to set the kernel
* stack and in a later stage, perhaps the I/O permission bitmap.
*/
/* No task chain. */
vm->host.addr.tss->back = 0;
/* No debugging or I/O, for now. */
vm->host.addr.tss->trap = 0;
vm->host.addr.tss->io = sizeof(tss_t);
/* Monitor stack offset. */
vm->host.addr.tss->esp0 =
((Bit32u)vm->guest.addr.nexus) + PAGESIZE;
/*
* Set up initial monitor code and stack offset.
*/
vm->host.addr.nexus->mon_jmp_info.offset = MON_NEXUS_OFFSET(vm, __mon_cs);
vm->host.addr.nexus->mon_stack_info.offset =
vm->host.addr.tss->esp0 - (sizeof(guest_context_t) + 48);
/* xxx 48 above should be calculated from code below which winds
* xxx up monitor stack.
*/
/*
* Setup the IDT for the monitor/guest environment
*/
r = 0;
r |= hostInitIDTSlot(vm, 0, IDT_EXCEPTION_NOERROR); /* Divide error */
r |= hostInitIDTSlot(vm, 1, IDT_EXCEPTION_NOERROR); /* Debug exceptions */
r |= hostInitIDTSlot(vm, 2, IDT_INTERRUPT); /* NMI */
r |= hostInitIDTSlot(vm, 3, IDT_EXCEPTION_NOERROR); /* Breakpoint */
r |= hostInitIDTSlot(vm, 4, IDT_EXCEPTION_NOERROR); /* Overflow */
r |= hostInitIDTSlot(vm, 5, IDT_EXCEPTION_NOERROR); /* Bounds check */
r |= hostInitIDTSlot(vm, 6, IDT_EXCEPTION_NOERROR); /* Invalid opcode */
r |= hostInitIDTSlot(vm, 7, IDT_EXCEPTION_NOERROR); /* FPU not available */
r |= hostInitIDTSlot(vm, 8, IDT_EXCEPTION_ERROR); /* Double fault */
r |= hostInitIDTSlot(vm, 9, IDT_EXCEPTION_NOERROR); /* FPU segment overrun */
r |= hostInitIDTSlot(vm, 10, IDT_EXCEPTION_ERROR); /* Invalid TSS */
r |= hostInitIDTSlot(vm, 11, IDT_EXCEPTION_ERROR); /* Segment not present */
r |= hostInitIDTSlot(vm, 12, IDT_EXCEPTION_ERROR); /* Stack exception */
r |= hostInitIDTSlot(vm, 13, IDT_EXCEPTION_ERROR); /* GP fault */
r |= hostInitIDTSlot(vm, 14, IDT_EXCEPTION_ERROR); /* Page fault */
r |= hostInitIDTSlot(vm, 15, IDT_EXCEPTION_NOERROR); /* reserved */
r |= hostInitIDTSlot(vm, 16, IDT_EXCEPTION_NOERROR); /* Coprocessor error */
r |= hostInitIDTSlot(vm, 17, IDT_EXCEPTION_ERROR); /* Alignment check */
r |= hostInitIDTSlot(vm, 18, IDT_EXCEPTION_NOERROR); /* Machine check */
/* Reserved exceptions */
for (i = 19; i < 32; i++)
r |= hostInitIDTSlot(vm, i, IDT_EXCEPTION_NOERROR);
/* Hardware interrupts */
for (i = 32; i < 256; i++)
r |= hostInitIDTSlot(vm, i, IDT_INTERRUPT);
if (r!=0)
goto error;
/*
* Setup the initial guest context
*/
mon_memzero(vm->host.addr.guest_context, sizeof(guest_context_t));
/* Wind up the monitor stack for the initial transition via
* __host2mon. At the tail end, monitor state is popped from the
* stack and a RET is executed.
*/
{
Bit32u *ptr;
ptr = (Bit32u *) (((unsigned char *) vm->host.addr.guest_context) - 4);
*ptr-- = (Bit32u) &__ret_to_guest;
*ptr-- = 0x02; /* eflags: only reserved bit on */
*ptr-- = 0; /* eax */
*ptr-- = 0; /* ecx */
*ptr-- = 0; /* edx */
*ptr-- = 0; /* ebx */
*ptr-- = 0; /* esp dummy */
*ptr-- = 0; /* ebp */
*ptr-- = 0; /* esi */
*ptr-- = 0; /* edi */
*ptr-- = 0; /* FS; start with null value. */
*ptr-- = 0; /* GS; start with null value. */
}
vm->vmState |= VMStateInitMonitor;
vm->mon_request = MonReqNone;
return(1); /* all OK */
error:
return(0); /* error */
}
unsigned
hostInitGuestPhyMem(vm_t *vm)
{
unsigned i;
mon_memzero(vm->pageInfo, sizeof(vm->pageInfo));
for (i=0; i<vm->pages.guest_n_pages; i++) {
/* For now, we start out by preallocating physical pages */
/* for the guest, though not necessarily mapped into linear */
/* space. */
vm->pageInfo[i].attr.raw = 0;
vm->pageInfo[i].tsc = 0;
vm->pageInfo[i].attr.fields.allocated = 1;
}
{
Bit32u rom_page;
unsigned npages;
/* Mark BIOS ROM area as ReadOnly */
rom_page = 0xf0000 >> 12;
npages = (1 + 0xfffff - 0xf0000) / 4096;
for (i=0; i<npages; i++)
vm->pageInfo[rom_page + i].attr.fields.RO = 1;
/* Mark VGA BIOS ROM area as ReadOnly */
rom_page = 0xc0000 >> 12;
npages = (1 + 0xc7fff - 0xc0000) / 4096;
for (i=0; i<npages; i++)
vm->pageInfo[rom_page + i].attr.fields.RO = 1;
}
#if 1
/* Mark VGA framebuffer area as Memory Mapped IO */
{
Bit32u vga_page;
unsigned npages;
vga_page = 0xa0000 >> 12;
npages = (1 + 0xbffff - 0xa0000) / 4096;
for (i=0; i<npages; i++)
vm->pageInfo[vga_page + i].attr.fields.memMapIO = 1;
}
#endif
return(0);
}
int
hostInitIDTSlot(vm_t *vm, unsigned vec, int type)
/*
* initIDTSlot(): Initialize a monitor IDT slot.
*/
{
/* IDT slot stubs */
idt_stub_t *stub = &vm->host.addr.idt_stubs[vec];
Bit32u stub_mon = ((Bit32u) vm->guest.addr.idt_stubs) +
vec*sizeof(idt_stub_t);
if (sizeof(idt_stub_t) != IDT_STUB_SIZE)
return( -1 );
switch (type) {
case IDT_INTERRUPT:
stub->m2.pushla = 0x68;
stub->m2.dummy = 0;
stub->m2.pushlb = 0x68;
stub->m2.vector = vec;
stub->m2.jmp = 0xe9;
stub->m2.reloc = ((Bit32u) &__handle_int) -
(stub_mon + sizeof(idt_method2_t));
break;
case IDT_EXCEPTION_ERROR:
stub->m1.pushl = 0x68;
stub->m1.vector = vec;
stub->m1.jmp = 0xe9;
stub->m1.reloc = ((Bit32u) &__handle_fault) -
(stub_mon + sizeof(idt_method1_t));
break;
case IDT_EXCEPTION_NOERROR:
stub->m2.pushla = 0x68;
stub->m2.dummy = 0;
stub->m2.pushlb = 0x68;
stub->m2.vector = vec;
stub->m2.jmp = 0xe9;
stub->m2.reloc = ((Bit32u) &__handle_fault) -
(stub_mon + sizeof(idt_method2_t));
break;
default:
return -1;
}
/* Set the interrupt gate */
SET_INT_GATE(vm->host.addr.idt[vec],
nullSelector, stub_mon, D_PRESENT, D_DPL0, D_D32);
return 0;
}
/*
* Map pages allocated by host, into the linear address space of
* the monitor/guest, given the Page Table supplied.
*/
void
hostMapMonPages(vm_t *vm, Bit32u *pages, unsigned n, Bit32u *laddr_p,
page_t *pageTable, unsigned user, unsigned writable, char *name)
{
unsigned i, pti;
#if 0
hostOSPrint("hostMapMonPages: '%s' mapped at 0x%x .. 0x%x.\n",
name,
(*laddr_p) - MON_BASE_FROM_LADDR(0),
((*laddr_p) + (n*4096)) - MON_BASE_FROM_LADDR(0) );
#endif
pti = (*laddr_p >> 12) & 0x3ff;
for (i = 0; i < n; i++, pti++) {
if (pti > 1024)
break; /* This should not happen! */
/* Fill in the PTE flags */
pageTable->pte[pti].fields.base = pages[i];
pageTable->pte[pti].fields.avail = 0;
pageTable->pte[pti].fields.G = 0; /* not global */
pageTable->pte[pti].fields.PS = 0; /* (unused in pte) */
pageTable->pte[pti].fields.D = 0; /* clean */
pageTable->pte[pti].fields.A = 0; /* not accessed */
pageTable->pte[pti].fields.PCD = 0; /* normal caching */
pageTable->pte[pti].fields.PWT = 0; /* normal write-back */
pageTable->pte[pti].fields.US = user; /* 0=system, 1=user */
pageTable->pte[pti].fields.RW = writable; /* 0=RO, 1=RW */
pageTable->pte[pti].fields.P = 1; /* present in memory */
}
/*
* Advance linear address pointer, for the next set of pages
* to be mapped.
*/
*laddr_p += 4096 * n;
}
#if ANAL_CHECKS
void
hostMapBlankPage(vm_t *vm, Bit32u *laddr_p, page_t *pageTable)
{
unsigned pti;
pti = (*laddr_p >> 12) & 0x3ff;
if (pti > 1024)
return; /* This should not happen! */
/* Fill in the PTE flags */
pageTable->pte[pti].fields.base = 0;
pageTable->pte[pti].fields.avail = 0;
pageTable->pte[pti].fields.G = 0; /* not global */
pageTable->pte[pti].fields.PS = 0; /* (unused in pte) */
pageTable->pte[pti].fields.D = 0; /* clean */
pageTable->pte[pti].fields.A = 0; /* not accessed */
pageTable->pte[pti].fields.PCD = 0; /* normal caching */
pageTable->pte[pti].fields.PWT = 0; /* normal write-back */
pageTable->pte[pti].fields.US = 0;
pageTable->pte[pti].fields.RW = 0;
pageTable->pte[pti].fields.P = 0;
/*
* Advance linear address pointer, for the next set of pages
* to be mapped.
*/
*laddr_p += 4096;
}
#endif
int
hostIoctlGeneric(vm_t *vm, void *inode, void *filp,
unsigned int cmd, unsigned long arg)
{
switch (cmd) {
/*
* Set the guest CPUID info.
*/
case PLEX86_CPUID:
{
if ( vm->vmState & VMStateGuestCPUID ) {
/* Can't change guest CPUID. */
return -Plex86ErrnoEINVAL;
}
if ( hostOSCopyFromUser(&vm->guestCPUIDInfo, (void *)arg,
sizeof(vm->guestCPUIDInfo)) )
return -Plex86ErrnoEFAULT;
/* xxx Value checks here. */
vm->vmState |= VMStateGuestCPUID;
return 0;
}
case PLEX86_REGISTER_MEMORY:
{
plex86IoctlRegisterMem_t registerMemMsg;
if ( hostOSCopyFromUser(&registerMemMsg, (void *)arg,
sizeof(registerMemMsg)) )
return -Plex86ErrnoEFAULT;
return( hostIoctlRegisterMem(vm, &registerMemMsg) );
}
/*
* Tear down the VM environment.
*/
case PLEX86_TEARDOWN:
if ( vm->vmState & VMStateRegisteredAll ) {
hostOSPrint("plex86: guest memory is still registered!\n");
/* Could effect the unpinning here and then do:
* vm->vmState &= ~VMStateRegisteredAll;
*/
return -Plex86ErrnoEBUSY;
}
hostUnallocVmPages(vm);
/* Fixme: deal with state better here. */
/* Reset state to only FD opened. */
vm->vmState = VMStateFDOpened;
return 0;
/*
* Execute the guest in the VM for a while. The guest CPU state
* is specified in a memory window mmap()'d to user space.
*/
case PLEX86_EXECUTE:
{
plex86IoctlExecute_t executeMsg;
int ret;
if ( hostOSCopyFromUser(&executeMsg, (void *)arg, sizeof(executeMsg)) )
return -Plex86ErrnoEFAULT;
ret = hostIoctlExecute(vm, &executeMsg);
if ( hostOSCopyToUser((void *)arg, &executeMsg, sizeof(executeMsg)) )
return -Plex86ErrnoEFAULT;
return ret;
}
#warning "PLEX86_RESET should only conditionally compiled for debugging."
/*
* For debugging, when the module gets hosed, this is a way
* to reset the in-use count, so we can rmmod it.
*/
case PLEX86_RESET:
hostOSModuleCountReset(vm, inode, filp);
return 0;
default:
hostOSPrint("plex86: unknown ioctl(%d) called\n", cmd);
return -Plex86ErrnoEINVAL;
}
}
int
hostIoctlExecute(vm_t *vm, plex86IoctlExecute_t *executeMsg)
{
guest_cpu_t *guest_cpu;
guest_context_t *guest_stack_context;
nexus_t *nexus;
unsigned s;
int retval;
if ( (vm->vmState != VMStateReady) ||
(vm->mon_request != MonReqNone) ) {
retval = Plex86NoExecute_VMState; /* Fail. */
goto handlePanic;
}
/* Only (virtualized) native execution is supported currently.
* Later, it will be interesting to breakpoint one instruction
* at-a-time using Plex86ExecuteMethodBreakpoint, for
* cosimulation.
*/
if (executeMsg->executeMethod != Plex86ExecuteMethodNative) {
retval = Plex86NoExecute_Method; /* Fail. */
goto handleFail;
}
/* A pointer to the guest CPU state as passed from host-user space.
* This structure is memory mapped between user and kernel/monitor space.
*/
guest_cpu = vm->host.addr.guest_cpu;
/* A pointer to the guest CPU state saved on the monitor stack. */
guest_stack_context = vm->host.addr.guest_context;
/* =================================================================
* Before executing the guest in the VM, we must check that
* the guest conditions meet the requirements of the user-level-only
* VM.
* =================================================================
*/
/* CR0:
* PG(31)==1
* CD(30)==? (look into this later)
* NW(29)==? (look into this later)
* AM(18)==pass-thru from guest
* WP(16)==Don't care. Monitor always sets this to 1.
* NE( 5)==? (look into this later)
* ET( 4)==? (look into this later)
* TS( 3)==? (look into this later)
* EM( 2)==? (look into this later)
* MP( 1)==? (look into this later)
* PE( 0)==1
*/
/* 0x8005003b */
if ( (guest_cpu->cr0.raw & 0xe0000037) != 0x80000033 ) {
hostOSPrint("plex86: guest CR0=0x%x\n", guest_cpu->cr0.raw);
retval = Plex86NoExecute_CR0; /* Fail. */
goto handleFail;
}
/* CR4:
* OSXMMEXCPT(10)==? (look into this later)
* OSFXSR(9)==? (look into this later)
* PCE(8)==? (look into this later)
* PGE(7)==? (look into this later)
* MCE(6)==? (look into this later)
* PAE(5)==? (look into this later)
* PSE(4)==? (look into this later)
* DE(3)==? (look into this later)
* TSD(2)==? (look into this later)
* PVI(1)==? (look into this later)
* VME(0)==? (look into this later)
*/
if ( (guest_cpu->cr4.raw & 0x000007ff) != 0x00000000 ) {
hostOSPrint("plex86: guest CR4=0x%x\n", guest_cpu->cr4.raw);
retval = Plex86NoExecute_CR4; /* Fail. */
goto handleFail;
}
/* Guest CPL must be 3 (user-level).
* CS selector must not be NULL.
*/
if ( (guest_cpu->sreg[SRegCS].sel.fields.rpl != 3) ||
(guest_cpu->sreg[SRegCS].sel.fields.index == 0) ||
(guest_cpu->sreg[SRegCS].des.dpl != 3) ) {
retval = Plex86NoExecute_CS; /* Fail. */
goto handleFail;
}
/* A20 line must be enabled. */
if ( guest_cpu->a20Enable != 1 ) {
retval = Plex86NoExecute_A20; /* Fail. */
goto handleFail;
}
/* Some code not really used now, since we only support A20 being enabled. */
{
unsigned newA20Enable;
newA20Enable = guest_cpu->a20Enable > 0; /* Make 0 or 1. */
if ( newA20Enable != vm->system.a20Enable ) {
if ( (!newA20Enable) && guest_cpu->cr0.fields.pg ) {
/* A20 disabled, paging on not supported. Well, really I have to
* see if it matters. This check was in old plex86 code.
*/
retval = Plex86NoExecute_A20; /* Fail. */
goto handleFail;
}
vm->system.a20Enable = newA20Enable;
vm->system.a20AddrMask = 0xffefffff | (newA20Enable << 20);
vm->system.a20IndexMask = 0x000ffeff | (newA20Enable << 8);
}
}
/* LDT not supported.
* Monitor uses GDT slots 1,2,3, so guest segments can not.
* Segment descriptor cache DPL should equal 3.
*/
for (s=0; s<6; s++) {
unsigned selector = guest_cpu->sreg[s].sel.raw;
unsigned index;
/* Only care if selector is not NULL. */
if ( selector & 0xfffc ) {
if ( (selector & 0x0007) != 3 ) {
/* Either TI=1 (LDT usage) or RPL!=3. */
retval = Plex86NoExecute_Selector; /* Fail. */
goto handleFail;
}
index = selector >> 3;
if ( index <= 3 ) {
/* Selector index field uses one of the monitor GDT slots. */
retval = Plex86NoExecute_Selector; /* Fail. */
goto handleFail;
}
if ( index >= (MON_GDT_SIZE/8) ) {
/* Selector index field uses a slot beyond the monitor GDT size. */
retval = Plex86NoExecute_Selector; /* Fail. */
goto handleFail;
}
if ( guest_cpu->sreg[s].des.dpl != 3 ) {
retval = Plex86NoExecute_DPL; /* Fail. */
goto handleFail;
}
}
}
/* EFlags constraints:
* VIP/VIF==0
* VM==0
* RF==0
* NT==0
* IOPL==0 (We may be able to allow this to be 0..2)
* IF==1
* TF==0
* bit1==1
*/
if ( (guest_cpu->eflags & (0x001b7302)) !=
(0x00000202) ) {
retval = Plex86NoExecute_EFlags; /* Fail. */
goto handleFail;
}
/* Notes on other stuff:
* - CPUID emulation vs virtualization match.
*/
/* NOTE: We should commit to executing the guest at this point.
* We must not leave stray entries in the GDT.
*/
/* Install virtualized guest descriptors in GDT.
* Either use descriptor caches from guest space, or we have
* to chase down the GDT entries using the guest's paging
* system. Might be a cheaper/safe bet to just use the
* descriptor caches. If the guest reloads a descriptor,
* just let the user space deal with it.
*/
for (s=0; s<6; s++) {
if ( (guest_cpu->sreg[s].sel.raw & 0xfffc) != 0) {
vm->host.addr.gdt[ guest_cpu->sreg[s].sel.fields.index ] =
guest_cpu->sreg[s].des;
}
}
#warning "Have to clear out GDT"
guest_stack_context->gs = guest_cpu->sreg[SRegGS].sel.raw;
guest_stack_context->fs = guest_cpu->sreg[SRegFS].sel.raw;
guest_stack_context->ds = guest_cpu->sreg[SRegDS].sel.raw;
guest_stack_context->es = guest_cpu->sreg[SRegES].sel.raw;
/* Could use memcpy(); both are in order. Pack both structs. */
guest_stack_context->edi = guest_cpu->edi;
guest_stack_context->esi = guest_cpu->esi;
guest_stack_context->ebp = guest_cpu->ebp;
guest_stack_context->dummy_esp = 0; /* Not needed. */
guest_stack_context->ebx = guest_cpu->ebx;
guest_stack_context->edx = guest_cpu->edx;
guest_stack_context->ecx = guest_cpu->ecx;
guest_stack_context->eax = guest_cpu->eax;
/* Fields vector/error are ignored for return to guest. */
/* CS:EIP */
guest_stack_context->eip = guest_cpu->eip;
guest_stack_context->cs = guest_cpu->sreg[SRegCS].sel.raw;
guest_stack_context->eflags.raw = guest_cpu->eflags;
vm->veflags.raw = 0; /* Virtualized EFLAGS - implement later. */
guest_stack_context->esp = guest_cpu->esp;
guest_stack_context->ss = guest_cpu->sreg[SRegSS].sel.raw;
/* Pointer to the fields in the nexus.S assembly code. */
nexus = vm->host.addr.nexus;
#warning "Monitor CRx hacks"
nexus->mon_cr0 = 0x8001003b | /* PG/WP/NE/ET/TS/MP/PE */
(guest_cpu->cr0.raw & 0x00040000); /* Pass-thru AM from guest. */
/* Could move mon_cr3 load to mapMonitor. */
nexus->mon_cr3 = vm->pages.page_dir << 12;
nexus->mon_cr4 = 0x00000004; /* TSD=1 */
/* vm->guest_cpu.cr0.raw = guest_cpu->cr0 | 0x32; */ /* +++ hack for now */
// Notes:
// - Implement some of monPagingRemap from old code, since that
// was intended to be run/triggered by an initial mode change.
// - After execution of 1st timeslice, need to copy dynamic state
// from VM to guest_cpu area.
// - Deal with cycle counts etc.
hostInitShadowPaging(vm);
for (;;) {
unsigned long eflags;
#if 0
/* If print buffer has contents, return to user space to print. */
if (vm->log_buffer_info.offset) {
vm->mon_msgs.header.msg_type = VMMessagePrintBuf;
vm->mon_msgs.header.msg_len = 0;
vm->mon_request = MonReqNone; /* Request satisfied */
resetPrintBuf(vm); /* xxx Fix print mess */
retval = 100;
goto handleFail;
}
#endif
vm_save_flags(eflags);
vm_restore_flags(eflags & ~0x00004300); /* clear NT/IF/TF */
#if ANAL_CHECKS
if (!(eflags & 0x200)) {
vm_restore_flags(eflags);
hostOSPrint("ioctlExecute: EFLAGS.IF==0\n");
retval = 101; /* Fail. */
goto handlePanic;
}
#endif
/* Call assembly routine to effect transition. */
vm->host.__host2mon();
/* First check for an asynchronous event (interrupt redirection) */
if ( vm->mon_request == MonReqRedirect ) {
vm_restore_flags(eflags & ~0x00000200); /* restore all but IF */
soft_int(vm->redirect_vector); /* sets IF to 1 */
hostOSInstrumentIntRedirCount(vm->redirect_vector);
vm->mon_request = MonReqNone; /* Request satisfied */
}
/* Event was synchronous; monitor requested a switch back to host. */
else {
vm_restore_flags(eflags);
/* Perform action requested by monitor. */
switch ( vm->mon_request ) {
case MonReqRemapMonitor:
#if 0
if ( mapMonitor(vm) ) {
vm->mon_request = MonReqNone; /* Request satisfied */
break;
}
else {
hostOSPrint("mapMonitor failed.\n");
hostOSPrint("Panic w/ abort_code=%u\n", vm->abort_code);
retval = 102;
goto handlePanic;
}
#endif
hostOSPrint("ioctlExecute: case MonReqRemapMonitor.\n");
retval = 103;
goto handlePanic;
case MonReqFlushPrintBuf:
hostOSPrint("ioctlExecute: case MonReqFlushPrintBuf.\n");
retval = 104;
goto handlePanic;
case MonReqGuestFault:
/* Encountered a guest fault. */
hostCopyGuestStateToUserSpace(vm);
executeMsg->cyclesExecuted = 0; /* Handle later. */
executeMsg->instructionsExecuted = 0; /* Handle later. */
executeMsg->monitorState.state = vm->vmState;
executeMsg->monitorState.request = vm->mon_request;
executeMsg->monitorState.guestFaultNo = vm->guestFaultNo;
vm->mon_request = MonReqNone;
return 0;
case MonReqPanic:
if (vm->abort_code)
hostOSPrint("Panic w/ abort_code=%u\n", vm->abort_code);
hostOSPrint("ioctlExecute: case MonReqPanic.\n");
retval = 106;
goto handlePanic;
case MonReqPinUserPage:
if ( !hostHandlePagePinRequest(vm, vm->pinReqPPI) ) {
retval = 108;
goto handlePanic;
}
continue; /* Back to VM monitor. */
default:
hostOSPrint("ioctlExecute: default case (%u).\n", vm->mon_request);
retval = 107;
goto handlePanic;
}
}
/* Let host decide whether we are allowed another timeslice */
if ( !hostOSIdle() ) {
/* We are returning only because the host wants to
* schedule other work.
*/
executeMsg->monitorState.state = vm->vmState;
executeMsg->monitorState.request = MonReqNone;
return 0;
}
}
/* Should not get here. */
retval = 109;
goto handlePanic;
handleFail:
/* Handle inabilitiy to execute the guest due to certain state. */
executeMsg->monitorState.state = vm->vmState;
executeMsg->monitorState.request = vm->mon_request;
return(retval);
handlePanic:
vm->vmState |= VMStatePanic;
vm->mon_request = MonReqPanic;
executeMsg->monitorState.state = vm->vmState;
executeMsg->monitorState.request = vm->mon_request;
return(retval);
}
void
hostCopyGuestStateToUserSpace(vm_t *vm)
{
guest_cpu_t *guest_cpu;
guest_context_t *guest_stack_context;
/* A pointer to the guest CPU state as passed from host-user space.
* This structure is memory mapped between user and kernel/monitor space.
*/
guest_cpu = vm->host.addr.guest_cpu;
/* A pointer to the guest CPU state saved on the monitor stack. */
guest_stack_context = vm->host.addr.guest_context;
guest_cpu->sreg[SRegES].sel.raw = guest_stack_context->es;
if ( (guest_stack_context->es & 0xfffc) == 0 ) {
guest_cpu->sreg[SRegES].des = nullDescriptor;
guest_cpu->sreg[SRegES].valid = 0;
}
else {
guest_cpu->sreg[SRegES].des =
vm->host.addr.gdt[ guest_cpu->sreg[SRegES].sel.fields.index ];
guest_cpu->sreg[SRegES].valid = 1;
}
guest_cpu->sreg[SRegCS].sel.raw = guest_stack_context->cs;
if ( (guest_stack_context->cs & 0xfffc) == 0 ) {
guest_cpu->sreg[SRegCS].des = nullDescriptor;
guest_cpu->sreg[SRegCS].valid = 0;
}
else {
guest_cpu->sreg[SRegCS].des =
vm->host.addr.gdt[ guest_cpu->sreg[SRegCS].sel.fields.index ];
guest_cpu->sreg[SRegCS].valid = 1;
}
guest_cpu->sreg[SRegSS].sel.raw = guest_stack_context->ss;
if ( (guest_stack_context->ss & 0xfffc) == 0 ) {
guest_cpu->sreg[SRegSS].des = nullDescriptor;
guest_cpu->sreg[SRegSS].valid = 0;
}
else {
guest_cpu->sreg[SRegSS].des =
vm->host.addr.gdt[ guest_cpu->sreg[SRegSS].sel.fields.index ];
guest_cpu->sreg[SRegSS].valid = 1;
}
guest_cpu->sreg[SRegDS].sel.raw = guest_stack_context->ds;
if ( (guest_stack_context->ds & 0xfffc) == 0 ) {
guest_cpu->sreg[SRegDS].des = nullDescriptor;
guest_cpu->sreg[SRegDS].valid = 0;
}
else {
guest_cpu->sreg[SRegDS].des =
vm->host.addr.gdt[ guest_cpu->sreg[SRegDS].sel.fields.index ];
guest_cpu->sreg[SRegDS].valid = 1;
}
guest_cpu->sreg[SRegFS].sel.raw = guest_stack_context->fs;
if ( (guest_stack_context->fs & 0xfffc) == 0 ) {
guest_cpu->sreg[SRegFS].des = nullDescriptor;
guest_cpu->sreg[SRegFS].valid = 0;
}
else {
guest_cpu->sreg[SRegFS].des =
vm->host.addr.gdt[ guest_cpu->sreg[SRegFS].sel.fields.index ];
guest_cpu->sreg[SRegFS].valid = 1;
}
guest_cpu->sreg[SRegGS].sel.raw = guest_stack_context->gs;
if ( (guest_stack_context->gs & 0xfffc) == 0 ) {
guest_cpu->sreg[SRegGS].des = nullDescriptor;
guest_cpu->sreg[SRegGS].valid = 0;
}
else {
guest_cpu->sreg[SRegGS].des =
vm->host.addr.gdt[ guest_cpu->sreg[SRegGS].sel.fields.index ];
guest_cpu->sreg[SRegGS].valid = 1;
}
/* Could use memcpy(); both are in order. Pack both structs. */
guest_cpu->edi = guest_stack_context->edi;
guest_cpu->esi = guest_stack_context->esi;
guest_cpu->ebp = guest_stack_context->ebp;
guest_cpu->esp = guest_stack_context->esp;
guest_cpu->ebx = guest_stack_context->ebx;
guest_cpu->edx = guest_stack_context->edx;
guest_cpu->ecx = guest_stack_context->ecx;
guest_cpu->eax = guest_stack_context->eax;
/* CS:EIP */
guest_cpu->eip = guest_stack_context->eip;
guest_cpu->eflags = guest_stack_context->eflags.raw;
/* vm->veflags.raw = 0; */ /* Virtualized EFLAGS - implement later. */
}
int
hostIoctlRegisterMem(vm_t *vm, plex86IoctlRegisterMem_t *registerMemMsg)
{
unsigned error;
/* Do not allow duplicate allocation. The file descriptor must be
* opened. The guest CPUID info can be filled in later.
*/
if ( (vm->vmState & ~VMStateGuestCPUID) != VMStateFDOpened )
return -Plex86ErrnoEBUSY;
if (vm->pages.guest_n_megs != 0)
return -Plex86ErrnoEBUSY;
/* Check that the amount of memory is reasonable. */
if ( (registerMemMsg->nMegs > PLEX86_MAX_PHY_MEGS) ||
(registerMemMsg->nMegs < 4) ||
(registerMemMsg->nMegs & 0x3) )
return -Plex86ErrnoEINVAL;
/* Check that the guest memory vector is page aligned. */
if ( registerMemMsg->guestPhyMemVector & 0xfff )
return -Plex86ErrnoEINVAL;
/* Check that the log buffer area is page aligned. */
if ( registerMemMsg->logBufferWindow & 0xfff )
return -Plex86ErrnoEINVAL;
/* Check that the guest CPU area is page aligned. */
if ( registerMemMsg->guestCPUWindow & 0xfff )
return -Plex86ErrnoEINVAL;
/* Check that none of the user areas overlap. In case we have a
* number of regions, use some generic code to handle N regions.
*/
{
#define NumUserRegions 3
struct {
Bit32u min, max;
} userRegion[NumUserRegions];
unsigned i,j;
userRegion[0].min = registerMemMsg->guestPhyMemVector;
userRegion[0].max = userRegion[0].min + (registerMemMsg->nMegs<<20) - 1;
userRegion[1].min = registerMemMsg->logBufferWindow;
userRegion[1].max = userRegion[1].min + LOG_BUFF_SIZE - 1;
userRegion[2].min = registerMemMsg->guestCPUWindow;
userRegion[2].max = userRegion[2].min + (4096) - 1;
for (i=1; i<NumUserRegions; i++) {
for (j=1; j<NumUserRegions; j++) {
if (j == i)
continue; /* Don't compare at the same region. */
/* Check for min(j) contained in region(i). */
if ( (userRegion[j].min >= userRegion[i].min) &&
(userRegion[j].min <= userRegion[i].max) )
return -Plex86ErrnoEINVAL;
/* Check for max(j) contained in region(i). */
if ( (userRegion[j].max >= userRegion[i].min) &&
(userRegion[j].max <= userRegion[i].max) )
return -Plex86ErrnoEINVAL;
}
}
}
/* Allocate memory */
if ( (error = hostAllocVmPages(vm, registerMemMsg)) != 0 ) {
hostOSPrint("plex86: allocVmPages failed at %u\n",
error);
return -Plex86ErrnoENOMEM;
}
/* Initialize the guests physical memory. */
if ( hostInitGuestPhyMem(vm) ) {
hostUnallocVmPages(vm);
return -Plex86ErrnoEFAULT;
}
/* Initialize the monitor. */
if ( !hostInitMonitor(vm) ||
!hostMapMonitor(vm) ) {
hostUnallocVmPages(vm);
return -Plex86ErrnoEFAULT;
}
return 0;
}
/*
* Allocate various pages/memory needed by monitor.
*/
int
hostAllocVmPages(vm_t *vm, plex86IoctlRegisterMem_t *registerMemMsg)
{
vm_pages_t *pg = &vm->pages;
vm_addr_t *ad = &vm->host.addr;
#warning "Fix these shortcuts"
unsigned where = 1;
/* clear out allocated pages lists */
mon_memzero(pg, sizeof(*pg));
mon_memzero(ad, sizeof(*ad));
/* Guest physical memory pages */
pg->guest_n_megs = registerMemMsg->nMegs;
pg->guest_n_pages = registerMemMsg->nMegs * 256;
pg->guest_n_bytes = registerMemMsg->nMegs * 1024 * 1024;
if ( pg->guest_n_pages > MAX_MON_GUEST_PAGES) {
/* The size of the user-space allocated guest physical memory must
* fit within the maximum number of guest pages that the VM monitor
* supports.
*/
goto error;
}
where++;
vm->guestPhyMemAddr = registerMemMsg->guestPhyMemVector;
vm->vmState |= VMStateRegisteredPhyMem; /* Bogus for now. */
where++;
{
Bit32u hostPPI, kernelAddr;
/* Guest CPU state (malloc()'d in user space). */
if ( !hostOSGetAndPinUserPage(vm, registerMemMsg->guestCPUWindow,
&pg->guest_cpu_hostOSPtr, &hostPPI, &kernelAddr) ) {
goto error;
}
ad->guest_cpu = (guest_cpu_t *) kernelAddr;
pg->guest_cpu = hostPPI;
vm->vmState |= VMStateRegisteredGuestCPU; /* For now. */
where++;
/* Log buffer area (malloc()'d in user space). */
/* LOG_BUFF_PAGES */
if ( !hostOSGetAndPinUserPage(vm, registerMemMsg->logBufferWindow,
&pg->log_buffer_hostOSPtr[0], &hostPPI, &kernelAddr) ) {
goto error;
}
ad->log_buffer = (Bit8u *) kernelAddr;
pg->log_buffer[0] = hostPPI;
where++;
vm->vmState |= VMStateRegisteredPrintBuffer; /* For now. */
}
/* Monitor page directory */
if ( !(ad->page_dir = (pageEntry_t *) hostOSAllocZeroedPage()) ) {
goto error;
}
where++;
if (!(pg->page_dir = hostOSGetAllocedPagePhyPage(ad->page_dir))) {
goto error;
}
where++;
/* Monitor page tables */
if ( !(ad->page_tbl = hostOSAllocZeroedMem(4096 * MON_PAGE_TABLES)) ) {
goto error;
}
where++;
if (!hostOSGetAllocedMemPhyPages(pg->page_tbl, MON_PAGE_TABLES,
ad->page_tbl, 4096 * MON_PAGE_TABLES)) {
goto error;
}
where++;
/* Map of the linear addresses of page tables currently */
/* mapped into the monitor space. */
if ( !(ad->page_tbl_laddr_map = (unsigned *) hostOSAllocZeroedPage()) ) {
goto error;
}
where++;
if ( !(pg->page_tbl_laddr_map =
hostOSGetAllocedPagePhyPage(ad->page_tbl_laddr_map)) ) {
goto error;
}
where++;
/* Nexus page table */
if ( !(ad->nexus_page_tbl = (page_t *) hostOSAllocZeroedPage()) ) {
goto error;
}
where++;
if ( !(pg->nexus_page_tbl = hostOSGetAllocedPagePhyPage(ad->nexus_page_tbl)) ) {
goto error;
}
where++;
/* Transition page table */
if ( !(ad->transition_PT = (page_t *) hostOSAllocZeroedPage()) ) {
goto error;
}
where++;
if ( !(pg->transition_PT = hostOSGetAllocedPagePhyPage(ad->transition_PT)) ) {
goto error;
}
where++;
/* Nexus page */
if ( !(ad->nexus = (nexus_t *) hostOSAllocZeroedPage()) ) {
goto error;
}
where++;
if ( !(pg->nexus = hostOSGetAllocedPagePhyPage(ad->nexus)) ) {
goto error;
}
where++;
/* Monitor IDT */
if ( !(ad->idt = hostOSAllocZeroedMem(MON_IDT_PAGES*4096)) ) {
goto error;
}
where++;
if (!hostOSGetAllocedMemPhyPages(pg->idt, MON_IDT_PAGES, ad->idt, MON_IDT_SIZE)) {
goto error;
}
where++;
/* Monitor GDT */
if ( !(ad->gdt = hostOSAllocZeroedMem(MON_GDT_PAGES*4096)) ) {
goto error;
}
where++;
if (!hostOSGetAllocedMemPhyPages(pg->gdt, MON_GDT_PAGES, ad->gdt, MON_GDT_SIZE)) {
goto error;
}
where++;
/* Monitor LDT */
if ( !(ad->ldt = hostOSAllocZeroedMem(MON_LDT_PAGES*4096)) ) {
goto error;
}
where++;
if (!hostOSGetAllocedMemPhyPages(pg->ldt, MON_LDT_PAGES, ad->ldt, MON_LDT_SIZE)) {
goto error;
}
where++;
/* Monitor TSS */
if ( !(ad->tss = hostOSAllocZeroedMem(MON_TSS_PAGES*4096)) ) {
goto error;
}
where++;
if (!hostOSGetAllocedMemPhyPages(pg->tss, MON_TSS_PAGES, ad->tss, MON_TSS_SIZE)) {
goto error;
}
where++;
/* Monitor IDT stubs */
if ( !(ad->idt_stubs = hostOSAllocZeroedMem(MON_IDT_STUBS_PAGES*4096)) ) {
goto error;
}
where++;
if (!hostOSGetAllocedMemPhyPages(pg->idt_stubs, MON_IDT_STUBS_PAGES,
ad->idt_stubs, MON_IDT_STUBS_SIZE)) {
goto error;
}
where++;
/* Get the physical pages associated with the vm_t structure. */
if (!hostOSGetAllocedMemPhyPages(pg->vm, MAX_VM_STRUCT_PAGES, vm, sizeof(*vm))) {
goto error;
}
where++;
vm->vmState |= VMStateMemAllocated;
return 0; /* OK. */
error:
hostUnallocVmPages( vm );
return( where );
}
/* */
/* Unallocate pages/memory used by monitor */
/* */
void
hostUnallocVmPages( vm_t *vm )
{
vm_pages_t *pg = &vm->pages;
vm_addr_t *ad = &vm->host.addr;
/* Guest physical memory pages */
if (vm->guestPhyMemAddr) {
hostReleasePinnedUserPages(vm);
vm->guestPhyMemAddr = 0;
}
vm->vmState &= ~VMStateRegisteredPhyMem; /* Bogus for now. */
/* Monitor page directory */
if (ad->page_dir) hostOSFreePage(ad->page_dir);
/* Monitor page tables */
if (ad->page_tbl) hostOSFreeMem(ad->page_tbl);
/* Map of linear addresses of page tables mapped into monitor. */
if (ad->page_tbl_laddr_map) hostOSFreePage(ad->page_tbl_laddr_map);
/* Nexus page table */
if (ad->nexus_page_tbl) hostOSFreePage(ad->nexus_page_tbl);
/* Guest CPU state. */
if (ad->guest_cpu) hostOSFreePage(ad->guest_cpu);
/* Transition page table */
if (ad->transition_PT) hostOSFreePage(ad->transition_PT);
if (ad->log_buffer) hostOSFreeMem(ad->log_buffer);
/* Nexus page */
if (ad->nexus) hostOSFreePage(ad->nexus);
/* Monitor IDT */
if (ad->idt) hostOSFreeMem(ad->idt);
/* Monitor GDT */
if (ad->gdt) hostOSFreeMem(ad->gdt);
/* Monitor LDT */
if (ad->ldt) hostOSFreeMem(ad->ldt);
/* Monitor TSS */
if (ad->tss) hostOSFreeMem(ad->tss);
/* Monitor IDT stubs */
if (ad->idt_stubs) hostOSFreeMem(ad->idt_stubs);
/* clear out allocated pages lists */
mon_memzero(pg, sizeof(*pg));
mon_memzero(ad, sizeof(*ad));
}
unsigned
hostGetCpuCapabilities(void)
{
Bit32u eax, ebx, ecx, edx;
/* Get the highest allowed cpuid level */
asm volatile (
"xorl %%eax,%%eax\n\t"
"cpuid"
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
:
: "cc"
);
if (eax < 1)
return(0); /* not enough capabilities */
/* Copy vendor string. */
hostCpuIDInfo.vendorDWord0 = ebx;
hostCpuIDInfo.vendorDWord1 = edx;
hostCpuIDInfo.vendorDWord2 = ecx;
/* CPUID w/ EAX==1: Processor Signature & Feature Flags */
asm volatile (
"movl $1,%%eax\n\t"
"cpuid"
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
:
: "cc"
);
hostCpuIDInfo.procSignature.raw = eax;
hostCpuIDInfo.featureFlags.raw = edx;
/* Plex86 needs TSC */
if (hostCpuIDInfo.featureFlags.fields.tsc==0)
return(0);
return(1);
}
/* Map the monitor and guest into the VM. */
unsigned
hostMapMonitor(vm_t *vm)
{
selector_t monCsSel, monSsSel, monTssSel;
Bit32u laddr, base;
unsigned slot;
guest_context_t *guestContext;
nexus_t *nexus;
descriptor_t *gdt;
/* For convenience, some pointers. */
guestContext = vm->host.addr.guest_context;
nexus = vm->host.addr.nexus;
gdt = vm->host.addr.gdt;
#warning "Is the GDT being cleared of old values?"
/* +++ should zero out GDT, so prev entries do not remain */
/* =========================
* Map in Monitor structures
* =========================
*/
/* CS/SS/TSS selectors:
* For now, hardcode in monitor descriptors at slots 1,2,3. As we
* are only running user code in the VM, these are likely safe slots
* as they are often used guest OSes for kernel descriptors.
*/
monCsSel.raw = Selector(1, 0, RPL0);
monSsSel.raw = Selector(2, 0, RPL0);
monTssSel.raw = Selector(3, 0, RPL0);
/* Search for unused PDE for nexus PT (fixed for now) */
laddr = 0x70000000;
vm->mon_pde_mask = laddr & 0xffc00000;
vm->mon_pdi = laddr >> 22;
base = MON_BASE_FROM_LADDR(laddr);
/* Map nexus into monitor/guest address space */
vm->host.addr.page_dir[laddr >> 22] = vm->host.nexus_pde;
/* CS/SS/TSS descriptors: Put at fixed GDT location for now. */
SET_DESCRIPTOR(gdt[monCsSel.fields.index], base, 0xfffff,
D_PG, D_D32, D_AVL0, D_PRESENT, D_DPL0, D_CODE | D_READ)
SET_DESCRIPTOR(gdt[monSsSel.fields.index], base, 0xfffff,
D_PG, D_D32, D_AVL0, D_PRESENT, D_DPL0, D_DATA | D_WRITE)
SET_DESCRIPTOR(gdt[monTssSel.fields.index],
base + (Bit32u) vm->guest.addr.tss,
sizeof(tss_t)-1,
D_BG, 0, D_AVL0, D_PRESENT, D_DPL0, D_TSS)
/* Fix up the selectors of all IDT entries. */
for ( slot = 0; slot < 256; slot++ )
vm->host.addr.idt[slot].selector = monCsSel;
/* The monitor GDT/IDT loading info. */
nexus->mon_gdt_info.base = base + (Bit32u) vm->guest.addr.gdt;
nexus->mon_gdt_info.limit = MON_GDT_SIZE;
nexus->mon_idt_info.base = base + (Bit32u) vm->guest.addr.idt;
nexus->mon_idt_info.limit = MON_IDT_SIZE;
/* We don't have a monitor LDT for now. */
nexus->mon_ldt_sel = 0;
/* The monitor TSS. */
nexus->mon_tss_sel = monTssSel.raw;
vm->host.addr.tss->esp0 = ((Bit32u)vm->guest.addr.nexus) + PAGESIZE;
vm->host.addr.tss->ss0 = monSsSel.raw;
/* Monitor code and stack segments. */
nexus->mon_jmp_info.selector = monCsSel.raw;
nexus->mon_stack_info.selector = monSsSel.raw;
/* Monitor code/data segment base. */
nexus->mon_base = base;
vm->vmState |= VMStateMapMonitor;
return(1);
}
void
hostInitShadowPaging(vm_t *vm)
{
pageEntry_t *monPDir;
Bit32u pdi;
/*Bit32u cr3_page_index;*/
/*phy_page_usage_t *pusage;*/
#if 0
cr3_page_index = A20Addr(vm, vm->guest_cpu.cr3) >> 12;
if ( cr3_page_index >= vm->pages.guest_n_pages)
xxxpanic(vm, "monPagingRemap: CR3 conflicts with monitor space\n");
#endif
/* Reset page table heap */
vm->ptbl_laddr_map_i = 0;
/* Clear monitor PD except 4Meg range used by monitor */
monPDir = vm->host.addr.page_dir;
for (pdi=0; pdi<1024; pdi++) {
#if ANAL_CHECKS
vm->host.addr.page_tbl_laddr_map[pdi] = -1; /* max unsigned */
#endif
if (pdi != vm->mon_pdi)
monPDir[pdi].raw = 0;
}
/* Update vpaging timestamp. */
vm->vpaging_tsc = vm_rdtsc();
#if 0
/* When we remap the monitor page tables, IF guest paging is
* enabled, then mark the page containing the guest page directory
* as such. In non-paged mode, there is no page directory.
*/
if (vm->guest_cpu.cr0.fields.pg) {
pusage = &vm->pageInfo[cr3_page_index];
pusage->tsc = vm->vpaging_tsc;
pusage->attr.raw &= PageUsageSticky;
pusage->attr.raw |= PageUsagePDir;
pusage->attr.fields.access_perm = PagePermNA;
if (pusage->attr.raw & PageBadUsage4PDir)
xxxpanic(vm, "monPagingRemap: BadUsage4PDir\n");
}
#endif
}
void
hostReleasePinnedUserPages(vm_t *vm)
{
unsigned ppi;
unsigned dirty;
unsigned nPages;
Bit32u kernelAddr;
/* Unpin the pages associate with the guest physical memory. */
nPages = vm->pages.guest_n_pages;
for (ppi=0; ppi<nPages; ppi++) {
if ( vm->pageInfo[ppi].attr.fields.pinned ) {
void *osSpecificPtr;
osSpecificPtr = (void *) vm->hostStructPagePtr[ppi];
#warning "Conditionalize page dirtying before page release."
dirty = 1; /* FIXME: 1 for now. */
hostOSUnpinUserPage(vm,
vm->guestPhyMemAddr + (ppi<<12),
osSpecificPtr,
ppi,
0 /* There was no host kernel addr mapped for this page. */,
dirty);
vm->pageInfo[ppi].attr.fields.pinned = 0;
}
}
/* Unpin the pages associated with the guest_cpu area. */
kernelAddr = (Bit32u) vm->host.addr.guest_cpu;
hostOSUnpinUserPage(vm,
0, /* User space address. */
vm->pages.guest_cpu_hostOSPtr,
vm->pages.guest_cpu,
&kernelAddr,
1 /* Dirty. */);
/* Unpin the pages associated with the log buffer area. */
kernelAddr = (Bit32u) vm->host.addr.log_buffer;
hostOSUnpinUserPage(vm,
0, /* User space address. */
vm->pages.log_buffer_hostOSPtr[0],
vm->pages.log_buffer[0],
&kernelAddr,
1 /* Dirty. */);
#warning "User space address is passed as 0 for now..."
}
unsigned
hostHandlePagePinRequest(vm_t *vm, Bit32u reqGuestPPI)
{
Bit32u hostPPI;
unsigned qIndex;
#warning "We must not unpin open pages (for page walking) here."
if (vm->guestPhyPagePinQueue.nEntries < MaxPhyPagesPinned) {
/* There is room in the Q for another entry - we have not reached
* the upper limit of allowable number of pinned pages.
*/
qIndex = vm->guestPhyPagePinQueue.nEntries;
}
else {
unsigned dirty;
Bit32u unpinGuestPPI;
/* There is no room in the Q for another entry - we have reached
* the upper limit of allowable number of pinned pages. We must
* first unpin a page to free up the limit, then we can pin the
* requested page. This keeps plex86 from pinning an unconstrained
* number of pages at one time.
*/
qIndex = vm->guestPhyPagePinQueue.tail;
dirty = 1; /* FIXME: 1 for now. */
unpinGuestPPI = vm->guestPhyPagePinQueue.ppi[qIndex];
hostOSUnpinUserPage(vm,
vm->guestPhyMemAddr + (unpinGuestPPI<<12),
vm->hostStructPagePtr[unpinGuestPPI],
unpinGuestPPI,
0 /* There was no host kernel addr mapped for this page. */,
dirty);
vm->pageInfo[unpinGuestPPI].attr.fields.pinned = 0;
}
/* Pin the requested guest physical page in the host OS. */
if ( !hostOSGetAndPinUserPage(vm,
vm->guestPhyMemAddr + (reqGuestPPI<<12),
&vm->hostStructPagePtr[reqGuestPPI],
&hostPPI,
0 /* Don't need a host kernel address. */
) ) {
hostOSPrint("handlePagePinReq: request to pin failed.\n");
return(0); /* Fail. */
}
/* Pinning activities have succeeded. Mark this physical page as being
* pinnned, and store it's physical address.
*/
vm->pageInfo[reqGuestPPI].attr.fields.pinned = 1;
vm->pageInfo[reqGuestPPI].hostPPI = hostPPI;
/* Now add this entry to the Q. */
vm->guestPhyPagePinQueue.ppi[qIndex] = reqGuestPPI;
if (vm->guestPhyPagePinQueue.nEntries < MaxPhyPagesPinned) {
vm->guestPhyPagePinQueue.nEntries++;
vm->guestPhyPagePinQueue.tail =
vm->guestPhyPagePinQueue.nEntries % MaxPhyPagesPinned;
}
else {
/* Leave .nEntries at the maximum value - Q is full. */
vm->guestPhyPagePinQueue.tail =
(vm->guestPhyPagePinQueue.tail + 1) % MaxPhyPagesPinned;
}
return(1); /* OK. */
}