Bochs/bochs/plex86/kernel/host-linux.c
2008-02-05 22:57:43 +00:00

801 lines
21 KiB
C

/*
* plex86: run multiple x86 operating systems concurrently
* Copyright (C) 1999-2003 Kevin P. Lawton
*
* host-linux.c: Linux specific VM host driver functionality
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "plex86.h"
#define IN_HOST_SPACE
#include "monitor.h"
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/wrapper.h>
#include <linux/version.h>
#include <asm/irq.h>
#include <asm/atomic.h>
#ifndef VERSION_CODE
# define VERSION_CODE(vers,rel,seq) ( ((vers)<<16) | ((rel)<<8) | (seq) )
#endif
#if LINUX_VERSION_CODE < VERSION_CODE(2,4,20)
/* I use get_user_pages() to find and pin physical pages of memory
* underlying the guest physical memory malloc()'d from user space.
* This became an exported symbol available for kernel modules
* as of 2.4.20. You will have to recode some functions for
* lesser kernels.
*/
# error "Currently, you need Linux kernel 2.4.20 or above."
#endif
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,0)
# include <asm/uaccess.h>
#endif
#include <asm/io.h>
/************************************************************************/
/* Compatibility macros & convenience functions for older kernels */
/************************************************************************/
#ifndef EXPORT_NO_SYMBOLS
# define EXPORT_NO_SYMBOLS register_symtab(NULL)
#endif
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,29)
# define proc_register_dynamic proc_register
#endif
#if LINUX_VERSION_CODE < VERSION_CODE(2,2,0)
#define NEED_RESCHED need_resched
#else
#define NEED_RESCHED current->need_resched
#endif
/* Instrumentation of how many hardware interrupts were redirected
* to the host, while the VM monitor/guest was running. This can be
* written to by multiple contexts, so it needs SMP protection.
*/
static atomic_t interruptRedirCount[256];
#if LINUX_VERSION_CODE < VERSION_CODE(2,1,0)
static inline unsigned long
copy_from_user(void *to, const void *from, unsigned long n)
{
int i;
if ( (i = verify_area(VERIFY_READ, from, n)) != 0 )
return i;
memcpy_fromfs(to, from, n);
return 0;
}
static inline unsigned long
copy_to_user(void *to, const void *from, unsigned long n)
{
int i;
if ( (i = verify_area(VERIFY_WRITE, to, n)) != 0 )
return i;
memcpy_tofs(to, from, n);
return 0;
}
#endif
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,18) && !defined(THIS_MODULE)
/* Starting with version 2.1.18, the __this_module symbol is present,
* but the THIS_MODULE #define was introduced much later ...
*/
#define THIS_MODULE (&__this_module)
#endif
/************************************************************************/
/* Declarations */
/************************************************************************/
/* Use dynamic major number allocation. (Set non-zero for static allocation) */
#define PLEX86_MAJOR 0
static int plex_major = PLEX86_MAJOR;
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,18)
MODULE_PARM(plex_major, "i");
MODULE_PARM_DESC(plex_major, "major number (default " __MODULE_STRING(PLEX86_MAJOR) ")");
#endif
/* The kernel segment base. */
#if LINUX_VERSION_CODE < VERSION_CODE(2,1,0)
# define KERNEL_OFFSET 0xc0000000
#else
# define KERNEL_OFFSET 0x00000000
#endif
/* File operations. */
static int plex86_ioctl(struct inode *, struct file *, unsigned int,
unsigned long);
static int plex86_open(struct inode *, struct file *);
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,31)
static int plex86_release(struct inode *, struct file *);
#else
static void plex86_release(struct inode *, struct file *);
#endif
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,0)
static int plex86_mmap(struct file * file, struct vm_area_struct * vma);
#else
static int plex86_mmap(struct inode * inode, struct file * file,
struct vm_area_struct * vma);
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,9)
/* New License scheme. */
#ifdef MODULE_LICENSE
MODULE_LICENSE("GPL"); /* Close enough. Keeps kernel from complaining. */
#endif
#endif
/************************************************************************/
/* Structures / Variables */
/************************************************************************/
static int retrieveKernelModulePages(void);
static unsigned retrievePhyPages(Bit32u *page, int max_pages, void *addr,
unsigned size);
static struct file_operations plex86_fops = {
#if LINUX_VERSION_CODE >= VERSION_CODE(2,4,0)
owner: THIS_MODULE,
#endif
mmap: plex86_mmap,
ioctl: plex86_ioctl,
open: plex86_open,
release: plex86_release,
};
#ifdef CONFIG_DEVFS_FS
#include <linux/devfs_fs_kernel.h>
devfs_handle_t my_devfs_entry;
#endif
/* For the /proc/driver/plex86 entry. */
#if LINUX_VERSION_CODE >= VERSION_CODE(2,4,0) /* XXX - How far back? */
int plex86_read_procmem(char *, char **, off_t, int);
#else
int plex86_read_procmem(char *, char **, off_t, int, int);
#endif
#if LINUX_VERSION_CODE < VERSION_CODE(2,3,25)
static struct proc_dir_entry plex86_proc_entry = {
0, /* dynamic inode */
6, "driver/plex86", /* len, name */
S_IFREG | S_IRUGO, /* mode */
1, 0, 0,
0,
NULL,
&plex86_read_procmem, /* read function */
};
#endif
#if CONFIG_X86_PAE
# error "CONFIG_X86_PAE defined for this kernel, but unhandled in plex86"
#endif
/************************************************************************/
/* Main kernel module code */
/************************************************************************/
int
init_module(void)
{
int err;
/* Initialize structures which are not specific to each VM. These
* are things which are set only once upon kernel module initialization.
*/
memset(&kernelModulePages, 0, sizeof(kernelModulePages));
memset(&interruptRedirCount, 0, sizeof(interruptRedirCount));
/* Register the device with the kernel. */
err = register_chrdev(plex_major, "plex86", &plex86_fops);
if (err < 0) {
printk(KERN_WARNING "plex86: can't get major %d\n", plex_major);
return(err);
}
/* If this was a dynamic allocation, save the major for
* the release code
*/
if(!plex_major)
plex_major = err;
/* Register the /proc entry. */
#ifdef CONFIG_PROC_FS
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,25)
if (!create_proc_info_entry("driver/plex86", 0, NULL, plex86_read_procmem))
printk(KERN_ERR "plex86: registering /proc/driver/plex86 failed\n");
#else
proc_register_dynamic(&proc_root, &plex86_proc_entry);
#endif
#endif
/* Register /dev/misc/plex86 with devfs. */
#ifdef CONFIG_DEVFS_FS
my_devfs_entry = devfs_register(NULL, "misc/plex86",
DEVFS_FL_DEFAULT,
plex_major, 0 /* minor mode*/,
S_IFCHR | 0666, &plex86_fops,
NULL /* "info" */);
if (!my_devfs_entry)
printk(KERN_ERR "plex86: registering misc/plex86 devfs entry failed\n");
#endif
/* Retrieve the monitor physical pages. */
if ( !retrieveKernelModulePages() ) {
printk(KERN_ERR "plex86: retrieveKernelModulePages returned error\n");
err = -EINVAL;
goto fail_retrieve_pages;
}
/* Kernel independent code to be run when kernel module is loaded. */
if ( !hostModuleInit() ) {
printk(KERN_ERR "plex86: genericModuleInit returned error\n");
err = -EINVAL;
goto fail_cpu_capabilities;
}
/* Success. */
EXPORT_NO_SYMBOLS;
return(0);
fail_cpu_capabilities:
fail_retrieve_pages:
/* Unregister /proc entry. */
#ifdef CONFIG_PROC_FS
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,25)
remove_proc_entry("driver/plex86", NULL);
#else
proc_unregister(&proc_root, plex86_proc_entry.low_ino);
#endif
#endif
/* Unregister device. */
unregister_chrdev(plex_major, "plex86");
return err;
}
void
cleanup_module(void)
{
/* Unregister device. */
unregister_chrdev(plex_major, "plex86");
/* Unregister /proc entry. */
#ifdef CONFIG_PROC_FS
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,25)
remove_proc_entry("driver/plex86", NULL);
#else
proc_unregister(&proc_root, plex86_proc_entry.low_ino);
#endif
#endif
#ifdef CONFIG_DEVFS_FS
devfs_unregister(my_devfs_entry);
#endif
}
/************************************************************************/
/* Open / Release a VM */
/************************************************************************/
int
plex86_open(struct inode *inode, struct file *filp)
{
vm_t *vm;
#if LINUX_VERSION_CODE < VERSION_CODE(2,4,0)
MOD_INC_USE_COUNT;
#endif
/* Allocate a VM structure. */
if ( (vm = hostOSAllocZeroedMem(sizeof(vm_t))) == NULL )
return -ENOMEM;
filp->private_data = vm;
/* Kernel independent device open code. */
hostDeviceOpen(vm);
return(0);
}
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,31)
int
#else
void
#endif
plex86_release(struct inode *inode, struct file *filp)
{
vm_t *vm = (vm_t *)filp->private_data;
filp->private_data = NULL;
/* Free the virtual memory. */
hostUnallocVmPages( vm );
/* Free the VM structure. */
memset( vm, 0, sizeof(*vm) );
vfree( vm );
#if LINUX_VERSION_CODE < VERSION_CODE(2,4,0)
MOD_DEC_USE_COUNT;
#endif
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,31)
return(0);
#endif
}
int
plex86_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
vm_t *vm = (vm_t *)filp->private_data;
int ret;
/* Call non host-specific ioctl() code which calls back to this
* module only when it needs host-specific features.
*/
ret = hostIoctlGeneric(vm, inode, filp, cmd, arg);
/* Convert from plex86 errno codes to host-specific errno codes. Not
* very exciting.
*/
if ( ret < 0 )
ret = - hostOSConvertPlex86Errno(- ret);
return( ret );
}
int
#if LINUX_VERSION_CODE >= VERSION_CODE(2,1,0)
plex86_mmap(struct file * file, struct vm_area_struct * vma)
#else
plex86_mmap(struct inode * inode, struct file * file, struct vm_area_struct * vma)
#endif
{
vm_t *vm = (vm_t *)file->private_data;
UNUSED(vm);
return -EINVAL;
}
/************************************************************************/
/* Status reporting: /proc code */
/************************************************************************/
int
plex86_read_procmem(char *buf, char **start, off_t offset,
#if LINUX_VERSION_CODE >= VERSION_CODE(2,4,0)
int len
#else
int len, int unused
#endif
)
{
unsigned i;
len = 0;
len += sprintf(buf, "monitor-->host interrupt reflection counts\n");
for (i=0; i<256; i++) {
int count;
count = atomic_read( &interruptRedirCount[i] );
if (count)
len += sprintf(buf+len, " 0x%2x:%10u\n", i, count);
}
return(len);
}
int
retrieveKernelModulePages(void)
{
/*
* Retrieve start address and size of this module.
*
* Note that with old kernels, we cannot access the module info (size),
* hence we rely on the fact that Linux lets at least one page of
* virtual address space unused after the end of the module.
*/
#ifdef THIS_MODULE
Bit32u driverStartAddr = (Bit32u) THIS_MODULE;
unsigned size = THIS_MODULE->size;
#else
Bit32u driverStartAddr = (Bit32u) &mod_use_count_;
unsigned size = 0; /* Actual size determined below */
#endif
Bit32u driverStartAddrPageAligned = driverStartAddr & ~0xfff;
int nPages;
if (driverStartAddr != driverStartAddrPageAligned) {
/* Pretend this kernel module starts at the beginning of the page. */
/* If size is known, we have to add the extra offset from the beginning
* of the page.
*/
if (size)
size += (driverStartAddr & 0xfff);
}
nPages = retrievePhyPages(kernelModulePages.ppi, Plex86MaxKernelModulePages,
(void *) driverStartAddrPageAligned, size);
if (nPages == 0) {
printk(KERN_ERR "plex86: retrieveKernelModulePages: retrieve returned error.\n");
return( 0 ); /* Error. */
}
printk(KERN_WARNING "plex86: %u monitor pages located\n", nPages);
kernelModulePages.startOffset = driverStartAddr;
kernelModulePages.startOffsetPageAligned = driverStartAddrPageAligned;
kernelModulePages.nPages = nPages;
return( 1 ); /* OK. */
}
unsigned
retrievePhyPages(Bit32u *page, int max_pages, void *addr_v, unsigned size)
{
/*
* Grrr. There doesn't seem to be an exported mechanism to retrieve
* the physical pages underlying a vmalloc()'ed area. We do it the
* hard way ...
*/
pageEntry_t *host_pgd;
Bit32u host_cr3;
Bit32u addr; // start_addr;
unsigned n_pages;
int i;
addr = (Bit32u) addr_v;
if ( addr & 0xfff ) {
printk(KERN_ERR "plex86: retrievePhyPages: not page aligned!\n");
return 0;
}
if (!addr) {
printk(KERN_ERR "plex86: retrievePhyPages: addr NULL!\n");
return 0;
}
if (size == 0) {
/* Size unknown. Determine by cycling through page tables until
* we find one which is not present. We will assume that means
* the end of the data structure. Set the number of pages to
* cycle through, to one more than the maximum requested. This
* way we'll look through enough pages.
*/
n_pages = max_pages + 1;
}
else {
n_pages = BytesToPages(size);
if ( n_pages > max_pages ) {
printk(KERN_ERR "plex86: retrievePhyPages: n=%u > max=%u\n",
n_pages, max_pages);
return 0;
}
}
asm volatile ("movl %%cr3, %0" : "=r" (host_cr3));
host_pgd = (pageEntry_t *)(phys_to_virt(host_cr3 & ~0xfff));
for (i = 0; i < n_pages; i++) {
Bit32u laddr;
unsigned long lpage;
pgd_t *pgdPtr; pmd_t *pmdPtr; pte_t *ptePtr;
pgd_t pgdVal; pmd_t pmdVal; pte_t pteVal;
laddr = KERNEL_OFFSET + ((Bit32u) addr);
lpage = VMALLOC_VMADDR(laddr);
/* About to traverse the page tables. We need to lock others
* out of them briefly. Newer Linux versions can do a fine-grained
* lock on the page tables themselves. Older ones have to do
* a "big kernel lock".
*/
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,10)
spin_lock(&init_mm.page_table_lock);
#else
lock_kernel(); /* Big kernel lock. */
#endif
pgdPtr = pgd_offset(&init_mm, lpage);
pmdPtr = pmd_offset(pgdPtr, lpage);
ptePtr = pte_offset(pmdPtr, lpage);
pgdVal = *pgdPtr;
pmdVal = *pmdPtr;
pteVal = *ptePtr;
#if LINUX_VERSION_CODE >= VERSION_CODE(2,3,10)
spin_unlock(&init_mm.page_table_lock);
#else
unlock_kernel(); /* Big kernel unlock. */
#endif
if ( !(pgdVal.pgd & 1) ||
!(pmdVal.pmd & 1) ||
!(pteVal.pte_low & 1) ) {
if (size == 0)
return i; /* Report number of pages until area ended. */
printk(KERN_ERR "plex86: retrievePhyPages: "
"PDE.P==0: i=%u, n=%u laddr=0x%x\n", i, n_pages, laddr);
return 0; /* Error, ran into unmapped page in memory range. */
}
/* Abort if our page list is too small. */
if (i >= max_pages) {
printk(KERN_WARNING "plex86: page list is too small!\n");
printk(KERN_WARNING "plex86: n_pages=%u, max_pages=%u\n",
n_pages, max_pages);
return 0;
}
/* Get physical page address for this virtual page address. */
page[i] = pte_val(pteVal) >> 12;
/* Increment to the next virtual page address. */
addr += 4096;
}
return(n_pages);
}
/************************************************************************
* The requisite host-specific functions. An implementation of each of
* these functions needs to be offered for each host-XYZ.c file.
************************************************************************/
unsigned
hostOSIdle(void)
{
if (NEED_RESCHED)
schedule();
/* return !current_got_fatal_signal(); */
return( ! signal_pending(current) );
}
void *
hostOSAllocZeroedMem(unsigned long size)
{
void *ptr;
ptr = vmalloc(size);
if ( ((Bit32u) ptr) & 0x00000fff )
return( 0 ); /* Error. */
/* Zero pages. This also demand maps the pages in, which we need
* since we'll cycle through all the pages to get the physical
* address mappings.
*/
mon_memzero(ptr, size);
return( ptr );
}
void
hostOSFreeMem(void *ptr)
{
vfree(ptr);
}
void *
hostOSAllocZeroedPage(void)
{
return( (void *) get_zeroed_page(GFP_KERNEL) );
}
void
hostOSFreePage(void *ptr)
{
free_page( (Bit32u)ptr );
}
unsigned
hostOSGetAllocedMemPhyPages(Bit32u *page, int max_pages, void *ptr, unsigned size)
{
return( retrievePhyPages(page, max_pages, ptr, size) );
}
Bit32u
hostOSGetAllocedPagePhyPage(void *ptr)
{
if (!ptr) return 0;
/* return MAP_NR(ptr); */
return(__pa(ptr) >> PAGE_SHIFT);
}
void
hostOSPrint(char *fmt, ...)
{
#warning "Fix hostPrint"
#if 0
va_list args;
int ret;
unsigned char buffer[256];
va_start(args, fmt);
ret = mon_vsnprintf(buffer, 256, fmt, args);
if (ret == -1) {
printk(KERN_ERR "plex86: hostPrint: vsnprintf returns error.\n");
}
else {
printk(KERN_WARNING "plex86: %s\n", buffer);
}
#endif
}
int
hostOSConvertPlex86Errno(unsigned ret)
{
switch (ret) {
case 0: return(0);
case Plex86ErrnoEBUSY: return(EBUSY);
case Plex86ErrnoENOMEM: return(ENOMEM);
case Plex86ErrnoEFAULT: return(EFAULT);
case Plex86ErrnoEINVAL: return(EINVAL);
case Plex86ErrnoEACCES: return(EACCES);
case Plex86ErrnoEAGAIN: return(EAGAIN);
default:
printk(KERN_ERR "plex86: ioctlAllocVPhys: case %u\n", ret);
return(EINVAL);
}
}
Bit32u
hostOSKernelOffset(void)
{
return( KERNEL_OFFSET );
}
void
hostOSModuleCountReset(vm_t *vm, void *inode, void *filp)
{
#if LINUX_VERSION_CODE < VERSION_CODE(2,4,0)
while (MOD_IN_USE) {
MOD_DEC_USE_COUNT;
}
MOD_INC_USE_COUNT; /* bump back to 1 so release can decrement */
#endif
}
unsigned long
hostOSCopyFromUser(void *to, void *from, unsigned long len)
{
return( copy_from_user(to, from, len) );
}
unsigned long
hostOSCopyToUser(void *to, void *from, unsigned long len)
{
return( copy_to_user(to, from, len) );
}
Bit32u
hostOSGetAndPinUserPage(vm_t *vm, Bit32u userAddr, void **osSpecificPtr,
Bit32u *ppi, Bit32u *kernelAddr)
{
int ret;
struct page **pagePtr;
struct page *page;
pagePtr = (struct page **) osSpecificPtr;
ret = get_user_pages(current,
current->mm,
(unsigned long) userAddr,
1, /* 1 page. */
1, /* 'write': intent to write. */
0, /* 'force': ? */
pagePtr,
NULL /* struct vm_area_struct *[] */
);
if (ret != 1) {
printk(KERN_ERR "plex86: hostGetAndPinUserPages: failed.\n");
return(0); /* Error. */
}
page = *pagePtr; /* The returned "struct page *" value. */
/* Now that we have a list of "struct page *", one for each physical
* page of memory of the user space process's requested area, we can
* calculate the physical page address by simple pointer arithmetic
* based on "mem_map".
*/
*ppi = page - mem_map;
if (kernelAddr) {
/* Caller wants a kernel address returned which maps to this physical
* address.
*/
*kernelAddr = (Bit32u) kmap( page );
#warning "FIXME: Check return value here."
#warning "Also, conditionally compile for version and high memory support."
}
return(1); /* OK. */
}
void
hostOSUnpinUserPage(vm_t *vm, Bit32u userAddr, void *osSpecificPtr,
Bit32u ppi, Bit32u *kernelAddr, unsigned dirty)
{
#if 0
/* Here is some sample code from Linux 2.4.18, mm/memory.c:__free_pte() */
struct page *page = pte_page(pte);
if ((!VALID_PAGE(page)) || PageReserved(page))
return;
if (pte_dirty(pte))
set_page_dirty(page);
free_page_and_swap_cache(page);
#endif
struct page *page;
page = (struct page *) osSpecificPtr;
/* If a kernel address is passed, that means that previously we created
* a mapping for this physical page in the kernel address pace.
* We should unmap it. Only really useful for pages allocated from
* high memory.
*/
if (kernelAddr)
kunmap(page);
/* If the page was dirtied due to the guest running in the VM, we
* need to tell the kernel about that since it is not aware of
* the VM page tables.
*/
if (dirty)
set_page_dirty(page);
/* Release/unpin the page. */
put_page(page);
}
void
hostOSInstrumentIntRedirCount(unsigned interruptVector)
{
atomic_inc( &interruptRedirCount[interruptVector] );
}