From d94742232d15079a24982286f93236d34cb42bbf Mon Sep 17 00:00:00 2001 From: jym Date: Sat, 24 Jul 2010 00:45:54 +0000 Subject: [PATCH] Welcome PAE inside i386 current. This patch is inspired by work previously done by Jeremy Morse, ported by me to -current, merged with the work previously done for port-xen, together with additionals fixes and improvements. PAE option is disabled by default in GENERIC (but will be enabled in ALL in the next few days). In quick, PAE switches the CPU to a mode where physical addresses become 36 bits (64 GiB). Virtual address space remains at 32 bits (4 GiB). To cope with the increased size of the physical address, they are manipulated as 64 bits variables by kernel and MMU. When supported by the CPU, it also allows the use of the NX/XD bit that provides no-execution right enforcement on a per physical page basis. Notes: - reworked locore.S - introduce cpu_load_pmap(), used to switch pmap for the curcpu. Due to the different handling of pmap mappings with PAE vs !PAE, Xen vs native, details are hidden within this function. This helps calling it from assembly, as some features, like BIOS calls, switch to pmap_kernel before mapping trampoline code in low memory. - some changes in bioscall and kvm86_call, to reflect the above. - the L3 is "pinned" per-CPU, and is only manipulated by a reduced set of functions within pmap. To track the L3, I added two elements to struct cpu_info, namely ci_l3_pdirpa (PA of the L3), and ci_l3_pdir (the L3 VA). Rest of the code considers that it runs "just like" a normal i386, except that the L2 is 4 pages long (PTP_LEVELS is still 2). - similar to the ci_pae_l3_pdir{,pa} variables, amd64's xen_current_user_pgd becomes an element of cpu_info (slowly paving the way for MP world). - bootinfo_source struct declaration is modified, to cope with paddr_t size change with PAE (it is not correct to assume that bs_addr is a paddr_t when compiled with PAE - it should remain 32 bits). bs_addrs is now a void * array (in bootloader's code under i386/stand/, the bs_addrs is a physaddr_t, which is an unsigned long). - fixes in multiboot code (same reason as bootinfo): paddr_t size change. I used Elf32_* types, use RELOC() where necessary, and move the memcpy() functions out of the if/else if (I do not expect sym and str tables to overlap with ELF). - 64 bits atomic functions for pmap - all pmap_pdirpa access are now done through the pmap_pdirpa macro. It hides the L3/L2 stuff from PAE, as well as the pm_pdirpa change in struct pmap (it now becomes a PDP_SIZE array, with or without PAE). - manipulation of recursive mappings ( PDIR_SLOT_{,A}PTEs ) is done via loops on PDP_SIZE. See also http://mail-index.netbsd.org/port-i386/2010/07/17/msg002062.html No objection raised on port-i386@ and port-xen@R for about a week. XXX kvm(3) will be fixed in another patch to properly handle both PAE and !PAE kernel dumps (VA => PA macros are slightly different, and need proper 64 bits PA support in kvm_i386). XXX Mixing PAE and !PAE modules may lead to unwanted/unexpected results. This cannot be solved easily, and needs lots of thinking before being declared safe (paddr_t/bus_addr_t size handling, PD/PT macros abstractions). --- sys/arch/i386/conf/GENERIC | 5 +- sys/arch/i386/i386/bioscall.S | 21 +++-- sys/arch/i386/i386/kvm86call.S | 24 +++-- sys/arch/i386/i386/locore.S | 106 ++++++++++++++------- sys/arch/i386/i386/machdep.c | 23 +++-- sys/arch/i386/i386/mptramp.S | 10 +- sys/arch/i386/i386/multiboot.c | 51 +++++------ sys/arch/i386/include/pmap.h | 96 ++++++++++++------- sys/arch/x86/include/cpu.h | 15 ++- sys/arch/x86/include/pmap.h | 14 ++- sys/arch/x86/x86/cpu.c | 45 ++++++++- sys/arch/x86/x86/pmap.c | 162 +++++++++++++++++---------------- sys/arch/xen/x86/cpu.c | 65 ++++++++++++- sys/arch/xen/x86/x86_xpmap.c | 24 ++--- sys/arch/xen/x86/xenfunc.c | 8 +- 15 files changed, 438 insertions(+), 231 deletions(-) diff --git a/sys/arch/i386/conf/GENERIC b/sys/arch/i386/conf/GENERIC index 58acb70a3ffb..22a2ececbcfc 100644 --- a/sys/arch/i386/conf/GENERIC +++ b/sys/arch/i386/conf/GENERIC @@ -1,4 +1,4 @@ -# $NetBSD: GENERIC,v 1.988 2010/07/23 00:43:20 jakllsch Exp $ +# $NetBSD: GENERIC,v 1.989 2010/07/24 00:45:54 jym Exp $ # # GENERIC machine description file # @@ -22,7 +22,7 @@ include "arch/i386/conf/std.i386" options INCLUDE_CONFIG_FILE # embed config file in kernel binary -#ident "GENERIC-$Revision: 1.988 $" +#ident "GENERIC-$Revision: 1.989 $" maxusers 64 # estimated number of users @@ -35,6 +35,7 @@ no options COREDUMP # CPU-related options. options VM86 # virtual 8086 emulation options USER_LDT # user-settable LDT; used by WINE +#options PAE # PAE mode (36 bits physical addressing) # Enhanced SpeedStep Technology in the Pentium M options ENHANCED_SPEEDSTEP diff --git a/sys/arch/i386/i386/bioscall.S b/sys/arch/i386/i386/bioscall.S index b2acf17a5745..d1f7fc101819 100644 --- a/sys/arch/i386/i386/bioscall.S +++ b/sys/arch/i386/i386/bioscall.S @@ -1,4 +1,4 @@ -/* $NetBSD: bioscall.S,v 1.8 2008/04/28 20:23:24 martin Exp $ */ +/* $NetBSD: bioscall.S,v 1.9 2010/07/24 00:45:54 jym Exp $ */ /*- * Copyright (c) 1997 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: bioscall.S,v 1.8 2008/04/28 20:23:24 martin Exp $"); +__KERNEL_RCSID(0, "$NetBSD: bioscall.S,v 1.9 2010/07/24 00:45:54 jym Exp $"); #include @@ -39,8 +39,6 @@ __KERNEL_RCSID(0, "$NetBSD: bioscall.S,v 1.8 2008/04/28 20:23:24 martin Exp $"); /* LINTSTUB: include */ /* LINTSTUB: include */ - .globl _C_LABEL(PDPpaddr) /* from locore.S */ - .section ".rodata" _C_LABEL(biostramp_image): .globl _C_LABEL(biostramp_image) @@ -69,11 +67,11 @@ NENTRY(bioscall) pushl %ebp movl %esp,%ebp /* set up frame ptr */ - movl %cr3,%eax /* save PDP base register */ + /* install lwp0 pmap */ + movl _C_LABEL(kernel_pmap_ptr),%eax pushl %eax - - movl _C_LABEL(PDPpaddr),%eax /* install proc0 PDP */ - movl %eax,%cr3 + call _C_LABEL(cpu_load_pmap) + addl $4,%esp movl $(BIOSTRAMP_BASE),%eax /* address of trampoline area */ pushl 12(%ebp) @@ -81,8 +79,11 @@ NENTRY(bioscall) call *%eax /* machdep.c initializes it */ addl $8,%esp /* clear args from stack */ - popl %eax - movl %eax,%cr3 /* restore PTDB register */ + /* restore pmap - saved value is in curcpu()->ci_pmap */ + movl %fs:(CPU_INFO_PMAP),%eax + pushl %eax + call _C_LABEL(cpu_load_pmap) + addl $4,%esp leave ret diff --git a/sys/arch/i386/i386/kvm86call.S b/sys/arch/i386/i386/kvm86call.S index c77b9b408a3e..127bb451dafe 100644 --- a/sys/arch/i386/i386/kvm86call.S +++ b/sys/arch/i386/i386/kvm86call.S @@ -1,4 +1,4 @@ -/* $NetBSD: kvm86call.S,v 1.9 2008/01/04 15:55:31 yamt Exp $ */ +/* $NetBSD: kvm86call.S,v 1.10 2010/07/24 00:45:54 jym Exp $ */ /*- * Copyright (c) 1998 Jonathan Lemon @@ -34,7 +34,7 @@ #include "assym.h" -__KERNEL_RCSID(0, "$NetBSD: kvm86call.S,v 1.9 2008/01/04 15:55:31 yamt Exp $"); +__KERNEL_RCSID(0, "$NetBSD: kvm86call.S,v 1.10 2010/07/24 00:45:54 jym Exp $"); .data .align 4 @@ -79,10 +79,7 @@ ENTRY(kvm86_call) andl $~0x0200,4(%eax,%edi,1) /* reset "task busy" */ ltr %di - movl %cr3,%eax - pushl %eax /* save address space */ - movl PDPpaddr,%ecx - movl %ecx,%ebx + movl _C_LABEL(PDPpaddr),%ebx addl $KERNBASE,%ebx /* va of Idle PDP */ movl 0(%ebx),%eax pushl %eax /* old pde */ @@ -93,7 +90,12 @@ ENTRY(kvm86_call) movl vm86newptd,%eax /* mapping for vm86 page table */ movl %eax,0(%ebx) /* ... install as PDP entry 0 */ - movl %ecx,%cr3 /* new page tables */ + /* install Idle pmap (lwp0 pmap) */ + movl _C_LABEL(kernel_pmap_ptr),%eax + pushl %eax + call _C_LABEL(cpu_load_pmap) + addl $4,%esp + movl vm86frame,%esp /* switch to new stack */ movl $1,kvm86_incall /* set flag for trap() */ @@ -129,8 +131,12 @@ ENTRY(kvm86_ret) popl %ebx /* saved va of Idle PDP */ popl %eax movl %eax,0(%ebx) /* restore old pde */ - popl %eax - movl %eax,%cr3 /* install old page table */ + + /* restore pmap - saved value is in curcpu()->ci_pmap */ + movl %fs:(CPU_INFO_PMAP),%eax + pushl %eax + call _C_LABEL(cpu_load_pmap) + addl $4,%esp movl $0,kvm86_incall /* reset trapflag */ diff --git a/sys/arch/i386/i386/locore.S b/sys/arch/i386/i386/locore.S index e795556a0ad5..a7276e5da940 100644 --- a/sys/arch/i386/i386/locore.S +++ b/sys/arch/i386/i386/locore.S @@ -1,4 +1,4 @@ -/* $NetBSD: locore.S,v 1.92 2010/07/15 18:55:27 jym Exp $ */ +/* $NetBSD: locore.S,v 1.93 2010/07/24 00:45:54 jym Exp $ */ /* * Copyright-o-rama! @@ -129,7 +129,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.92 2010/07/15 18:55:27 jym Exp $"); +__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.93 2010/07/24 00:45:54 jym Exp $"); #include "opt_compat_oldboot.h" #include "opt_ddb.h" @@ -482,29 +482,43 @@ try586: /* Use the `cpuid' instruction. */ movl $_RELOC(tmpstk),%esp # bootstrap stack end location /* - * Virtual address space of kernel: + * Virtual address space of kernel, without PAE. The page dir is 1 page long. * * text | data | bss | [syms] | [blobs] | page dir | proc0 kstack | L1 ptp * 0 1 2 3 + * + * Virtual address space of kernel, with PAE. We need 4 pages for the page dir + * and 1 page for the L3. + * text | data | bss | [syms] | [blobs] | L3 | page dir | proc0 kstack | L1 ptp + * 0 1 5 6 7 */ - +#ifndef PAE #define PROC0_PDIR_OFF 0 -#define PROC0_STK_OFF (PROC0_PDIR_OFF + PAGE_SIZE) +#else +#define PROC0_L3_OFF 0 +#define PROC0_PDIR_OFF 1 * PAGE_SIZE +#endif + +#define PROC0_STK_OFF (PROC0_PDIR_OFF + PDP_SIZE * PAGE_SIZE) #define PROC0_PTP1_OFF (PROC0_STK_OFF + UPAGES * PAGE_SIZE) /* - * fillkpt + * fillkpt - Fill in a kernel page table * eax = pte (page frame | control | status) * ebx = page table address * ecx = number of pages to map + * + * For PAE, each entry is 8 bytes long: we must set the 4 upper bytes to 0. + * This is done by the first instruction of fillkpt. In the non-PAE case, this + * instruction just clears the page table entry. */ #define fillkpt \ -1: movl %eax,(%ebx) ; /* store phys addr */ \ - addl $4,%ebx ; /* next pte/pde */ \ - addl $PAGE_SIZE,%eax ; /* next phys page */ \ - loop 1b ; \ - +1: movl $0,(PDE_SIZE-4)(%ebx) ; /* clear bits */ \ + movl %eax,(%ebx) ; /* store phys addr */ \ + addl $PDE_SIZE,%ebx ; /* next pte/pde */ \ + addl $PAGE_SIZE,%eax ; /* next phys page */ \ + loop 1b ; /* Find end of kernel image. */ movl $RELOC(end),%edi @@ -538,9 +552,14 @@ try586: /* Use the `cpuid' instruction. */ incl %eax /* one more ptp for VAs stolen by bootstrap */ 1: movl %eax,RELOC(nkptp)+1*4 - /* tablesize = (1 + UPAGES + nkptp) << PGSHIFT; */ - addl $(1+UPAGES),%eax + /* tablesize = (PDP_SIZE + UPAGES + nkptp) << PGSHIFT; */ + addl $(PDP_SIZE+UPAGES),%eax +#ifdef PAE + incl %eax /* one more page for the L3 PD */ + shll $PGSHIFT+1,%eax /* PTP tables are twice larger with PAE */ +#else shll $PGSHIFT,%eax +#endif movl %eax,RELOC(tablesize) /* ensure that nkptp covers bootstrap tables */ @@ -578,7 +597,10 @@ try586: /* Use the `cpuid' instruction. */ */ movl $_RELOC(KERNTEXTOFF),%eax movl %eax,%ecx - shrl $(PGSHIFT-2),%ecx /* ((n >> PGSHIFT) << 2) for # pdes */ + shrl $(PGSHIFT-2),%ecx /* ((n >> PGSHIFT) << 2) for # pdes */ +#ifdef PAE + shll $1,%ecx /* pdes are twice larger with PAE */ +#endif addl %ecx,%ebx /* Map the kernel text read-only. */ @@ -605,36 +627,51 @@ try586: /* Use the `cpuid' instruction. */ * Construct a page table directory. */ /* Set up top level entries for identity mapping */ - leal (PROC0_PDIR_OFF)(%esi),%ebx + leal (PROC0_PDIR_OFF)(%esi),%ebx leal (PROC0_PTP1_OFF)(%esi),%eax orl $(PG_V|PG_KW), %eax movl RELOC(nkptp)+1*4,%ecx fillkpt /* Set up top level entries for actual kernel mapping */ - leal (PROC0_PDIR_OFF + L2_SLOT_KERNBASE*4)(%esi),%ebx + leal (PROC0_PDIR_OFF + L2_SLOT_KERNBASE*PDE_SIZE)(%esi),%ebx leal (PROC0_PTP1_OFF)(%esi),%eax orl $(PG_V|PG_KW), %eax movl RELOC(nkptp)+1*4,%ecx fillkpt /* Install a PDE recursively mapping page directory as a page table! */ - leal (PROC0_PDIR_OFF + PDIR_SLOT_PTE*4)(%esi),%ebx - leal (PROC0_PDIR_OFF)(%esi),%eax + leal (PROC0_PDIR_OFF + PDIR_SLOT_PTE*PDE_SIZE)(%esi),%ebx + leal (PROC0_PDIR_OFF)(%esi),%eax orl $(PG_V|PG_KW),%eax - movl %eax,(%ebx) - + movl $PDP_SIZE,%ecx + fillkpt + +#ifdef PAE + /* Fill in proc0 L3 page with entries pointing to the page dirs */ + leal (PROC0_L3_OFF)(%esi),%ebx + leal (PROC0_PDIR_OFF)(%esi),%eax + orl $(PG_V),%eax + movl $PDP_SIZE,%ecx + fillkpt + + /* Enable PAE mode */ + movl %cr4,%eax + orl $CR4_PAE,%eax + movl %eax,%cr4 +#endif /* Save phys. addr of PDP, for libkvm. */ - movl %esi,RELOC(PDPpaddr) + leal (PROC0_PDIR_OFF)(%esi),%eax + movl %eax,RELOC(PDPpaddr) - /* - * Startup checklist: - * 1. Load %cr3 with pointer to PDIR. - */ + /* + * Startup checklist: + * 1. Load %cr3 with pointer to PDIR (or L3 PD page for PAE). + */ movl %esi,%eax # phys address of ptd in proc 0 movl %eax,%cr3 # load ptd addr into mmu - + /* * 2. Enable paging and the rest of it. */ @@ -653,10 +690,11 @@ begin: * memory, remove it. */ movl _C_LABEL(nkptp)+1*4,%ecx - leal (PROC0_PDIR_OFF)(%esi),%ebx # old, phys address of PDIR - addl $(KERNBASE), %ebx # new, virtual address of PDIR -1: movl $0,(%ebx) - addl $4,%ebx + leal (PROC0_PDIR_OFF)(%esi),%ebx # old, phys address of PDIR + addl $(KERNBASE), %ebx # new, virtual address of PDIR +1: movl $0,(PDE_SIZE-4)(%ebx) # Upper bits (for PAE) + movl $0,(%ebx) + addl $PDE_SIZE,%ebx loop 1b /* Relocate atdevbase. */ @@ -688,9 +726,13 @@ begin: movl _C_LABEL(tablesize),%eax addl %esi,%eax # skip past stack and page tables +#ifdef PAE + pushl $0 # init386() expects a 64 bits paddr_t with PAE +#endif pushl %eax call _C_LABEL(init386) # wire 386 chip for unix operation - addl $4+NGDT*8,%esp # pop temporary gdt + addl $PDE_SIZE,%esp # pop paddr_t + addl $NGDT*8,%esp # pop temporary gdt #ifdef SAFARI_FIFO_HACK movb $5,%al @@ -765,7 +807,7 @@ start: #endif pushl %esi call _C_LABEL(init386) # wire 386 chip for unix operation - addl $PDE_SIZE,%esp + addl $PDE_SIZE,%esp # pop paddr_t call _C_LABEL(main) #if defined(XEN) && !defined(XEN_COMPAT_030001) diff --git a/sys/arch/i386/i386/machdep.c b/sys/arch/i386/i386/machdep.c index 418ea244de68..e964b90730e7 100644 --- a/sys/arch/i386/i386/machdep.c +++ b/sys/arch/i386/i386/machdep.c @@ -1,4 +1,4 @@ -/* $NetBSD: machdep.c,v 1.690 2010/07/15 23:20:34 jym Exp $ */ +/* $NetBSD: machdep.c,v 1.691 2010/07/24 00:45:54 jym Exp $ */ /*- * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009 @@ -67,7 +67,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.690 2010/07/15 23:20:34 jym Exp $"); +__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.691 2010/07/24 00:45:54 jym Exp $"); #include "opt_beep.h" #include "opt_compat_ibcs2.h" @@ -320,7 +320,7 @@ int biosmem_implicit; * boot loader. Only be used by native_loader(). */ struct bootinfo_source { uint32_t bs_naddrs; - paddr_t bs_addrs[1]; /* Actually longer. */ + void *bs_addrs[1]; /* Actually longer. */ }; /* Only called by locore.h; no need to be in a header file. */ @@ -384,10 +384,10 @@ native_loader(int bl_boothowto, int bl_bootdev, for (i = 0; i < bl_bootinfo->bs_naddrs; i++) { struct btinfo_common *bc; - bc = (struct btinfo_common *)(bl_bootinfo->bs_addrs[i]); + bc = bl_bootinfo->bs_addrs[i]; - if ((paddr_t)(data + bc->len) > - (paddr_t)(&bidest->bi_data[0] + BOOTINFO_MAXSIZE)) + if ((data + bc->len) > + (&bidest->bi_data[0] + BOOTINFO_MAXSIZE)) break; memcpy(data, bc, bc->len); @@ -1312,6 +1312,14 @@ init386(paddr_t first_avail) (void *)atdevbase)); #endif +#if defined(PAE) && !defined(XEN) + /* + * Save VA and PA of L3 PD of boot processor (for Xen, this is done + * in xen_pmap_bootstrap()) + */ + cpu_info_primary.ci_pae_l3_pdirpa = rcr3(); + cpu_info_primary.ci_pae_l3_pdir = (pd_entry_t *)(rcr3() + KERNBASE); +#endif /* PAE && !XEN */ #ifdef XBOX /* @@ -1457,6 +1465,9 @@ init386(paddr_t first_avail) VM_PROT_ALL, 0); /* protection */ pmap_update(pmap_kernel()); memcpy((void *)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size); + + /* Needed early, for bioscall() and kvm86_call() */ + cpu_info_primary.ci_pmap = pmap_kernel(); #endif #endif /* !XEN */ diff --git a/sys/arch/i386/i386/mptramp.S b/sys/arch/i386/i386/mptramp.S index 8d75adb70139..a1599c8bfe34 100644 --- a/sys/arch/i386/i386/mptramp.S +++ b/sys/arch/i386/i386/mptramp.S @@ -1,4 +1,4 @@ -/* $NetBSD: mptramp.S,v 1.20 2010/02/09 23:09:47 jym Exp $ */ +/* $NetBSD: mptramp.S,v 1.21 2010/07/24 00:45:55 jym Exp $ */ /*- * Copyright (c) 2000 The NetBSD Foundation, Inc. @@ -76,7 +76,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: mptramp.S,v 1.20 2010/02/09 23:09:47 jym Exp $"); +__KERNEL_RCSID(0, "$NetBSD: mptramp.S,v 1.21 2010/07/24 00:45:55 jym Exp $"); #include "opt_mpbios.h" /* for MPDEBUG */ @@ -160,6 +160,12 @@ _TRMP_LABEL(mp_startup) movl %eax,%cr4 1: +#ifdef PAE /* Enable PAE */ + movl %cr4,%eax + or $CR4_PAE,%eax + movl %eax,%cr4 +#endif + movl RELOC(mp_pdirpa),%ecx HALTT(0x5,%ecx) diff --git a/sys/arch/i386/i386/multiboot.c b/sys/arch/i386/i386/multiboot.c index e7fcff6aef95..630de019b730 100644 --- a/sys/arch/i386/i386/multiboot.c +++ b/sys/arch/i386/i386/multiboot.c @@ -1,4 +1,4 @@ -/* $NetBSD: multiboot.c,v 1.19 2009/02/22 18:05:42 ahoka Exp $ */ +/* $NetBSD: multiboot.c,v 1.20 2010/07/24 00:45:55 jym Exp $ */ /*- * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: multiboot.c,v 1.19 2009/02/22 18:05:42 ahoka Exp $"); +__KERNEL_RCSID(0, "$NetBSD: multiboot.c,v 1.20 2010/07/24 00:45:55 jym Exp $"); #include "opt_multiboot.h" @@ -276,12 +276,11 @@ copy_syms(struct multiboot_info *mi) { #define RELOC(type, x) ((type)((vaddr_t)(x) - KERNBASE)) int i; - Elf32_Shdr *symtabp, *strtabp; struct multiboot_symbols *ms; - size_t symsize, strsize; - paddr_t symaddr, straddr; - paddr_t symstart, strstart; - + Elf32_Shdr *symtabp, *strtabp; + Elf32_Word symsize, strsize; + Elf32_Addr symaddr, straddr; + Elf32_Addr symstart, strstart; /* * Check if the Multiboot information header has symbols or not. @@ -336,38 +335,32 @@ copy_syms(struct multiboot_info *mi) * that if the tables start before the kernel's end address, * they will not grow over this address. */ - if ((paddr_t)symtabp < (paddr_t)&end - KERNBASE && - (paddr_t)strtabp < (paddr_t)&end - KERNBASE) { - symstart = (paddr_t)((vaddr_t)&end - KERNBASE); + if ((void *)symtabp < RELOC(void *, &end) && + (void *)strtabp < RELOC(void *, &end)) { + symstart = RELOC(Elf32_Addr, &end); strstart = symstart + symsize; - memcpy((void *)symstart, (void *)symaddr, symsize); - memcpy((void *)strstart, (void *)straddr, strsize); - } else if ((paddr_t)symtabp > (paddr_t)&end - KERNBASE && - (paddr_t)strtabp < (paddr_t)&end - KERNBASE) { - symstart = (paddr_t)((vaddr_t)&end - KERNBASE); + } else if ((void *)symtabp > RELOC(void *, &end) && + (void *)strtabp < RELOC(void *, &end)) { + symstart = RELOC(Elf32_Addr, &end); strstart = symstart + symsize; - memcpy((void *)symstart, (void *)symaddr, symsize); - memcpy((void *)strstart, (void *)straddr, strsize); - } else if ((paddr_t)symtabp < (paddr_t)&end - KERNBASE && - (paddr_t)strtabp > (paddr_t)&end - KERNBASE) { - strstart = (paddr_t)((vaddr_t)&end - KERNBASE); + } else if ((void *)symtabp < RELOC(void *, &end) && + (void *)strtabp > RELOC(void *, &end)) { + strstart = RELOC(Elf32_Addr, &end); symstart = strstart + strsize; - memcpy((void *)strstart, (void *)straddr, strsize); - memcpy((void *)symstart, (void *)symaddr, symsize); } else { /* symtabp and strtabp are both over end */ - if ((paddr_t)symtabp < (paddr_t)strtabp) { - symstart = (paddr_t)((vaddr_t)&end - KERNBASE); + if (symtabp < strtabp) { + symstart = RELOC(Elf32_Addr, &end); strstart = symstart + symsize; - memcpy((void *)symstart, (void *)symaddr, symsize); - memcpy((void *)strstart, (void *)straddr, strsize); } else { - strstart = (paddr_t)((vaddr_t)&end - KERNBASE); + strstart = RELOC(Elf32_Addr, &end); symstart = strstart + strsize; - memcpy((void *)strstart, (void *)straddr, strsize); - memcpy((void *)symstart, (void *)symaddr, symsize); } } + + memcpy((void *)strstart, (void *)straddr, strsize); + memcpy((void *)symstart, (void *)symaddr, symsize); + *RELOC(int *, &esym) = (int)(symstart + symsize + strsize + KERNBASE); diff --git a/sys/arch/i386/include/pmap.h b/sys/arch/i386/include/pmap.h index be4be7e5f0fa..a4ee1f0155ee 100644 --- a/sys/arch/i386/include/pmap.h +++ b/sys/arch/i386/include/pmap.h @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.h,v 1.106 2010/07/15 18:58:40 jym Exp $ */ +/* $NetBSD: pmap.h,v 1.107 2010/07/24 00:45:55 jym Exp $ */ /* * @@ -181,25 +181,45 @@ * note that in the APTE_BASE space, the APDP appears at VA * "APDP_BASE" (0xfffff000). * - * When PAE is in use, the L3 page directory breaks up the address space in - * 4 1GB * regions, each of them broken in 512 2MB regions by the L2 PD - * (the size of the pages at the L1 level is still 4K). + * - PAE support - + * --------------- + * + * PAE adds another layer of indirection during address translation, breaking + * up the translation process in 3 different levels: + * - L3 page directory, containing 4 * 64-bits addresses (index determined by + * bits [31:30] from the virtual address). This breaks up the address space + * in 4 1GB regions. + * - the PD (L2), containing 512 64-bits addresses, breaking each L3 region + * in 512 * 2MB regions. + * - the PT (L1), also containing 512 64-bits addresses (at L1, the size of + * the pages is still 4K). + * * The kernel virtual space is mapped by the last entry in the L3 page, * the first 3 entries mapping the user VA space. + * * Because the L3 has only 4 entries of 1GB each, we can't use recursive - * mappings at this level for PDP_PDE and APDP_PDE (this would eat 2 of the - * 4GB virtual space). There's also restrictions imposed by Xen on the - * last entry of the L3 PD, which makes it hard to use one L3 page per pmap - * switch %cr3 to switch pmaps. So we use one static L3 page which is - * always loaded in %cr3, and we use it as 2 virtual PD pointers: one for - * kernel space (L3[3], always loaded), and one for user space (in fact the - * first 3 entries of the L3 PD), and we claim the VM has only a 2-level - * PTP (with the L2 index extended by 2 bytes). - * PTE_BASE and APTE_BASE will need 4 entries in the L2 page table. - * In addition, we can't recursively map L3[3] (Xen wants the ref count on - * this page to be exactly once), so we use a shadow PD page for the last - * L2 PD. The shadow page could be static too, but to make pm_pdir[] - * contigous we'll allocate/copy one page per pmap. + * mappings at this level for PDP_PDE and APDP_PDE (this would eat up 2 of + * the 4GB virtual space). There are also restrictions imposed by Xen on the + * last entry of the L3 PD (reference count to this page cannot be bigger + * than 1), which makes it hard to use one L3 page per pmap to switch + * between pmaps using %cr3. + * + * As such, each CPU gets its own L3 page that is always loaded into its %cr3 + * (ci_pae_l3_pd in the associated cpu_info struct). We claim that the VM has + * only a 2-level PTP (similar to the non-PAE case). L2 PD is now 4 contiguous + * pages long (corresponding to the 4 entries of the L3), and the different + * index/slots (like PDP_PDE) are adapted accordingly. + * + * Kernel space remains in L3[3], L3[0-2] maps the user VA space. Switching + * between pmaps consists in modifying the first 3 entries of the CPU's L3 page. + * + * PTE_BASE and APTE_BASE will need 4 entries in the L2 PD pages to map the + * L2 pages recursively. + * + * In addition, for Xen, we can't recursively map L3[3] (Xen wants the ref + * count on this page to be exactly one), so we use a shadow PD page for + * the last L2 PD. The shadow page could be static too, but to make pm_pdir[] + * contiguous we'll allocate/copy one page per pmap. */ /* XXX MP should we allocate one APDP_PDE per processor?? */ @@ -219,12 +239,16 @@ #ifdef PAE #define L2_SLOT_PTE (KERNBASE/NBPD_L2-4) /* 1532: for recursive PDP map */ #define L2_SLOT_KERN (KERNBASE/NBPD_L2) /* 1536: start of kernel space */ -#define L2_SLOT_APTE 1960 /* 1964-2047 reserved by Xen */ +#ifndef XEN +#define L2_SLOT_APTE 2044 /* 2044: alternative recursive slot */ +#else +#define L2_SLOT_APTE 1960 /* 1964-2047 reserved by Xen */ +#endif #else /* PAE */ #define L2_SLOT_PTE (KERNBASE/NBPD_L2-1) /* 767: for recursive PDP map */ #define L2_SLOT_KERN (KERNBASE/NBPD_L2) /* 768: start of kernel space */ #ifndef XEN -#define L2_SLOT_APTE 1023 /* 1023: alternative recursive slot */ +#define L2_SLOT_APTE 1023 /* 1023: alternative recursive slot */ #else #define L2_SLOT_APTE 1007 /* 1008-1023 reserved by Xen */ #endif @@ -254,17 +278,17 @@ #define AL2_BASE ((pd_entry_t *)((char *)AL1_BASE + L2_SLOT_PTE * NBPD_L1)) #define PDP_PDE (L2_BASE + PDIR_SLOT_PTE) -#ifdef PAE +#if defined(PAE) && defined(XEN) /* - * when PAE is in use we can't write APDP_PDE though the recursive mapping, - * because it points to the shadow PD. Use the kernel PD instead, which is - * static + * when PAE is in use under Xen, we can't write APDP_PDE through the recursive + * mapping, because it points to the shadow PD. Use the kernel PD instead, + * which is static */ #define APDP_PDE (&pmap_kl2pd[l2tol2(PDIR_SLOT_APTE)]) #define APDP_PDE_SHADOW (L2_BASE + PDIR_SLOT_APTE) -#else /* PAE */ +#else /* PAE && XEN */ #define APDP_PDE (L2_BASE + PDIR_SLOT_APTE) -#endif /* PAE */ +#endif /* PAE && XEN */ #define PDP_BASE L2_BASE #define APDP_BASE AL2_BASE @@ -316,6 +340,17 @@ #define pmap_pa2pte(a) (a) #define pmap_pte2pa(a) ((a) & PG_FRAME) #define pmap_pte_set(p, n) do { *(p) = (n); } while (0) +#define pmap_pte_flush() /* nothing */ + +#ifdef PAE +#define pmap_pte_cas(p, o, n) atomic_cas_64((p), (o), (n)) +#define pmap_pte_testset(p, n) \ + atomic_swap_64((volatile uint64_t *)p, n) +#define pmap_pte_setbits(p, b) \ + atomic_or_64((volatile uint64_t *)p, b) +#define pmap_pte_clearbits(p, b) \ + atomic_and_64((volatile uint64_t *)p, ~(b)) +#else /* PAE */ #define pmap_pte_cas(p, o, n) atomic_cas_32((p), (o), (n)) #define pmap_pte_testset(p, n) \ atomic_swap_ulong((volatile unsigned long *)p, n) @@ -323,8 +358,9 @@ atomic_or_ulong((volatile unsigned long *)p, b) #define pmap_pte_clearbits(p, b) \ atomic_and_ulong((volatile unsigned long *)p, ~(b)) -#define pmap_pte_flush() /* nothing */ -#else +#endif /* PAE */ + +#else /* XEN */ static __inline pt_entry_t pmap_pa2pte(paddr_t pa) { @@ -400,11 +436,7 @@ pmap_pte_flush(void) #endif #ifdef PAE -/* addresses of static pages used for PAE pmap: */ -/* the L3 page */ -pd_entry_t *pmap_l3pd; -paddr_t pmap_l3paddr; -/* the kernel's L2 page */ +/* Address of the static kernel's L2 page */ pd_entry_t *pmap_kl2pd; paddr_t pmap_kl2paddr; #endif diff --git a/sys/arch/x86/include/cpu.h b/sys/arch/x86/include/cpu.h index db14e0fe3b4f..929c511a6569 100644 --- a/sys/arch/x86/include/cpu.h +++ b/sys/arch/x86/include/cpu.h @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.h,v 1.22 2010/05/09 20:32:41 rmind Exp $ */ +/* $NetBSD: cpu.h,v 1.23 2010/07/24 00:45:56 jym Exp $ */ /*- * Copyright (c) 1990 The Regents of the University of California. @@ -50,6 +50,7 @@ * Definitions unique to x86 cpu support. */ #include +#include #include #include #include @@ -162,6 +163,17 @@ struct cpu_info { struct i386tss ci_doubleflt_tss; struct i386tss ci_ddbipi_tss; #endif + +#ifdef PAE + uint32_t ci_pae_l3_pdirpa; /* PA of L3 PD */ + pd_entry_t * ci_pae_l3_pdir; /* VA pointer to L3 PD */ +#endif + +#if defined(XEN) && defined(__x86_64__) + /* Currently active user PGD (can't use rcr3() with Xen) */ + paddr_t ci_xen_current_user_pgd; +#endif + char *ci_doubleflt_stack; char *ci_ddbipi_stack; @@ -276,6 +288,7 @@ lwp_t *x86_curlwp(void); void cpu_boot_secondary_processors(void); void cpu_init_idle_lwps(void); void cpu_init_msrs(struct cpu_info *, bool); +void cpu_load_pmap(struct pmap *); extern uint32_t cpus_attached; #ifndef XEN diff --git a/sys/arch/x86/include/pmap.h b/sys/arch/x86/include/pmap.h index 6a12db0d562b..cdc6bdabe46e 100644 --- a/sys/arch/x86/include/pmap.h +++ b/sys/arch/x86/include/pmap.h @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.h,v 1.32 2010/07/15 19:02:26 jym Exp $ */ +/* $NetBSD: pmap.h,v 1.33 2010/07/24 00:45:56 jym Exp $ */ /* * @@ -144,11 +144,7 @@ struct pmap { #define pm_lock pm_obj[0].vmobjlock LIST_ENTRY(pmap) pm_list; /* list (lck by pm_list lock) */ pd_entry_t *pm_pdir; /* VA of PD (lck by object lock) */ -#ifdef PAE - paddr_t pm_pdirpa[PDP_SIZE]; -#else - paddr_t pm_pdirpa; /* PA of PD (read-only after create) */ -#endif + paddr_t pm_pdirpa[PDP_SIZE]; /* PA of PDs (read-only after create) */ struct vm_page *pm_ptphint[PTP_LEVELS-1]; /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats (lck by object lock) */ @@ -166,13 +162,13 @@ struct pmap { of pmap */ }; -/* macro to access pm_pdirpa */ +/* macro to access pm_pdirpa slots */ #ifdef PAE #define pmap_pdirpa(pmap, index) \ ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t)) #else #define pmap_pdirpa(pmap, index) \ - ((pmap)->pm_pdirpa + (index) * sizeof(pd_entry_t)) + ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t)) #endif /* @@ -187,6 +183,8 @@ struct pmap { * PDPpaddr is the physical address of the kernel's PDP. * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3 * value associated to the kernel process, proc0. + * - i386 PAE: it still represents the PA of the kernel's PDP (L2). Due to + * the L3 PD, it cannot be considered as the equivalent of a %cr3 any more. * - Xen: it corresponds to the PFN of the kernel's PDP. */ extern u_long PDPpaddr; diff --git a/sys/arch/x86/x86/cpu.c b/sys/arch/x86/x86/cpu.c index 9c772a294928..56677d6d6a66 100644 --- a/sys/arch/x86/x86/cpu.c +++ b/sys/arch/x86/x86/cpu.c @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.c,v 1.72 2010/07/08 11:22:24 rmind Exp $ */ +/* $NetBSD: cpu.c,v 1.73 2010/07/24 00:45:56 jym Exp $ */ /*- * Copyright (c) 2000, 2006, 2007, 2008 The NetBSD Foundation, Inc. @@ -62,7 +62,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.72 2010/07/08 11:22:24 rmind Exp $"); +__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.73 2010/07/24 00:45:56 jym Exp $"); #include "opt_ddb.h" #include "opt_mpbios.h" /* for MPDEBUG */ @@ -717,9 +717,18 @@ cpu_hatch(void *v) KASSERT((ci->ci_flags & CPUF_RUNNING) == 0); - lcr3(pmap_kernel()->pm_pdirpa); +#ifdef PAE + pd_entry_t * l3_pd = ci->ci_pae_l3_pdir; + for (i = 0 ; i < PDP_SIZE; i++) { + l3_pd[i] = pmap_kernel()->pm_pdirpa[i] | PG_V; + } + lcr3(ci->ci_pae_l3_pdirpa); +#else + lcr3(pmap_pdirpa(pmap_kernel(), 0)); +#endif + pcb = lwp_getpcb(curlwp); - pcb->pcb_cr3 = pmap_kernel()->pm_pdirpa; + pcb->pcb_cr3 = rcr3(); pcb = lwp_getpcb(ci->ci_data.cpu_idlelwp); lcr0(pcb->pcb_cr0); @@ -812,6 +821,8 @@ cpu_copy_trampoline(void) static void tss_init(struct i386tss *tss, void *stack, void *func) { + KASSERT(curcpu()->ci_pmap == pmap_kernel()); + memset(tss, 0, sizeof *tss); tss->tss_esp0 = tss->tss_esp = (int)((char *)stack + USPACE - 16); tss->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); @@ -819,7 +830,8 @@ tss_init(struct i386tss *tss, void *stack, void *func) tss->tss_fs = GSEL(GCPU_SEL, SEL_KPL); tss->tss_gs = tss->__tss_es = tss->__tss_ds = tss->__tss_ss = GSEL(GDATA_SEL, SEL_KPL); - tss->tss_cr3 = pmap_kernel()->pm_pdirpa; + /* %cr3 contains the value associated to pmap_kernel */ + tss->tss_cr3 = rcr3(); tss->tss_esp = (int)((char *)stack + USPACE - 16); tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL); tss->__tss_eflags = PSL_MBO | PSL_NT; /* XXX not needed? */ @@ -1094,3 +1106,26 @@ x86_cpu_idle_halt(void) x86_enable_intr(); } } + +/* + * Loads pmap for the current CPU. + */ +void +cpu_load_pmap(struct pmap *pmap) +{ +#ifdef PAE + int i, s; + struct cpu_info *ci; + + s = splvm(); /* just to be safe */ + ci = curcpu(); + pd_entry_t *l3_pd = ci->ci_pae_l3_pdir; + for (i = 0 ; i < PDP_SIZE; i++) { + l3_pd[i] = pmap->pm_pdirpa[i] | PG_V; + } + splx(s); + tlbflush(); +#else /* PAE */ + lcr3(pmap_pdirpa(pmap, 0)); +#endif /* PAE */ +} diff --git a/sys/arch/x86/x86/pmap.c b/sys/arch/x86/x86/pmap.c index 6902438498fe..4e6cdbd0628f 100644 --- a/sys/arch/x86/x86/pmap.c +++ b/sys/arch/x86/x86/pmap.c @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.c,v 1.112 2010/07/15 21:14:31 jym Exp $ */ +/* $NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $ */ /* * Copyright (c) 2007 Manuel Bouyer. @@ -149,7 +149,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.112 2010/07/15 21:14:31 jym Exp $"); +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" @@ -422,8 +422,6 @@ paddr_t avail_end; /* PA of last available physical page */ #ifdef __x86_64__ /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ static paddr_t xen_dummy_user_pgd; -/* Currently active user PGD (can't use rcr3()) */ -static paddr_t xen_current_user_pgd = 0; #endif /* __x86_64__ */ paddr_t pmap_pa_start; /* PA of first physical page for this domain */ paddr_t pmap_pa_end; /* PA of last physical page for this domain */ @@ -1283,7 +1281,6 @@ pmap_bootstrap(vaddr_t kva_start) { struct pmap *kpm; pt_entry_t *pte; - struct pcb *pcb; int i; vaddr_t kva; #ifndef XEN @@ -1334,14 +1331,11 @@ pmap_bootstrap(vaddr_t kva_start) kpm->pm_ptphint[i] = NULL; } memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ - pcb = lwp_getpcb(&lwp0); - kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE); -#ifdef PAE + + kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); for (i = 0; i < PDP_SIZE; i++) - kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i; -#else - kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3; -#endif + kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; + kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); @@ -1612,7 +1606,7 @@ pmap_prealloc_lowmem_ptps(void) paddr_t newp; paddr_t pdes_pa; - pdes_pa = pmap_kernel()->pm_pdirpa; + pdes_pa = pmap_pdirpa(pmap_kernel(), 0); level = PTP_LEVELS; for (;;) { newp = avail_start; @@ -1715,6 +1709,40 @@ pmap_cpu_init_late(struct cpu_info *ci) evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, NULL, device_xname(ci->ci_dev), "TLB IPI"); + +#ifdef PAE + int ret; + struct pglist pg; + struct vm_page *vmap; + + /* The BP has already its own L3 page allocated in locore.S. */ + if (ci == &cpu_info_primary) + return; + + /* + * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts + * resides below the 4GB boundary. + */ + ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0); + vmap = TAILQ_FIRST(&pg); + + if (ret != 0 || vmap == NULL) + panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n", + __func__, cpu_index(ci), ret); + + ci->ci_pae_l3_pdirpa = vmap->phys_addr; + + ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, + UVM_KMF_VAONLY | UVM_KMF_NOWAIT); + if (ci->ci_pae_l3_pdir == NULL) + panic("%s: failed to allocate L3 PD for CPU %d\n", + __func__, cpu_index(ci)); + + pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa, + VM_PROT_READ | VM_PROT_WRITE, 0); + + pmap_update(pmap_kernel()); +#endif } /* @@ -1931,7 +1959,7 @@ pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, * If ptp is a L3 currently mapped in kernel space, * clear it before freeing */ - if (pmap->pm_pdirpa == xen_current_user_pgd + if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd && level == PTP_LEVELS - 1) pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); #endif /* XEN && __x86_64__ */ @@ -2274,13 +2302,9 @@ pmap_create(void) goto try_again; } -#ifdef PAE for (i = 0; i < PDP_SIZE; i++) pmap->pm_pdirpa[i] = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); -#else - pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]); -#endif LIST_INSERT_HEAD(&pmaps, pmap, pm_list); @@ -2602,11 +2626,11 @@ pmap_reactivate(struct pmap *pmap) KASSERT(kpreempt_disabled()); #if defined(XEN) && defined(__x86_64__) - KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); + KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); #elif defined(PAE) - KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); + KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); #elif !defined(XEN) - KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); + KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); #endif /* @@ -2708,12 +2732,12 @@ pmap_load(void) atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); #if defined(XEN) && defined(__x86_64__) - KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd || + KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || oldpmap == pmap_kernel()); #elif defined(PAE) - KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0])); + KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); #elif !defined(XEN) - KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3())); + KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); #endif KASSERT((pmap->pm_cpus & cpumask) == 0); KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); @@ -2735,36 +2759,13 @@ pmap_load(void) * from other CPUs, we're good to load the page tables. */ #ifdef PAE - pcb->pcb_cr3 = pmap_l3paddr; + pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; #else - pcb->pcb_cr3 = pmap->pm_pdirpa; + pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); #endif -#if defined(XEN) && defined(__x86_64__) - /* kernel pmap always in cr3 and should never go in user cr3 */ - if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) { - /* - * Map user space address in kernel space and load - * user cr3 - */ - int i, s; - pd_entry_t *old_pgd, *new_pgd; - paddr_t addr; - s = splvm(); - new_pgd = pmap->pm_pdir; - old_pgd = pmap_kernel()->pm_pdir; - addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0)); - for (i = 0; i < PDIR_SLOT_PTE; - i++, addr += sizeof(pd_entry_t)) { - if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V)) - xpq_queue_pte_update(addr, new_pgd[i]); - } - tlbflush(); - xen_set_user_pgd(pmap_pdirpa(pmap, 0)); - xen_current_user_pgd = pmap_pdirpa(pmap, 0); - splx(s); - } -#else /* XEN && x86_64 */ -#if defined(XEN) + +#ifdef i386 +#ifdef XEN /* * clear APDP slot, in case it points to a page table that has * been freed @@ -2773,34 +2774,19 @@ pmap_load(void) pmap_unmap_apdp(); } /* lldt() does pmap_pte_flush() */ -#else /* XEN */ -#if defined(i386) +#endif /* XEN */ + +#ifndef XEN ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; ci->ci_tss.tss_cr3 = pcb->pcb_cr3; -#endif -#endif /* XEN */ +#endif /* !XEN */ +#endif /* i386 */ + lldt(pmap->pm_ldt_sel); -#ifdef PAE - { - paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr); - int i; - int s = splvm(); - /* don't update the kernel L3 slot */ - for (i = 0 ; i < PDP_SIZE - 1; i++, l3_pd += sizeof(pd_entry_t)) { - xpq_queue_pte_update(l3_pd, - xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V); - } - tlbflush(); - splx(s); - } -#else /* PAE */ - { + u_int gen = uvm_emap_gen_return(); - lcr3(pcb->pcb_cr3); + cpu_load_pmap(pmap); uvm_emap_update(gen); - } -#endif /* PAE */ -#endif /* XEN && x86_64 */ ci->ci_want_pmapload = 0; @@ -2867,11 +2853,11 @@ pmap_deactivate(struct lwp *l) } #if defined(XEN) && defined(__x86_64__) - KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); + KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); #elif defined(PAE) - KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); + KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); #elif !defined(XEN) - KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); + KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); #endif KASSERT(ci->ci_pmap == pmap); @@ -4761,6 +4747,21 @@ pmap_init_tmp_pgtbl(paddr_t pg) tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; memcpy(tmp_pml, kernel_pml, PAGE_SIZE); +#ifdef PAE + /* + * Use the last 4 entries of the L2 page as L3 PD entries. These + * last entries are unlikely to be used for temporary mappings. + * 508: maps 0->1GB (userland) + * 509: unused + * 510: unused + * 511: maps 3->4GB (kernel) + */ + tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; + tmp_pml[509] = 0; + tmp_pml[510] = 0; + tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V; +#endif + for (level = PTP_LEVELS - 1; level > 0; --level) { tmp_pml = (void *)x86_tmp_pml_vaddr[level]; @@ -4771,5 +4772,10 @@ pmap_init_tmp_pgtbl(paddr_t pg) tmp_pml = (void *)x86_tmp_pml_vaddr[0]; tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; +#ifdef PAE + /* Return the PA of the L3 page (entry 508 of the L2 page) */ + return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); +#endif + return x86_tmp_pml_paddr[PTP_LEVELS - 1]; } diff --git a/sys/arch/xen/x86/cpu.c b/sys/arch/xen/x86/cpu.c index b538ddc7a138..28b7f6c09329 100644 --- a/sys/arch/xen/x86/cpu.c +++ b/sys/arch/xen/x86/cpu.c @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.c,v 1.46 2010/07/06 20:50:35 cegger Exp $ */ +/* $NetBSD: cpu.c,v 1.47 2010/07/24 00:45:56 jym Exp $ */ /* NetBSD: cpu.c,v 1.18 2004/02/20 17:35:01 yamt Exp */ /*- @@ -66,7 +66,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.46 2010/07/06 20:50:35 cegger Exp $"); +__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.47 2010/07/24 00:45:56 jym Exp $"); #include "opt_ddb.h" #include "opt_multiprocessor.h" @@ -582,6 +582,11 @@ cpu_init(struct cpu_info *ci) lcr4(rcr4() | CR4_OSXMMEXCPT); } +#ifdef __x86_64__ + /* No user PGD mapped for this CPU yet */ + ci->ci_xen_current_user_pgd = 0; +#endif + atomic_or_32(&cpus_running, ci->ci_cpumask); atomic_or_32(&ci->ci_flags, CPUF_RUNNING); } @@ -1111,3 +1116,59 @@ x86_cpu_idle_xen(void) x86_enable_intr(); } } + +/* + * Loads pmap for the current CPU. + */ +void +cpu_load_pmap(struct pmap *pmap) +{ +#ifdef i386 +#ifdef PAE + int i, s; + struct cpu_info *ci; + + s = splvm(); /* just to be safe */ + ci = curcpu(); + paddr_t l3_pd = xpmap_ptom_masked(ci->ci_pae_l3_pdirpa); + /* don't update the kernel L3 slot */ + for (i = 0 ; i < PDP_SIZE - 1; i++) { + xpq_queue_pte_update(l3_pd + i * sizeof(pd_entry_t), + xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V); + } + splx(s); + tlbflush(); +#else /* PAE */ + lcr3(pmap_pdirpa(pmap, 0)); +#endif /* PAE */ +#endif /* i386 */ + +#ifdef __x86_64__ + int i, s; + pd_entry_t *old_pgd, *new_pgd; + paddr_t addr; + struct cpu_info *ci; + + /* kernel pmap always in cr3 and should never go in user cr3 */ + if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) { + ci = curcpu(); + /* + * Map user space address in kernel space and load + * user cr3 + */ + s = splvm(); + new_pgd = pmap->pm_pdir; + old_pgd = pmap_kernel()->pm_pdir; + addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0)); + for (i = 0; i < PDIR_SLOT_PTE; + i++, addr += sizeof(pd_entry_t)) { + if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V)) + xpq_queue_pte_update(addr, new_pgd[i]); + } + tlbflush(); + xen_set_user_pgd(pmap_pdirpa(pmap, 0)); + ci->ci_xen_current_user_pgd = pmap_pdirpa(pmap, 0); + splx(s); + } +#endif /* __x86_64__ */ +} diff --git a/sys/arch/xen/x86/x86_xpmap.c b/sys/arch/xen/x86/x86_xpmap.c index 1f0418e6b7d8..cbb0de1d9ccb 100644 --- a/sys/arch/xen/x86/x86_xpmap.c +++ b/sys/arch/xen/x86/x86_xpmap.c @@ -1,4 +1,4 @@ -/* $NetBSD: x86_xpmap.c,v 1.20 2010/07/15 23:20:34 jym Exp $ */ +/* $NetBSD: x86_xpmap.c,v 1.21 2010/07/24 00:45:56 jym Exp $ */ /* * Copyright (c) 2006 Mathieu Ropert @@ -69,7 +69,7 @@ #include -__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.20 2010/07/15 23:20:34 jym Exp $"); +__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.21 2010/07/24 00:45:56 jym Exp $"); #include "opt_xen.h" #include "opt_ddb.h" @@ -814,22 +814,26 @@ xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd, #else xpq_queue_pin_table(xpmap_ptom_masked(new_pgd - KERNBASE)); #endif -#ifdef __i386__ + /* Save phys. addr of PDP, for libkvm. */ - PDPpaddr = (long)pde - KERNBASE; #ifdef PAE - /* also save the address of the L3 page */ - pmap_l3pd = pdtpe; - pmap_l3paddr = (new_pgd - KERNBASE); -#endif /* PAE */ -#endif /* i386 */ + PDPpaddr = (u_long)pde - KERNBASE; /* PDP is the L2 with PAE */ +#else + PDPpaddr = (u_long)new_pgd - KERNBASE; +#endif + /* Switch to new tables */ __PRINTK(("switch to PGD\n")); xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE)); __PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry %#" PRIxPADDR "\n", bt_pgd[PDIR_SLOT_PTE])); + #ifdef PAE if (final) { + /* save the address of the L3 page */ + cpu_info_primary.ci_pae_l3_pdir = pdtpe; + cpu_info_primary.ci_pae_l3_pdirpa = (new_pgd - KERNBASE); + /* now enter kernel's PTE mappings */ addr = (u_long)pde - KERNBASE + PAGE_SIZE * 3; xpq_queue_pte_update( @@ -839,8 +843,6 @@ xen_bootstrap_tables (vaddr_t old_pgd, vaddr_t new_pgd, } #endif - - /* Now we can safely reclaim space taken by old tables */ __PRINTK(("unpin old PGD\n")); diff --git a/sys/arch/xen/x86/xenfunc.c b/sys/arch/xen/x86/xenfunc.c index 9f5f63fffdfb..61dfb444a986 100644 --- a/sys/arch/xen/x86/xenfunc.c +++ b/sys/arch/xen/x86/xenfunc.c @@ -1,4 +1,4 @@ -/* $NetBSD: xenfunc.c,v 1.10 2010/02/12 01:55:46 jym Exp $ */ +/* $NetBSD: xenfunc.c,v 1.11 2010/07/24 00:45:56 jym Exp $ */ /* * @@ -27,7 +27,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: xenfunc.c,v 1.10 2010/02/12 01:55:46 jym Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xenfunc.c,v 1.11 2010/07/24 00:45:56 jym Exp $"); #include @@ -58,10 +58,10 @@ invlpg(vaddr_t addr) splx(s); } -#ifndef __x86_64__ void lldt(u_short sel) { +#ifndef __x86_64__ struct cpu_info *ci; ci = curcpu(); @@ -75,8 +75,8 @@ lldt(u_short sel) xen_set_ldt(ci->ci_gdt[IDXSELN(sel)].ld.ld_base, ci->ci_gdt[IDXSELN(sel)].ld.ld_entries); ci->ci_curldt = sel; -} #endif +} void ltr(u_short sel)