From 1788a8e60ca75f6e370e8c519957a29f94d9423a Mon Sep 17 00:00:00 2001 From: yamt Date: Tue, 15 Jan 2008 11:12:03 +0000 Subject: [PATCH] - use cpu-local temporary mapping for P->V operations and kill pmap_main_lock. fixes a part of PR/37193 from Andrew Doran. - use pmap_stats_update_bypte in some more places. some comments and fixes from Andrew Doran and Manuel Bouyer. --- sys/arch/x86/x86/pmap.c | 295 +++++++++++++++++++++++++--------------- 1 file changed, 184 insertions(+), 111 deletions(-) diff --git a/sys/arch/x86/x86/pmap.c b/sys/arch/x86/x86/pmap.c index dc7769f6c1d0..9005e7417b6a 100644 --- a/sys/arch/x86/x86/pmap.c +++ b/sys/arch/x86/x86/pmap.c @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.c,v 1.27 2008/01/13 11:34:18 yamt Exp $ */ +/* $NetBSD: pmap.c,v 1.28 2008/01/15 11:12:03 yamt Exp $ */ /* * Copyright (c) 2007 Manuel Bouyer. @@ -154,7 +154,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.27 2008/01/13 11:34:18 yamt Exp $"); +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.28 2008/01/15 11:12:03 yamt Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" @@ -283,21 +283,6 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.27 2008/01/13 11:34:18 yamt Exp $"); * * we have the following locks that we must contend with: * - * RW locks: - * - * - pmap_main_lock - * this lock is used to prevent deadlock and/or provide mutex - * access to the pmap system. most operations lock the pmap - * structure first, then they lock the pv_lists (if needed). - * however, some operations such as pmap_page_protect lock - * the pv_lists and then lock pmaps. in order to prevent a - * cycle, we require a mutex lock when locking the pv_lists - * first. thus, the "pmap = >pv_list" lockers must gain a - * read-lock on pmap_main_lock before locking the pmap. and - * the "pv_list => pmap" lockers must gain a write-lock on - * pmap_main_lock before locking. since only one thread - * can write-lock a lock at a time, this provides mutex. - * * mutexes: * * - pmap lock (per pmap, part of uvm_object) @@ -346,7 +331,6 @@ pd_entry_t *normal_pdes[] = PDES_INITIALIZER; pd_entry_t *alternate_pdes[] = APDES_INITIALIZER; static kmutex_t pmaps_lock; -static krwlock_t pmap_main_lock; static vaddr_t pmap_maxkvaddr; @@ -1326,7 +1310,6 @@ pmap_bootstrap(vaddr_t kva_start) * again is never taken from interrupt context. */ - rw_init(&pmap_main_lock); mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); LIST_INIT(&pmaps); pmap_cpu_init_early(curcpu()); @@ -1532,6 +1515,8 @@ pmap_enter_pv(struct pv_head *pvh, vaddr_t va, struct vm_page *ptp) /* PTP in pmap that maps this VA */ { + + KASSERT(mutex_owned(&pvh->pvh_lock)); pve->pv_pmap = pmap; pve->pv_va = va; pve->pv_ptp = ptp; /* NULL for kernel pmap */ @@ -1553,6 +1538,7 @@ pmap_remove_pv(struct pv_head *pvh, struct pmap *pmap, vaddr_t va) { struct pv_entry tmp, *pve; + KASSERT(mutex_owned(&pvh->pvh_lock)); tmp.pv_pmap = pmap; tmp.pv_va = va; pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp); @@ -2678,6 +2664,56 @@ pmap_copy_page(paddr_t srcpa, paddr_t dstpa) #endif } +static pt_entry_t * +pmap_map_ptp(struct vm_page *ptp) +{ + pt_entry_t *ptppte; + void *ptpva; + int id; + + id = cpu_number(); + ptppte = PTESLEW(ptp_pte, id); + ptpva = VASLEW(ptpp, id); + pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | +#if !defined(XEN) + PG_RW | +#endif + PG_U | PG_k); + pmap_pte_flush(); + pmap_update_pg((vaddr_t)ptpva); + + return (pt_entry_t *)ptpva; +} + +static void +pmap_unmap_ptp(void) +{ +#if defined(DIAGNOSTIC) || defined(XEN) + pt_entry_t *pte; + + pte = PTESLEW(ptp_pte, cpu_number()); + pmap_pte_set(pte, 0); + pmap_pte_flush(); +#endif +} + +static pt_entry_t * +pmap_map_pte(struct vm_page *ptp, vaddr_t va) +{ + + if (ptp == NULL) { + return kvtopte(va); + } + return pmap_map_ptp(ptp) + pl1_pi(va); +} + +static void +pmap_unmap_pte(void) +{ + + pmap_unmap_ptp(); +} + /* * p m a p r e m o v e f u n c t i o n s * @@ -2888,12 +2924,6 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) struct vm_page *ptp, *empty_ptps = NULL; struct pmap *pmap2; - /* - * we lock in the pmap => pv_head direction - */ - - rw_enter(&pmap_main_lock, RW_READER); - pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ /* @@ -2995,7 +3025,6 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) } pmap_tlb_shootwait(); pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ - rw_exit(&pmap_main_lock); /* Now we can free unused PVs and ptps */ if (pv_tofree) @@ -3017,15 +3046,11 @@ void pmap_page_remove(struct vm_page *pg) { struct pv_head *pvh; - struct pv_entry *pve, *npve, *killlist = NULL; - pt_entry_t *ptes, opte; - pd_entry_t **pdes; -#ifdef DIAGNOSTIC - pd_entry_t pde; -#endif + struct pv_entry *pve, *killlist = NULL; struct vm_page *empty_ptps = NULL; struct vm_page *ptp; - struct pmap *pmap2; + pt_entry_t expect; + int count; #ifdef DIAGNOSTIC int bank, off; @@ -3040,37 +3065,44 @@ pmap_page_remove(struct vm_page *pg) return; } - /* set pv_head => pmap locking */ - rw_enter(&pmap_main_lock, RW_WRITER); - - for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) { - npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve); - - /* locks pmap */ - pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes); - -#ifdef DIAGNOSTIC - if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) && - pmap_pte2pa(pde) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { - printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n", - pg, pve->pv_va, pve->pv_ptp); - printf("pmap_page_remove: PTP's phys addr: " - "actual=%lx, recorded=%lx\n", - (unsigned long)pmap_pte2pa(pde), - (unsigned long)VM_PAGE_TO_PHYS(pve->pv_ptp)); - panic("pmap_page_remove: mapped managed page has " - "invalid pv_ptp field"); - } -#endif + expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; + count = SPINLOCK_BACKOFF_MIN; +startover: + mutex_spin_enter(&pvh->pvh_lock); + while ((pve = SPLAY_MIN(pvtree, &pvh->pvh_root)) != NULL) { + struct pmap *pmap = pve->pv_pmap; + pt_entry_t *ptep; + pt_entry_t opte; /* atomically save the old PTE and zap! it */ - opte = pmap_pte_testset(&ptes[pl1_i(pve->pv_va)], 0); - KASSERT(pmap_valid_entry(opte)); - KDASSERT(pmap_pte2pa(opte) == VM_PAGE_TO_PHYS(pg)); + ptep = pmap_map_pte(pve->pv_ptp, pve->pv_va); + do { + opte = *ptep; + if ((opte & (PG_FRAME | PG_V)) != expect) { +#if defined(MULTIPROCESSOR) + int hold_count; +#endif /* defined(MULTIPROCESSOR) */ - if (opte & PG_W) - pve->pv_pmap->pm_stats.wired_count--; - pve->pv_pmap->pm_stats.resident_count--; + /* + * we lost a race with a V->P operation like + * pmap_remove(). wait for the competitor + * reflecting pte bits into mp_attrs. + * + * issue a redundant TLB shootdown so that + * we can wait for its completion. + */ + + pmap_unmap_pte(); + pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, 0, + PG_G); + mutex_spin_exit(&pvh->pvh_lock); + KERNEL_UNLOCK_ALL(curlwp, &hold_count); + SPINLOCK_BACKOFF(count); + KERNEL_LOCK(hold_count, curlwp); + goto startover; + } + } while (pmap_pte_cas(ptep, opte, 0) != opte); + pmap_unmap_pte(); /* Shootdown only if referenced */ if (opte & PG_U) @@ -3079,21 +3111,41 @@ pmap_page_remove(struct vm_page *pg) /* sync R/M bits */ pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M)); + if (pve->pv_ptp) { + pmap_reference(pmap); + } + SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */ + mutex_spin_exit(&pvh->pvh_lock); + /* update the PTP reference count. free if last reference. */ if (pve->pv_ptp) { + struct pmap *pmap2; + pt_entry_t *ptes; + pd_entry_t **pdes; + + KASSERT(pmap != pmap_kernel()); + + crit_enter(); + pmap_tlb_shootwait(); + crit_exit(); + pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); + pmap_stats_update_bypte(pmap, 0, opte); pve->pv_ptp->wire_count--; if (pve->pv_ptp->wire_count <= 1) { pmap_free_ptp(pve->pv_pmap, pve->pv_ptp, pve->pv_va, ptes, pdes, &empty_ptps); } + pmap_unmap_ptes(pmap, pmap2); + pmap_destroy(pmap); + } else { + pmap_stats_update_bypte(pmap, 0, opte); } - - pmap_unmap_ptes(pve->pv_pmap, pmap2); /* unlocks pmap */ - SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */ + SPLAY_RIGHT(pve, pv_node) = killlist; /* mark it for death */ killlist = pve; + mutex_spin_enter(&pvh->pvh_lock); } - rw_exit(&pmap_main_lock); + mutex_spin_exit(&pvh->pvh_lock); crit_enter(); pmap_tlb_shootwait(); @@ -3127,8 +3179,9 @@ pmap_test_attrs(struct vm_page *pg, unsigned testbits) int *myattrs; struct pv_head *pvh; struct pv_entry *pve; - struct pmap *pmap2; pt_entry_t pte; + pt_entry_t expect; + int result; #if DIAGNOSTIC int bank, off; @@ -3155,27 +3208,30 @@ pmap_test_attrs(struct vm_page *pg, unsigned testbits) } /* nope, gonna have to do it the hard way */ - rw_enter(&pmap_main_lock, RW_WRITER); + expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; + mutex_spin_enter(&pvh->pvh_lock); for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL && (*myattrs & testbits) == 0; pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) { - pt_entry_t *ptes; - pd_entry_t **pdes; + pt_entry_t *ptep; - pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes); - pte = ptes[pl1_i(pve->pv_va)]; - pmap_unmap_ptes(pve->pv_pmap, pmap2); - *myattrs |= pte; + ptep = pmap_map_pte(pve->pv_ptp, pve->pv_va); + pte = *ptep; + pmap_unmap_pte(); + if ((pte & (PG_FRAME | PG_V)) == expect) { + *myattrs |= pte; + } } + result = *myattrs & testbits; + mutex_spin_exit(&pvh->pvh_lock); /* * note that we will exit the for loop with a non-null pve if * we have found the bits we are testing for. */ - rw_exit(&pmap_main_lock); - return((*myattrs & testbits) != 0); + return result != 0; } /* @@ -3192,10 +3248,9 @@ pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) uint32_t result; struct pv_head *pvh; struct pv_entry *pve; - pt_entry_t *ptes, opte; int *myattrs; - struct pmap *pmap2; - + pt_entry_t expect; + int count; #ifdef DIAGNOSTIC int bank, off; @@ -3204,33 +3259,47 @@ pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) panic("pmap_change_attrs: unmanaged page?"); #endif mdpg = &pg->mdpage; - - rw_enter(&pmap_main_lock, RW_WRITER); pvh = &mdpg->mp_pvhead; - myattrs = &mdpg->mp_attrs; - result = *myattrs & clearbits; + expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; + result = 0; + count = SPINLOCK_BACKOFF_MIN; +startover: + mutex_spin_enter(&pvh->pvh_lock); + result |= *myattrs & clearbits; *myattrs &= ~clearbits; - SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) { pt_entry_t *ptep; - pd_entry_t **pdes; + pt_entry_t opte; - /* locks pmap */ - pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes); -#ifdef DIAGNOSTIC - if (!pmap_pdes_valid(pve->pv_va, pdes, NULL)) - panic("pmap_change_attrs: mapping without PTP " - "detected"); -#endif - ptep = &ptes[pl1_i(pve->pv_va)]; + ptep = pmap_map_pte(pve->pv_ptp, pve->pv_va); +retry: opte = *ptep; - KASSERT(pmap_valid_entry(opte)); - KDASSERT(pmap_pte2pa(opte) == VM_PAGE_TO_PHYS(pg)); + if ((opte & (PG_FRAME | PG_V)) != expect) { +#if defined(MULTIPROCESSOR) + int hold_count; +#endif /* defined(MULTIPROCESSOR) */ + + /* + * we lost a race with a V->P operation like + * pmap_remove(). wait for the competitor + * reflecting pte bits into mp_attrs. + * + * issue a redundant TLB shootdown so that + * we can wait for its completion. + */ + + pmap_unmap_pte(); + pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, 0, PG_G); + mutex_spin_exit(&pvh->pvh_lock); + KERNEL_UNLOCK_ALL(curlwp, &hold_count); + SPINLOCK_BACKOFF(count); + KERNEL_LOCK(hold_count, curlwp); + goto startover; + } if (opte & clearbits) { /* We need to do something */ if (clearbits == PG_RW) { - result |= PG_RW; /* * On write protect we might not need to flush @@ -3238,8 +3307,12 @@ pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) */ /* First zap the RW bit! */ - pmap_pte_clearbits(ptep, PG_RW); - opte = *ptep; + if (pmap_pte_cas(ptep, opte, opte & ~PG_RW) + != opte) { + goto retry; + } + opte &= ~PG_RW; + result |= PG_RW; /* * Then test if it is not cached as RW the TLB @@ -3254,7 +3327,10 @@ pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) */ /* zap! */ - opte = pmap_pte_testset(ptep, (opte & ~(PG_U | PG_M))); + if (pmap_pte_cas(ptep, opte, opte & ~(PG_U | PG_M)) + != opte) { + goto retry; + } result |= (opte & clearbits); *myattrs |= (opte & ~(clearbits)); @@ -3262,10 +3338,9 @@ pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, 0, opte); } no_tlb_shootdown: - pmap_unmap_ptes(pve->pv_pmap, pmap2); /* unlocks pmap */ + pmap_unmap_pte(); } - - rw_exit(&pmap_main_lock); + mutex_spin_exit(&pvh->pvh_lock); crit_enter(); pmap_tlb_shootwait(); @@ -3390,14 +3465,18 @@ pmap_unwire(struct pmap *pmap, vaddr_t va) pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ if (pmap_pdes_valid(va, pdes, NULL)) { + pt_entry_t *ptep = &ptes[pl1_i(va)]; + pt_entry_t opte = *ptep; #ifdef DIAGNOSTIC - if (!pmap_valid_entry(ptes[pl1_i(va)])) + if (!pmap_valid_entry(opte)) panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); #endif - if ((ptes[pl1_i(va)] & PG_W) != 0) { - pmap_pte_clearbits(&ptes[pl1_i(va)], PG_W); - pmap->pm_stats.wired_count--; + if ((opte & PG_W) != 0) { + pt_entry_t npte = opte & ~PG_W; + + opte = pmap_pte_testset(ptep, npte); + pmap_stats_update_bypte(pmap, npte, opte); } #ifdef DIAGNOSTIC else { @@ -3512,9 +3591,6 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, /* get a pve. */ freepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); - /* get lock */ - rw_enter(&pmap_main_lock, RW_READER); - pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ if (pmap == pmap_kernel()) { ptp = NULL; @@ -3716,7 +3792,6 @@ shootdown_now: out: pmap_unmap_ptes(pmap, pmap2); - rw_exit(&pmap_main_lock); if (freepve != NULL) { /* put back the pv, we don't need it. */ @@ -3964,7 +4039,6 @@ pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) * we lock in the pmap => pv_head direction */ - rw_enter(&pmap_main_lock, RW_READER); pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ /* @@ -3992,7 +4066,6 @@ pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) } } pmap_unmap_ptes(pmap, pmap2); - rw_exit(&pmap_main_lock); } #endif