PERFORCE change 92587 for review
Kip Macy
kmacy at FreeBSD.org
Wed Mar 1 00:07:06 PST 2006
http://perforce.freebsd.org/chv.cgi?CH=92587
Change 92587 by kmacy at kmacy_storage:sun4v_work on 2006/03/01 08:06:57
implement pmap_enter as well as all of the functions that it depends on
simplify pmap_kextract with help of the new tsb_lookup_tte
Affected files ...
.. //depot/projects/kmacy_sun4v/src/sys/sun4v/sun4v/pmap.c#12 edit
Differences ...
==== //depot/projects/kmacy_sun4v/src/sys/sun4v/sun4v/pmap.c#12 (text+ko) ====
@@ -34,6 +34,7 @@
#include <vm/vm_extern.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
+#include <vm/uma.h>
#include <machine/cpu.h>
#include <machine/cache.h>
@@ -68,7 +69,6 @@
struct msgbuf *msgbufp;
vm_paddr_t msgbuf_phys;
-
/*
* Map of physical memory reagions.
*/
@@ -95,18 +95,22 @@
+#ifndef PMAP_SHPGPERPROC
+#define PMAP_SHPGPERPROC 200
+#endif
/*
- * Kernel pmap.
+ * Data for the pv entry allocation mechanism
*/
-struct pmap kernel_pmap_store;
+static uma_zone_t pvzone;
+static struct vm_object pvzone_obj;
+static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
/*
- * Kernel TSBs
+ * Kernel pmap.
*/
-#define TSB8K_INDEX 0
-#define TSB4M_INDEX 1
+struct pmap kernel_pmap_store;
-static hv_tsb_info_t kernel_td[MAX_TSB_INFO];
+hv_tsb_info_t kernel_td[MAX_TSB_INFO];
/*
@@ -136,7 +140,14 @@
#define UNIMPLEMENTED panic("%s not implemented", __FUNCTION__)
+static void free_pv_entry(pv_entry_t pv);
+static pv_entry_t get_pv_entry(pmap_t locked_pmap);
+
static void pmap_scrub_pages(vm_paddr_t pa, int64_t size);
+static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
+static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va);
+
+
/*
* Quick sort callout for comparing memory regions.
*/
@@ -173,6 +184,109 @@
return (0);
}
+
+static __inline void
+free_pv_entry(pv_entry_t pv)
+{
+ pv_entry_count--;
+ uma_zfree(pvzone, pv);
+}
+
+/*
+ * get a new pv_entry, allocating a block from the system
+ * when needed.
+ */
+static pv_entry_t
+get_pv_entry(pmap_t locked_pmap)
+{
+ static const struct timeval printinterval = { 60, 0 };
+ static struct timeval lastprint;
+ struct vpgqueues *vpq;
+ pmap_t pmap;
+ pv_entry_t allocated_pv, next_pv, pv;
+ vm_offset_t va;
+ vm_page_t m;
+
+ PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+ mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
+ if (allocated_pv != NULL) {
+ pv_entry_count++;
+ if (pv_entry_count > pv_entry_high_water)
+ pagedaemon_wakeup();
+ else
+ return (allocated_pv);
+ }
+
+ /*
+ * Reclaim pv entries: At first, destroy mappings to inactive
+ * pages. After that, if a pv entry is still needed, destroy
+ * mappings to active pages.
+ */
+ if (ratecheck(&lastprint, &printinterval))
+ printf("Approaching the limit on PV entries, "
+ "increase the vm.pmap.shpgperproc tunable.\n");
+
+ vpq = &vm_page_queues[PQ_INACTIVE];
+retry:
+ sched_pin();
+ TAILQ_FOREACH(m, &vpq->pl, pageq) {
+ if (m->hold_count || m->busy || (m->flags & PG_BUSY))
+ continue;
+ TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
+ UNIMPLEMENTED;
+ va = pv->pv_va;
+ pmap = pv->pv_pmap;
+ /* Avoid deadlock and lock recursion. */
+ if (pmap > locked_pmap)
+ PMAP_LOCK(pmap);
+ else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
+ continue;
+ pmap->pm_stats.resident_count--;
+#ifdef notyet
+ pte = pmap_pte_quick(pmap, va);
+ tpte = pte_load_clear(pte);
+ KASSERT((tpte & PG_W) == 0,
+ ("get_pv_entry: wired pte %#jx", (uintmax_t)tpte));
+ if (tpte & PG_A)
+ vm_page_flag_set(m, PG_REFERENCED);
+ if (tpte & PG_M) {
+ KASSERT((tpte & PG_RW),
+ ("get_pv_entry: modified page not writable: va: %#x, pte: %#jx",
+ va, (uintmax_t)tpte));
+ if (pmap_track_modified(va))
+ vm_page_dirty(m);
+ }
+#endif
+ pmap_invalidate_page(pmap, va);
+ TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ if (TAILQ_EMPTY(&m->md.pv_list))
+ vm_page_flag_clear(m, PG_WRITEABLE);
+ m->md.pv_list_count--;
+#ifdef notyet
+ pmap_unuse_pt(pmap, va);
+#endif
+
+ if (pmap != locked_pmap)
+ PMAP_UNLOCK(pmap);
+ if (allocated_pv == NULL)
+ allocated_pv = pv;
+ else
+ free_pv_entry(pv);
+ }
+ }
+ sched_unpin();
+ if (allocated_pv == NULL) {
+ if (vpq == &vm_page_queues[PQ_INACTIVE]) {
+ vpq = &vm_page_queues[PQ_ACTIVE];
+ goto retry;
+ }
+ panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
+ }
+ return (allocated_pv);
+}
+
/*
* Allocate a physical page of memory directly from the phys_avail map.
* Can only be called from pmap_bootstrap before avail start and end are
@@ -184,7 +298,6 @@
vm_paddr_t pa;
int i;
- printf("looking for size %lx\n", size);
size = round_page(size);
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
@@ -306,6 +419,7 @@
kernel_td[TSB8K_INDEX].hvtsb_pa = pa;
tsb_4m_size = virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT);
+
pa = pmap_bootstrap_alloc(tsb_4m_size);
kernel_td[TSB4M_INDEX].hvtsb_idxpgsz = TTE4M;
@@ -320,7 +434,6 @@
pmap_scrub_pages(kernel_td[TSB4M_INDEX].hvtsb_pa, tsb_4m_size);
-
/*
* Set up TSB descriptors for the hypervisor
*
@@ -332,7 +445,6 @@
/*
* allocate MMU fault status areas for all CPUS
*/
- printf("allocate fault status area\n");
mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU);
/*
@@ -365,7 +477,6 @@
virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE;
kstack0 = virtual_avail;
virtual_avail += KSTACK_PAGES * PAGE_SIZE;
- printf("setting ttes\n");
for (i = 0; i < KSTACK_PAGES; i++) {
pa = kstack0_phys + i * PAGE_SIZE;
va = kstack0 + i * PAGE_SIZE;
@@ -405,18 +516,21 @@
translations[i].om_start > VM_MAX_PROM_ADDRESS)
continue;
#endif
- printf("om_size: %ld om_start: %lx om_tte: %lx\n", translations[i].om_size,
+ printf("om_size=%ld om_start=%lx om_tte=%lx\n", translations[i].om_size,
translations[i].om_start, translations[i].om_tte);
- if (translations[i].om_size == PAGE_SIZE_4M)
+ if (translations[i].om_size == PAGE_SIZE_4M) {
+ tsb_assert_invalid(&kernel_td[TSB4M_INDEX], translations[i].om_start);
tsb_set_tte(&kernel_td[TSB4M_INDEX], translations[i].om_start,
TTE_GET_PA(translations[i].om_tte), TTE_KERNEL | VTD_4M, 0);
- else
+ } else {
for (off = 0; off < translations[i].om_size;
off += PAGE_SIZE) {
va = translations[i].om_start + off;
pa = TTE_GET_PA(translations[i].om_tte) + off;
+ tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va);
tsb_set_tte(&kernel_td[TSB8K_INDEX], va, pa, TTE_KERNEL | VTD_8K, 0);
}
+ }
}
/*
@@ -437,11 +551,9 @@
PMAP_LOCK_INIT(kernel_pmap);
TAILQ_INIT(&kernel_pmap->pm_pvlist);
-
- printf("physical address of kernel_td: 0x%lx\n", vtophys((vm_offset_t)&kernel_td));
- printf("set ctx0\n");
- error = hv_set_ctx0(2, vtophys((vm_offset_t)&kernel_td));
- printf("ctx0 set\n");
+ printf("physical address of kernel_td: 0x%lx\n", vtophys((vm_offset_t)&kernel_td));
+
+ error = hv_set_ctx0(MAX_TSB_INFO, vtophys((vm_offset_t)&kernel_td));
if (error != H_EOK)
panic("failed to set ctx0 TSBs error: %ld", error);
@@ -477,12 +589,14 @@
pmap_clear_modify(vm_page_t m)
{
/* XXX Need to also clear this in the TSB if possible :-( */
+ UNIMPLEMENTED;
tte_clear_phys_bit(m, VTD_W);
}
void
pmap_clear_reference(vm_page_t m)
{
+ UNIMPLEMENTED;
tte_clear_phys_bit(m, VTD_REF);
}
@@ -508,7 +622,119 @@
pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
boolean_t wired)
{
- UNIMPLEMENTED;
+ vm_paddr_t pa, opa;
+ uint64_t tte_data, otte_data;
+ vm_page_t om;
+ int invlva;
+#if 0
+ printf("ctx=%d va=%lx prot=%x wired=%x\n", pmap->pm_context,
+ va, prot, wired);
+#endif
+
+ vm_page_lock_queues();
+ om = NULL;
+ PMAP_LOCK(pmap);
+ sched_pin();
+
+ tte_data = pa = VM_PAGE_TO_PHYS(m);
+ otte_data = tsb_lookup_tte(va, pmap->pm_context);
+ opa = TTE_GET_PA(otte_data);
+ /*
+ * Mapping has not changed, must be protection or wiring change.
+ */
+ if (pa == opa) {
+ /*
+ * Wiring change, just update stats. We don't worry about
+ * wiring PT pages as they remain resident as long as there
+ * are valid mappings in them. Hence, if a user page is wired,
+ * the PT page will be also.
+ */
+ if (wired && ((otte_data & VTD_WIRED) == 0))
+ pmap->pm_stats.wired_count++;
+ else if (!wired && (otte_data & VTD_WIRED))
+ pmap->pm_stats.wired_count--;
+
+ /*
+ * We might be turning off write access to the page,
+ * so we go ahead and sense modify status.
+ */
+ if (otte_data & VTD_MANAGED) {
+ om = m;
+ pa |= VTD_MANAGED;
+ }
+ goto validate;
+
+ }
+ /*
+ * Mapping has changed, invalidate old range and fall through to
+ * handle validating new mapping.
+ */
+ if (opa) {
+ if (otte_data & VTD_W)
+ pmap->pm_stats.wired_count--;
+ if (otte_data & VTD_MANAGED) {
+ om = PHYS_TO_VM_PAGE(opa);
+ pmap_remove_entry(pmap, om, va);
+ }
+ } else
+ pmap->pm_stats.resident_count++;
+
+ /*
+ * Enter on the PV list if part of our managed memory.
+ */
+ if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
+ pmap_insert_entry(pmap, va, m);
+ tte_data |= VTD_MANAGED;
+ }
+ /*
+ * Increment counters
+ */
+ if (wired)
+ pmap->pm_stats.wired_count++;
+
+validate:
+ /*
+ * Now validate mapping with desired protection/wiring.
+ */
+ if ((prot & VM_PROT_WRITE) != 0)
+ tte_data |= (VTD_W|VTD_WR_PERM); /* XXX need to handle modify */
+ if ((prot & VM_PROT_EXECUTE) != 0)
+ tte_data |= VTD_X;
+ if (wired)
+ tte_data |= VTD_WIRED;
+ if (pmap == kernel_pmap)
+ tte_data |= TTE_KERNEL_MINFLAGS;
+
+ if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) {
+ if (otte_data & VTD_V) {
+ invlva = FALSE;
+ tsb_set_tte(&kernel_td[TSB8K_INDEX], va, pa, tte_data,
+ pmap->pm_context);
+ if (otte_data & VTD_REF) {
+ if (otte_data & VTD_MANAGED)
+ vm_page_flag_set(om, PG_REFERENCED);
+ if (opa != pa)
+ invlva = TRUE;
+ }
+ if (otte_data & VTD_W) {
+ if ((otte_data & VTD_MANAGED) &&
+ pmap_track_modified(pmap, va))
+ vm_page_dirty(om);
+ if ((prot & VM_PROT_WRITE) == 0)
+ invlva = TRUE;
+ }
+ if (invlva)
+ pmap_invalidate_page(pmap, va);
+ } else
+ tsb_set_tte(&kernel_td[TSB8K_INDEX], va, pa, tte_data,
+ pmap->pm_context);
+ }
+
+
+ sched_unpin();
+ vm_page_unlock_queues();
+ PMAP_UNLOCK(pmap);
+
}
vm_page_t
@@ -556,9 +782,42 @@
pmap_init(void)
{
/* allocate pv_entry zones */
- return;
+ int shpgperproc = PMAP_SHPGPERPROC;
+
+ /*
+ * Initialize the address space (zone) for the pv entries. Set a
+ * high water mark so that the system can recover from excessive
+ * numbers of pv entries.
+ */
+ pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
+ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
+ pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+ TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
+ pv_entry_high_water = 9 * (pv_entry_max / 10);
+ uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
+
}
+/*
+ * Create a pv entry for page at pa for
+ * (pmap, va).
+ */
+static void
+pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
+{
+ pv_entry_t pv;
+
+ pv = get_pv_entry(pmap);
+ pv->pv_va = va;
+ pv->pv_pmap = pmap;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
+ TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+ m->md.pv_list_count++;
+}
void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
@@ -578,8 +837,14 @@
void
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
-
- printf("%s unimplemented\n", __FUNCTION__);
+ vm_offset_t tva;
+ printf("pmap_invalidate_range(sva=%lx, eva=%lx)\n", sva, eva);
+ /* XXX SUN4V_FIXME - oversimplified logic */
+ if (((sva & PAGE_MASK_4M) != 0) || ((eva & PAGE_MASK_4M) != 0)) {
+ for (tva = sva; tva < eva; tva += PAGE_SIZE_8K)
+ invlpg(tva, pmap->pm_context);
+ } else
+ UNIMPLEMENTED;
}
@@ -596,6 +861,8 @@
boolean_t
pmap_is_modified(vm_page_t m)
{
+ UNIMPLEMENTED;
+ /* Not properly handled yet */
return tte_get_phys_bit(m, VTD_W);
}
@@ -612,9 +879,7 @@
void
pmap_kenter(vm_offset_t va, vm_paddr_t pa)
{
- printf("pmap_kentering\n");
tsb_set_tte(&kernel_td[TSB8K_INDEX], va, pa, TTE_KERNEL | VTD_8K, 0);
- printf("pmap_kentered\n");
}
/*
@@ -627,20 +892,15 @@
{
uint64_t tte_data;
vm_paddr_t pa;
- /*
- * check 4M TSB
- */
- tte_data = tsb_get_tte(&kernel_td[TSB4M_INDEX], va, 0);
- pa = TTE_GET_PA(tte_data) | (va & PAGE_MASK_4M);
- if (TTE_GET_PA(tte_data) != 0)
- goto done;
- /*
- * check 8k TSB
- */
- tte_data = tsb_get_tte(&kernel_td[TSB8K_INDEX], va, 0);
- pa = TTE_GET_PA(tte_data)| (va & PAGE_MASK);
-
-done:
+
+ pa = 0;
+#if 0
+ printf("tte_data=%lx TTE_GET_PA(tte_data)=%lx (va & TTE_GET_PAGE_MASK(tte_data))=%lx\n",
+ tsb_lookup_tte(va, 0), TTE_GET_PA(tte_data), (va & TTE_GET_PAGE_MASK(tte_data)));
+#endif
+ if ((tte_data = tsb_lookup_tte(va, 0)) != 0)
+ pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
+
return pa;
}
@@ -748,7 +1008,7 @@
{
if ((prot & VM_PROT_WRITE) == 0) {
if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
- tte_clear_phys_bit(m, VTD_SW|VTD_W);
+ tte_clear_phys_bit(m, VTD_WR_PERM|VTD_W);
} else {
pmap_remove_all(m);
}
@@ -821,6 +1081,7 @@
void
pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
{
+ UNIMPLEMENTED;
#ifdef notyet
vm_offset_t pdnxt;
pd_entry_t ptpaddr;
@@ -1048,6 +1309,33 @@
UNIMPLEMENTED;
}
+static void
+pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
+{
+ pv_entry_t pv;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+ if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
+ TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+ if (pmap == pv->pv_pmap && va == pv->pv_va)
+ break;
+ }
+ } else {
+ TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
+ if (va == pv->pv_va)
+ break;
+ }
+ }
+ KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ m->md.pv_list_count--;
+ if (TAILQ_EMPTY(&m->md.pv_list))
+ vm_page_flag_clear(m, PG_WRITEABLE);
+ TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
+ free_pv_entry(pv);
+}
+
void
pmap_remove_pages(pmap_t pmap, vm_offset_t start, vm_offset_t end)
More information about the p4-projects
mailing list