svn commit: r339386 - in head/sys/amd64: amd64 include
Oliver Pinter
oliver.pinter at hardenedbsd.org
Tue Oct 16 20:31:20 UTC 2018
On 10/16/18, Konstantin Belousov <kib at freebsd.org> wrote:
> Author: kib
> Date: Tue Oct 16 17:28:10 2018
> New Revision: 339386
> URL: https://svnweb.freebsd.org/changeset/base/339386
>
> Log:
> Provide pmap_large_map() KPI on amd64.
>
> The KPI allows to map very large contigous physical memory regions
> into KVA, which are not covered by DMAP.
>
> I see both with QEMU and with some real hardware started shipping, the
> regions for NVDIMMs might be very far apart from the normal RAM, and
> we expect that at least initial users of NVDIMM could install very
> large amount of such memory. IMO it is not reasonable to extend DMAP
> to cover that far-away regions both because it could overflow existing
> 4T window for DMAP in KVA, and because it costs in page table pages
> allocations, for gap and for possibly unused NV RAM.
>
> Also, KPI provides some special functionality for fast cache flushing
> based on the knowledge of the NVRAM mapping use.
>
> Reviewed by: alc, markj
> Sponsored by: The FreeBSD Foundation
> Approved by: re (gjb)
> MFC after: 1 week
> Differential revision: https://reviews.freebsd.org/D17070
>
> Modified:
> head/sys/amd64/amd64/pmap.c
> head/sys/amd64/include/pmap.h
> head/sys/amd64/include/vmparam.h
>
> Modified: head/sys/amd64/amd64/pmap.c
> ==============================================================================
> --- head/sys/amd64/amd64/pmap.c Tue Oct 16 17:17:11 2018 (r339385)
> +++ head/sys/amd64/amd64/pmap.c Tue Oct 16 17:28:10 2018 (r339386)
> @@ -409,6 +409,9 @@ static struct mtx qframe_mtx;
>
> static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */
>
> +static vmem_t *large_vmem;
> +static u_int lm_ents;
> +
> int pmap_pcid_enabled = 1;
> SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN |
> CTLFLAG_NOFETCH,
> &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
> @@ -655,6 +658,7 @@ static void pmap_invalidate_cache_range_all(vm_offset_
> static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
> pd_entry_t pde);
> static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
> +static vm_page_t pmap_large_map_getptp_unlocked(void);
> static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
> #if VM_NRESERVLEVEL > 0
> static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
> @@ -1313,7 +1317,7 @@ void
> pmap_init(void)
> {
> struct pmap_preinit_mapping *ppim;
> - vm_page_t mpte;
> + vm_page_t m, mpte;
> vm_size_t s;
> int error, i, pv_npg, ret, skz63;
>
> @@ -1440,6 +1444,28 @@ pmap_init(void)
> (vmem_addr_t *)&qframe);
> if (error != 0)
> panic("qframe allocation failed");
> +
> + lm_ents = 8;
> + TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
> + if (lm_ents > LMEPML4I - LMSPML4I + 1)
> + lm_ents = LMEPML4I - LMSPML4I + 1;
> + if (bootverbose)
> + printf("pmap: large map %u PML4 slots (%lu Gb)\n",
Isn't this GB (GigaByte instead of Gigabit?)
> + lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
> + if (lm_ents != 0) {
> + large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
> + (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
> + if (large_vmem == NULL) {
> + printf("pmap: cannot create large map\n");
> + lm_ents = 0;
> + }
> + for (i = 0; i < lm_ents; i++) {
> + m = pmap_large_map_getptp_unlocked();
> + kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
> + X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
> + VM_PAGE_TO_PHYS(m);
> + }
> + }
> }
>
> static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
> @@ -2315,14 +2341,6 @@ pmap_force_invalidate_cache_range(vm_offset_t sva,
> vm_
> {
>
> sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
> - if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) {
> - /*
> - * The supplied range is bigger than 2MB.
> - * Globally invalidate cache.
> - */
> - pmap_invalidate_cache();
> - return;
> - }
>
> /*
> * XXX: Some CPUs fault, hang, or trash the local APIC
> @@ -2406,6 +2424,64 @@ pmap_invalidate_cache_pages(vm_page_t *pages, int
> coun
> }
> }
>
> +void
> +pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
> +{
> +
> + pmap_invalidate_cache_range_check_align(sva, eva);
> +
> + if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
> + pmap_force_invalidate_cache_range(sva, eva);
> + return;
> + }
> +
> + /* See comment in pmap_force_invalidate_cache_range(). */
> + if (pmap_kextract(sva) == lapic_paddr)
> + return;
> +
> + sfence();
> + for (; sva < eva; sva += cpu_clflush_line_size)
> + clwb(sva);
> + sfence();
> +}
> +
> +void
> +pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t
> mattr)
> +{
> + pt_entry_t *pte;
> + vm_offset_t vaddr;
> + int error, pte_bits;
> +
> + KASSERT((spa & PAGE_MASK) == 0,
> + ("pmap_flush_cache_phys_range: spa not page-aligned"));
> + KASSERT((epa & PAGE_MASK) == 0,
> + ("pmap_flush_cache_phys_range: epa not page-aligned"));
> +
> + if (spa < dmaplimit) {
> + pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
> + dmaplimit, epa)));
> + if (dmaplimit >= epa)
> + return;
> + spa = dmaplimit;
> + }
> +
> + pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
> + X86_PG_V;
> + error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
> + &vaddr);
> + KASSERT(error == 0, ("vmem_alloc failed: %d", error));
> + pte = vtopte(vaddr);
> + for (; spa < epa; spa += PAGE_SIZE) {
> + sched_pin();
> + pte_store(pte, spa | pte_bits);
> + invlpg(vaddr);
> + /* XXXKIB sfences inside flush_cache_range are excessive */
> + pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
> + sched_unpin();
> + }
> + vmem_free(kernel_arena, vaddr, PAGE_SIZE);
> +}
> +
> /*
> * Routine: pmap_extract
> * Function:
> @@ -2812,6 +2888,10 @@ pmap_pinit_pml4(vm_page_t pml4pg)
> /* install self-referential address mapping entry(s) */
> pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
> X86_PG_A | X86_PG_M;
> +
> + /* install large map entries if configured */
> + for (i = 0; i < lm_ents; i++)
> + pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
> }
>
> static void
> @@ -3158,6 +3238,8 @@ pmap_release(pmap_t pmap)
> for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
> pmap->pm_pml4[DMPML4I + i] = 0;
> pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
> + for (i = 0; i < lm_ents; i++) /* Large Map */
> + pmap->pm_pml4[LMSPML4I + i] = 0;
>
> vm_page_unwire_noq(m);
> vm_page_free_zero(m);
> @@ -8099,6 +8181,477 @@ pmap_quick_remove_page(vm_offset_t addr)
> pte_store(vtopte(qframe), 0);
> invlpg(qframe);
> mtx_unlock_spin(&qframe_mtx);
> +}
> +
> +/*
> + * Pdp pages from the large map are managed differently from either
> + * kernel or user page table pages. They are permanently allocated at
> + * initialization time, and their wire count is permanently set to
> + * zero. The pml4 entries pointing to those pages are copied into
> + * each allocated pmap.
> + *
> + * In contrast, pd and pt pages are managed like user page table
> + * pages. They are dynamically allocated, and their wire count
> + * represents the number of valid entries within the page.
> + */
> +static vm_page_t
> +pmap_large_map_getptp_unlocked(void)
> +{
> + vm_page_t m;
> +
> + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
> + VM_ALLOC_ZERO);
> + if (m != NULL && (m->flags & PG_ZERO) == 0)
> + pmap_zero_page(m);
> + return (m);
> +}
> +
> +static vm_page_t
> +pmap_large_map_getptp(void)
> +{
> + vm_page_t m;
> +
> + PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
> + m = pmap_large_map_getptp_unlocked();
> + if (m == NULL) {
> + PMAP_UNLOCK(kernel_pmap);
> + vm_wait(NULL);
> + PMAP_LOCK(kernel_pmap);
> + /* Callers retry. */
> + }
> + return (m);
> +}
> +
> +static pdp_entry_t *
> +pmap_large_map_pdpe(vm_offset_t va)
> +{
> + vm_pindex_t pml4_idx;
> + vm_paddr_t mphys;
> +
> + pml4_idx = pmap_pml4e_index(va);
> + KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
> + ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
> + "%#jx lm_ents %d",
> + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
> + KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
> + ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
> + "LMSPML4I %#jx lm_ents %d",
> + (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
> + mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
> + return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
> +}
> +
> +static pd_entry_t *
> +pmap_large_map_pde(vm_offset_t va)
> +{
> + pdp_entry_t *pdpe;
> + vm_page_t m;
> + vm_paddr_t mphys;
> +
> +retry:
> + pdpe = pmap_large_map_pdpe(va);
> + if (*pdpe == 0) {
> + m = pmap_large_map_getptp();
> + if (m == NULL)
> + goto retry;
> + mphys = VM_PAGE_TO_PHYS(m);
> + *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
> + } else {
> + MPASS((*pdpe & X86_PG_PS) == 0);
> + mphys = *pdpe & PG_FRAME;
> + }
> + return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
> +}
> +
> +static pt_entry_t *
> +pmap_large_map_pte(vm_offset_t va)
> +{
> + pd_entry_t *pde;
> + vm_page_t m;
> + vm_paddr_t mphys;
> +
> +retry:
> + pde = pmap_large_map_pde(va);
> + if (*pde == 0) {
> + m = pmap_large_map_getptp();
> + if (m == NULL)
> + goto retry;
> + mphys = VM_PAGE_TO_PHYS(m);
> + *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
> + PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++;
> + } else {
> + MPASS((*pde & X86_PG_PS) == 0);
> + mphys = *pde & PG_FRAME;
> + }
> + return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
> +}
> +
> +static int
> +pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
> + vmem_addr_t *vmem_res)
> +{
> +
> + /*
> + * Large mappings are all but static. Consequently, there
> + * is no point in waiting for an earlier allocation to be
> + * freed.
> + */
> + return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
> + VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
> +}
> +
> +int
> +pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
> + vm_memattr_t mattr)
> +{
> + pdp_entry_t *pdpe;
> + pd_entry_t *pde;
> + pt_entry_t *pte;
> + vm_offset_t va, inc;
> + vmem_addr_t vmem_res;
> + vm_paddr_t pa;
> + int error;
> +
> + if (len == 0 || spa + len < spa)
> + return (EINVAL);
> +
> + /* See if DMAP can serve. */
> + if (spa + len <= dmaplimit) {
> + va = PHYS_TO_DMAP(spa);
> + *addr = (void *)va;
> + return (pmap_change_attr(va, len, mattr));
> + }
> +
> + /*
> + * No, allocate KVA. Fit the address with best possible
> + * alignment for superpages. Fall back to worse align if
> + * failed.
> + */
> + error = ENOMEM;
> + if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
> + NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
> + error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
> + &vmem_res);
> + if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
> + NBPDR) + NBPDR)
> + error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
> + &vmem_res);
> + if (error != 0)
> + error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
> + if (error != 0)
> + return (error);
> +
> + /*
> + * Fill pagetable. PG_M is not pre-set, we scan modified bits
> + * in the pagetable to minimize flushing. No need to
> + * invalidate TLB, since we only update invalid entries.
> + */
> + PMAP_LOCK(kernel_pmap);
> + for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
> + len -= inc) {
> + if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
> + (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
> + pdpe = pmap_large_map_pdpe(va);
> + MPASS(*pdpe == 0);
> + *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
> + X86_PG_V | X86_PG_A | pg_nx |
> + pmap_cache_bits(kernel_pmap, mattr, TRUE);
> + inc = NBPDP;
> + } else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
> + (va & PDRMASK) == 0) {
> + pde = pmap_large_map_pde(va);
> + MPASS(*pde == 0);
> + *pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
> + X86_PG_V | X86_PG_A | pg_nx |
> + pmap_cache_bits(kernel_pmap, mattr, TRUE);
> + PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
> + wire_count++;
> + inc = NBPDR;
> + } else {
> + pte = pmap_large_map_pte(va);
> + MPASS(*pte == 0);
> + *pte = pa | pg_g | X86_PG_RW | X86_PG_V |
> + X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
> + mattr, FALSE);
> + PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
> + wire_count++;
> + inc = PAGE_SIZE;
> + }
> + }
> + PMAP_UNLOCK(kernel_pmap);
> + MPASS(len == 0);
> +
> + *addr = (void *)vmem_res;
> + return (0);
> +}
> +
> +void
> +pmap_large_unmap(void *svaa, vm_size_t len)
> +{
> + vm_offset_t sva, va;
> + vm_size_t inc;
> + pdp_entry_t *pdpe, pdp;
> + pd_entry_t *pde, pd;
> + pt_entry_t *pte;
> + vm_page_t m;
> + struct spglist spgf;
> +
> + sva = (vm_offset_t)svaa;
> + if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
> + sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
> + return;
> +
> + SLIST_INIT(&spgf);
> + KASSERT(LARGEMAP_MIN_ADDRESS <= sva && sva + len <=
> + LARGEMAP_MAX_ADDRESS + NBPML4 * (u_long)lm_ents,
> + ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
> + PMAP_LOCK(kernel_pmap);
> + for (va = sva; va < sva + len; va += inc) {
> + pdpe = pmap_large_map_pdpe(va);
> + pdp = *pdpe;
> + KASSERT((pdp & X86_PG_V) != 0,
> + ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
> + (u_long)pdpe, pdp));
> + if ((pdp & X86_PG_PS) != 0) {
> + KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
> + ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
> + (u_long)pdpe, pdp));
> + KASSERT((va & PDPMASK) == 0,
> + ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
> + (u_long)pdpe, pdp));
> + KASSERT(len <= NBPDP,
> + ("len < NBPDP, sva %#lx va %#lx pdpe %#lx pdp %#lx "
> + "len %#lx", sva, va, (u_long)pdpe, pdp, len));
> + *pdpe = 0;
> + inc = NBPDP;
> + continue;
> + }
> + pde = pmap_pdpe_to_pde(pdpe, va);
> + pd = *pde;
> + KASSERT((pd & X86_PG_V) != 0,
> + ("invalid pd va %#lx pde %#lx pd %#lx", va,
> + (u_long)pde, pd));
> + if ((pd & X86_PG_PS) != 0) {
> + KASSERT((va & PDRMASK) == 0,
> + ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
> + (u_long)pde, pd));
> + KASSERT(len <= NBPDR,
> + ("len < NBPDR, sva %#lx va %#lx pde %#lx pd %#lx "
> + "len %#lx", sva, va, (u_long)pde, pd, len));
> + pde_store(pde, 0);
> + inc = NBPDR;
> + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
> + m->wire_count--;
> + if (m->wire_count == 0) {
> + *pdpe = 0;
> + SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
> + }
> + continue;
> + }
> + pte = pmap_pde_to_pte(pde, va);
> + KASSERT((*pte & X86_PG_V) != 0,
> + ("invalid pte va %#lx pte %#lx pt %#lx", va,
> + (u_long)pte, *pte));
> + pte_clear(pte);
> + inc = PAGE_SIZE;
> + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
> + m->wire_count--;
> + if (m->wire_count == 0) {
> + *pde = 0;
> + SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
> + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
> + m->wire_count--;
> + if (m->wire_count == 0) {
> + *pdpe = 0;
> + SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
> + }
> + }
> + }
> + pmap_invalidate_range(kernel_pmap, sva, sva + len);
> + PMAP_UNLOCK(kernel_pmap);
> + vm_page_free_pages_toq(&spgf, false);
> + vmem_free(large_vmem, sva, len);
> +}
> +
> +static void
> +pmap_large_map_wb_fence_mfence(void)
> +{
> +
> + mfence();
> +}
> +
> +static void
> +pmap_large_map_wb_fence_sfence(void)
> +{
> +
> + sfence();
> +}
> +
> +static void
> +pmap_large_map_wb_fence_nop(void)
> +{
> +}
> +
> +DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void), static)
> +{
> +
> + if (cpu_vendor_id != CPU_VENDOR_INTEL)
> + return (pmap_large_map_wb_fence_mfence);
> + else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
> + CPUID_STDEXT_CLFLUSHOPT)) == 0)
> + return (pmap_large_map_wb_fence_sfence);
> + else
> + /* clflush is strongly enough ordered */
> + return (pmap_large_map_wb_fence_nop);
> +}
> +
> +static void
> +pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
> +{
> +
> + for (; len > 0; len -= cpu_clflush_line_size,
> + va += cpu_clflush_line_size)
> + clwb(va);
> +}
> +
> +static void
> +pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
> +{
> +
> + for (; len > 0; len -= cpu_clflush_line_size,
> + va += cpu_clflush_line_size)
> + clflushopt(va);
> +}
> +
> +static void
> +pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
> +{
> +
> + for (; len > 0; len -= cpu_clflush_line_size,
> + va += cpu_clflush_line_size)
> + clflush(va);
> +}
> +
> +static void
> +pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len
> __unused)
> +{
> +}
> +
> +DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t,
> vm_size_t),
> + static)
> +{
> +
> + if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
> + return (pmap_large_map_flush_range_clwb);
> + else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
> + return (pmap_large_map_flush_range_clflushopt);
> + else if ((cpu_feature & CPUID_CLFSH) != 0)
> + return (pmap_large_map_flush_range_clflush);
> + else
> + return (pmap_large_map_flush_range_nop);
> +}
> +
> +static void
> +pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
> +{
> + volatile u_long *pe;
> + u_long p;
> + vm_offset_t va;
> + vm_size_t inc;
> + bool seen_other;
> +
> + for (va = sva; va < eva; va += inc) {
> + inc = 0;
> + if ((amd_feature & AMDID_PAGE1GB) != 0) {
> + pe = (volatile u_long *)pmap_large_map_pdpe(va);
> + p = *pe;
> + if ((p & X86_PG_PS) != 0)
> + inc = NBPDP;
> + }
> + if (inc == 0) {
> + pe = (volatile u_long *)pmap_large_map_pde(va);
> + p = *pe;
> + if ((p & X86_PG_PS) != 0)
> + inc = NBPDR;
> + }
> + if (inc == 0) {
> + pe = (volatile u_long *)pmap_large_map_pte(va);
> + p = *pe;
> + inc = PAGE_SIZE;
> + }
> + seen_other = false;
> + for (;;) {
> + if ((p & X86_PG_AVAIL1) != 0) {
> + /*
> + * Spin-wait for the end of a parallel
> + * write-back.
> + */
> + cpu_spinwait();
> + p = *pe;
> +
> + /*
> + * If we saw other write-back
> + * occuring, we cannot rely on PG_M to
> + * indicate state of the cache. The
> + * PG_M bit is cleared before the
> + * flush to avoid ignoring new writes,
> + * and writes which are relevant for
> + * us might happen after.
> + */
> + seen_other = true;
> + continue;
> + }
> +
> + if ((p & X86_PG_M) != 0 || seen_other) {
> + if (!atomic_fcmpset_long(pe, &p,
> + (p & ~X86_PG_M) | X86_PG_AVAIL1))
> + /*
> + * If we saw PG_M without
> + * PG_AVAIL1, and then on the
> + * next attempt we do not
> + * observe either PG_M or
> + * PG_AVAIL1, the other
> + * write-back started after us
> + * and finished before us. We
> + * can rely on it doing our
> + * work.
> + */
> + continue;
> + pmap_large_map_flush_range(va, inc);
> + atomic_clear_long(pe, X86_PG_AVAIL1);
> + }
> + break;
> + }
> + maybe_yield();
> + }
> +}
> +
> +/*
> + * Write-back cache lines for the given address range.
> + *
> + * Must be called only on the range or sub-range returned from
> + * pmap_large_map(). Must not be called on the coalesced ranges.
> + *
> + * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
> + * instructions support.
> + */
> +void
> +pmap_large_map_wb(void *svap, vm_size_t len)
> +{
> + vm_offset_t eva, sva;
> +
> + sva = (vm_offset_t)svap;
> + eva = sva + len;
> + pmap_large_map_wb_fence();
> + if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
> + pmap_large_map_flush_range(sva, len);
> + } else {
> + KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
> + eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
> + ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
> + pmap_large_map_wb_large(sva, eva);
> + }
> + pmap_large_map_wb_fence();
> }
>
> static vm_page_t
>
> Modified: head/sys/amd64/include/pmap.h
> ==============================================================================
> --- head/sys/amd64/include/pmap.h Tue Oct 16 17:17:11 2018 (r339385)
> +++ head/sys/amd64/include/pmap.h Tue Oct 16 17:28:10 2018 (r339386)
> @@ -216,6 +216,10 @@
> #define KPML4I (NPML4EPG-1)
> #define KPDPI (NPDPEPG-2) /* kernbase at -2GB */
>
> +/* Large map: index of the first and max last pml4 entry */
> +#define LMSPML4I (PML4PML4I + 1)
> +#define LMEPML4I (DMPML4I - 1)
> +
> /*
> * XXX doesn't really belong here I guess...
> */
> @@ -413,11 +417,16 @@ void pmap_bootstrap(vm_paddr_t *);
> int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde);
> int pmap_change_attr(vm_offset_t, vm_size_t, int);
> void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t
> invalidate);
> +void pmap_flush_cache_range(vm_offset_t, vm_offset_t);
> +void pmap_flush_cache_phys_range(vm_paddr_t, vm_paddr_t, vm_memattr_t);
> void pmap_init_pat(void);
> void pmap_kenter(vm_offset_t va, vm_paddr_t pa);
> void *pmap_kenter_temporary(vm_paddr_t pa, int i);
> vm_paddr_t pmap_kextract(vm_offset_t);
> void pmap_kremove(vm_offset_t);
> +int pmap_large_map(vm_paddr_t, vm_size_t, void **, vm_memattr_t);
> +void pmap_large_map_wb(void *sva, vm_size_t len);
> +void pmap_large_unmap(void *sva, vm_size_t len);
> void *pmap_mapbios(vm_paddr_t, vm_size_t);
> void *pmap_mapdev(vm_paddr_t, vm_size_t);
> void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
>
> Modified: head/sys/amd64/include/vmparam.h
> ==============================================================================
> --- head/sys/amd64/include/vmparam.h Tue Oct 16 17:17:11 2018 (r339385)
> +++ head/sys/amd64/include/vmparam.h Tue Oct 16 17:28:10 2018 (r339386)
> @@ -156,7 +156,9 @@
> * 0x0000000000000000 - 0x00007fffffffffff user map
> * 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole)
> * 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB
> slot)
> - * 0xffff804020101000 - 0xfffff7ffffffffff unused
> + * 0xffff804020100fff - 0xffff807fffffffff unused
> + * 0xffff808000000000 - 0xffff847fffffffff large map (can be tuned up)
> + * 0xffff848000000000 - 0xfffff7ffffffffff unused (large map extends
> there)
> * 0xfffff80000000000 - 0xfffffbffffffffff 4TB direct map
> * 0xfffffc0000000000 - 0xfffffdffffffffff unused
> * 0xfffffe0000000000 - 0xffffffffffffffff 2TB kernel map
> @@ -172,6 +174,9 @@
>
> #define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0)
> #define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
> +
> +#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0)
> +#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0)
>
> #define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0)
>
> _______________________________________________
> svn-src-head at freebsd.org mailing list
> https://lists.freebsd.org/mailman/listinfo/svn-src-head
> To unsubscribe, send any mail to "svn-src-head-unsubscribe at freebsd.org"
>
More information about the svn-src-all
mailing list