git: a165b4591e48 - releng/12.2 - MFC r368649 / 3fd989da by kib: amd64 pmap: fix PCID mode invalidations
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 11 Jan 2022 18:32:04 UTC
The branch releng/12.2 has been updated by emaste:
URL: https://cgit.FreeBSD.org/src/commit/?id=a165b4591e48cd2adce8215fca73147c016e6cea
commit a165b4591e48cd2adce8215fca73147c016e6cea
Author: Andriy Gapon <avg@FreeBSD.org>
AuthorDate: 2021-12-14 14:43:29 +0000
Commit: Ed Maste <emaste@FreeBSD.org>
CommitDate: 2022-01-10 18:04:06 +0000
MFC r368649 / 3fd989da by kib: amd64 pmap: fix PCID mode invalidations
r368649 fixed a regression in r362031 that was MFC-ed to stable/12 as
a part of r362572. That commit reordered IPI send and local TLB flush in
TLB invalidations.
Without this fix we've been seeing problems with stale memory content
where changes done under a mutex were not immediately observed by
another thread after taking the same mutex. Those inconsistenices were
correlated to copy-on-write faults for pages contaning the data.
The change needed some adaptations as I elected to skip two significant
intermediate changes:
- r363195 / dc43978a, amd64: allow parallel shootdown IPIs
- r363311 / 3ec7e169, amd64 pmap: microoptimize local shootdowns for
PCID PTI configurations
Reviewed by: kib
Differential Revision: https://reviews.freebsd.org/D33413
(cherry picked from commit 1820ca2154611d6f27ce5a5fdd561a16ac54fdd8)
Approved by: so
Errata: FreeBSD-EN-22:04.pcid
---
sys/amd64/amd64/pmap.c | 273 ++++++++++++++++++++++++-------------------------
sys/x86/x86/mp_x86.c | 32 ++++--
2 files changed, 153 insertions(+), 152 deletions(-)
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index a030fb3fd21b..aa72830ff3e5 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -2328,45 +2328,19 @@ pmap_invalidate_ept(pmap_t pmap)
static cpuset_t
pmap_invalidate_cpu_mask(pmap_t pmap)
{
-
return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
}
static inline void
-pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
- const bool invpcid_works1)
+pmap_invalidate_preipi_pcid(pmap_t pmap)
{
- struct invpcid_descr d;
- uint64_t kcr3, ucr3;
- uint32_t pcid;
u_int cpuid, i;
+ sched_pin();
+
cpuid = PCPU_GET(cpuid);
- if (pmap == PCPU_GET(curpmap)) {
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- /*
- * Because pm_pcid is recalculated on a
- * context switch, we must disable switching.
- * Otherwise, we might use a stale value
- * below.
- */
- critical_enter();
- pcid = pmap->pm_pcids[cpuid].pm_pcid;
- if (invpcid_works1) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = va;
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlpg(ucr3, kcr3, va);
- }
- critical_exit();
- }
- } else
- pmap->pm_pcids[cpuid].pm_gen = 0;
+ if (pmap != PCPU_GET(curpmap))
+ cpuid = 0xffffffff; /* An impossible value */
CPU_FOREACH(i) {
if (cpuid != i)
@@ -2387,52 +2361,96 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
}
static void
-pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_preipi_nopcid(pmap_t pmap __unused)
{
+ sched_pin();
+}
- pmap_invalidate_page_pcid(pmap, va, true);
+DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t), static)
+{
+ return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid :
+ pmap_invalidate_preipi_nopcid);
+}
+
+static inline void
+pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va,
+ const bool invpcid_works1)
+{
+ struct invpcid_descr d;
+ uint64_t kcr3, ucr3;
+ uint32_t pcid;
+ u_int cpuid;
+
+ /*
+ * Because pm_pcid is recalculated on a context switch, we
+ * must ensure there is no preemption, not just pinning.
+ * Otherwise, we might use a stale value below.
+ */
+ CRITICAL_ASSERT(curthread);
+
+ /*
+ * No need to do anything with user page tables invalidation
+ * if there is no user page table.
+ */
+ if (pmap->pm_ucr3 == PMAP_NO_CR3)
+ return;
+
+ cpuid = PCPU_GET(cpuid);
+
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
+ if (invpcid_works1) {
+ d.pcid = pcid | PMAP_PCID_USER_PT;
+ d.pad = 0;
+ d.addr = va;
+ invpcid(&d, INVPCID_ADDR);
+ } else {
+ kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+ ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+ pmap_pti_pcid_invlpg(ucr3, kcr3, va);
+ }
}
static void
-pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va)
{
+ pmap_invalidate_page_pcid_cb(pmap, va, true);
+}
- pmap_invalidate_page_pcid(pmap, va, false);
+static void
+pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va)
+{
+ pmap_invalidate_page_pcid_cb(pmap, va, false);
}
static void
-pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
+pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused)
{
}
-DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t),
+DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t),
static)
{
-
if (pmap_pcid_enabled)
- return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
- pmap_invalidate_page_pcid_noinvpcid);
- return (pmap_invalidate_page_nopcid);
+ return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb :
+ pmap_invalidate_page_pcid_noinvpcid_cb);
+ return (pmap_invalidate_page_nopcid_cb);
}
static void
pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
vm_offset_t addr2 __unused)
{
-
if (pmap == kernel_pmap) {
invlpg(va);
- } else {
- if (pmap == PCPU_GET(curpmap))
- invlpg(va);
- pmap_invalidate_page_mode(pmap, va);
+ } else if (pmap == PCPU_GET(curpmap)) {
+ invlpg(va);
+ pmap_invalidate_page_cb(pmap, va);
}
}
void
pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
{
-
if (pmap_type_guest(pmap)) {
pmap_invalidate_ept(pmap);
return;
@@ -2441,6 +2459,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
KASSERT(pmap->pm_type == PT_X86,
("pmap_invalidate_page: invalid type %d", pmap->pm_type));
+ pmap_invalidate_preipi(pmap);
smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap,
pmap_invalidate_page_curcpu_cb);
}
@@ -2449,73 +2468,62 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
#define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE)
static void
-pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
const bool invpcid_works1)
{
struct invpcid_descr d;
uint64_t kcr3, ucr3;
uint32_t pcid;
- u_int cpuid, i;
+ u_int cpuid;
+
+ CRITICAL_ASSERT(curthread);
+
+ if (pmap != PCPU_GET(curpmap) ||
+ pmap->pm_ucr3 == PMAP_NO_CR3)
+ return;
cpuid = PCPU_GET(cpuid);
- if (pmap == PCPU_GET(curpmap)) {
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- critical_enter();
- pcid = pmap->pm_pcids[cpuid].pm_pcid;
- if (invpcid_works1) {
- d.pcid = pcid | PMAP_PCID_USER_PT;
- d.pad = 0;
- d.addr = sva;
- for (; d.addr < eva; d.addr += PAGE_SIZE)
- invpcid(&d, INVPCID_ADDR);
- } else {
- kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
- ucr3 = pmap->pm_ucr3 | pcid |
- PMAP_PCID_USER_PT | CR3_PCID_SAVE;
- pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
- }
- critical_exit();
- }
- } else
- pmap->pm_pcids[cpuid].pm_gen = 0;
- CPU_FOREACH(i) {
- if (cpuid != i)
- pmap->pm_pcids[i].pm_gen = 0;
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
+ if (invpcid_works1) {
+ d.pcid = pcid | PMAP_PCID_USER_PT;
+ d.pad = 0;
+ for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE)
+ invpcid(&d, INVPCID_ADDR);
+ } else {
+ kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+ ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+ pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
}
- /* See the comment in pmap_invalidate_page_pcid(). */
- atomic_thread_fence_seq_cst();
}
static void
-pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
+pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva,
vm_offset_t eva)
{
-
- pmap_invalidate_range_pcid(pmap, sva, eva, true);
+ pmap_invalidate_range_pcid_cb(pmap, sva, eva, true);
}
static void
-pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
+pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva,
vm_offset_t eva)
{
-
- pmap_invalidate_range_pcid(pmap, sva, eva, false);
+ pmap_invalidate_range_pcid_cb(pmap, sva, eva, false);
}
static void
-pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused,
+ vm_offset_t eva __unused)
{
}
-DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
+DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t,
vm_offset_t), static)
{
-
if (pmap_pcid_enabled)
- return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
- pmap_invalidate_range_pcid_noinvpcid);
- return (pmap_invalidate_range_nopcid);
+ return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb :
+ pmap_invalidate_range_pcid_noinvpcid_cb);
+ return (pmap_invalidate_range_nopcid_cb);
}
static void
@@ -2526,19 +2534,16 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
if (pmap == kernel_pmap) {
for (addr = sva; addr < eva; addr += PAGE_SIZE)
invlpg(addr);
- } else {
- if (pmap == PCPU_GET(curpmap)) {
- for (addr = sva; addr < eva; addr += PAGE_SIZE)
- invlpg(addr);
- }
- pmap_invalidate_range_mode(pmap, sva, eva);
+ } else if (pmap == PCPU_GET(curpmap)) {
+ for (addr = sva; addr < eva; addr += PAGE_SIZE)
+ invlpg(addr);
+ pmap_invalidate_range_cb(pmap, sva, eva);
}
}
void
pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
-
if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
pmap_invalidate_all(pmap);
return;
@@ -2552,17 +2557,18 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
KASSERT(pmap->pm_type == PT_X86,
("pmap_invalidate_range: invalid type %d", pmap->pm_type));
+ pmap_invalidate_preipi(pmap);
smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap,
pmap_invalidate_range_curcpu_cb);
}
static inline void
-pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
+pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1)
{
struct invpcid_descr d;
uint64_t kcr3, ucr3;
uint32_t pcid;
- u_int cpuid, i;
+ u_int cpuid;
if (pmap == kernel_pmap) {
if (invpcid_works1) {
@@ -2571,87 +2577,72 @@ pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
} else {
invltlb_glob();
}
- } else {
+ } else if (pmap == PCPU_GET(curpmap)) {
+ CRITICAL_ASSERT(curthread);
cpuid = PCPU_GET(cpuid);
- if (pmap == PCPU_GET(curpmap)) {
- critical_enter();
- pcid = pmap->pm_pcids[cpuid].pm_pcid;
- if (invpcid_works1) {
- d.pcid = pcid;
- d.pad = 0;
- d.addr = 0;
+
+ pcid = pmap->pm_pcids[cpuid].pm_pcid;
+ if (invpcid_works1) {
+ d.pcid = pcid;
+ d.pad = 0;
+ d.addr = 0;
+ invpcid(&d, INVPCID_CTX);
+ if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+ d.pcid |= PMAP_PCID_USER_PT;
invpcid(&d, INVPCID_CTX);
- if (pmap->pm_ucr3 != PMAP_NO_CR3) {
- d.pcid |= PMAP_PCID_USER_PT;
- invpcid(&d, INVPCID_CTX);
- }
+ }
+ } else {
+ kcr3 = pmap->pm_cr3 | pcid;
+ ucr3 = pmap->pm_ucr3;
+ if (ucr3 != PMAP_NO_CR3) {
+ ucr3 |= pcid | PMAP_PCID_USER_PT;
+ pmap_pti_pcid_invalidate(ucr3, kcr3);
} else {
- kcr3 = pmap->pm_cr3 | pcid;
- ucr3 = pmap->pm_ucr3;
- if (ucr3 != PMAP_NO_CR3) {
- ucr3 |= pcid | PMAP_PCID_USER_PT;
- pmap_pti_pcid_invalidate(ucr3, kcr3);
- } else {
- load_cr3(kcr3);
- }
+ load_cr3(kcr3);
}
- critical_exit();
- } else
- pmap->pm_pcids[cpuid].pm_gen = 0;
- CPU_FOREACH(i) {
- if (cpuid != i)
- pmap->pm_pcids[i].pm_gen = 0;
}
}
- /* See the comment in pmap_invalidate_page_pcid(). */
- atomic_thread_fence_seq_cst();
}
static void
-pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
+pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap)
{
-
- pmap_invalidate_all_pcid(pmap, true);
+ pmap_invalidate_all_pcid_cb(pmap, true);
}
static void
-pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
+pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap)
{
-
- pmap_invalidate_all_pcid(pmap, false);
+ pmap_invalidate_all_pcid_cb(pmap, false);
}
static void
-pmap_invalidate_all_nopcid(pmap_t pmap)
+pmap_invalidate_all_nopcid_cb(pmap_t pmap)
{
-
if (pmap == kernel_pmap)
invltlb_glob();
else if (pmap == PCPU_GET(curpmap))
invltlb();
}
-DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static)
+DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t), static)
{
-
if (pmap_pcid_enabled)
- return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
- pmap_invalidate_all_pcid_noinvpcid);
- return (pmap_invalidate_all_nopcid);
+ return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb :
+ pmap_invalidate_all_pcid_noinvpcid_cb);
+ return (pmap_invalidate_all_nopcid_cb);
}
static void
pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused,
vm_offset_t addr2 __unused)
{
-
- pmap_invalidate_all_mode(pmap);
+ pmap_invalidate_all_cb(pmap);
}
void
pmap_invalidate_all(pmap_t pmap)
{
-
if (pmap_type_guest(pmap)) {
pmap_invalidate_ept(pmap);
return;
@@ -2660,6 +2651,7 @@ pmap_invalidate_all(pmap_t pmap)
KASSERT(pmap->pm_type == PT_X86,
("pmap_invalidate_all: invalid type %d", pmap->pm_type));
+ pmap_invalidate_preipi(pmap);
smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap,
pmap_invalidate_all_curcpu_cb);
}
@@ -2668,14 +2660,13 @@ static void
pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused,
vm_offset_t addr2 __unused)
{
-
wbinvd();
}
void
pmap_invalidate_cache(void)
{
-
+ sched_pin();
smp_cache_flush(pmap_invalidate_cache_curcpu_cb);
}
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index f24e406a52b4..813c006d4c63 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1643,13 +1643,16 @@ volatile uint32_t smp_tlb_generation;
* Used by pmap to request invalidation of TLB or cache on local and
* remote processors. Mask provides the set of remote CPUs which are
* to be signalled with the IPI specified by vector. The curcpu_cb
- * callback is invoked on the calling CPU while waiting for remote
- * CPUs to complete the operation.
+ * callback is invoked on the calling CPU in a critical section while
+ * waiting for remote CPUs to complete the operation.
*
* The callback function is called unconditionally on the caller's
* underlying processor, even when this processor is not set in the
* mask. So, the callback function must be prepared to handle such
* spurious invocations.
+ *
+ * This function must be called with the thread pinned, and it unpins on
+ * completion.
*/
static void
smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
@@ -1664,23 +1667,21 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
* It is not necessary to signal other CPUs while booting or
* when in the debugger.
*/
- if (kdb_active || panicstr != NULL || !smp_started) {
- curcpu_cb(pmap, addr1, addr2);
- return;
- }
+ if (kdb_active || panicstr != NULL || !smp_started)
+ goto local_cb;
- sched_pin();
+ KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
/*
* Check for other cpus. Return if none.
*/
if (CPU_ISFULLSET(&mask)) {
if (mp_ncpus <= 1)
- goto nospinexit;
+ goto local_cb;
} else {
CPU_CLR(PCPU_GET(cpuid), &mask);
if (CPU_EMPTY(&mask))
- goto nospinexit;
+ goto local_cb;
}
if (!(read_eflags() & PSL_I))
@@ -1712,13 +1713,22 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
while (*p_cpudone != generation)
ia32_pause();
}
- mtx_unlock_spin(&smp_ipi_mtx);
+
+ /*
+ * Unpin before unlocking smp_ipi_mtx. If the thread owes
+ * preemption, this allows scheduler to select thread on any
+ * CPU from its cpuset.
+ */
sched_unpin();
+ mtx_unlock_spin(&smp_ipi_mtx);
+
return;
-nospinexit:
+local_cb:
+ critical_enter();
curcpu_cb(pmap, addr1, addr2);
sched_unpin();
+ critical_exit();
}
void