git: 649db49403a7 - releng/15.0 - pkru: Fix handling of 1GB largepage mappings
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 21 Apr 2026 15:44:28 UTC
The branch releng/15.0 has been updated by markj:
URL: https://cgit.FreeBSD.org/src/commit/?id=649db49403a727739552bf2d546ca2a08df1d944
commit 649db49403a727739552bf2d546ca2a08df1d944
Author: Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2026-03-31 13:37:43 +0000
Commit: Mark Johnston <markj@FreeBSD.org>
CommitDate: 2026-04-20 19:41:17 +0000
pkru: Fix handling of 1GB largepage mappings
pmap_pkru_update_range() did not handle the case where a PDPE has PG_PS
set. More generally, the SET_PKRU and CLEAR_PKRU sysarch
implementations did not check whether the request covers a "boundary" vm
map entry. Fix this, add the missing PG_PS test, and add some tests.
Approved by: so
Security: FreeBSD-SA-26:11.amd64
Security: CVE-2026-6386
Reported by: Nicholas Carlini <npc@anthropic.com>
Reviewed by: kib, alc
Differential Revision: https://reviews.freebsd.org/D56184
---
lib/libsys/x86/pkru.3 | 3 +
sys/amd64/amd64/pmap.c | 20 +++-
sys/amd64/amd64/sys_machdep.c | 43 +++++++--
sys/vm/vm_map.c | 32 +++++++
sys/vm/vm_map.h | 1 +
tests/sys/posixshm/posixshm_test.c | 187 +++++++++++++++++++++++++++++++++++++
6 files changed, 274 insertions(+), 12 deletions(-)
diff --git a/lib/libsys/x86/pkru.3 b/lib/libsys/x86/pkru.3
index 95bc66c979ac..033dc07c4b06 100644
--- a/lib/libsys/x86/pkru.3
+++ b/lib/libsys/x86/pkru.3
@@ -179,6 +179,9 @@ The supplied
argument for
.Fn x86_pkru_protect_range
has reserved bits set.
+.It Bq Er EINVAL
+The range of the request partially covers a mapping of an object created by
+.Xr shm_create_largepage 3 .
.It Bq Er EFAULT
The supplied address range does not completely fit into the user-managed
address range.
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 243a6625bece..573f78fe3da4 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -11542,7 +11542,7 @@ pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
u_int keyidx)
{
pml4_entry_t *pml4e;
- pdp_entry_t *pdpe;
+ pdp_entry_t newpdpe, *pdpe;
pd_entry_t newpde, ptpaddr, *pde;
pt_entry_t newpte, *ptep, pte;
vm_offset_t va, va_next;
@@ -11568,6 +11568,22 @@ pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
va_next = eva;
continue;
}
+ if ((*pdpe & PG_PS) != 0) {
+ va_next = (va + NBPDP) & ~PDPMASK;
+ if (va_next < va)
+ va_next = eva;
+ KASSERT(va_next <= eva,
+ ("partial update of non-transparent 1G mapping "
+ "pdpe %#lx va %#lx eva %#lx va_next %#lx",
+ *pdpe, va, eva, va_next));
+ newpdpe = (*pdpe & ~X86_PG_PKU_MASK) |
+ X86_PG_PKU(keyidx);
+ if (newpdpe != *pdpe) {
+ *pdpe = newpdpe;
+ changed = true;
+ }
+ continue;
+ }
va_next = (va + NBPDR) & ~PDRMASK;
if (va_next < va)
@@ -11620,8 +11636,6 @@ pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
(flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
return (EINVAL);
- if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
- return (EFAULT);
if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
return (ENOTSUP);
return (0);
diff --git a/sys/amd64/amd64/sys_machdep.c b/sys/amd64/amd64/sys_machdep.c
index 51f55687bbcf..1df73a25c05e 100644
--- a/sys/amd64/amd64/sys_machdep.c
+++ b/sys/amd64/amd64/sys_machdep.c
@@ -30,7 +30,6 @@
* SUCH DAMAGE.
*/
-#include <sys/cdefs.h>
#include "opt_capsicum.h"
#include "opt_ktrace.h"
@@ -369,32 +368,58 @@ sysarch(struct thread *td, struct sysarch_args *uap)
break;
case I386_SET_PKRU:
- case AMD64_SET_PKRU:
+ case AMD64_SET_PKRU: {
+ vm_offset_t addr, start, end;
+ vm_size_t len;
+
+ addr = (uintptr_t)a64pkru.addr;
+ len = a64pkru.len;
+
/*
* Read-lock the map to synchronize with parallel
* pmap_vmspace_copy() on fork.
*/
map = &td->td_proc->p_vmspace->vm_map;
vm_map_lock_read(map);
- error = pmap_pkru_set(PCPU_GET(curpmap),
- (vm_offset_t)a64pkru.addr, (vm_offset_t)a64pkru.addr +
- a64pkru.len, a64pkru.keyidx, a64pkru.flags);
+ if (len == 0 || !vm_map_check_boundary(map, addr, addr + len)) {
+ vm_map_unlock_read(map);
+ error = EINVAL;
+ break;
+ }
+ start = trunc_page(addr);
+ end = round_page(addr + len);
+ error = pmap_pkru_set(PCPU_GET(curpmap), start, end,
+ a64pkru.keyidx, a64pkru.flags);
vm_map_unlock_read(map);
break;
+ }
case I386_CLEAR_PKRU:
- case AMD64_CLEAR_PKRU:
+ case AMD64_CLEAR_PKRU: {
+ vm_offset_t addr, start, end;
+ vm_size_t len;
+
if (a64pkru.flags != 0 || a64pkru.keyidx != 0) {
error = EINVAL;
break;
}
+
+ addr = (uintptr_t)a64pkru.addr;
+ len = a64pkru.len;
+
map = &td->td_proc->p_vmspace->vm_map;
vm_map_lock_read(map);
- error = pmap_pkru_clear(PCPU_GET(curpmap),
- (vm_offset_t)a64pkru.addr,
- (vm_offset_t)a64pkru.addr + a64pkru.len);
+ if (len == 0 || !vm_map_check_boundary(map, addr, addr + len)) {
+ vm_map_unlock_read(map);
+ error = EINVAL;
+ break;
+ }
+ start = trunc_page(addr);
+ end = round_page(addr + len);
+ error = pmap_pkru_clear(PCPU_GET(curpmap), start, end);
vm_map_unlock_read(map);
break;
+ }
case AMD64_DISABLE_TLSBASE:
clear_pcb_flags(pcb, PCB_TLSBASE);
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 6b09552c5fee..e43a061617ad 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -4146,6 +4146,38 @@ vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
return (TRUE);
}
+/*
+ * Check whether the specified range partially overlaps a map entry with
+ * fixed boundaries, and return false if so.
+ *
+ * The map must be locked.
+ */
+bool
+vm_map_check_boundary(vm_map_t map, vm_offset_t start, vm_offset_t end)
+{
+ vm_map_entry_t entry;
+ int bdry_idx;
+
+ if (!vm_map_range_valid(map, start, end))
+ return (false);
+ if (start == end)
+ return (true);
+
+ if (vm_map_lookup_entry(map, start, &entry)) {
+ bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
+ if (bdry_idx != 0 &&
+ (start & (pagesizes[bdry_idx] - 1)) != 0)
+ return (false);
+ }
+ if (vm_map_lookup_entry(map, end - 1, &entry)) {
+ bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
+ if (bdry_idx != 0 &&
+ (end & (pagesizes[bdry_idx] - 1)) != 0)
+ return (false);
+ }
+ return (true);
+}
+
/*
*
* vm_map_copy_swap_object:
diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
index 6af3dba42685..0b0edb24a64d 100644
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@@ -479,6 +479,7 @@ vm_map_entry_read_succ(void *token, struct vm_map_entry *const clone,
#endif /* ! _KERNEL */
#ifdef _KERNEL
+bool vm_map_check_boundary(vm_map_t, vm_offset_t, vm_offset_t);
boolean_t vm_map_check_protection (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t);
int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t);
int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
diff --git a/tests/sys/posixshm/posixshm_test.c b/tests/sys/posixshm/posixshm_test.c
index 55514a5f4bde..1250fcb63a93 100644
--- a/tests/sys/posixshm/posixshm_test.c
+++ b/tests/sys/posixshm/posixshm_test.c
@@ -38,10 +38,17 @@
#include <sys/sysctl.h>
#include <sys/wait.h>
+#ifdef __amd64__
+#include <machine/sysarch.h>
+#endif
+
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
+#include <paths.h>
+#include <setjmp.h>
#include <signal.h>
+#include <stdatomic.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -1901,6 +1908,183 @@ ATF_TC_BODY(largepage_pipe, tc)
}
}
+#ifdef __amd64__
+static sigjmp_buf jmpbuf;
+static _Atomic(void *) faultaddr;
+static _Atomic(int) faultsig;
+
+#define KEY_RW 1
+#define KEY_RO 2
+#define KEY_WO 3
+#define KEY_NO 4
+#define VAL 0xdeadfacec0debeef
+static void
+set_keys(void)
+{
+ int error;
+
+ error = x86_pkru_set_perm(KEY_RW, 1, 1);
+ ATF_REQUIRE(error == 0);
+ error = x86_pkru_set_perm(KEY_RO, 1, 0);
+ ATF_REQUIRE(error == 0);
+ error = x86_pkru_set_perm(KEY_WO, 0, 1);
+ ATF_REQUIRE(error == 0);
+ error = x86_pkru_set_perm(KEY_NO, 0, 0);
+ ATF_REQUIRE(error == 0);
+}
+
+static void
+sigsegv(int sig, siginfo_t *si, void *uc __unused)
+{
+ faultsig = sig;
+ faultaddr = si->si_addr;
+ siglongjmp(jmpbuf, 1);
+}
+
+static bool
+try_read(volatile uint64_t *p, uint64_t *outp)
+{
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ *outp = *p;
+ return (true);
+ } else {
+ atomic_signal_fence(memory_order_relaxed);
+ ATF_REQUIRE(faultsig == SIGSEGV);
+ ATF_REQUIRE(faultaddr == p);
+ set_keys(); /* PKRU is not restored by siglongjmp? */
+ return (false);
+ }
+}
+
+static bool
+try_write(volatile uint64_t *p, uint64_t val)
+{
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ *p = val;
+ return (true);
+ } else {
+ atomic_signal_fence(memory_order_relaxed);
+ ATF_REQUIRE(faultsig == SIGSEGV);
+ ATF_REQUIRE(faultaddr == p);
+ set_keys(); /* PKRU is not restored by siglongjmp? */
+ return (false);
+ }
+}
+
+ATF_TC_WITHOUT_HEAD(largepage_pkru);
+ATF_TC_BODY(largepage_pkru, tc)
+{
+ size_t ps[MAXPAGESIZES];
+ struct sigaction sa;
+ char *addr, *addr1;
+ int error, fd, pscnt;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = sigsegv;
+ sa.sa_flags = SA_SIGINFO;
+ sigemptyset(&sa.sa_mask);
+ error = sigaction(SIGSEGV, &sa, NULL);
+ ATF_REQUIRE(error == 0);
+
+ pscnt = pagesizes(ps);
+
+ for (int i = 1; i < pscnt; i++) {
+ uint64_t val;
+
+ fd = shm_open_large(i, SHM_LARGEPAGE_ALLOC_DEFAULT, ps[i]);
+ addr = mmap(NULL, ps[i], PROT_READ | PROT_WRITE, MAP_SHARED, fd,
+ 0);
+ ATF_REQUIRE_MSG(addr != MAP_FAILED,
+ "mmap(%zu bytes) failed; error=%d", ps[i], errno);
+
+ /*
+ * Ensure that the page is faulted into the pmap.
+ */
+ memset(addr, 0, ps[i]);
+
+ set_keys();
+
+ /*
+ * Make sure we can't partially cover a largepage mapping.
+ */
+ error = x86_pkru_protect_range(addr, PAGE_SIZE, KEY_RW, 0);
+ ATF_REQUIRE_ERRNO(EINVAL, error != 0);
+ error = x86_pkru_protect_range(addr, ps[i] - PAGE_SIZE, KEY_RW,
+ 0);
+ ATF_REQUIRE_ERRNO(EINVAL, error != 0);
+ error = x86_pkru_protect_range(addr + PAGE_SIZE, ps[i] - PAGE_SIZE,
+ KEY_RW, 0);
+ ATF_REQUIRE_ERRNO(EINVAL, error != 0);
+ error = x86_pkru_protect_range(addr + 1, ps[i], KEY_RW, 0);
+ ATF_REQUIRE_ERRNO(EINVAL, error != 0);
+
+ /*
+ * Make sure that protections are honoured.
+ */
+ for (int j = 1; j <= 4; j++) {
+ volatile uint64_t *addr64;
+
+ error = x86_pkru_protect_range(addr, ps[i], 0, 0);
+ ATF_REQUIRE(error == 0);
+
+ addr64 = (volatile uint64_t *)(void *)addr;
+ *addr64 = VAL;
+
+ error = x86_pkru_protect_range(addr, ps[i], j, 0);
+ ATF_REQUIRE(error == 0);
+ switch (j) {
+ case KEY_RW:
+ ATF_REQUIRE(try_write(addr64, VAL));
+ ATF_REQUIRE(try_read(addr64, &val));
+ ATF_REQUIRE(val == VAL);
+ break;
+ case KEY_RO:
+ ATF_REQUIRE(try_read(addr64, &val));
+ ATF_REQUIRE(val == VAL);
+ ATF_REQUIRE(!try_write(addr64, VAL));
+ break;
+ case KEY_WO:
+ /* !access implies !modify */
+ case KEY_NO:
+ ATF_REQUIRE(!try_read(addr64, &val));
+ ATF_REQUIRE(!try_write(addr64, VAL));
+ break;
+ default:
+ __unreachable();
+ }
+ }
+ error = munmap(addr, ps[i]);
+ ATF_CHECK(error == 0);
+
+ /*
+ * Try mapping a large page in a region partially covered by a
+ * key.
+ *
+ * Rather than detecting the mismatch when the logical mapping
+ * is created, we currently only fail once pmap_enter() is
+ * called from the fault handler. This is not ideal and might
+ * be improved in the future.
+ */
+ error = x86_pkru_protect_range(addr, ps[i], 0, 0);
+ ATF_REQUIRE(error == 0);
+ error = x86_pkru_protect_range(addr + PAGE_SIZE,
+ ps[i] - PAGE_SIZE, KEY_RW, 0);
+ ATF_REQUIRE(error == 0);
+
+ addr1 = mmap(addr, ps[i], PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, fd, 0);
+ ATF_REQUIRE(addr1 != MAP_FAILED);
+ ATF_REQUIRE(addr == addr1);
+ ATF_REQUIRE(!try_read((volatile uint64_t *)(void *)addr, &val));
+ ATF_REQUIRE(!try_write((volatile uint64_t *)(void *)addr, VAL));
+ }
+}
+#undef KEY_RW
+#undef KEY_RO
+#undef KEY_WO
+#undef KEY_NO
+#endif
+
ATF_TC_WITHOUT_HEAD(largepage_reopen);
ATF_TC_BODY(largepage_reopen, tc)
{
@@ -1991,6 +2175,9 @@ ATF_TP_ADD_TCS(tp)
ATF_TP_ADD_TC(tp, largepage_mprotect);
ATF_TP_ADD_TC(tp, largepage_minherit);
ATF_TP_ADD_TC(tp, largepage_pipe);
+#ifdef __amd64__
+ ATF_TP_ADD_TC(tp, largepage_pkru);
+#endif
ATF_TP_ADD_TC(tp, largepage_reopen);
return (atf_no_error());