git: a340b5b4bd48 - stable/13 - linux(4); Almost complete the vDSO.

From: Dmitry Chagin <dchagin_at_FreeBSD.org>
Date: Fri, 17 Jun 2022 19:36:53 UTC
The branch stable/13 has been updated by dchagin:

URL: https://cgit.FreeBSD.org/src/commit/?id=a340b5b4bd4814ad2010c5e7bfaa51082427c4ae

commit a340b5b4bd4814ad2010c5e7bfaa51082427c4ae
Author:     Dmitry Chagin <dchagin@FreeBSD.org>
AuthorDate: 2022-06-17 19:33:07 +0000
Commit:     Dmitry Chagin <dchagin@FreeBSD.org>
CommitDate: 2022-06-17 19:33:07 +0000

    linux(4); Almost complete the vDSO.
    
    The vDSO (virtual dynamic shared object) is a small shared library that the
    kernel maps R/O into the address space of all Linux processes on image
    activation. The vDSO is a fully formed ELF image, shared by all processes
    with the same ABI, has no process private data.
    
    The primary purpose of the vDSO:
    - non-executable stack, signal trampolines not copied to the stack;
    - signal trampolines unwind, mandatory for the NPTL;
    - to avoid contex-switch overhead frequently used system calls can be
      implemented in the vDSO: for now gettimeofday, clock_gettime.
    
    The first two have been implemented, so add the implementation of system
    calls.
    
    System calls implemenation based on a native timekeeping code with some
    limitations:
    - ifunc can't be used, as vDSO r/o mapped to the process VA and rtld
      can't relocate symbols;
    - reading HPET memory is not implemented for now (TODO).
    
    In case on any error vDSO system calls fallback to the kernel system
    calls. For unimplemented vDSO system calls added prototypes which call
    corresponding kernel system call.
    
    Relnotes:               yes
    Tested by:              trasz (arm64)
    Differential revision:  https://reviews.freebsd.org/D30900
    MFC after:              2 weeks
    
    (cherry picked from commit 9931033bbfbe56a037723638cf3712366c6d943f)
---
 sys/amd64/linux/linux_locore.asm            |   2 +-
 sys/amd64/linux/linux_sysvec.c              | 161 ++++++++++---
 sys/amd64/linux/linux_vdso.lds.s            |  12 +-
 sys/amd64/linux/linux_vdso_gtod.c           | 146 ++++++++++++
 sys/amd64/linux32/linux32_locore.asm        |   6 +-
 sys/amd64/linux32/linux32_sysvec.c          | 158 ++++++++++---
 sys/amd64/linux32/linux32_vdso.lds.s        |  24 +-
 sys/amd64/linux32/linux32_vdso_gtod.c       | 146 ++++++++++++
 sys/arm64/linux/linux_sysvec.c              | 153 ++++++++++---
 sys/arm64/linux/linux_vdso.lds.s            |  65 +++++-
 sys/arm64/linux/linux_vdso_gtod.c           | 153 +++++++++++++
 sys/compat/linux/linux_vdso.c               | 211 ++++++-----------
 sys/compat/linux/linux_vdso.h               |  10 +-
 sys/compat/linux/linux_vdso_gtod.inc        | 337 ++++++++++++++++++++++++++++
 sys/i386/linux/linux.h                      |   3 -
 sys/i386/linux/linux_locore.asm             |   6 +-
 sys/i386/linux/linux_sysvec.c               | 161 ++++++++++---
 sys/i386/linux/linux_vdso.lds.s             |  25 ++-
 sys/i386/linux/linux_vdso_gtod.c            | 145 ++++++++++++
 sys/modules/linux/Makefile                  |  60 +++--
 sys/modules/linux64/Makefile                |  49 ++--
 sys/x86/linux/linux_vdso_gettc_x86.inc      | 164 ++++++++++++++
 sys/x86/linux/linux_vdso_tsc_selector_x86.c |  57 +++++
 sys/x86/linux/linux_x86.h                   |  33 +++
 24 files changed, 1955 insertions(+), 332 deletions(-)

diff --git a/sys/amd64/linux/linux_locore.asm b/sys/amd64/linux/linux_locore.asm
index 4ac44c35274b..8f7431d42737 100644
--- a/sys/amd64/linux/linux_locore.asm
+++ b/sys/amd64/linux/linux_locore.asm
@@ -17,7 +17,7 @@ linux_platform:
  * To avoid excess stack frame the signal trampoline code emulates
  * the 'call' instruction.
  */
-NON_GPROF_ENTRY(linux_rt_sigcode)
+ENTRY(linux_rt_sigcode)
 	movq	%rsp, %rbx			/* preserve sigframe */
 	call	.getip
 .getip:
diff --git a/sys/amd64/linux/linux_sysvec.c b/sys/amd64/linux/linux_sysvec.c
index bcc8cbf0b0bd..f13526b00d85 100644
--- a/sys/amd64/linux/linux_sysvec.c
+++ b/sys/amd64/linux/linux_sysvec.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/stddef.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
@@ -72,6 +73,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/trap.h>
 
+#include <x86/linux/linux_x86.h>
 #include <amd64/linux/linux.h>
 #include <amd64/linux/linux_proto.h>
 #include <compat/linux/linux_emul.h>
@@ -85,11 +87,24 @@ __FBSDID("$FreeBSD$");
 
 MODULE_VERSION(linux64, 1);
 
+#define	LINUX_VDSOPAGE_SIZE	PAGE_SIZE * 2
+#define	LINUX_VDSOPAGE_LA48	(VM_MAXUSER_ADDRESS_LA48 - \
+				    LINUX_VDSOPAGE_SIZE)
+#define	LINUX_SHAREDPAGE_LA48	(LINUX_VDSOPAGE_LA48 - PAGE_SIZE)
+				/*
+				 * PAGE_SIZE - the size
+				 * of the native SHAREDPAGE
+				 */
+#define	LINUX_USRSTACK_LA48	LINUX_SHAREDPAGE_LA48
+#define	LINUX_PS_STRINGS_LA48	(LINUX_USRSTACK_LA48 - \
+				    sizeof(struct ps_strings))
+
 static int linux_szsigcode;
-static vm_object_t linux_shared_page_obj;
-static char *linux_shared_page_mapping;
-extern char _binary_linux_locore_o_start;
-extern char _binary_linux_locore_o_end;
+static vm_object_t linux_vdso_obj;
+static char *linux_vdso_mapping;
+extern char _binary_linux_vdso_so_o_start;
+extern char _binary_linux_vdso_so_o_end;
+static vm_offset_t linux_vdso_base;
 
 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
 
@@ -102,10 +117,12 @@ static int	linux_fixup_elf(uintptr_t *stack_base,
 static bool	linux_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static void	linux_vdso_install(void *param);
 static void	linux_vdso_deinstall(void *param);
+static void	linux_vdso_reloc(char *mapping, Elf_Addr offset);
 static void	linux_set_syscall_retval(struct thread *td, int error);
 static int	linux_fetch_syscall_args(struct thread *td);
 static void	linux_exec_setregs(struct thread *td, struct image_params *imgp,
 		    uintptr_t stack);
+static void	linux_exec_sysvec_init(void *param);
 static int	linux_on_exec_vmspace(struct proc *p,
 		    struct image_params *imgp);
 static int	linux_vsyscall(struct thread *td);
@@ -151,6 +168,8 @@ static int _bsd_to_linux_trapcode[] = {
 
 LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
 LINUX_VDSO_SYM_CHAR(linux_platform);
+LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
+LINUX_VDSO_SYM_INTPTR(kern_tsc_selector);
 
 /*
  * If FreeBSD & Linux have a difference of opinion about what a trap
@@ -263,8 +282,7 @@ linux_copyout_auxargs(struct image_params *imgp, uintptr_t base)
 	    M_WAITOK | M_ZERO);
 
 	issetugid = p->p_flag & P_SUGID ? 1 : 0;
-	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
-	    imgp->proc->p_sysent->sv_shared_page_base);
+	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
 	AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz);
@@ -732,7 +750,7 @@ struct sysentvec elf_linux_sysvec = {
 	.sv_transtrap	= linux_translate_traps,
 	.sv_fixup	= linux_fixup_elf,
 	.sv_sendsig	= linux_rt_sendsig,
-	.sv_sigcode	= &_binary_linux_locore_o_start,
+	.sv_sigcode	= &_binary_linux_vdso_so_o_start,
 	.sv_szsigcode	= &linux_szsigcode,
 	.sv_name	= "Linux ELF64",
 	.sv_coredump	= elf64_coredump,
@@ -743,8 +761,8 @@ struct sysentvec elf_linux_sysvec = {
 	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
 	.sv_minuser	= VM_MIN_ADDRESS,
 	.sv_maxuser	= VM_MAXUSER_ADDRESS_LA48,
-	.sv_usrstack	= USRSTACK_LA48,
-	.sv_psstrings	= PS_STRINGS_LA48,
+	.sv_usrstack	= LINUX_USRSTACK_LA48,
+	.sv_psstrings	= LINUX_PS_STRINGS_LA48,
 	.sv_psstringssz	= sizeof(struct ps_strings),
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_auxargs = linux_copyout_auxargs,
@@ -753,11 +771,11 @@ struct sysentvec elf_linux_sysvec = {
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
 	.sv_flags	= SV_ABI_LINUX | SV_LP64 | SV_SHP | SV_SIG_DISCIGN |
-	    SV_SIG_WAITNDQ,
+	    SV_SIG_WAITNDQ | SV_TIMEKEEP,
 	.sv_set_syscall_retval = linux_set_syscall_retval,
 	.sv_fetch_syscall_args = linux_fetch_syscall_args,
 	.sv_syscallnames = NULL,
-	.sv_shared_page_base = SHAREDPAGE_LA48,
+	.sv_shared_page_base = LINUX_SHAREDPAGE_LA48,
 	.sv_shared_page_len = PAGE_SIZE,
 	.sv_schedtail	= linux_schedtail,
 	.sv_thread_detach = linux_thread_detach,
@@ -771,47 +789,130 @@ struct sysentvec elf_linux_sysvec = {
 static int
 linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
 {
+	int error;
 
-	linux_on_exec(p, imgp);
-	return (0);
+	error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base,
+	    LINUX_VDSOPAGE_SIZE, imgp);
+	if (error == 0)
+		linux_on_exec(p, imgp);
+	return (error);
 }
 
 static void
-linux_vdso_install(void *param)
+linux_exec_sysvec_init(void *param)
 {
+	l_uintptr_t *ktimekeep_base, *ktsc_selector;
+	struct sysentvec *sv;
+	ptrdiff_t tkoff;
+
+	sv = param;
+	amd64_lower_shared_page(sv);
+	/* Fill timekeep_base */
+	exec_sysvec_init(sv);
+
+	tkoff = kern_timekeep_base - linux_vdso_base;
+	ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+	*ktimekeep_base = sv->sv_timekeep_base;
+
+	tkoff = kern_tsc_selector - linux_vdso_base;
+	ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+	*ktsc_selector = linux_vdso_tsc_selector_idx();
+	if (bootverbose)
+		printf("Linux x86-64 vDSO tsc_selector: %lu\n", *ktsc_selector);
+}
+SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY,
+    linux_exec_sysvec_init, &elf_linux_sysvec);
 
-	amd64_lower_shared_page(&elf_linux_sysvec);
-
-	linux_szsigcode = (&_binary_linux_locore_o_end -
-	    &_binary_linux_locore_o_start);
+static void
+linux_vdso_install(void *param)
+{
+	char *vdso_start = &_binary_linux_vdso_so_o_start;
+	char *vdso_end = &_binary_linux_vdso_so_o_end;
 
-	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
-		panic("Linux invalid vdso size\n");
+	linux_szsigcode = vdso_end - vdso_start;
+	MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE);
 
-	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
+	linux_vdso_base = LINUX_VDSOPAGE_LA48;
+	if (hw_lower_amd64_sharedpage != 0)
+		linux_vdso_base -= PAGE_SIZE;
 
-	linux_shared_page_obj = __elfN(linux_shared_page_init)
-	    (&linux_shared_page_mapping);
+	__elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
 
-	__elfN(linux_vdso_reloc)(&elf_linux_sysvec);
+	linux_vdso_obj = __elfN(linux_shared_page_init)
+	    (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
+	bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
 
-	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
-	    linux_szsigcode);
-	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
+	linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
 }
-SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
+SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST,
     linux_vdso_install, NULL);
 
 static void
 linux_vdso_deinstall(void *param)
 {
 
-	__elfN(linux_shared_page_fini)(linux_shared_page_obj,
-	    linux_shared_page_mapping);
+	__elfN(linux_shared_page_fini)(linux_vdso_obj,
+	    linux_vdso_mapping, LINUX_VDSOPAGE_SIZE);
 }
 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
     linux_vdso_deinstall, NULL);
 
+static void
+linux_vdso_reloc(char *mapping, Elf_Addr offset)
+{
+	const Elf_Ehdr *ehdr;
+	const Elf_Shdr *shdr;
+	Elf64_Addr *where, val;
+	Elf_Size rtype, symidx;
+	const Elf_Rela *rela;
+	Elf_Addr addr, addend;
+	int relacnt;
+	int i, j;
+
+	MPASS(offset != 0);
+
+	relacnt = 0;
+	ehdr = (const Elf_Ehdr *)mapping;
+	shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
+	for (i = 0; i < ehdr->e_shnum; i++)
+	{
+		switch (shdr[i].sh_type) {
+		case SHT_REL:
+			printf("Linux x86_64 vDSO: unexpected Rel section\n");
+			break;
+		case SHT_RELA:
+			rela = (const Elf_Rela *)(mapping + shdr[i].sh_offset);
+			relacnt = shdr[i].sh_size / sizeof(*rela);
+		}
+	}
+
+	for (j = 0; j < relacnt; j++, rela++) {
+		where = (Elf_Addr *)(mapping + rela->r_offset);
+		addend = rela->r_addend;
+		rtype = ELF_R_TYPE(rela->r_info);
+		symidx = ELF_R_SYM(rela->r_info);
+
+		switch (rtype) {
+		case R_X86_64_NONE:	/* none */
+			break;
+
+		case R_X86_64_RELATIVE:	/* B + A */
+			addr = (Elf_Addr)(offset + addend);
+			val = addr;
+			if (*where != val)
+				*where = val;
+			break;
+		case R_X86_64_IRELATIVE:
+			printf("Linux x86_64 vDSO: unexpected ifunc relocation, "
+			    "symbol index %ld\n", symidx);
+			break;
+		default:
+			printf("Linux x86_64 vDSO: unexpected relocation type %ld, "
+			    "symbol index %ld\n", rtype, symidx);
+		}
+	}
+}
+
 static char GNULINUX_ABI_VENDOR[] = "GNU";
 static int GNULINUX_ABI_DESC = 0;
 
diff --git a/sys/amd64/linux/linux_vdso.lds.s b/sys/amd64/linux/linux_vdso.lds.s
index 94f0266095fb..ccf7c80565bb 100644
--- a/sys/amd64/linux/linux_vdso.lds.s
+++ b/sys/amd64/linux/linux_vdso.lds.s
@@ -54,16 +54,20 @@ VERSION
 {
 	LINUX_2.6 {
 	global:
-		time;
 		__vdso_time;
-		gettimeofday;
 		__vdso_gettimeofday;
-		getcpu;
 		__vdso_getcpu;
-		clock_gettime;
 		__vdso_clock_gettime;
+		__vdso_clock_getres;
+	local: *;
+	};
+
+	LINUX_0.0 {
+	global:
 		linux_rt_sigcode;
 		linux_platform;
+		kern_timekeep_base;
+		kern_tsc_selector;
 	local: *;
 	};
 }
diff --git a/sys/amd64/linux/linux_vdso_gtod.c b/sys/amd64/linux/linux_vdso_gtod.c
new file mode 100644
index 000000000000..ad23dc33575a
--- /dev/null
+++ b/sys/amd64/linux/linux_vdso_gtod.c
@@ -0,0 +1,146 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/elf.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#define	_KERNEL
+#include <sys/vdso.h>
+#undef	_KERNEL
+#include <stdbool.h>
+#include <strings.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <amd64/linux/linux.h>
+#include <amd64/linux/linux_syscall.h>
+#include <compat/linux/linux_errno.h>
+#include <compat/linux/linux_timer.h>
+
+/* The kernel fixup this at vDSO install */
+uintptr_t *kern_timekeep_base = NULL;
+uint32_t kern_tsc_selector = 0;
+
+#include <x86/linux/linux_vdso_gettc_x86.inc>
+
+/* for debug purpose */
+static int
+write(int fd, const void *buf, size_t size)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "syscall"
+	    : "=a"(res)
+	    : "a"(LINUX_SYS_write), "D"(fd), "S"(buf), "d"(size)
+	    : "cc", "rcx", "r11", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "syscall"
+	    : "=a"(res)
+	    : "a"(LINUX_SYS_linux_clock_gettime), "D"(clock_id), "S"(ts)
+	    : "cc", "rcx", "r11", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "syscall"
+	    : "=a"(res)
+	    : "a"(LINUX_SYS_gettimeofday), "D"(tv), "S"(tz)
+	    : "cc", "rcx", "r11", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_clock_getres_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "syscall"
+	    : "=a"(res)
+	    : "a"(LINUX_SYS_linux_clock_getres), "D"(clock_id), "S"(ts)
+	    : "cc", "rcx", "r11", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_getcpu_fallback(uint32_t *cpu, uint32_t *node, void *cache)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "syscall"
+	    : "=a"(res)
+	    : "a"(LINUX_SYS_linux_getcpu), "D"(cpu), "S"(node), "d"(cache)
+	    : "cc", "rcx", "r11", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_time_fallback(long *tm)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "syscall"
+	    : "=a"(res)
+	    : "a"(LINUX_SYS_linux_time), "D"(tm)
+	    : "cc", "rcx", "r11", "memory"
+	);
+	return (res);
+}
+
+#include <compat/linux/linux_vdso_gtod.inc>
diff --git a/sys/amd64/linux32/linux32_locore.asm b/sys/amd64/linux32/linux32_locore.asm
index 5862f0a0d674..f96b3e730f9f 100644
--- a/sys/amd64/linux32/linux32_locore.asm
+++ b/sys/amd64/linux32/linux32_locore.asm
@@ -18,7 +18,7 @@ linux_platform:
  * To avoid excess stack frame the signal trampoline code emulates
  * the 'call' instruction.
  */
-NON_GPROF_ENTRY(linux32_sigcode)
+ENTRY(__kernel_sigreturn)
 	movl	%esp, %ebx			/* preserve sigframe */
 	call .getip0
 .getip0:
@@ -33,7 +33,7 @@ NON_GPROF_ENTRY(linux32_sigcode)
 .endsigcode:
 0:	jmp	0b
 
-NON_GPROF_ENTRY(linux32_rt_sigcode)
+ENTRY(__kernel_rt_sigreturn)
 	leal	LINUX_RT_SIGF_UC(%esp),%ebx	/* linux ucp */
 	leal	LINUX_RT_SIGF_SC(%ebx),%ecx	/* linux sigcontext */
 	movl	%esp, %edi
@@ -49,7 +49,7 @@ NON_GPROF_ENTRY(linux32_rt_sigcode)
 .endrtsigcode:
 0:	jmp	0b
 
-NON_GPROF_ENTRY(linux32_vsyscall)
+ENTRY(__kernel_vsyscall)
 .startvsyscall:
 	int $0x80
 	ret
diff --git a/sys/amd64/linux32/linux32_sysvec.c b/sys/amd64/linux32/linux32_sysvec.c
index 2a3fde78852d..10f616c56510 100644
--- a/sys/amd64/linux32/linux32_sysvec.c
+++ b/sys/amd64/linux32/linux32_sysvec.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/stddef.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
@@ -78,6 +79,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/specialreg.h>
 #include <machine/trap.h>
 
+#include <x86/linux/linux_x86.h>
 #include <amd64/linux32/linux.h>
 #include <amd64/linux32/linux32_proto.h>
 #include <compat/linux/linux_emul.h>
@@ -91,14 +93,21 @@ __FBSDID("$FreeBSD$");
 MODULE_VERSION(linux, 1);
 
 #define	LINUX32_MAXUSER		((1ul << 32) - PAGE_SIZE)
-#define	LINUX32_SHAREDPAGE	(LINUX32_MAXUSER - PAGE_SIZE)
+#define	LINUX32_VDSOPAGE_SIZE	PAGE_SIZE * 2
+#define	LINUX32_VDSOPAGE	(LINUX32_MAXUSER - LINUX32_VDSOPAGE_SIZE)
+#define	LINUX32_SHAREDPAGE	(LINUX32_VDSOPAGE - PAGE_SIZE)
+				/*
+				 * PAGE_SIZE - the size
+				 * of the native SHAREDPAGE
+				 */
 #define	LINUX32_USRSTACK	LINUX32_SHAREDPAGE
 
 static int linux_szsigcode;
-static vm_object_t linux_shared_page_obj;
-static char *linux_shared_page_mapping;
-extern char _binary_linux32_locore_o_start;
-extern char _binary_linux32_locore_o_end;
+static vm_object_t linux_vdso_obj;
+static char *linux_vdso_mapping;
+extern char _binary_linux32_vdso_so_o_start;
+extern char _binary_linux32_vdso_so_o_end;
+static vm_offset_t linux_vdso_base;
 
 extern struct sysent linux32_sysent[LINUX32_SYS_MAXSYSCALL];
 
@@ -111,12 +120,14 @@ static int	linux_copyout_strings(struct image_params *imgp,
 static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
 static void	linux_exec_setregs(struct thread *td,
 				   struct image_params *imgp, uintptr_t stack);
+static void	linux_exec_sysvec_init(void *param);
 static int	linux_on_exec_vmspace(struct proc *p,
 		    struct image_params *imgp);
 static void	linux32_fixlimit(struct rlimit *rl, int which);
 static bool	linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static void	linux_vdso_install(void *param);
 static void	linux_vdso_deinstall(void *param);
+static void	linux_vdso_reloc(char *mapping, Elf_Addr offset);
 static void	linux32_set_syscall_retval(struct thread *td, int error);
 
 #define LINUX_T_UNKNOWN  255
@@ -167,9 +178,11 @@ struct linux32_ps_strings {
 #define	LINUX32_PS_STRINGS	(LINUX32_USRSTACK - \
 				    sizeof(struct linux32_ps_strings))
 
-LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
-LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
-LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
+LINUX_VDSO_SYM_INTPTR(__kernel_vsyscall);
+LINUX_VDSO_SYM_INTPTR(__kernel_sigreturn);
+LINUX_VDSO_SYM_INTPTR(__kernel_rt_sigreturn);
+LINUX_VDSO_SYM_INTPTR(kern_timekeep_base);
+LINUX_VDSO_SYM_INTPTR(kern_tsc_selector);
 LINUX_VDSO_SYM_CHAR(linux_platform);
 
 /*
@@ -206,9 +219,8 @@ linux_copyout_auxargs(struct image_params *imgp, uintptr_t base)
 	    M_WAITOK | M_ZERO);
 
 	issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0;
-	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
-	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
-	    imgp->proc->p_sysent->sv_shared_page_base);
+	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall);
+	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base);
 	AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 
@@ -354,7 +366,7 @@ linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 
 	/* Build context to run handler in. */
 	regs->tf_rsp = PTROUT(fp);
-	regs->tf_rip = linux32_rt_sigcode;
+	regs->tf_rip = __kernel_rt_sigreturn;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
@@ -460,7 +472,7 @@ linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 
 	/* Build context to run handler in. */
 	regs->tf_rsp = PTROUT(fp);
-	regs->tf_rip = linux32_sigcode;
+	regs->tf_rip = __kernel_sigreturn;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
@@ -901,7 +913,7 @@ struct sysentvec elf_linux_sysvec = {
 	.sv_transtrap	= linux_translate_traps,
 	.sv_fixup	= linux_fixup_elf,
 	.sv_sendsig	= linux_sendsig,
-	.sv_sigcode	= &_binary_linux32_locore_o_start,
+	.sv_sigcode	= &_binary_linux32_vdso_so_o_start,
 	.sv_szsigcode	= &linux_szsigcode,
 	.sv_name	= "Linux ELF32",
 	.sv_coredump	= elf32_coredump,
@@ -922,7 +934,7 @@ struct sysentvec elf_linux_sysvec = {
 	.sv_fixlimit	= linux32_fixlimit,
 	.sv_maxssiz	= &linux32_maxssiz,
 	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP |
-	    SV_SIG_DISCIGN | SV_SIG_WAITNDQ,
+	    SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP,
 	.sv_set_syscall_retval = linux32_set_syscall_retval,
 	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
 	.sv_syscallnames = NULL,
@@ -940,45 +952,127 @@ struct sysentvec elf_linux_sysvec = {
 static int
 linux_on_exec_vmspace(struct proc *p, struct image_params *imgp)
 {
+	int error;
 
-	linux_on_exec(p, imgp);
-	return (0);
+	error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base,
+	    LINUX32_VDSOPAGE_SIZE, imgp);
+	if (error == 0)
+		linux_on_exec(p, imgp);
+	return (error);
 }
 
 static void
-linux_vdso_install(void *param)
+linux_exec_sysvec_init(void *param)
 {
+	l_uintptr_t *ktimekeep_base, *ktsc_selector;
+	struct sysentvec *sv;
+	ptrdiff_t tkoff;
+
+	sv = param;
+	/* Fill timekeep_base */
+	exec_sysvec_init(sv);
+
+	tkoff = kern_timekeep_base - linux_vdso_base;
+	ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+	*ktimekeep_base = sv->sv_timekeep_base;
+
+	tkoff = kern_tsc_selector - linux_vdso_base;
+	ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff);
+	*ktsc_selector = linux_vdso_tsc_selector_idx();
+	if (bootverbose)
+		printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector);
+}
+SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC, SI_ORDER_ANY,
+    linux_exec_sysvec_init, &elf_linux_sysvec);
 
-	linux_szsigcode = (&_binary_linux32_locore_o_end -
-	    &_binary_linux32_locore_o_start);
+static void
+linux_vdso_install(void *param)
+{
+	char *vdso_start = &_binary_linux32_vdso_so_o_start;
+	char *vdso_end = &_binary_linux32_vdso_so_o_end;
 
-	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
-		panic("Linux invalid vdso size\n");
+	linux_szsigcode = vdso_end - vdso_start;
+	MPASS(linux_szsigcode <= LINUX32_VDSOPAGE_SIZE);
 
-	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
+	linux_vdso_base = LINUX32_VDSOPAGE;
 
-	linux_shared_page_obj = __elfN(linux_shared_page_init)
-	    (&linux_shared_page_mapping);
+	__elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base);
 
-	__elfN(linux_vdso_reloc)(&elf_linux_sysvec);
+	linux_vdso_obj = __elfN(linux_shared_page_init)
+	    (&linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE);
+	bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode);
 
-	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
-	    linux_szsigcode);
-	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
+	linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base);
 }
-SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
+SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_FIRST,
     linux_vdso_install, NULL);
 
 static void
 linux_vdso_deinstall(void *param)
 {
 
-	__elfN(linux_shared_page_fini)(linux_shared_page_obj,
-	    linux_shared_page_mapping);
+	__elfN(linux_shared_page_fini)(linux_vdso_obj,
+	    linux_vdso_mapping, LINUX32_VDSOPAGE_SIZE);
 }
 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
     linux_vdso_deinstall, NULL);
 
+static void
+linux_vdso_reloc(char *mapping, Elf_Addr offset)
+{
+	const Elf_Shdr *shdr;
+	const Elf_Rel *rel;
+	const Elf_Ehdr *ehdr;
+	Elf32_Addr *where;
+	Elf_Size rtype, symidx;
+	Elf32_Addr addr, addend;
+	int i, relcnt;
+
+	MPASS(offset != 0);
+
+	relcnt = 0;
+	ehdr = (const Elf_Ehdr *)mapping;
+	shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff);
+	for (i = 0; i < ehdr->e_shnum; i++)
+	{
+		switch (shdr[i].sh_type) {
+		case SHT_REL:
+			rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset);
+			relcnt = shdr[i].sh_size / sizeof(*rel);
+			break;
+		case SHT_RELA:
+			printf("Linux i386 vDSO: unexpected Rela section\n");
+			break;
+		}
+	}
+
+	for (i = 0; i < relcnt; i++, rel++) {
+		where = (Elf32_Addr *)(mapping + rel->r_offset);
+		addend = *where;
+		rtype = ELF_R_TYPE(rel->r_info);
+		symidx = ELF_R_SYM(rel->r_info);
+
+		switch (rtype) {
+		case R_386_NONE:	/* none */
+			break;
+
+		case R_386_RELATIVE:	/* B + A */
+			addr = (Elf32_Addr)PTROUT(offset + addend);
+			if (*where != addr)
+				*where = addr;
+			break;
+
+		case R_386_IRELATIVE:
+			printf("Linux i386 vDSO: unexpected ifunc relocation, "
+			    "symbol index %ld\n", (intmax_t)symidx);
+			break;
+		default:
+			printf("Linux i386 vDSO: unexpected relocation type %ld, "
+			    "symbol index %ld\n", (intmax_t)rtype, (intmax_t)symidx);
+		}
+	}
+}
+
 static char GNU_ABI_VENDOR[] = "GNU";
 static int GNULINUX_ABI_DESC = 0;
 
diff --git a/sys/amd64/linux32/linux32_vdso.lds.s b/sys/amd64/linux32/linux32_vdso.lds.s
index a49c209a1ebc..0a392e6380b6 100644
--- a/sys/amd64/linux32/linux32_vdso.lds.s
+++ b/sys/amd64/linux32/linux32_vdso.lds.s
@@ -51,16 +51,30 @@ PHDRS
 	eh_frame_hdr	PT_GNU_EH_FRAME;
 }
 
-ENTRY(linux32_vsyscall);
-
 VERSION
 {
+	LINUX_2.6 {
+	global:
+		__vdso_clock_gettime;
+		__vdso_gettimeofday;
+		__vdso_time;
+		__vdso_clock_getres;
+		__vdso_clock_gettime64;
+	};
+
 	LINUX_2.5 {
 	global:
-		linux32_vsyscall;
-		linux32_sigcode;
-		linux32_rt_sigcode;
+		__kernel_vsyscall;
+		__kernel_sigreturn;
+		__kernel_rt_sigreturn;
+	local: *;
+	};
+
+	LINUX_0.0 {
+	global:
 		linux_platform;
+		kern_timekeep_base;
+		kern_tsc_selector;
 	local: *;
 	};
 }
diff --git a/sys/amd64/linux32/linux32_vdso_gtod.c b/sys/amd64/linux32/linux32_vdso_gtod.c
new file mode 100644
index 000000000000..f1573ca3c1b1
--- /dev/null
+++ b/sys/amd64/linux32/linux32_vdso_gtod.c
@@ -0,0 +1,146 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Dmitry Chagin <dchagin@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/elf.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#define	_KERNEL
+#include <sys/vdso.h>
+#undef	_KERNEL
+#include <stdbool.h>
+#include <strings.h>
+
+#include <machine/atomic.h>
+#include <machine/stdarg.h>
+
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_syscall.h>
+#include <compat/linux/linux_errno.h>
+#include <compat/linux/linux_timer.h>
+
+/* The kernel fixup this at vDSO install */
+uintptr_t *kern_timekeep_base = NULL;
+uint32_t kern_tsc_selector = 0;
+
+#include <x86/linux/linux_vdso_gettc_x86.inc>
+
+static int
+write(int fd, const void *buf, size_t size)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "int $0x80"
+	    : "=a"(res)
+	    : "a"(LINUX32_SYS_write), "b"(fd), "c"(buf), "d"(size)
+	    : "cc", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_clock_gettime_fallback(clockid_t clock_id, struct l_timespec *ts)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "int $0x80"
+	    : "=a"(res)
+	    : "a"(LINUX32_SYS_linux_clock_gettime), "b"(clock_id), "c"(ts)
+	    : "cc", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_clock_gettime64_fallback(clockid_t clock_id, struct l_timespec64 *ts)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "int $0x80"
+	    : "=a"(res)
+	    : "a"(LINUX32_SYS_linux_clock_gettime64), "b"(clock_id), "c"(ts)
+	    : "cc", "memory"
+	);
+	return (res);
+}
+
+static int
+__vdso_gettimeofday_fallback(l_timeval *tv, struct timezone *tz)
+{
+	int res;
+
+	__asm__ __volatile__
+	(
+	    "int $0x80"
+	    : "=a"(res)
+	    : "a"(LINUX32_SYS_linux_gettimeofday), "b"(tv), "c"(tz)
+	    : "cc", "memory"
+	);
*** 2174 LINES SKIPPED ***