git: b27e30232927 - stable/13 - linux(4): Implement clone3 system call.

From: Dmitry Chagin <dchagin_at_FreeBSD.org>
Date: Fri, 17 Jun 2022 19:38:02 UTC
The branch stable/13 has been updated by dchagin:

URL: https://cgit.FreeBSD.org/src/commit/?id=b27e30232927dffd86a4498aa418f798a19cc3b0

commit b27e30232927dffd86a4498aa418f798a19cc3b0
Author:     Dmitry Chagin <dchagin@FreeBSD.org>
AuthorDate: 2021-08-12 08:49:36 +0000
Commit:     Dmitry Chagin <dchagin@FreeBSD.org>
CommitDate: 2022-06-17 19:33:30 +0000

    linux(4): Implement clone3 system call.
    
    clone3 system call is used by glibc-2.34.
    
    Differential revision:  https://reviews.freebsd.org/D31475
    MFC after:              2 weeks
    
    (cherry picked from commit 17913b0b6b707568d63559255820f3212cd31cdf)
---
 sys/amd64/linux/syscalls.master   |  5 ++-
 sys/amd64/linux32/syscalls.master |  5 ++-
 sys/arm64/linux/syscalls.master   |  5 ++-
 sys/compat/linux/linux_fork.c     | 80 +++++++++++++++++++++++++++++++++++++++
 sys/compat/linux/linux_fork.h     |  9 +++++
 sys/compat/linux/linux_misc.h     |  2 +
 sys/i386/linux/syscalls.master    |  5 ++-
 7 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/sys/amd64/linux/syscalls.master b/sys/amd64/linux/syscalls.master
index cdf663ce2e06..d3ebedbfed01 100644
--- a/sys/amd64/linux/syscalls.master
+++ b/sys/amd64/linux/syscalls.master
@@ -2082,7 +2082,10 @@
 		int linux_pidfd_open(void);
 	}
 435    AUE_NULL		STD {
-		int linux_clone3(void);
+		int linux_clone3(
+		    struct l_user_clone_args *uargs,
+		    l_size_t usize
+		);
 	}
 436    AUE_NULL		STD {
 		int linux_close_range(void);
diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master
index ff7ab7f98ca8..9d55fb1ade48 100644
--- a/sys/amd64/linux32/syscalls.master
+++ b/sys/amd64/linux32/syscalls.master
@@ -2484,7 +2484,10 @@
 		int linux_pidfd_open(void);
 	}
 435    AUE_NULL		STD {
-		int linux_clone3(void);
+		int linux_clone3(
+		    struct l_user_clone_args *uargs,
+		    l_size_t usize
+		);
 	}
 436    AUE_NULL		STD {
 		int linux_close_range(void);
diff --git a/sys/arm64/linux/syscalls.master b/sys/arm64/linux/syscalls.master
index 6e163cc3360d..a6bb14a5ed63 100644
--- a/sys/arm64/linux/syscalls.master
+++ b/sys/arm64/linux/syscalls.master
@@ -1731,7 +1731,10 @@
 		int linux_pidfd_open(void);
 	}
 435    AUE_NULL		STD {
-		int linux_clone3(void);
+		int linux_clone3(
+		    struct l_user_clone_args *uargs,
+		    l_size_t usize
+		);
 	}
 436    AUE_NULL		STD {
 		int linux_close_range(void);
diff --git a/sys/compat/linux/linux_fork.c b/sys/compat/linux/linux_fork.c
index 97f5b7d89de4..db3e9e1ea27b 100644
--- a/sys/compat/linux/linux_fork.c
+++ b/sys/compat/linux/linux_fork.c
@@ -377,6 +377,86 @@ linux_clone(struct thread *td, struct linux_clone_args *args)
 		return (linux_clone_proc(td, &ca));
 }
 
+
+static int
+linux_clone3_args_valid(struct l_user_clone_args *uca)
+{
+
+	/* Verify that no unknown flags are passed along. */
+	if ((uca->flags & ~(LINUX_CLONE_LEGACY_FLAGS |
+	    LINUX_CLONE_CLEAR_SIGHAND | LINUX_CLONE_INTO_CGROUP)) != 0)
+		return (EINVAL);
+	if ((uca->flags & (LINUX_CLONE_DETACHED | LINUX_CSIGNAL)) != 0)
+		return (EINVAL);
+
+	if ((uca->flags & (LINUX_CLONE_SIGHAND | LINUX_CLONE_CLEAR_SIGHAND)) ==
+	    (LINUX_CLONE_SIGHAND | LINUX_CLONE_CLEAR_SIGHAND))
+		return (EINVAL);
+	if ((uca->flags & (LINUX_CLONE_THREAD | LINUX_CLONE_PARENT)) != 0 &&
+	    uca->exit_signal != 0)
+		return (EINVAL);
+
+	/* We don't support set_tid, only validate input. */
+	if (uca->set_tid_size > LINUX_MAX_PID_NS_LEVEL)
+		return (EINVAL);
+	if (uca->set_tid == 0 && uca->set_tid_size > 0)
+		return (EINVAL);
+	if (uca->set_tid != 0 && uca->set_tid_size == 0)
+		return (EINVAL);
+
+	if (uca->stack == 0 && uca->stack_size > 0)
+		return (EINVAL);
+	if (uca->stack != 0 && uca->stack_size == 0)
+		return (EINVAL);
+
+	return (0);
+}
+
+int
+linux_clone3(struct thread *td, struct linux_clone3_args *args)
+{
+	struct l_user_clone_args *uca;
+	struct l_clone_args *ca;
+	size_t size;
+	int error;
+
+	if (args->usize > PAGE_SIZE)
+		return (E2BIG);
+	if (args->usize < LINUX_CLONE_ARGS_SIZE_VER0)
+		return (EINVAL);
+
+	/*
+	 * usize can be less than size of struct clone_args, to avoid using
+	 * of uninitialized data of struct clone_args, allocate at least
+	 * sizeof(struct clone_args) storage and zero it.
+	 */
+	size = max(args->usize, sizeof(*uca));
+	uca = malloc(size, M_LINUX, M_WAITOK | M_ZERO);
+	error = copyin(args->uargs, uca, args->usize);
+	if (error != 0)
+		goto out;
+	error = linux_clone3_args_valid(uca);
+	if (error != 0)
+		goto out;
+	ca = malloc(sizeof(*ca), M_LINUX, M_WAITOK | M_ZERO);
+	ca->flags = uca->flags;
+	ca->child_tid = PTRIN(uca->child_tid);
+	ca->parent_tid = PTRIN(uca->parent_tid);
+	ca->exit_signal = uca->exit_signal;
+	ca->stack = uca->stack + uca->stack_size;
+	ca->stack_size = uca->stack_size;
+	ca->tls = uca->tls;
+
+	if ((ca->flags & LINUX_CLONE_THREAD) != 0)
+		error = linux_clone_thread(td, ca);
+	else
+		error = linux_clone_proc(td, ca);
+	free(ca, M_LINUX);
+out:
+	free(uca, M_LINUX);
+	return (error);
+}
+
 int
 linux_exit(struct thread *td, struct linux_exit_args *args)
 {
diff --git a/sys/compat/linux/linux_fork.h b/sys/compat/linux/linux_fork.h
index 04dfb8ac8a70..fa7b39544450 100644
--- a/sys/compat/linux/linux_fork.h
+++ b/sys/compat/linux/linux_fork.h
@@ -53,6 +53,13 @@
 #define	LINUX_CLONE_NEWNET		0x40000000
 #define	LINUX_CLONE_IO			0x80000000
 
+/* Flags for the clone3() syscall. */
+#define	LINUX_CLONE_CLEAR_SIGHAND	0x100000000ULL
+#define	LINUX_CLONE_INTO_CGROUP		0x200000000ULL
+#define	LINUX_CLONE_NEWTIME		0x00000080
+
+#define	LINUX_CLONE_LEGACY_FLAGS	0xffffffffULL
+
 #define	LINUX_CSIGNAL			0x000000ff
 
 /*
@@ -85,6 +92,8 @@ struct l_clone_args {
 	l_ulong tls;
 };
 
+#define	LINUX_CLONE_ARGS_SIZE_VER0	64
+
 int linux_set_upcall(struct thread *, register_t);
 int linux_set_cloned_tls(struct thread *, void *);
 void linux_thread_detach(struct thread *);
diff --git a/sys/compat/linux/linux_misc.h b/sys/compat/linux/linux_misc.h
index 80f6b8a58e81..ceb140d3da75 100644
--- a/sys/compat/linux/linux_misc.h
+++ b/sys/compat/linux/linux_misc.h
@@ -33,6 +33,8 @@
 
 #include <sys/sysctl.h>
 
+#define	LINUX_MAX_PID_NS_LEVEL	32
+
 				/* bits per mask */
 #define	LINUX_NFDBITS		sizeof(l_fd_mask) * 8
 
diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master
index aecb852e21c7..27bbca9e65e7 100644
--- a/sys/i386/linux/syscalls.master
+++ b/sys/i386/linux/syscalls.master
@@ -2502,7 +2502,10 @@
 		int linux_pidfd_open(void);
 	}
 435    AUE_NULL		STD {
-		int linux_clone3(void);
+		int linux_clone3(
+		    struct l_user_clone_args *uargs,
+		    l_size_t usize
+		);
 	}
 436    AUE_NULL		STD {
 		int linux_close_range(void);