Re: git: af93fea71038 - main - timerfd: Move implementation from linux compat to sys/kern
Date: Thu, 24 Aug 2023 22:18:18 UTC
On Thu, Aug 24, 2023 at 08:29:48PM +0000, Warner Losh wrote:
> The branch main has been updated by imp:
>
> URL: https://cgit.FreeBSD.org/src/commit/?id=af93fea710385b2b11f0cabd377e7ed6f3d97c34
>
> commit af93fea710385b2b11f0cabd377e7ed6f3d97c34
> Author: Jake Freeland <jfree@freebsd.org>
> AuthorDate: 2023-08-24 04:39:54 +0000
> Commit: Warner Losh <imp@FreeBSD.org>
> CommitDate: 2023-08-24 20:28:56 +0000
>
> timerfd: Move implementation from linux compat to sys/kern
>
> Move the timerfd impelemntation from linux compat code to sys/kern. Use
> it to implement the new system calls for timerfd. Add a hook to kern_tc
> to allow timerfd to know when the system time has stepped. Add kqueue
> support to timerfd. Adjust a few names to be less Linux centric.
>
> RelNotes: YES
> Reviewed by: markj (on irc), imp, kib (with reservations), jhb (slack)
> Differential Revision: https://reviews.freebsd.org/D38459
> ---
> lib/libc/sys/Symbol.map | 3 +
> sys/bsm/audit_kevents.h | 1 +
> sys/compat/freebsd32/freebsd32_proto.h | 14 +
> sys/compat/freebsd32/freebsd32_syscall.h | 5 +-
> sys/compat/freebsd32/freebsd32_syscalls.c | 3 +
> sys/compat/freebsd32/freebsd32_sysent.c | 3 +
> sys/compat/freebsd32/freebsd32_systrace_args.c | 86 ++++
> sys/compat/linux/linux_event.c | 443 ++---------------
> sys/compat/linux/linux_event.h | 11 -
> sys/conf/files | 1 +
> sys/kern/init_sysent.c | 3 +
> sys/kern/kern_descrip.c | 4 +-
> sys/kern/kern_tc.c | 2 +
> sys/kern/sys_timerfd.c | 632 +++++++++++++++++++++++++
> sys/kern/syscalls.c | 3 +
> sys/kern/syscalls.master | 20 +
> sys/kern/systrace_args.c | 86 ++++
> sys/sys/file.h | 2 +-
> sys/sys/syscall.h | 5 +-
> sys/sys/syscall.mk | 5 +-
> sys/sys/sysproto.h | 20 +
> sys/sys/timerfd.h | 66 +++
> sys/sys/user.h | 6 +
> 23 files changed, 999 insertions(+), 425 deletions(-)
>
> diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
> index 9a07bb457eb8..7937661e3787 100644
> --- a/lib/libc/sys/Symbol.map
> +++ b/lib/libc/sys/Symbol.map
> @@ -421,6 +421,9 @@ FBSD_1.7 {
> kqueuex;
> membarrier;
> swapoff;
> + timerfd_create;
> + timerfd_gettime;
> + timerfd_settime;
> };
>
> FBSDprivate_1.0 {
> diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
> index a6b50a67ee6a..d06381837aad 100644
> --- a/sys/bsm/audit_kevents.h
> +++ b/sys/bsm/audit_kevents.h
> @@ -661,6 +661,7 @@
> #define AUE_AIO_WRITEV 43267 /* FreeBSD-specific. */
> #define AUE_AIO_READV 43268 /* FreeBSD-specific. */
> #define AUE_FSPACECTL 43269 /* FreeBSD-specific. */
> +#define AUE_TIMERFD 43270 /* FreeBSD/Linux. */
>
> /*
> * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
> diff --git a/sys/compat/freebsd32/freebsd32_proto.h b/sys/compat/freebsd32/freebsd32_proto.h
> index bb333e0321a0..50448b6dce16 100644
> --- a/sys/compat/freebsd32/freebsd32_proto.h
> +++ b/sys/compat/freebsd32/freebsd32_proto.h
> @@ -684,6 +684,16 @@ struct freebsd32_aio_writev_args {
> struct freebsd32_aio_readv_args {
> char aiocbp_l_[PADL_(struct aiocb32 *)]; struct aiocb32 * aiocbp; char aiocbp_r_[PADR_(struct aiocb32 *)];
> };
> +struct freebsd32_timerfd_gettime_args {
> + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
> + char curr_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * curr_value; char curr_value_r_[PADR_(struct itimerspec32 *)];
> +};
> +struct freebsd32_timerfd_settime_args {
> + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
> + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
> + char new_value_l_[PADL_(const struct itimerspec32 *)]; const struct itimerspec32 * new_value; char new_value_r_[PADR_(const struct itimerspec32 *)];
> + char old_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * old_value; char old_value_r_[PADR_(struct itimerspec32 *)];
> +};
> int freebsd32_wait4(struct thread *, struct freebsd32_wait4_args *);
> int freebsd32_ptrace(struct thread *, struct freebsd32_ptrace_args *);
> int freebsd32_recvmsg(struct thread *, struct freebsd32_recvmsg_args *);
> @@ -799,6 +809,8 @@ int freebsd32_cpuset_setdomain(struct thread *, struct freebsd32_cpuset_setdomai
> int freebsd32___sysctlbyname(struct thread *, struct freebsd32___sysctlbyname_args *);
> int freebsd32_aio_writev(struct thread *, struct freebsd32_aio_writev_args *);
> int freebsd32_aio_readv(struct thread *, struct freebsd32_aio_readv_args *);
> +int freebsd32_timerfd_gettime(struct thread *, struct freebsd32_timerfd_gettime_args *);
> +int freebsd32_timerfd_settime(struct thread *, struct freebsd32_timerfd_settime_args *);
>
> #ifdef COMPAT_43
>
> @@ -1292,6 +1304,8 @@ int freebsd11_freebsd32_fstatat(struct thread *, struct freebsd11_freebsd32_fsta
> #define FREEBSD32_SYS_AUE_freebsd32___sysctlbyname AUE_SYSCTL
> #define FREEBSD32_SYS_AUE_freebsd32_aio_writev AUE_AIO_WRITEV
> #define FREEBSD32_SYS_AUE_freebsd32_aio_readv AUE_AIO_READV
> +#define FREEBSD32_SYS_AUE_freebsd32_timerfd_gettime AUE_TIMERFD
> +#define FREEBSD32_SYS_AUE_freebsd32_timerfd_settime AUE_TIMERFD
>
> #undef PAD_
> #undef PADL_
> diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h
> index c3d8617abf4b..e3777730be1c 100644
> --- a/sys/compat/freebsd32/freebsd32_syscall.h
> +++ b/sys/compat/freebsd32/freebsd32_syscall.h
> @@ -502,4 +502,7 @@
> #define FREEBSD32_SYS_swapoff 582
> #define FREEBSD32_SYS_kqueuex 583
> #define FREEBSD32_SYS_membarrier 584
> -#define FREEBSD32_SYS_MAXSYSCALL 585
> +#define FREEBSD32_SYS_timerfd_create 585
> +#define FREEBSD32_SYS_freebsd32_timerfd_gettime 586
> +#define FREEBSD32_SYS_freebsd32_timerfd_settime 587
> +#define FREEBSD32_SYS_MAXSYSCALL 588
> diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c
> index 19d454743c55..ccc910ee5ca9 100644
> --- a/sys/compat/freebsd32/freebsd32_syscalls.c
> +++ b/sys/compat/freebsd32/freebsd32_syscalls.c
> @@ -590,4 +590,7 @@ const char *freebsd32_syscallnames[] = {
> "swapoff", /* 582 = swapoff */
> "kqueuex", /* 583 = kqueuex */
> "membarrier", /* 584 = membarrier */
> + "timerfd_create", /* 585 = timerfd_create */
> + "freebsd32_timerfd_gettime", /* 586 = freebsd32_timerfd_gettime */
> + "freebsd32_timerfd_settime", /* 587 = freebsd32_timerfd_settime */
> };
> diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c
> index 971f06a643c5..fec6f4a47bd6 100644
> --- a/sys/compat/freebsd32/freebsd32_sysent.c
> +++ b/sys/compat/freebsd32/freebsd32_sysent.c
> @@ -646,4 +646,7 @@ struct sysent freebsd32_sysent[] = {
> { .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 582 = swapoff */
> { .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 583 = kqueuex */
> { .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 584 = membarrier */
> + { .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 585 = timerfd_create */
> + { .sy_narg = AS(freebsd32_timerfd_gettime_args), .sy_call = (sy_call_t *)freebsd32_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 586 = freebsd32_timerfd_gettime */
> + { .sy_narg = AS(freebsd32_timerfd_settime_args), .sy_call = (sy_call_t *)freebsd32_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 587 = freebsd32_timerfd_settime */
> };
> diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c
> index 5dfc82c30b7b..2c26a0ddab2f 100644
> --- a/sys/compat/freebsd32/freebsd32_systrace_args.c
> +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c
> @@ -3336,6 +3336,32 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
> *n_args = 3;
> break;
> }
> + /* timerfd_create */
> + case 585: {
> + struct timerfd_create_args *p = params;
> + iarg[a++] = p->clockid; /* int */
> + iarg[a++] = p->flags; /* int */
> + *n_args = 2;
> + break;
> + }
> + /* freebsd32_timerfd_gettime */
> + case 586: {
> + struct freebsd32_timerfd_gettime_args *p = params;
> + iarg[a++] = p->fd; /* int */
> + uarg[a++] = (intptr_t)p->curr_value; /* struct itimerspec32 * */
> + *n_args = 2;
> + break;
> + }
> + /* freebsd32_timerfd_settime */
> + case 587: {
> + struct freebsd32_timerfd_settime_args *p = params;
> + iarg[a++] = p->fd; /* int */
> + iarg[a++] = p->flags; /* int */
> + uarg[a++] = (intptr_t)p->new_value; /* const struct itimerspec32 * */
> + uarg[a++] = (intptr_t)p->old_value; /* struct itimerspec32 * */
> + *n_args = 4;
> + break;
> + }
> default:
> *n_args = 0;
> break;
> @@ -9005,6 +9031,51 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
> break;
> };
> break;
> + /* timerfd_create */
> + case 585:
> + switch (ndx) {
> + case 0:
> + p = "int";
> + break;
> + case 1:
> + p = "int";
> + break;
> + default:
> + break;
> + };
> + break;
> + /* freebsd32_timerfd_gettime */
> + case 586:
> + switch (ndx) {
> + case 0:
> + p = "int";
> + break;
> + case 1:
> + p = "userland struct itimerspec32 *";
> + break;
> + default:
> + break;
> + };
> + break;
> + /* freebsd32_timerfd_settime */
> + case 587:
> + switch (ndx) {
> + case 0:
> + p = "int";
> + break;
> + case 1:
> + p = "int";
> + break;
> + case 2:
> + p = "userland const struct itimerspec32 *";
> + break;
> + case 3:
> + p = "userland struct itimerspec32 *";
> + break;
> + default:
> + break;
> + };
> + break;
> default:
> break;
> };
> @@ -10873,6 +10944,21 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
> if (ndx == 0 || ndx == 1)
> p = "int";
> break;
> + /* timerfd_create */
> + case 585:
> + if (ndx == 0 || ndx == 1)
> + p = "int";
> + break;
> + /* freebsd32_timerfd_gettime */
> + case 586:
> + if (ndx == 0 || ndx == 1)
> + p = "int";
> + break;
> + /* freebsd32_timerfd_settime */
> + case 587:
> + if (ndx == 0 || ndx == 1)
> + p = "int";
> + break;
> default:
> break;
> };
> diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
> index a7db8516e5f0..816c68a90f1d 100644
> --- a/sys/compat/linux/linux_event.c
> +++ b/sys/compat/linux/linux_event.c
> @@ -44,6 +44,7 @@
> #include <sys/specialfd.h>
> #include <sys/sx.h>
> #include <sys/syscallsubr.h>
> +#include <sys/timerfd.h>
> #include <sys/timespec.h>
> #include <sys/user.h>
>
> @@ -99,55 +100,6 @@ struct epoll_copyout_args {
> int error;
> };
>
> -/* timerfd */
> -typedef uint64_t timerfd_t;
> -
> -static fo_rdwr_t timerfd_read;
> -static fo_ioctl_t timerfd_ioctl;
> -static fo_poll_t timerfd_poll;
> -static fo_kqfilter_t timerfd_kqfilter;
> -static fo_stat_t timerfd_stat;
> -static fo_close_t timerfd_close;
> -static fo_fill_kinfo_t timerfd_fill_kinfo;
> -
> -static struct fileops timerfdops = {
> - .fo_read = timerfd_read,
> - .fo_write = invfo_rdwr,
> - .fo_truncate = invfo_truncate,
> - .fo_ioctl = timerfd_ioctl,
> - .fo_poll = timerfd_poll,
> - .fo_kqfilter = timerfd_kqfilter,
> - .fo_stat = timerfd_stat,
> - .fo_close = timerfd_close,
> - .fo_chmod = invfo_chmod,
> - .fo_chown = invfo_chown,
> - .fo_sendfile = invfo_sendfile,
> - .fo_fill_kinfo = timerfd_fill_kinfo,
> - .fo_flags = DFLAG_PASSABLE
> -};
> -
> -static void filt_timerfddetach(struct knote *kn);
> -static int filt_timerfdread(struct knote *kn, long hint);
> -
> -static struct filterops timerfd_rfiltops = {
> - .f_isfd = 1,
> - .f_detach = filt_timerfddetach,
> - .f_event = filt_timerfdread
> -};
> -
> -struct timerfd {
> - clockid_t tfd_clockid;
> - struct itimerspec tfd_time;
> - struct callout tfd_callout;
> - timerfd_t tfd_count;
> - bool tfd_canceled;
> - struct selinfo tfd_sel;
> - struct mtx tfd_lock;
> -};
> -
> -static void linux_timerfd_expire(void *);
> -static void linux_timerfd_curval(struct timerfd *, struct itimerspec *);
> -
> static int
> epoll_create_common(struct thread *td, int flags)
> {
> @@ -658,255 +610,14 @@ linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
> int
> linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
> {
> - struct timerfd *tfd;
> - struct file *fp;
> clockid_t clockid;
> - int fflags, fd, error;
> -
> - if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
> - return (EINVAL);
> -
> - error = linux_to_native_clockid(&clockid, args->clockid);
> - if (error != 0)
> - return (error);
> - if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
> - return (EINVAL);
> -
> - fflags = 0;
> - if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
> - fflags |= O_CLOEXEC;
> -
> - error = falloc(td, &fp, &fd, fflags);
> - if (error != 0)
> - return (error);
> -
> - tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
> - tfd->tfd_clockid = clockid;
> - mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
> -
> - callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
> - knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
> -
> - fflags = FREAD;
> - if ((args->flags & LINUX_O_NONBLOCK) != 0)
> - fflags |= FNONBLOCK;
> -
> - finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
> - fdrop(fp, td);
> -
> - td->td_retval[0] = fd;
> - return (error);
> -}
> -
> -static int
> -timerfd_close(struct file *fp, struct thread *td)
> -{
> - struct timerfd *tfd;
> -
> - tfd = fp->f_data;
> - if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> - return (EINVAL);
> -
> - timespecclear(&tfd->tfd_time.it_value);
> - timespecclear(&tfd->tfd_time.it_interval);
> -
> - callout_drain(&tfd->tfd_callout);
> -
> - seldrain(&tfd->tfd_sel);
> - knlist_destroy(&tfd->tfd_sel.si_note);
> -
> - fp->f_ops = &badfileops;
> - mtx_destroy(&tfd->tfd_lock);
> - free(tfd, M_EPOLL);
> -
> - return (0);
> -}
> -
> -static int
> -timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
> - int flags, struct thread *td)
> -{
> - struct timerfd *tfd;
> - timerfd_t count;
> - int error;
> -
> - tfd = fp->f_data;
> - if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> - return (EINVAL);
> -
> - if (uio->uio_resid < sizeof(timerfd_t))
> - return (EINVAL);
> -
> - error = 0;
> - mtx_lock(&tfd->tfd_lock);
> -retry:
> - if (tfd->tfd_canceled) {
> - tfd->tfd_count = 0;
> - mtx_unlock(&tfd->tfd_lock);
> - return (ECANCELED);
> - }
> - if (tfd->tfd_count == 0) {
> - if ((fp->f_flag & FNONBLOCK) != 0) {
> - mtx_unlock(&tfd->tfd_lock);
> - return (EAGAIN);
> - }
> - error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
> - if (error == 0)
> - goto retry;
> - }
> - if (error == 0) {
> - count = tfd->tfd_count;
> - tfd->tfd_count = 0;
> - mtx_unlock(&tfd->tfd_lock);
> - error = uiomove(&count, sizeof(timerfd_t), uio);
> - } else
> - mtx_unlock(&tfd->tfd_lock);
> -
> - return (error);
> -}
> -
> -static int
> -timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
> - struct thread *td)
> -{
> - struct timerfd *tfd;
> - int revents = 0;
> -
> - tfd = fp->f_data;
> - if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> - return (POLLERR);
> -
> - mtx_lock(&tfd->tfd_lock);
> - if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
> - revents |= events & (POLLIN|POLLRDNORM);
> - if (revents == 0)
> - selrecord(td, &tfd->tfd_sel);
> - mtx_unlock(&tfd->tfd_lock);
> -
> - return (revents);
> -}
> -
> -static int
> -timerfd_kqfilter(struct file *fp, struct knote *kn)
> -{
> - struct timerfd *tfd;
> -
> - tfd = fp->f_data;
> - if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> - return (EINVAL);
> -
> - if (kn->kn_filter == EVFILT_READ)
> - kn->kn_fop = &timerfd_rfiltops;
> - else
> - return (EINVAL);
> -
> - kn->kn_hook = tfd;
> - knlist_add(&tfd->tfd_sel.si_note, kn, 0);
> -
> - return (0);
> -}
> -
> -static void
> -filt_timerfddetach(struct knote *kn)
> -{
> - struct timerfd *tfd = kn->kn_hook;
> -
> - mtx_lock(&tfd->tfd_lock);
> - knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
> - mtx_unlock(&tfd->tfd_lock);
> -}
> -
> -static int
> -filt_timerfdread(struct knote *kn, long hint)
> -{
> - struct timerfd *tfd = kn->kn_hook;
> -
> - return (tfd->tfd_count > 0);
> -}
> -
> -static int
> -timerfd_ioctl(struct file *fp, u_long cmd, void *data,
> - struct ucred *active_cred, struct thread *td)
> -{
> -
> - if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
> - return (EINVAL);
> -
> - switch (cmd) {
> - case FIONBIO:
> - case FIOASYNC:
> - return (0);
> - }
> -
> - return (ENOTTY);
> -}
> -
> -static int
> -timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
> -{
> -
> - return (ENXIO);
> -}
> -
> -static int
> -timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
> -{
> -
> - kif->kf_type = KF_TYPE_UNKNOWN;
> - return (0);
> -}
> -
> -static void
> -linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
> -{
> -
> - if (tfd->tfd_clockid == CLOCK_REALTIME)
> - getnanotime(ts);
> - else /* CLOCK_MONOTONIC */
> - getnanouptime(ts);
> -}
> -
> -static void
> -linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
> -{
> - struct timespec cts;
> -
> - linux_timerfd_clocktime(tfd, &cts);
> - *ots = tfd->tfd_time;
> - if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
> - timespecsub(&ots->it_value, &cts, &ots->it_value);
> - if (ots->it_value.tv_sec < 0 ||
> - (ots->it_value.tv_sec == 0 &&
> - ots->it_value.tv_nsec == 0)) {
> - ots->it_value.tv_sec = 0;
> - ots->it_value.tv_nsec = 1;
> - }
> - }
> -}
> -
> -static int
> -linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
> -{
> - struct timerfd *tfd;
> - struct file *fp;
> int error;
>
> - error = fget(td, fd, &cap_read_rights, &fp);
> + error = linux_to_native_clockid(&clockid, args->clockid);
> if (error != 0)
> return (error);
> - tfd = fp->f_data;
> - if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
> - error = EINVAL;
> - goto out;
> - }
> -
> - mtx_lock(&tfd->tfd_lock);
> - linux_timerfd_curval(tfd, ots);
> - mtx_unlock(&tfd->tfd_lock);
>
> -out:
> - fdrop(fp, td);
> - return (error);
> + return (kern_timerfd_create(td, clockid, args->flags));
> }
>
> int
> @@ -916,84 +627,14 @@ linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args
> struct itimerspec ots;
> int error;
>
> - error = linux_timerfd_gettime_common(td, args->fd, &ots);
> + error = kern_timerfd_gettime(td, args->fd, &ots);
> if (error != 0)
> return (error);
> - error = native_to_linux_itimerspec(&lots, &ots);
> - if (error == 0)
> - error = copyout(&lots, args->old_value, sizeof(lots));
> - return (error);
> -}
> -
> -#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
> -int
> -linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
> -{
> - struct l_itimerspec64 lots;
> - struct itimerspec ots;
> - int error;
>
> - error = linux_timerfd_gettime_common(td, args->fd, &ots);
> - if (error != 0)
> - return (error);
> - error = native_to_linux_itimerspec64(&lots, &ots);
> + error = native_to_linux_itimerspec(&lots, &ots);
> if (error == 0)
> error = copyout(&lots, args->old_value, sizeof(lots));
> - return (error);
> -}
> -#endif
> -
> -static int
> -linux_timerfd_settime_common(struct thread *td, int fd, int flags,
> - struct itimerspec *nts, struct itimerspec *oval)
> -{
> - struct timespec cts, ts;
> - struct timerfd *tfd;
> - struct timeval tv;
> - struct file *fp;
> - int error;
> -
> - if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
> - return (EINVAL);
> -
> - error = fget(td, fd, &cap_write_rights, &fp);
> - if (error != 0)
> - return (error);
> - tfd = fp->f_data;
> - if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
> - error = EINVAL;
> - goto out;
> - }
> -
> - mtx_lock(&tfd->tfd_lock);
> - if (!timespecisset(&nts->it_value))
> - timespecclear(&nts->it_interval);
> - if (oval != NULL)
> - linux_timerfd_curval(tfd, oval);
> -
> - bcopy(nts, &tfd->tfd_time, sizeof(*nts));
> - tfd->tfd_count = 0;
> - if (timespecisset(&nts->it_value)) {
> - linux_timerfd_clocktime(tfd, &cts);
> - ts = nts->it_value;
> - if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
> - timespecadd(&tfd->tfd_time.it_value, &cts,
> - &tfd->tfd_time.it_value);
> - } else {
> - timespecsub(&ts, &cts, &ts);
> - }
> - TIMESPEC_TO_TIMEVAL(&tv, &ts);
> - callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> - linux_timerfd_expire, tfd);
> - tfd->tfd_canceled = false;
> - } else {
> - tfd->tfd_canceled = true;
> - callout_stop(&tfd->tfd_callout);
> - }
> - mtx_unlock(&tfd->tfd_lock);
>
> -out:
> - fdrop(fp, td);
> return (error);
> }
>
> @@ -1001,7 +642,7 @@ int
> linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
> {
> struct l_itimerspec lots;
> - struct itimerspec nts, ots, *pots;
> + struct itimerspec nts, ots;
> int error;
>
> error = copyin(args->new_value, &lots, sizeof(lots));
> @@ -1010,23 +651,43 @@ linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args
> error = linux_to_native_itimerspec(&nts, &lots);
> if (error != 0)
> return (error);
> - pots = (args->old_value != NULL ? &ots : NULL);
> - error = linux_timerfd_settime_common(td, args->fd, args->flags,
> - &nts, pots);
> + if (args->old_value == NULL)
> + error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
> + else
> + error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
> if (error == 0 && args->old_value != NULL) {
> error = native_to_linux_itimerspec(&lots, &ots);
> if (error == 0)
> error = copyout(&lots, args->old_value, sizeof(lots));
> }
> +
> return (error);
> }
>
> #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
> +int
> +linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
> +{
> + struct l_itimerspec64 lots;
> + struct itimerspec ots;
> + int error;
> +
> + error = kern_timerfd_gettime(td, args->fd, &ots);
> + if (error != 0)
> + return (error);
> +
> + error = native_to_linux_itimerspec64(&lots, &ots);
> + if (error == 0)
> + error = copyout(&lots, args->old_value, sizeof(lots));
> +
> + return (error);
> +}
> +
> int
> linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
> {
> struct l_itimerspec64 lots;
> - struct itimerspec nts, ots, *pots;
> + struct itimerspec nts, ots;
> int error;
>
> error = copyin(args->new_value, &lots, sizeof(lots));
> @@ -1035,50 +696,16 @@ linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *
> error = linux_to_native_itimerspec64(&nts, &lots);
> if (error != 0)
> return (error);
> - pots = (args->old_value != NULL ? &ots : NULL);
> - error = linux_timerfd_settime_common(td, args->fd, args->flags,
> - &nts, pots);
> + if (args->old_value == NULL)
> + error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
> + else
> + error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
> if (error == 0 && args->old_value != NULL) {
> error = native_to_linux_itimerspec64(&lots, &ots);
> if (error == 0)
> error = copyout(&lots, args->old_value, sizeof(lots));
> }
> +
> return (error);
> }
> #endif
> -
> -static void
> -linux_timerfd_expire(void *arg)
> -{
> - struct timespec cts, ts;
> - struct timeval tv;
> - struct timerfd *tfd;
> -
> - tfd = (struct timerfd *)arg;
> -
> - linux_timerfd_clocktime(tfd, &cts);
> - if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
> - if (timespecisset(&tfd->tfd_time.it_interval))
> - timespecadd(&tfd->tfd_time.it_value,
> - &tfd->tfd_time.it_interval,
> - &tfd->tfd_time.it_value);
> - else
> - /* single shot timer */
> - timespecclear(&tfd->tfd_time.it_value);
> - if (timespecisset(&tfd->tfd_time.it_value)) {
> - timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
> - TIMESPEC_TO_TIMEVAL(&tv, &ts);
> - callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> - linux_timerfd_expire, tfd);
> - }
> - tfd->tfd_count++;
> - KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
> - selwakeup(&tfd->tfd_sel);
> - wakeup(&tfd->tfd_count);
> - } else if (timespecisset(&tfd->tfd_time.it_value)) {
> - timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
> - TIMESPEC_TO_TIMEVAL(&tv, &ts);
> - callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> - linux_timerfd_expire, tfd);
> - }
> -}
> diff --git a/sys/compat/linux/linux_event.h b/sys/compat/linux/linux_event.h
> index 32269b0070bc..fa63371b5170 100644
> --- a/sys/compat/linux/linux_event.h
> +++ b/sys/compat/linux/linux_event.h
> @@ -54,15 +54,4 @@
>
> #define LINUX_EFD_SEMAPHORE (1 << 0)
>
> -#define LINUX_TFD_TIMER_ABSTIME (1 << 0)
> -#define LINUX_TFD_TIMER_CANCEL_ON_SET (1 << 1)
> -#define LINUX_TFD_CLOEXEC LINUX_O_CLOEXEC
> -#define LINUX_TFD_NONBLOCK LINUX_O_NONBLOCK
> -
> -#define LINUX_TFD_SHARED_FCNTL_FLAGS (LINUX_TFD_CLOEXEC \
> - |LINUX_TFD_NONBLOCK)
> -#define LINUX_TFD_CREATE_FLAGS LINUX_TFD_SHARED_FCNTL_FLAGS
> -#define LINUX_TFD_SETTIME_FLAGS (LINUX_TFD_TIMER_ABSTIME \
> - |LINUX_TFD_TIMER_CANCEL_ON_SET)
> -
> #endif /* !_LINUX_EVENT_H_ */
> diff --git a/sys/conf/files b/sys/conf/files
> index 3f79ce752c80..8d38b9cc8a2e 100644
> --- a/sys/conf/files
> +++ b/sys/conf/files
> @@ -3908,6 +3908,7 @@ kern/sys_pipe.c standard
> kern/sys_procdesc.c standard
> kern/sys_process.c standard
> kern/sys_socket.c standard
> +kern/sys_timerfd.c standard
> kern/syscalls.c standard
> kern/sysv_ipc.c standard
> kern/sysv_msg.c optional sysvmsg
> diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
> index 1e62c46b8be0..d44fec54fcd7 100644
> --- a/sys/kern/init_sysent.c
> +++ b/sys/kern/init_sysent.c
> @@ -645,4 +645,7 @@ struct sysent sysent[] = {
> { .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 582 = swapoff */
> { .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 583 = kqueuex */
> { .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 584 = membarrier */
> + { .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 585 = timerfd_create */
> + { .sy_narg = AS(timerfd_gettime_args), .sy_call = (sy_call_t *)sys_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 586 = timerfd_gettime */
> + { .sy_narg = AS(timerfd_settime_args), .sy_call = (sy_call_t *)sys_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 587 = timerfd_settime */
> };
> diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
> index c5226288afc5..35046c856d54 100644
> --- a/sys/kern/kern_descrip.c
> +++ b/sys/kern/kern_descrip.c
> @@ -5001,8 +5001,8 @@ file_type_to_name(short type)
> return ("proc");
> case DTYPE_EVENTFD:
> return ("eventfd");
> - case DTYPE_LINUXTFD:
> - return ("ltimer");
> + case DTYPE_TIMERFD:
> + return ("timerfd");
> default:
> return ("unkn");
> }
> diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
> index 170f35830923..26f09cb60260 100644
> --- a/sys/kern/kern_tc.c
> +++ b/sys/kern/kern_tc.c
> @@ -34,6 +34,7 @@
> #include <sys/systm.h>
> #include <sys/timeffc.h>
> #include <sys/timepps.h>
> +#include <sys/timerfd.h>
> #include <sys/timetc.h>
> #include <sys/timex.h>
> #include <sys/vdso.h>
> @@ -1305,6 +1306,7 @@ tc_setclock(struct timespec *ts)
>
> /* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */
> atomic_add_rel_int(&rtc_generation, 2);
> + timerfd_jumped();
> sleepq_chains_remove_matching(sleeping_on_old_rtc);
> if (timestepwarnings) {
> nanotime(&taft);
> diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c
> new file mode 100644
> index 000000000000..6948fa059b8c
> --- /dev/null
> +++ b/sys/kern/sys_timerfd.c
> @@ -0,0 +1,632 @@
> +/*-
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
> + * Copyright (c) 2023 Jake Freeland <jfree@FreeBSD.org>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/callout.h>
> +#include <sys/fcntl.h>
> +#include <sys/file.h>
> +#include <sys/filedesc.h>
> +#include <sys/filio.h>
> +#include <sys/kernel.h>
> +#include <sys/lock.h>
> +#include <sys/malloc.h>
> +#include <sys/mount.h>
> +#include <sys/mutex.h>
> +#include <sys/poll.h>
> +#include <sys/proc.h>
> +#include <sys/queue.h>
> +#include <sys/selinfo.h>
> +#include <sys/stat.h>
> +#include <sys/sysctl.h>
> +#include <sys/sysent.h>
> +#include <sys/sysproto.h>
> +#include <sys/timerfd.h>
> +#include <sys/timespec.h>
> +#include <sys/uio.h>
> +#include <sys/user.h>
> +
> +#include <security/audit/audit.h>
> +
> +#ifdef COMPAT_FREEBSD32
> +#include <compat/freebsd32/freebsd32.h>
> +#include <compat/freebsd32/freebsd32_proto.h>
> +#endif
> +
> +static MALLOC_DEFINE(M_TIMERFD, "timerfd", "timerfd structures");
> +static LIST_HEAD(, timerfd) timerfd_head;
> +static struct unrhdr64 tfdino_unr;
> +
> +#define TFD_NOJUMP 0 /* Realtime clock has not jumped. */
> +#define TFD_READ 1 /* Jumped, tfd has been read since. */
> +#define TFD_ZREAD 2 /* Jumped backwards, CANCEL_ON_SET=false. */
> +#define TFD_CANCELED 4 /* Jumped, CANCEL_ON_SET=true. */
> +#define TFD_JUMPED (TFD_ZREAD | TFD_CANCELED)
> +
> +struct timerfd {
> + /* User specified. */
> + struct itimerspec tfd_time; /* tfd timer */
> + clockid_t tfd_clockid; /* timing base */
> + int tfd_flags; /* creation flags */
> + int tfd_timflags; /* timer flags */
> +
> + /* Used internally. */
> + timerfd_t tfd_count; /* expiration count since last read */
> + bool tfd_expired; /* true upon initial expiration */
> + struct mtx tfd_lock; /* mtx lock */
> + struct callout tfd_callout; /* expiration notification */
> + struct selinfo tfd_sel; /* I/O alerts */
> + struct timespec tfd_boottim; /* cached boottime */
> + int tfd_jumped; /* timer jump status */
> + LIST_ENTRY(timerfd) entry; /* entry in list */
> +
> + /* For stat(2). */
> + ino_t tfd_ino; /* inode number */
> + struct timespec tfd_atim; /* time of last read */
> + struct timespec tfd_mtim; /* time of last settime */
> + struct timespec tfd_birthtim; /* creation time */
> +};
> +
> +static void
> +timerfd_init(void *data)
> +{
> + new_unrhdr64(&tfdino_unr, 1);
> +}
> +
> +SYSINIT(timerfd, SI_SUB_VFS, SI_ORDER_ANY, timerfd_init, NULL);
> +
> +static inline void
> +timerfd_getboottime(struct timespec *ts)
> +{
> + struct timeval tv;
> + getboottime(&tv);
> + TIMEVAL_TO_TIMESPEC(&tv, ts);
> +}
> +
> +/*
> + * Call when a discontinuous jump has occured in CLOCK_REALTIME and
> + * update timerfd's cached boottime. A jump can be triggered using
> + * functions like clock_settime(2) or settimeofday(2).
> + *
> + * Timer is marked TFD_CANCELED if TFD_TIMER_CANCEL_ON_SET is set
> *** 850 LINES SKIPPED ***
I did a very quick look over the added code.
I do not see any protection for the timerfd_head list manipulation.
It is not clear what is protected by tfd->tfd_lock: e.g. in timerfd_stat()
it covers reading of items, writing of which is not protected by the mtx,
everything except tfd_atim.
There is no annotations in the timer structure for the locking regime.
stat st_ctim is always zero, this is somewhat strange.
The
tfd = fp->f_data;
if (tfd == NULL || fp->f_type != DTYPE_TIMERFD) {
triggers UB when f_type is not DTYPE_TIMERFD.
compat32 stuff was put into the sys/kern instead of sys/compat/freebsd32.
sys/timerfd.h pollutes userspace with sys/proc.h.
The regenerated files were put in the same commit as (probably) human
written files, why?