git: 626ea75ed2e9 - main - time: use precise callout for clock_nanosleep(2) and nanosleep(2)
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 30 Apr 2025 16:49:20 UTC
The branch main has been updated by glebius:
URL: https://cgit.FreeBSD.org/src/commit/?id=626ea75ed2e9e9365ef8d7a4fa8ef219020c98c6
commit 626ea75ed2e9e9365ef8d7a4fa8ef219020c98c6
Author: Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2025-04-30 16:47:57 +0000
Commit: Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2025-04-30 16:47:57 +0000
time: use precise callout for clock_nanosleep(2) and nanosleep(2)
Don't apply tc_precexp and TIMESEL() that uses sbt_timethreshold (both
derivatives of kern.timecounter.alloweddeviation) to sleep callout when
processing the default and precise clocks. The default timer deviation of
5% is our internal optimization in the kernel, and we shouldn't leak that
into the POSIX APIs. Note that application doesn't have any control to
cancel the deviation, only a superuser can change the global tunable [with
side effects].
Leave the deviation for CLOCK_*_FAST and CLOCK_SECOND that are documented
as imprecise.
Provide a sysctl kern.timecounter.nanosleep_precise that allows to restore
the previous behavior.
Improve documentation.
Reviewed by: ziaee, vangyzen, imp, kib
Differential Revision: https://reviews.freebsd.org/D50075
---
lib/libsys/nanosleep.2 | 52 +++++++++++++++++++++++++++++++++++++++++---------
sys/kern/kern_time.c | 36 +++++++++++++++++++++++++++-------
2 files changed, 72 insertions(+), 16 deletions(-)
diff --git a/lib/libsys/nanosleep.2 b/lib/libsys/nanosleep.2
index 8a4931e51413..ba9aae1edf57 100644
--- a/lib/libsys/nanosleep.2
+++ b/lib/libsys/nanosleep.2
@@ -27,7 +27,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd April 3, 2022
+.Dd April 29, 2025
.Dt NANOSLEEP 2
.Os
.Sh NAME
@@ -87,14 +87,6 @@ If, at the time of the call, the time value specified by
is less than or equal to the time value of the specified clock, then
.Fn clock_nanosleep
returns immediately and the calling thread is not suspended.
-.Pp
-The suspension time may be longer than requested due to the
-scheduling of other activity by the system.
-It is also subject to the allowed time interval deviation
-specified by the
-.Va kern.timecounter.alloweddeviation
-.Xr sysctl 8
-variable.
An unmasked signal will terminate the sleep early, regardless of the
.Dv SA_RESTART
value on the interrupting signal.
@@ -131,6 +123,32 @@ CLOCK_UPTIME_FAST
CLOCK_UPTIME_PRECISE
.El
.Pp
+The suspension time may be longer than requested due to the
+scheduling of other activity by the system.
+The clocks with the
+.Dv _FAST
+suffix and the
+.Dv CLOCK_SECOND
+are subject to the allowed time interval deviation specified by the
+.Va kern.timecounter.alloweddeviation
+.Xr sysctl 8
+variable.
+The clocks with the
+.Dv _PRECISE
+suffix are always as precise as possible.
+The
+.Dv CLOCK_MONOTONIC ,
+.Dv CLOCK_REALTIME
+and
+.Dv CLOCK_UPTIME
+are precise by default.
+Setting the
+.Va kern.timecounter.nanosleep_precise
+.Xr sysctl 8
+to a false value would make those clocks to behave like the
+.Dv _FAST
+clocks.
+.Pp
The
.Fn nanosleep
function behaves like
@@ -217,3 +235,19 @@ and was ported to
.Ox 2.1
and
.Fx 3.0 .
+The
+.Fn clock_nanosleep
+system call has been available since
+.Fx 11.1 .
+.Pp
+In
+.Fx 15.0
+the default behavior of
+.Fn clock_nanosleep
+with
+.Dv CLOCK_MONOTONIC ,
+.Dv CLOCK_REALTIME ,
+.Dv CLOCK_UPTIME
+clocks and
+.Fn nanosleep
+has been switched to use precise clock.
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index d7dc78366292..0c31c1563d99 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -494,6 +494,10 @@ kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
rmt));
}
+static __read_mostly bool nanosleep_precise = true;
+SYSCTL_BOOL(_kern_timecounter, OID_AUTO, nanosleep_precise, CTLFLAG_RW,
+ &nanosleep_precise, 0, "clock_nanosleep() with CLOCK_REALTIME, "
+ "CLOCK_MONOTONIC, CLOCK_UPTIME and nanosleep(2) use precise clock");
static uint8_t nanowait[MAXCPU];
int
@@ -504,7 +508,7 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
sbintime_t sbt, sbtt, prec, tmp;
time_t over;
int error;
- bool is_abs_real;
+ bool is_abs_real, precise;
if (rqt->tv_nsec < 0 || rqt->tv_nsec >= NS_PER_SEC)
return (EINVAL);
@@ -512,17 +516,31 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
return (EINVAL);
switch (clock_id) {
case CLOCK_REALTIME:
+ precise = nanosleep_precise;
+ is_abs_real = (flags & TIMER_ABSTIME) != 0;
+ break;
case CLOCK_REALTIME_PRECISE:
+ precise = true;
+ is_abs_real = (flags & TIMER_ABSTIME) != 0;
+ break;
case CLOCK_REALTIME_FAST:
case CLOCK_SECOND:
+ precise = false;
is_abs_real = (flags & TIMER_ABSTIME) != 0;
break;
case CLOCK_MONOTONIC:
- case CLOCK_MONOTONIC_PRECISE:
- case CLOCK_MONOTONIC_FAST:
case CLOCK_UPTIME:
+ precise = nanosleep_precise;
+ is_abs_real = false;
+ break;
+ case CLOCK_MONOTONIC_PRECISE:
case CLOCK_UPTIME_PRECISE:
+ precise = true;
+ is_abs_real = false;
+ break;
+ case CLOCK_MONOTONIC_FAST:
case CLOCK_UPTIME_FAST:
+ precise = false;
is_abs_real = false;
break;
case CLOCK_VIRTUAL:
@@ -553,10 +571,14 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
} else
over = 0;
tmp = tstosbt(ts);
- prec = tmp;
- prec >>= tc_precexp;
- if (TIMESEL(&sbt, tmp))
- sbt += tc_tick_sbt;
+ if (precise) {
+ prec = 0;
+ sbt = sbinuptime();
+ } else {
+ prec = tmp >> tc_precexp;
+ if (TIMESEL(&sbt, tmp))
+ sbt += tc_tick_sbt;
+ }
sbt += tmp;
error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp",
sbt, prec, C_ABSOLUTE);