git: f1e93df44ba0 - stable/14 - libc: Drop incorrect qsort optimization

From: Dag-Erling Smørgrav <des_at_FreeBSD.org>
Date: Wed, 27 Aug 2025 19:22:09 UTC
The branch stable/14 has been updated by des:

URL: https://cgit.FreeBSD.org/src/commit/?id=f1e93df44ba0b494c6eda01958a71fec501bc32b

commit f1e93df44ba0b494c6eda01958a71fec501bc32b
Author:     Dag-Erling Smørgrav <des@FreeBSD.org>
AuthorDate: 2025-08-15 07:22:33 +0000
Commit:     Dag-Erling Smørgrav <des@FreeBSD.org>
CommitDate: 2025-08-27 18:49:58 +0000

    libc: Drop incorrect qsort optimization
    
    As pointed out in the PR and the article linked below, the switch to
    insertion sort in the BSD qsort code is based on a misunderstanding of
    Knuth's TAOCP and is actually a pessimization.  As demonstrated by the
    added test, it is trivially easy to construct pathological input which
    results in quadratic runtime.  Without that misguided optimization, the
    same input runs in nearly linearithmic time.
    
    https://www.raygard.net/2022/02/26/Re-engineering-a-qsort-part-3
    
    PR:             287089
    MFC after:      1 week
    Reviewed by:    imp
    Differential Revision:  https://reviews.freebsd.org/D51907
    
    (cherry picked from commit 5205b32de3fb7702e96b3991f5b1a61eee406d8b)
---
 lib/libc/stdlib/qsort.c             |  13 -----
 lib/libc/tests/stdlib/Makefile      |   1 +
 lib/libc/tests/stdlib/qsort_bench.c | 113 ++++++++++++++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/lib/libc/stdlib/qsort.c b/lib/libc/stdlib/qsort.c
index 22308887de1a..0735cf635b51 100644
--- a/lib/libc/stdlib/qsort.c
+++ b/lib/libc/stdlib/qsort.c
@@ -109,13 +109,11 @@ local_qsort(void *a, size_t n, size_t es, cmp_t *cmp, void *thunk)
 	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
 	size_t d1, d2;
 	int cmp_result;
-	int swap_cnt;
 
 	/* if there are less than 2 elements, then sorting is not needed */
 	if (__predict_false(n < 2))
 		return;
 loop:
-	swap_cnt = 0;
 	if (n < 7) {
 		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
 			for (pl = pm; 
@@ -144,7 +142,6 @@ loop:
 	for (;;) {
 		while (pb <= pc && (cmp_result = CMP(thunk, pb, a)) <= 0) {
 			if (cmp_result == 0) {
-				swap_cnt = 1;
 				swapfunc(pa, pb, es);
 				pa += es;
 			}
@@ -152,7 +149,6 @@ loop:
 		}
 		while (pb <= pc && (cmp_result = CMP(thunk, pc, a)) >= 0) {
 			if (cmp_result == 0) {
-				swap_cnt = 1;
 				swapfunc(pc, pd, es);
 				pd -= es;
 			}
@@ -161,18 +157,9 @@ loop:
 		if (pb > pc)
 			break;
 		swapfunc(pb, pc, es);
-		swap_cnt = 1;
 		pb += es;
 		pc -= es;
 	}
-	if (swap_cnt == 0) {  /* Switch to insertion sort */
-		for (pm = (char *)a + es; pm < (char *)a + n * es; pm += es)
-			for (pl = pm; 
-			     pl > (char *)a && CMP(thunk, pl - es, pl) > 0;
-			     pl -= es)
-				swapfunc(pl, pl - es, es);
-		return;
-	}
 
 	pn = (char *)a + n * es;
 	d1 = MIN(pa - (char *)a, pb - pa);
diff --git a/lib/libc/tests/stdlib/Makefile b/lib/libc/tests/stdlib/Makefile
index 974bbf7c0704..14f1ab022c37 100644
--- a/lib/libc/tests/stdlib/Makefile
+++ b/lib/libc/tests/stdlib/Makefile
@@ -13,6 +13,7 @@ ATF_TESTS_C+=		qsort_b_test
 ATF_TESTS_C+=		qsort_r_compat_test
 ATF_TESTS_C+=		qsort_r_test
 ATF_TESTS_C+=		qsort_s_test
+ATF_TESTS_C+=		qsort_bench
 ATF_TESTS_C+=		quick_exit_test
 ATF_TESTS_C+=		set_constraint_handler_s_test
 ATF_TESTS_C+=		strfmon_test
diff --git a/lib/libc/tests/stdlib/qsort_bench.c b/lib/libc/tests/stdlib/qsort_bench.c
new file mode 100644
index 000000000000..5f2cfae40140
--- /dev/null
+++ b/lib/libc/tests/stdlib/qsort_bench.c
@@ -0,0 +1,113 @@
+/*-
+ * Copyright (c) 2025 Dag-Erling Smørgrav <des@FreeBSD.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <atf-c.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+/*-
+ * Measures qsort(3) runtime with pathological input and verify that it
+ * stays close to N * log2(N).
+ *
+ * Thanks to Vivian Hussey for the proof of concept.
+ *
+ * The input we construct is similar to a sweep from 0 to N where each
+ * half, except for the first element, has been reversed; for instance,
+ * with N = 8, we get { 0, 3, 2, 1, 4, 8, 7, 6 }.  This triggers a bug in
+ * the BSD qsort(3) where it will switch to insertion sort if the pivots
+ * are sorted.
+ *
+ * This article goes into more detail about the bug and its origin:
+ *
+ * https://www.raygard.net/2022/02/26/Re-engineering-a-qsort-part-3
+ *
+ * With this optimization (the `if (swap_cnt == 0)` block), qsort(3) needs
+ * roughly N * N / 4 comparisons to sort our pathological input.  Without
+ * it, it needs only a little more than N * log2(N) comparisons.
+ */
+
+/* we stop testing once a single takes longer than this */
+#define MAXRUNSECS 10
+
+static bool debugging;
+
+static uintmax_t ncmp;
+
+static int
+intcmp(const void *a, const void *b)
+{
+	ncmp++;
+	return ((*(int *)a > *(int *)b) - (*(int *)a < *(int *)b));
+}
+
+static void
+qsort_bench(int log2n)
+{
+	uintmax_t n = 1LLU << log2n;
+	int *buf;
+
+	/* fill an array with a pathological pattern */
+	ATF_REQUIRE(buf = malloc(n * sizeof(*buf)));
+	buf[0] = 0;
+	buf[n / 2] = n / 2;
+	for (unsigned int i = 1; i < n / 2; i++) {
+		buf[i] = n / 2 - i;
+		buf[n / 2 + i] = n - i;
+	}
+
+	ncmp = 0;
+	qsort(buf, n, sizeof(*buf), intcmp);
+
+	/* check result and free array */
+	if (debugging) {
+		for (unsigned int i = 1; i < n; i++) {
+			ATF_REQUIRE_MSG(buf[i] > buf[i - 1],
+			    "array is not sorted");
+		}
+	}
+	free(buf);
+
+	/* check that runtime does not exceed N² */
+	ATF_CHECK_MSG(ncmp / n < n,
+	    "runtime %ju exceeds N² for N = %ju", ncmp, n);
+
+	/* check that runtime does not exceed N log N by much */
+	ATF_CHECK_MSG(ncmp / n <= log2n + 1,
+	    "runtime %ju exceeds N log N for N = %ju", ncmp, n);
+}
+
+ATF_TC_WITHOUT_HEAD(qsort_bench);
+ATF_TC_BODY(qsort_bench, tc)
+{
+	struct timespec t0, t1;
+	uintmax_t tus;
+
+	for (int i = 10; i <= 30; i++) {
+		clock_gettime(CLOCK_UPTIME, &t0);
+		qsort_bench(i);
+		clock_gettime(CLOCK_UPTIME, &t1);
+		tus = t1.tv_sec * 1000000 + t1.tv_nsec / 1000;
+		tus -= t0.tv_sec * 1000000 + t0.tv_nsec / 1000;
+		if (debugging) {
+			fprintf(stderr, "N = 2^%d in %ju.%06jus\n",
+			    i, tus / 1000000, tus % 1000000);
+		}
+		/* stop once an individual run exceeds our limit */
+		if (tus / 1000000 >= MAXRUNSECS)
+			break;
+	}
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+	debugging = !getenv("__RUNNING_INSIDE_ATF_RUN") &&
+	    isatty(STDERR_FILENO);
+	ATF_TP_ADD_TC(tp, qsort_bench);
+	return (atf_no_error());
+}