git: 8665ca1fce6e - stable/12 - random(4): Fortuna: allow increased concurrency

From: David E. O'Brien <obrien_at_FreeBSD.org>
Date: Tue, 15 Feb 2022 02:09:25 UTC
The branch stable/12 has been updated by obrien:

URL: https://cgit.FreeBSD.org/src/commit/?id=8665ca1fce6e2ddf4cc3f57ca4c9f82aa8bbea89

commit 8665ca1fce6e2ddf4cc3f57ca4c9f82aa8bbea89
Author:     Conrad Meyer <cem@FreeBSD.org>
AuthorDate: 2019-06-17 20:29:13 +0000
Commit:     David E. O'Brien <obrien@FreeBSD.org>
CommitDate: 2022-02-15 02:09:08 +0000

    random(4): Fortuna: allow increased concurrency
    
    Add experimental feature to increase concurrency in Fortuna.  As this
    diverges slightly from canonical Fortuna, and due to the security
    sensitivity of random(4), it is off by default.  To enable it, set the
    tunable kern.random.fortuna.concurrent_read="1".  The rest of this commit
    message describes the behavior when enabled.
    
    Readers continue to update shared Fortuna state under global mutex, as they
    do in the status quo implementation of the algorithm, but shift the actual
    PRF generation out from under the global lock.  This massively reduces the
    CPU time readers spend holding the global lock, allowing for increased
    concurrency on SMP systems and less bullying of the harvestq kthread.
    
    It is somewhat of a deviation from FS&K.  I think the primary difference is
    that the specific sequence of AES keys will differ if READ_RANDOM_UIO is
    accessed concurrently (as the 2nd thread to take the mutex will no longer
    receive a key derived from rekeying the first thread).  However, I believe
    the goals of rekeying AES are maintained: trivially, we continue to rekey
    every 1MB for the statistical property; and each consumer gets a
    forward-secret, independent AES key for their PRF.
    
    Since Chacha doesn't need to rekey for sequences of any length, this change
    makes no difference to the sequence of Chacha keys and PRF generated when
    Chacha is used in place of AES.
    
    On a GENERIC 4-thread VM (so, INVARIANTS/WITNESS, numbers not necessarily
    representative), 3x concurrent AES performance jumped from ~55 MiB/s per
    thread to ~197 MB/s per thread.  Concurrent Chacha20 at 3 threads went from
    roughly ~113 MB/s per thread to ~430 MB/s per thread.
    
    Prior to this change, the system was extremely unresponsive with 3-4
    concurrent random readers; each thread had high variance in latency and
    throughput, depending on who got lucky and won the lock.  "rand_harvestq"
    thread CPU use was high (double digits), seemingly due to spinning on the
    global lock.
    
    After the change, concurrent random readers and the system in general are
    much more responsive, and rand_harvestq CPU use dropped to basically zero.
    
    Tests are added to the devrandom suite to ensure the uint128_add64 primitive
    utilized by unlocked read functions to specification.
    
    (cherry picked from commit 179f62805cf05c3ae334c03dd94cb4c2da46c716)
---
 sys/dev/random/fortuna.c           | 359 +++++++++++++++++++++++++++----------
 sys/dev/random/fortuna.h           |   2 +
 sys/dev/random/hash.c              |   2 +-
 sys/dev/random/hash.h              |   4 +
 sys/dev/random/uint128.h           |  15 ++
 tests/sys/devrandom/uint128_test.c |  57 ++++++
 6 files changed, 348 insertions(+), 91 deletions(-)

diff --git a/sys/dev/random/fortuna.c b/sys/dev/random/fortuna.c
index 48620a14c349..f7f8a753c790 100644
--- a/sys/dev/random/fortuna.c
+++ b/sys/dev/random/fortuna.c
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include "unit_test.h"
 #endif /* _KERNEL */
 
+#include <crypto/chacha20/chacha.h>
 #include <crypto/rijndael/rijndael-api-fst.h>
 #include <crypto/sha2/sha256.h>
 
@@ -75,7 +76,10 @@ __FBSDID("$FreeBSD$");
 /* Defined in FS&K */
 #define	RANDOM_FORTUNA_NPOOLS 32		/* The number of accumulation pools */
 #define	RANDOM_FORTUNA_DEFPOOLSIZE 64		/* The default pool size/length for a (re)seed */
-#define	RANDOM_FORTUNA_MAX_READ (1 << 20)	/* Max bytes in a single read */
+#define	RANDOM_FORTUNA_MAX_READ (1 << 20)	/* Max bytes from AES before rekeying */
+#define	RANDOM_FORTUNA_BLOCKS_PER_KEY (1 << 16)	/* Max blocks from AES before rekeying */
+CTASSERT(RANDOM_FORTUNA_BLOCKS_PER_KEY * RANDOM_BLOCKSIZE ==
+    RANDOM_FORTUNA_MAX_READ);
 
 /*
  * The allowable range of RANDOM_FORTUNA_DEFPOOLSIZE. The default value is above.
@@ -120,6 +124,26 @@ static struct fortuna_state {
 	mtx_t fs_mtx;
 } fortuna_state;
 
+/*
+ * Experimental concurrent reads feature.  For now, disabled by default.  But
+ * we may enable it in the future.
+ *
+ * The benefit is improved concurrency in Fortuna.  That is reflected in two
+ * related aspects:
+ *
+ * 1. Concurrent devrandom readers can achieve similar throughput to a single
+ *    reader thread.
+ *
+ * 2. The rand_harvestq process spends much less time spinning when one or more
+ *    readers is processing a large request.  Partially this is due to
+ *    rand_harvestq / ra_event_processor design, which only passes one event at
+ *    a time to the underlying algorithm.  Each time, Fortuna must take its
+ *    global state mutex, potentially blocking on a reader.  Our adaptive
+ *    mutexes assume that a lock holder currently on CPU will release the lock
+ *    quickly, and spin if the owning thread is currently running.
+ */
+static bool fortuna_concurrent_read __read_frequently = false;
+
 #ifdef _KERNEL
 static struct sysctl_ctx_list random_clist;
 RANDOM_CHECK_UINT(fs_minpoolsize, RANDOM_FORTUNA_MINPOOLSIZE, RANDOM_FORTUNA_MAXPOOLSIZE);
@@ -176,6 +200,11 @@ random_fortuna_init_alg(void *unused __unused)
 		random_check_uint_fs_minpoolsize, "IU",
 		"Minimum pool size necessary to cause a reseed");
 	KASSERT(fortuna_state.fs_minpoolsize > 0, ("random: Fortuna threshold must be > 0 at startup"));
+
+	SYSCTL_ADD_BOOL(&random_clist, SYSCTL_CHILDREN(random_fortuna_o),
+	    OID_AUTO, "concurrent_read", CTLFLAG_RDTUN,
+	    &fortuna_concurrent_read, 0, "If non-zero, enable EXPERIMENTAL "
+	    "feature to improve concurrent Fortuna performance.");
 #endif
 
 	/*-
@@ -305,48 +334,6 @@ random_fortuna_reseed_internal(uint32_t *entropy_data, u_int blockcount)
 	uint128_increment(&fortuna_state.fs_counter);
 }
 
-/*-
- * FS&K - PseudoRandomData()
- *
- * If Chacha20 is used, output size is unrestricted.  If AES-CTR is used,
- * output size MUST be <= 1MB and a multiple of RANDOM_BLOCKSIZE.  The
- * reasoning for this is discussed in FS&K 9.4; the significant distinction
- * between the two ciphers is that AES has a *block* size of 128 bits while
- * Chacha has a *block* size of 512 bits.
- */
-static __inline void
-random_fortuna_genrandom(uint8_t *buf, size_t bytecount)
-{
-	uint8_t newkey[RANDOM_KEYSIZE];
-
-	RANDOM_RESEED_ASSERT_LOCK_OWNED();
-
-	/*-
-	 * FS&K - assert(n < 2^20 (== 1 MB)) when 128-bit block cipher is used
-	 *      - r = first-n-bytes(GenerateBlocks(ceil(n/16)))
-	 *      - K = GenerateBlocks(2)
-	 */
-	KASSERT(random_chachamode || bytecount <= RANDOM_FORTUNA_MAX_READ,
-	    ("%s: invalid large read request: %zu bytes", __func__,
-	     bytecount));
-
-	/*
-	 * This is where FS&K would invoke GenerateBlocks().  GenerateBlocks()
-	 * doesn't make a lot of sense or have much value if we use bytecount
-	 * for the API (which is useful for ciphers that do not require
-	 * block-sized output, like Chacha20).
-	 *
-	 * Just invoke our PRF abstraction directly, which is responsible for
-	 * updating fs_counter ('C').
-	 */
-	randomdev_keystream(&fortuna_state.fs_key, &fortuna_state.fs_counter,
-	    buf, bytecount);
-	randomdev_keystream(&fortuna_state.fs_key, &fortuna_state.fs_counter,
-	    newkey, sizeof(newkey));
-	randomdev_encrypt_init(&fortuna_state.fs_key, newkey);
-	explicit_bzero(newkey, sizeof(newkey));
-}
-
 /*-
  * FS&K - RandomData() (Part 1)
  * Used to return processed entropy from the PRNG. There is a pre_read
@@ -433,70 +420,144 @@ random_fortuna_pre_read(void)
 	explicit_bzero(temp, sizeof(temp));
 }
 
-/*-
- * FS&K - RandomData() (Part 2)
- * Main read from Fortuna, continued. May be called multiple times after
- * the random_fortuna_pre_read() above.
+/*
+ * This is basically GenerateBlocks() from FS&K.
  *
- * The supplied buf MAY not be a multiple of RANDOM_BLOCKSIZE in size; it is
- * the responsibility of the algorithm to accommodate partial block reads, if a
- * block output mode is used.
+ * It differs in two ways:
+ *
+ * 1. Chacha20 is tolerant of non-block-multiple request sizes, so we do not
+ * need to handle any remainder bytes specially and can just pass the length
+ * directly to the PRF construction; and
+ *
+ * 2. Chacha20 is a 512-bit block size cipher (whereas AES has 128-bit block
+ * size, regardless of key size).  This means Chacha does not require re-keying
+ * every 1MiB.  This is implied by the math in FS&K 9.4 and mentioned
+ * explicitly in the conclusion, "If we had a block cipher with a 256-bit [or
+ * greater] block size, then the collisions would not have been an issue at
+ * all" (p. 144).
+ *
+ * 3. In conventional ("locked") mode, we produce a maximum of PAGE_SIZE output
+ * at a time before dropping the lock, to not bully the lock especially.  This
+ * has been the status quo since 2015 (r284959).
+ *
+ * The upstream caller random_fortuna_read is responsible for zeroing out
+ * sensitive buffers provided as parameters to this routine.
  */
-void
-random_fortuna_read(uint8_t *buf, size_t bytecount)
+enum {
+	FORTUNA_UNLOCKED = false,
+	FORTUNA_LOCKED = true
+};
+static void
+random_fortuna_genbytes(uint8_t *buf, size_t bytecount,
+    uint8_t newkey[static RANDOM_KEYSIZE], uint128_t *p_counter,
+    union randomdev_key *p_key, bool locked)
 {
 	uint8_t remainder_buf[RANDOM_BLOCKSIZE];
-	size_t read_directly_len, read_chunk;
+	size_t chunk_size;
 
-	/*
-	 * The underlying AES generator expects multiples of RANDOM_BLOCKSIZE.
-	 */
-	if (random_chachamode)
-		read_directly_len = bytecount;
+	if (locked)
+		RANDOM_RESEED_ASSERT_LOCK_OWNED();
 	else
-		read_directly_len = rounddown(bytecount, RANDOM_BLOCKSIZE);
+		RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED();
 
-	RANDOM_RESEED_LOCK();
-	KASSERT(!uint128_is_zero(fortuna_state.fs_counter), ("FS&K: C != 0"));
+	/*
+	 * Easy case: don't have to worry about bullying the global mutex,
+	 * don't have to worry about rekeying Chacha; API is byte-oriented.
+	 */
+	if (!locked && random_chachamode) {
+		randomdev_keystream(p_key, p_counter, buf, bytecount);
+		return;
+	}
 
-	while (read_directly_len > 0) {
+	if (locked) {
 		/*
-		 * 128-bit block ciphers like AES must be re-keyed at 1MB
-		 * intervals to avoid unacceptable statistical differentiation
-		 * from true random data.
-		 *
-		 * 512-bit block ciphers like Chacha20 do not have this
-		 * problem. (FS&K 9.4)
+		 * While holding the global lock, limit PRF generation to
+		 * mitigate, but not eliminate, bullying symptoms.
 		 */
-		if (random_chachamode)
-			read_chunk = read_directly_len;
-		else
-			read_chunk = MIN(read_directly_len,
-			    RANDOM_FORTUNA_MAX_READ);
-
+		chunk_size = PAGE_SIZE;
+	} else {
 		/*
-		 * For now, we hold the global Fortuna mutex, so yield
-		 * periodically to provide vague availability to other lock
-		 * users.  PAGE_SIZE is chosen to match existing behavior.
-		 */
-		read_chunk = MIN(read_chunk, PAGE_SIZE);
+		* 128-bit block ciphers like AES must be re-keyed at 1MB
+		* intervals to avoid unacceptable statistical differentiation
+		* from true random data (FS&K 9.4, p. 143-144).
+		*/
+		MPASS(!random_chachamode);
+		chunk_size = RANDOM_FORTUNA_MAX_READ;
+	}
 
-		random_fortuna_genrandom(buf, read_chunk);
-		buf += read_chunk;
-		read_directly_len -= read_chunk;
-		bytecount -= read_chunk;
+	chunk_size = MIN(bytecount, chunk_size);
+	if (!random_chachamode)
+		chunk_size = rounddown(chunk_size, RANDOM_BLOCKSIZE);
 
-		/* Perform the actual yield. */
-		if (read_directly_len != 0) {
-			RANDOM_RESEED_UNLOCK();
-			RANDOM_RESEED_LOCK();
+	while (bytecount >= chunk_size) {
+		randomdev_keystream(p_key, p_counter, buf, chunk_size);
+
+		buf += chunk_size;
+		bytecount -= chunk_size;
+
+		/* We have to rekey if there is any data remaining to be
+		 * generated, in two scenarios:
+		 *
+		 * locked: we need to rekey before we unlock and release the
+		 * global state to another consumer; or
+		 *
+		 * unlocked: we need to rekey because we're in AES mode and are
+		 * required to rekey at chunk_size==1MB.  But we do not need to
+		 * rekey during the last trailing <1MB chunk.
+		 */
+		if (bytecount > 0) {
+			if (locked || chunk_size == RANDOM_FORTUNA_MAX_READ) {
+				randomdev_keystream(p_key, p_counter, newkey,
+				    RANDOM_KEYSIZE);
+				randomdev_encrypt_init(p_key, newkey);
+			}
+
+			/*
+			 * If we're holding the global lock, yield it briefly
+			 * now.
+			 */
+			if (locked) {
+				RANDOM_RESEED_UNLOCK();
+				RANDOM_RESEED_LOCK();
+			}
+
+			/*
+			 * At the trailing end, scale down chunk_size from 1MB or
+			 * PAGE_SIZE to all remaining full blocks (AES) or all
+			 * remaining bytes (Chacha).
+			 */
+			if (bytecount < chunk_size) {
+				if (random_chachamode)
+					chunk_size = bytecount;
+				else if (bytecount >= RANDOM_BLOCKSIZE)
+					chunk_size = rounddown(bytecount,
+					    RANDOM_BLOCKSIZE);
+				else
+					break;
+			}
 		}
 	}
 
-	if (bytecount > 0)
-		random_fortuna_genrandom(remainder_buf, sizeof(remainder_buf));
+	/*
+	 * Generate any partial AES block remaining into a temporary buffer and
+	 * copy the desired substring out.
+	 */
+	if (bytecount > 0) {
+		MPASS(!random_chachamode);
 
-	RANDOM_RESEED_UNLOCK();
+		randomdev_keystream(p_key, p_counter, remainder_buf,
+		    sizeof(remainder_buf));
+	}
+
+	/*
+	 * In locked mode, re-key global K before dropping the lock, which we
+	 * don't need for memcpy/bzero below.
+	 */
+	if (locked) {
+		randomdev_keystream(p_key, p_counter, newkey, RANDOM_KEYSIZE);
+		randomdev_encrypt_init(p_key, newkey);
+		RANDOM_RESEED_UNLOCK();
+	}
 
 	if (bytecount > 0) {
 		memcpy(buf, remainder_buf, bytecount);
@@ -504,6 +565,124 @@ random_fortuna_read(uint8_t *buf, size_t bytecount)
 	}
 }
 
+
+/*
+ * Handle only "concurrency-enabled" Fortuna reads to simplify logic.
+ *
+ * Caller (random_fortuna_read) is responsible for zeroing out sensitive
+ * buffers provided as parameters to this routine.
+ */
+static void
+random_fortuna_read_concurrent(uint8_t *buf, size_t bytecount,
+    uint8_t newkey[static RANDOM_KEYSIZE])
+{
+	union randomdev_key key_copy;
+	uint128_t counter_copy;
+	size_t blockcount;
+
+	MPASS(fortuna_concurrent_read);
+
+	/*
+	 * Compute number of blocks required for the PRF request ('delta C').
+	 * We will step the global counter 'C' by this number under lock, and
+	 * then actually consume the counter values outside the lock.
+	 *
+	 * This ensures that contemporaneous but independent requests for
+	 * randomness receive distinct 'C' values and thus independent PRF
+	 * results.
+	 */
+	if (random_chachamode) {
+		blockcount = howmany(bytecount, CHACHA_BLOCKLEN);
+	} else {
+		blockcount = howmany(bytecount, RANDOM_BLOCKSIZE);
+
+		/*
+		 * Need to account for the additional blocks generated by
+		 * rekeying when updating the global fs_counter.
+		 */
+		blockcount += RANDOM_KEYS_PER_BLOCK *
+		    (blockcount / RANDOM_FORTUNA_BLOCKS_PER_KEY);
+	}
+
+	RANDOM_RESEED_LOCK();
+	KASSERT(!uint128_is_zero(fortuna_state.fs_counter), ("FS&K: C != 0"));
+	/*
+	 * Technically, we only need mutual exclusion to update shared state
+	 * appropriately.  Nothing about updating the shared internal state
+	 * requires that we perform (most) expensive cryptographic keystream
+	 * generation under lock.  (We still need to generate 256 bits of
+	 * keystream to re-key between consumers.)
+	 *
+	 * Save the original counter and key values that will be used as the
+	 * PRF for this particular consumer.
+	 */
+	memcpy(&counter_copy, &fortuna_state.fs_counter, sizeof(counter_copy));
+	memcpy(&key_copy, &fortuna_state.fs_key, sizeof(key_copy));
+
+	/*
+	 * Step the counter as if we had generated 'bytecount' blocks for this
+	 * consumer.  I.e., ensure that the next consumer gets an independent
+	 * range of counter values once we drop the global lock.
+	 */
+	uint128_add64(&fortuna_state.fs_counter, blockcount);
+
+	/*
+	 * We still need to Rekey the global 'K' between independent calls;
+	 * this is no different from conventional Fortuna.  Note that
+	 * 'randomdev_keystream()' will step the fs_counter 'C' appropriately
+	 * for the blocks needed for the 'newkey'.
+	 *
+	 * (This is part of PseudoRandomData() in FS&K, 9.4.4.)
+	 */
+	randomdev_keystream(&fortuna_state.fs_key, &fortuna_state.fs_counter,
+	    newkey, RANDOM_KEYSIZE);
+	randomdev_encrypt_init(&fortuna_state.fs_key, newkey);
+
+	/*
+	 * We have everything we need to generate a unique PRF for this
+	 * consumer without touching global state.
+	 */
+	RANDOM_RESEED_UNLOCK();
+
+	random_fortuna_genbytes(buf, bytecount, newkey, &counter_copy,
+	    &key_copy, FORTUNA_UNLOCKED);
+	RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED();
+
+	explicit_bzero(&counter_copy, sizeof(counter_copy));
+	explicit_bzero(&key_copy, sizeof(key_copy));
+}
+
+/*-
+ * FS&K - RandomData() (Part 2)
+ * Main read from Fortuna, continued. May be called multiple times after
+ * the random_fortuna_pre_read() above.
+ *
+ * The supplied buf MAY not be a multiple of RANDOM_BLOCKSIZE in size; it is
+ * the responsibility of the algorithm to accommodate partial block reads, if a
+ * block output mode is used.
+ */
+void
+random_fortuna_read(uint8_t *buf, size_t bytecount)
+{
+	uint8_t newkey[RANDOM_KEYSIZE];
+
+	if (fortuna_concurrent_read) {
+		random_fortuna_read_concurrent(buf, bytecount, newkey);
+		goto out;
+	}
+
+	RANDOM_RESEED_LOCK();
+	KASSERT(!uint128_is_zero(fortuna_state.fs_counter), ("FS&K: C != 0"));
+
+	random_fortuna_genbytes(buf, bytecount, newkey,
+	    &fortuna_state.fs_counter, &fortuna_state.fs_key, FORTUNA_LOCKED);
+	/* Returns unlocked */
+	RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED();
+
+out:
+	explicit_bzero(newkey, sizeof(newkey));
+}
+
 #ifdef _KERNEL
 static bool block_seeded_status = false;
 SYSCTL_BOOL(_kern_random, OID_AUTO, block_seeded_status, CTLFLAG_RWTUN,
diff --git a/sys/dev/random/fortuna.h b/sys/dev/random/fortuna.h
index 43aad04fb13c..3b75a008d91d 100644
--- a/sys/dev/random/fortuna.h
+++ b/sys/dev/random/fortuna.h
@@ -36,12 +36,14 @@ typedef struct mtx mtx_t;
 #define	RANDOM_RESEED_LOCK(x)			mtx_lock(&fortuna_state.fs_mtx)
 #define	RANDOM_RESEED_UNLOCK(x)			mtx_unlock(&fortuna_state.fs_mtx)
 #define	RANDOM_RESEED_ASSERT_LOCK_OWNED(x)	mtx_assert(&fortuna_state.fs_mtx, MA_OWNED)
+#define	RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED()	mtx_assert(&fortuna_state.fs_mtx, MA_NOTOWNED)
 #else
 #define	RANDOM_RESEED_INIT_LOCK(x)		mtx_init(&fortuna_state.fs_mtx, mtx_plain)
 #define	RANDOM_RESEED_DEINIT_LOCK(x)		mtx_destroy(&fortuna_state.fs_mtx)
 #define	RANDOM_RESEED_LOCK(x)			mtx_lock(&fortuna_state.fs_mtx)
 #define	RANDOM_RESEED_UNLOCK(x)			mtx_unlock(&fortuna_state.fs_mtx)
 #define	RANDOM_RESEED_ASSERT_LOCK_OWNED(x)
+#define	RANDOM_RESEED_ASSERT_LOCK_NOT_OWNED()
 #endif
 
 #endif /* SYS_DEV_RANDOM_FORTUNA_H_INCLUDED */
diff --git a/sys/dev/random/hash.c b/sys/dev/random/hash.c
index cd29b74c43a4..99965513350d 100644
--- a/sys/dev/random/hash.c
+++ b/sys/dev/random/hash.c
@@ -74,7 +74,7 @@ _Static_assert(CHACHA_STATELEN == RANDOM_BLOCKSIZE, "");
  * Benefits include somewhat faster keystream generation compared with
  * unaccelerated AES-ICM.
  */
-bool random_chachamode = false;
+bool random_chachamode __read_frequently = false;
 #ifdef _KERNEL
 SYSCTL_BOOL(_kern_random, OID_AUTO, use_chacha20_cipher, CTLFLAG_RDTUN,
     &random_chachamode, 0,
diff --git a/sys/dev/random/hash.h b/sys/dev/random/hash.h
index ca2f3db56a00..92a5950f9932 100644
--- a/sys/dev/random/hash.h
+++ b/sys/dev/random/hash.h
@@ -32,6 +32,10 @@
 #include <crypto/chacha20/_chacha.h>
 #include <dev/random/uint128.h>
 
+#ifndef _KERNEL
+#define	__read_frequently
+#endif
+
 /* Keys are formed from cipher blocks */
 #define	RANDOM_KEYSIZE		32	/* (in bytes) == 256 bits */
 #define	RANDOM_KEYSIZE_WORDS	(RANDOM_KEYSIZE/sizeof(uint32_t))
diff --git a/sys/dev/random/uint128.h b/sys/dev/random/uint128.h
index 63de28a3864a..71056a63a93b 100644
--- a/sys/dev/random/uint128.h
+++ b/sys/dev/random/uint128.h
@@ -65,6 +65,21 @@ uint128_increment(uint128_t *big_uintp)
 #endif
 }
 
+static __inline void
+uint128_add64(uint128_t *big_uintp, uint64_t add)
+{
+#ifdef USE_REAL_UINT128_T
+	(*big_uintp) += add;
+#else
+	uint64_t word0p;
+
+	word0p = big_uintp->u128t_word0 + add;
+	if (word0p < big_uintp->u128t_word0)
+		big_uintp->u128t_word1++;
+	big_uintp->u128t_word0 = word0p;
+#endif
+}
+
 static __inline bool
 uint128_equals(uint128_t a, uint128_t b)
 {
diff --git a/tests/sys/devrandom/uint128_test.c b/tests/sys/devrandom/uint128_test.c
index c621b0cc1a8d..288e8a2e9f79 100644
--- a/tests/sys/devrandom/uint128_test.c
+++ b/tests/sys/devrandom/uint128_test.c
@@ -152,6 +152,62 @@ ATF_TC_BODY(uint128_inc, tc)
 	}
 }
 
+ATF_TC_WITHOUT_HEAD(uint128_add64);
+ATF_TC_BODY(uint128_add64, tc)
+{
+	static const struct u128_add64_tc {
+		uint32_t input[4];
+		uint64_t addend;
+		uint32_t expected[4];
+		const char *descr;
+	} tests[] = {
+		{
+			.input = { 0, 0, 0, 0 },
+			.addend = 1,
+			.expected = { 1, 0, 0, 0 },
+			.descr = "0 + 1 -> 1",
+		},
+		{
+			.input = { 1, 0, 0, 0 },
+			.addend = UINT32_MAX,
+			.expected = { 0, 1, 0, 0 },
+			.descr = "1 + (2^32 - 1) -> 2^32 (word carry)",
+		},
+		{
+			.input = { 1, 0, 0, 0 },
+			.addend = UINT64_MAX,
+			.expected = { 0, 0, 1, 0 },
+			.descr = "1 + (2^64 - 1) -> 2^64 (u128t_word0 carry)",
+		},
+		{
+			.input = { 0x11111111, 0x11111111, 0, 0 },
+			.addend = 0xf0123456789abcdeULL,
+			.expected = { 0x89abcdef, 0x01234567, 1, 0 },
+			.descr = "0x1111_1111_1111_1111 +"
+				 "0xf012_3456_789a_bcde ->"
+			       "0x1_0123_4567_89ab_cdef",
+		},
+		{
+			.input = { 1, 0, UINT32_MAX, 0 },
+			.addend = UINT64_MAX,
+			.expected = { 0, 0, 0, 1 },
+			.descr = "Carry ~2^96",
+		},
+	};
+	uint8_t inputle[16], expectedle[16];
+	uint128_t a;
+	size_t i;
+
+	for (i = 0; i < nitems(tests); i++) {
+		vec_u32_tole128(inputle, tests[i].input);
+		vec_u32_tole128(expectedle, tests[i].expected);
+
+		a = le128dec(inputle);
+		uint128_add64(&a, tests[i].addend);
+		u128_check_equality(le128dec(expectedle), a, tests[i].descr);
+	}
+}
+
 /*
  * Test assumptions about Chacha incrementing counter in the same way as
  * uint128.h
@@ -219,6 +275,7 @@ ATF_TP_ADD_TCS(tp)
 {
 
 	ATF_TP_ADD_TC(tp, uint128_inc);
+	ATF_TP_ADD_TC(tp, uint128_add64);
 	ATF_TP_ADD_TC(tp, uint128_chacha_ctr);
 	return (atf_no_error());
 }