git: 9b1d87286c78 - main - ossl: Add a fallback AES-GCM implementation using AES-NI

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Fri, 02 Jun 2023 16:19:23 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=9b1d87286c78266dc76d32a06ed19bf3e93c0a3b

commit 9b1d87286c78266dc76d32a06ed19bf3e93c0a3b
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2023-06-02 16:01:41 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2023-06-02 16:17:11 +0000

    ossl: Add a fallback AES-GCM implementation using AES-NI
    
    This lets one use ossl(4) for AES-GCM operations on contemporary amd64
    platforms.  A kernel benchmark indicates that this gives roughly
    equivalent throughput to aesni(4) for various buffer sizes.
    
    Bulk processing is done in aesni-gcm-x86_64.S, the rest is handled in a
    C wrapper ported from OpenSSL's gcm128.c.
    
    Sponsored by:   Stormshield
    Sponsored by:   Klara, Inc.
    Reviewed by:    jhb
    MFC after:      3 months
    Differential Revision:  https://reviews.freebsd.org/D39967
---
 share/man/man4/ossl.4                   |   4 +-
 sys/crypto/openssl/amd64/ossl_aes_gcm.c | 475 +++++++++++++++++++++++++++++++-
 sys/crypto/openssl/ossl_x86.c           |   7 +
 sys/modules/ossl/Makefile               |   2 +
 4 files changed, 484 insertions(+), 4 deletions(-)

diff --git a/share/man/man4/ossl.4 b/share/man/man4/ossl.4
index 039ce301ac29..288678ce601c 100644
--- a/share/man/man4/ossl.4
+++ b/share/man/man4/ossl.4
@@ -26,7 +26,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd September 24, 2021
+.Dd May 4, 2023
 .Dt OSSL 4
 .Os
 .Sh NAME
@@ -76,6 +76,8 @@ driver includes support for the following algorithms:
 .It
 AES-CBC
 .It
+AES-GCM (amd64 only)
+.It
 ChaCha20
 .It
 ChaCha20-Poly1305 (RFC 8439)
diff --git a/sys/crypto/openssl/amd64/ossl_aes_gcm.c b/sys/crypto/openssl/amd64/ossl_aes_gcm.c
index 3381d35557f2..0d205ec3ff90 100644
--- a/sys/crypto/openssl/amd64/ossl_aes_gcm.c
+++ b/sys/crypto/openssl/amd64/ossl_aes_gcm.c
@@ -9,9 +9,11 @@
  */
 
 /*
- * This file contains a AES-GCM wrapper implementation from OpenSSL 3.1,
- * targeting amd64 VAES extensions.  This was ported from
- * cipher_aes_gcm_hw_vaes_avx512.inc.
+ * This file contains 2 AES-GCM wrapper implementations from OpenSSL, using
+ * AES-NI and VAES extensions respectively.  These were ported from
+ * cipher_aes_gcm_hw_aesni.inc and cipher_aes_gcm_hw_vaes_avx512.inc.  The
+ * AES-NI implementation makes use of a generic C implementation for partial
+ * blocks, ported from gcm128.c with OPENSSL_SMALL_FOOTPRINT defined.
  */
 
 #include <sys/endian.h>
@@ -218,6 +220,473 @@ static const struct ossl_aes_gcm_ops gcm_ops_avx512 = {
 	.tag = gcm_tag,
 };
 
+size_t aesni_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
+    const void *key, unsigned char ivec[16], uint64_t *Xi);
+size_t aesni_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
+    const void *key, unsigned char ivec[16], uint64_t *Xi);
+void aesni_encrypt(const unsigned char *in, unsigned char *out, void *ks);
+void aesni_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+    size_t blocks, void *ks, const unsigned char *iv);
+
+void gcm_init_avx(__uint128_t Htable[16], uint64_t Xi[2]);
+void gcm_gmult_avx(uint64_t Xi[2], const __uint128_t Htable[16]);
+void gcm_ghash_avx(uint64_t Xi[2], const __uint128_t Htable[16], const void *in,
+    size_t len);
+
+static void
+gcm_init_aesni(struct ossl_gcm_context *ctx, const void *key, size_t keylen)
+{
+	aesni_encrypt(ctx->gcm.H.c, ctx->gcm.H.c, &ctx->aes_ks);
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+	ctx->gcm.H.u[0] = bswap64(ctx->gcm.H.u[0]);
+	ctx->gcm.H.u[1] = bswap64(ctx->gcm.H.u[1]);
+#endif
+
+	gcm_init_avx(ctx->gcm.Htable, ctx->gcm.H.u);
+}
+
+static void
+gcm_setiv_aesni(struct ossl_gcm_context *ctx, const unsigned char *iv,
+    size_t len)
+{
+	uint32_t ctr;
+
+	KASSERT(len == AES_GCM_IV_LEN,
+	    ("%s: invalid IV length %zu", __func__, len));
+
+	ctx->gcm.len.u[0] = 0;
+	ctx->gcm.len.u[1] = 0;
+	ctx->gcm.ares = ctx->gcm.mres = 0;
+
+	memcpy(ctx->gcm.Yi.c, iv, len);
+	ctx->gcm.Yi.c[12] = 0;
+	ctx->gcm.Yi.c[13] = 0;
+	ctx->gcm.Yi.c[14] = 0;
+	ctx->gcm.Yi.c[15] = 1;
+	ctr = 1;
+
+	ctx->gcm.Xi.u[0] = 0;
+	ctx->gcm.Xi.u[1] = 0;
+
+	aesni_encrypt(ctx->gcm.Yi.c, ctx->gcm.EK0.c, &ctx->aes_ks);
+	ctr++;
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+	ctx->gcm.Yi.d[3] = bswap32(ctr);
+#else
+	ctx->gcm.Yi.d[3] = ctr;
+#endif
+}
+
+static int
+gcm_aad_aesni(struct ossl_gcm_context *ctx, const unsigned char *aad,
+    size_t len)
+{
+	size_t i;
+	unsigned int n;
+	uint64_t alen = ctx->gcm.len.u[0];
+
+	if (ctx->gcm.len.u[1])
+		return -2;
+
+	alen += len;
+	if (alen > (1ull << 61) || (sizeof(len) == 8 && alen < len))
+		return -1;
+	ctx->gcm.len.u[0] = alen;
+
+	n = ctx->gcm.ares;
+	if (n) {
+		while (n && len) {
+			ctx->gcm.Xi.c[n] ^= *(aad++);
+			--len;
+			n = (n + 1) % 16;
+		}
+		if (n == 0)
+			gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+		else {
+			ctx->gcm.ares = n;
+			return 0;
+		}
+	}
+	if ((i = (len & (size_t)-AES_BLOCK_LEN))) {
+		gcm_ghash_avx(ctx->gcm.Xi.u, ctx->gcm.Htable, aad, i);
+		aad += i;
+		len -= i;
+	}
+	if (len) {
+		n = (unsigned int)len;
+		for (i = 0; i < len; ++i)
+			ctx->gcm.Xi.c[i] ^= aad[i];
+	}
+
+	ctx->gcm.ares = n;
+	return 0;
+}
+
+static int
+gcm_encrypt(struct ossl_gcm_context *ctx, const unsigned char *in,
+    unsigned char *out, size_t len)
+{
+	unsigned int n, ctr, mres;
+	size_t i;
+	uint64_t mlen = ctx->gcm.len.u[1];
+
+	mlen += len;
+	if (mlen > ((1ull << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+		return -1;
+	ctx->gcm.len.u[1] = mlen;
+
+	mres = ctx->gcm.mres;
+
+	if (ctx->gcm.ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+		ctx->gcm.ares = 0;
+	}
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+	ctr = bswap32(ctx->gcm.Yi.d[3]);
+#else
+	ctr = ctx->gcm.Yi.d[3];
+#endif
+
+	n = mres % 16;
+	for (i = 0; i < len; ++i) {
+		if (n == 0) {
+			aesni_encrypt(ctx->gcm.Yi.c, ctx->gcm.EKi.c,
+			    &ctx->aes_ks);
+			++ctr;
+#if BYTE_ORDER == LITTLE_ENDIAN
+			ctx->gcm.Yi.d[3] = bswap32(ctr);
+#else
+			ctx->gcm.Yi.d[3] = ctr;
+#endif
+		}
+		ctx->gcm.Xi.c[n] ^= out[i] = in[i] ^ ctx->gcm.EKi.c[n];
+		mres = n = (n + 1) % 16;
+		if (n == 0)
+			gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+	}
+
+	ctx->gcm.mres = mres;
+	return 0;
+}
+
+static int
+gcm_encrypt_ctr32(struct ossl_gcm_context *ctx, const unsigned char *in,
+    unsigned char *out, size_t len)
+{
+	unsigned int n, ctr, mres;
+	size_t i;
+	uint64_t mlen = ctx->gcm.len.u[1];
+
+	mlen += len;
+	if (mlen > ((1ull << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+		return -1;
+	ctx->gcm.len.u[1] = mlen;
+
+	mres = ctx->gcm.mres;
+
+	if (ctx->gcm.ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+		ctx->gcm.ares = 0;
+	}
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+	ctr = bswap32(ctx->gcm.Yi.d[3]);
+#else
+	ctr = ctx->gcm.Yi.d[3];
+#endif
+
+	n = mres % 16;
+	if (n) {
+		while (n && len) {
+			ctx->gcm.Xi.c[n] ^= *(out++) = *(in++) ^ ctx->gcm.EKi.c[n];
+			--len;
+			n = (n + 1) % 16;
+		}
+		if (n == 0) {
+			gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+			mres = 0;
+		} else {
+			ctx->gcm.mres = n;
+			return 0;
+		}
+	}
+	if ((i = (len & (size_t)-16))) {
+		size_t j = i / 16;
+
+		aesni_ctr32_encrypt_blocks(in, out, j, &ctx->aes_ks, ctx->gcm.Yi.c);
+		ctr += (unsigned int)j;
+#if BYTE_ORDER == LITTLE_ENDIAN
+		ctx->gcm.Yi.d[3] = bswap32(ctr);
+#else
+		ctx->gcm.Yi.d[3] = ctr;
+#endif
+		in += i;
+		len -= i;
+		while (j--) {
+			for (i = 0; i < 16; ++i)
+				ctx->gcm.Xi.c[i] ^= out[i];
+			gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+			out += 16;
+		}
+	}
+	if (len) {
+		aesni_encrypt(ctx->gcm.Yi.c, ctx->gcm.EKi.c, &ctx->aes_ks);
+		++ctr;
+#if BYTE_ORDER == LITTLE_ENDIAN
+		ctx->gcm.Yi.d[3] = bswap32(ctr);
+#else
+		ctx->gcm.Yi.d[3] = ctr;
+#endif
+		while (len--) {
+			ctx->gcm.Xi.c[mres++] ^= out[n] = in[n] ^ ctx->gcm.EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->gcm.mres = mres;
+	return 0;
+}
+
+static int
+gcm_encrypt_aesni(struct ossl_gcm_context *ctx, const unsigned char *in,
+    unsigned char *out, size_t len)
+{
+	size_t bulk = 0, res;
+	int error;
+
+	res = (AES_BLOCK_LEN - ctx->gcm.mres) % AES_BLOCK_LEN;
+	if ((error = gcm_encrypt(ctx, in, out, res)) != 0)
+		return error;
+
+	bulk = aesni_gcm_encrypt(in + res, out + res, len - res,
+	    &ctx->aes_ks, ctx->gcm.Yi.c, ctx->gcm.Xi.u);
+	ctx->gcm.len.u[1] += bulk;
+	bulk += res;
+
+	if ((error = gcm_encrypt_ctr32(ctx, in + bulk, out + bulk,
+	    len - bulk)) != 0)
+		return error;
+
+	return 0;
+}
+
+static int
+gcm_decrypt(struct ossl_gcm_context *ctx, const unsigned char *in,
+    unsigned char *out, size_t len)
+{
+	unsigned int n, ctr, mres;
+	size_t i;
+	uint64_t mlen = ctx->gcm.len.u[1];
+
+	mlen += len;
+	if (mlen > ((1ull << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+		return -1;
+	ctx->gcm.len.u[1] = mlen;
+
+	mres = ctx->gcm.mres;
+
+	if (ctx->gcm.ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+		ctx->gcm.ares = 0;
+	}
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+	ctr = bswap32(ctx->gcm.Yi.d[3]);
+#else
+	ctr = ctx->gcm.Yi.d[3];
+#endif
+
+	n = mres % 16;
+	for (i = 0; i < len; ++i) {
+		uint8_t c;
+		if (n == 0) {
+			aesni_encrypt(ctx->gcm.Yi.c, ctx->gcm.EKi.c,
+			    &ctx->aes_ks);
+			++ctr;
+#if BYTE_ORDER == LITTLE_ENDIAN
+			ctx->gcm.Yi.d[3] = bswap32(ctr);
+#else
+			ctx->gcm.Yi.d[3] = ctr;
+#endif
+		}
+		c = in[i];
+		out[i] = c ^ ctx->gcm.EKi.c[n];
+		ctx->gcm.Xi.c[n] ^= c;
+		mres = n = (n + 1) % 16;
+		if (n == 0)
+			gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+	}
+
+	ctx->gcm.mres = mres;
+	return 0;
+}
+
+static int
+gcm_decrypt_ctr32(struct ossl_gcm_context *ctx, const unsigned char *in,
+    unsigned char *out, size_t len)
+{
+	unsigned int n, ctr, mres;
+	size_t i;
+	uint64_t mlen = ctx->gcm.len.u[1];
+
+	mlen += len;
+	if (mlen > ((1ull << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+		return -1;
+	ctx->gcm.len.u[1] = mlen;
+
+	mres = ctx->gcm.mres;
+
+	if (ctx->gcm.ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+		ctx->gcm.ares = 0;
+	}
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+	ctr = bswap32(ctx->gcm.Yi.d[3]);
+#else
+	ctr = ctx->gcm.Yi.d[3];
+#endif
+
+	n = mres % 16;
+	if (n) {
+		while (n && len) {
+			uint8_t c = *(in++);
+			*(out++) = c ^ ctx->gcm.EKi.c[n];
+			ctx->gcm.Xi.c[n] ^= c;
+			--len;
+			n = (n + 1) % 16;
+		}
+		if (n == 0) {
+			gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+			mres = 0;
+		} else {
+			ctx->gcm.mres = n;
+			return 0;
+		}
+	}
+	if ((i = (len & (size_t)-16))) {
+		size_t j = i / 16;
+
+		while (j--) {
+			size_t k;
+			for (k = 0; k < 16; ++k)
+				ctx->gcm.Xi.c[k] ^= in[k];
+			gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+			in += 16;
+		}
+		j = i / 16;
+		in -= i;
+		aesni_ctr32_encrypt_blocks(in, out, j, &ctx->aes_ks, ctx->gcm.Yi.c);
+		ctr += (unsigned int)j;
+#if BYTE_ORDER == LITTLE_ENDIAN
+		ctx->gcm.Yi.d[3] = bswap32(ctr);
+#else
+		ctx->gcm.Yi.d[3] = ctr;
+#endif
+		out += i;
+		in += i;
+		len -= i;
+	}
+	if (len) {
+		aesni_encrypt(ctx->gcm.Yi.c, ctx->gcm.EKi.c, &ctx->aes_ks);
+		++ctr;
+#if BYTE_ORDER == LITTLE_ENDIAN
+		ctx->gcm.Yi.d[3] = bswap32(ctr);
+#else
+		ctx->gcm.Yi.d[3] = ctr;
+#endif
+		while (len--) {
+			uint8_t c = in[n];
+			ctx->gcm.Xi.c[mres++] ^= c;
+			out[n] = c ^ ctx->gcm.EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->gcm.mres = mres;
+	return 0;
+}
+
+static int
+gcm_decrypt_aesni(struct ossl_gcm_context *ctx, const unsigned char *in,
+    unsigned char *out, size_t len)
+{
+	size_t bulk = 0, res;
+	int error;
+
+	res = (AES_BLOCK_LEN - ctx->gcm.mres) % AES_BLOCK_LEN;
+	if ((error = gcm_decrypt(ctx, in, out, res)) != 0)
+		return error;
+
+	bulk = aesni_gcm_decrypt(in, out, len, &ctx->aes_ks, ctx->gcm.Yi.c,
+	    ctx->gcm.Xi.u);
+	ctx->gcm.len.u[1] += bulk;
+	bulk += res;
+
+	if ((error = gcm_decrypt_ctr32(ctx, in + bulk, out + bulk, len - bulk)) != 0)
+		return error;
+
+	return 0;
+}
+
+static int
+gcm_finish_aesni(struct ossl_gcm_context *ctx, const unsigned char *tag,
+    size_t len)
+{
+	uint64_t alen = ctx->gcm.len.u[0] << 3;
+	uint64_t clen = ctx->gcm.len.u[1] << 3;
+
+	if (ctx->gcm.mres || ctx->gcm.ares)
+		gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+	alen = bswap64(alen);
+	clen = bswap64(clen);
+#endif
+
+	ctx->gcm.Xi.u[0] ^= alen;
+	ctx->gcm.Xi.u[1] ^= clen;
+	gcm_gmult_avx(ctx->gcm.Xi.u, ctx->gcm.Htable);
+
+	ctx->gcm.Xi.u[0] ^= ctx->gcm.EK0.u[0];
+	ctx->gcm.Xi.u[1] ^= ctx->gcm.EK0.u[1];
+
+	if (tag != NULL)
+		return timingsafe_bcmp(ctx->gcm.Xi.c, tag, len);
+	return 0;
+}
+
+static const struct ossl_aes_gcm_ops gcm_ops_aesni = {
+	.init = gcm_init_aesni,
+	.setiv = gcm_setiv_aesni,
+	.aad = gcm_aad_aesni,
+	.encrypt = gcm_encrypt_aesni,
+	.decrypt = gcm_decrypt_aesni,
+	.finish = gcm_finish_aesni,
+	.tag = gcm_tag,
+};
+
+int ossl_aes_gcm_setkey_aesni(const unsigned char *key, int klen, void *_ctx);
+
+int
+ossl_aes_gcm_setkey_aesni(const unsigned char *key, int klen,
+    void *_ctx)
+{
+	struct ossl_gcm_context *ctx;
+
+	ctx = _ctx;
+	ctx->ops = &gcm_ops_aesni;
+	gcm_init(ctx, key, klen);
+	return (0);
+}
+
 int ossl_aes_gcm_setkey_avx512(const unsigned char *key, int klen, void *_ctx);
 
 int
diff --git a/sys/crypto/openssl/ossl_x86.c b/sys/crypto/openssl/ossl_x86.c
index 594aee2ab97f..d60e903edd38 100644
--- a/sys/crypto/openssl/ossl_x86.c
+++ b/sys/crypto/openssl/ossl_x86.c
@@ -58,6 +58,7 @@ ossl_cipher_setkey_t aesni_set_decrypt_key;
 
 #ifdef __amd64__
 int ossl_vaes_vpclmulqdq_capable(void);
+ossl_cipher_setkey_t ossl_aes_gcm_setkey_aesni;
 ossl_cipher_setkey_t ossl_aes_gcm_setkey_avx512;
 #endif
 
@@ -139,6 +140,12 @@ ossl_cpuid(struct ossl_softc *sc)
 		ossl_cipher_aes_gcm.set_decrypt_key =
 		    ossl_aes_gcm_setkey_avx512;
 		sc->has_aes_gcm = true;
+	} else if ((cpu_feature2 &
+	    (CPUID2_AVX | CPUID2_PCLMULQDQ | CPUID2_MOVBE)) ==
+	    (CPUID2_AVX | CPUID2_PCLMULQDQ | CPUID2_MOVBE)) {
+		ossl_cipher_aes_gcm.set_encrypt_key = ossl_aes_gcm_setkey_aesni;
+		ossl_cipher_aes_gcm.set_decrypt_key = ossl_aes_gcm_setkey_aesni;
+		sc->has_aes_gcm = true;
 	} else {
 		sc->has_aes_gcm = false;
 	}
diff --git a/sys/modules/ossl/Makefile b/sys/modules/ossl/Makefile
index d56fef428494..a29649b2f5c8 100644
--- a/sys/modules/ossl/Makefile
+++ b/sys/modules/ossl/Makefile
@@ -29,7 +29,9 @@ SRCS.aarch64= \
 SRCS.amd64= \
 	aes-gcm-avx512.S \
 	aesni-x86_64.S \
+	aesni-gcm-x86_64.S \
 	chacha-x86_64.S \
+	ghash-x86_64.S \
 	poly1305-x86_64.S \
 	sha1-x86_64.S \
 	sha256-x86_64.S \