From nobody Fri Oct 24 10:18:49 2025 X-Original-To: dev-commits-src-all@mlmmj.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mlmmj.nyi.freebsd.org (Postfix) with ESMTP id 4ctJlZ1h8wz6F2y9; Fri, 24 Oct 2025 10:18:50 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "R12" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4ctJlY5yD1z3fyq; Fri, 24 Oct 2025 10:18:49 +0000 (UTC) (envelope-from git@FreeBSD.org) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=freebsd.org; s=dkim; t=1761301129; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding; bh=kWZIn5mMXNi2TMEaqTFK2pl183/69J7jnrBQhxRRwk0=; b=tpLYkFSU/BWx0KFQKRjj/Gm4QGui3y3fwpIK4mpJQpfopzdvSThv3RAo21jcFQrWzg9Uy0 LrXTBd7nm+puJZQDPRTVisu9gbT2E+b3qr49Gh3BOxIgtlWve5d5ubdrud/Knp+MMurnNa 4hctAqIPyiI/6+ZRLx2JHCikr5BVv3UA52tgCE7TPGuferSabcktQE9YgrP0Bs9OkPHzsN P5iZDf5J9pxqHWld+sZGHszj+b1WOgDbaGOw9tkl0y1uRUF+S/DqCVNNSqwF9H+/hgofxS SPTCLaCS7Cj3+UlnDEEGM2tP7lp+sblDDxr8Ykz7hLupLC95oFpXADQrobHNnw== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=freebsd.org; s=dkim; t=1761301129; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding; bh=kWZIn5mMXNi2TMEaqTFK2pl183/69J7jnrBQhxRRwk0=; b=drS+E6WavNHLnL/j952PhlGLVJ6fPJqJeTeowVzlkCApmmyHyk/X/CWo0KDC01/qG4EIVh Vdu1NuUDShvmuXCmcFtMTXbni5AIuPJ/kqlrw4LZ/aTIHXnqOUyvVTZffejLoxXofcOLJ3 U/EJp1iTEXZ9TrHW32OJ67btkzEk0ayleEQZwU/RGoBwQIdtuTRTPFW0MAjB7T4Qvnpm3W VaGuHHH6Hc84Bjx9f5yWIseoEOXhiULpma6S+7nPkdAf3UhE+CrUx8Ht6FANEZgP5K3ub4 J5rUtBmRBTFmsG2stYUk/szJWEy2YOPGTIr9Ns0hV1c5eWLETf/SwUXSmKTCVg== ARC-Seal: i=1; s=dkim; d=freebsd.org; t=1761301129; a=rsa-sha256; cv=none; b=xyYx1CSDwgSJYVpe2iA4kbmvM3wI6dNFYljkD36tP/YL1ABOtbyPjEae86hecY4RpYCz6D 3lb3YoX7ePIup/0KUECMcTQLSZZ6/3HtGjP/6O3Dk2ajvxpF7p+eqpWUzs4so14reR3EZy Pm7LMQ6GdAPxSbyIVr9Pjqk6Dk9q92XfC14Kp0fCPCaeyz7VlFYTfxNyl/mEkPcJWDHDDN NFA+/Q2+vMeRJK7PTwI1iNFLjbl9lTDR72WTcBy28IHNuF1d966xqfo79X6qjs9eqQDtag 0t8SzfeOzXBrSIvzrsqSsHzM/dv+p3CajC8xqslsLnKJiKn7cS+mO7n72s5yPA== ARC-Authentication-Results: i=1; mx1.freebsd.org; none Received: from gitrepo.freebsd.org (gitrepo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:5]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 4ctJlY5WJdzwwm; Fri, 24 Oct 2025 10:18:49 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org ([127.0.1.44]) by gitrepo.freebsd.org (8.18.1/8.18.1) with ESMTP id 59OAInNX033769; Fri, 24 Oct 2025 10:18:49 GMT (envelope-from git@gitrepo.freebsd.org) Received: (from git@localhost) by gitrepo.freebsd.org (8.18.1/8.18.1/Submit) id 59OAInxp033766; Fri, 24 Oct 2025 10:18:49 GMT (envelope-from git) Date: Fri, 24 Oct 2025 10:18:49 GMT Message-Id: <202510241018.59OAInxp033766@gitrepo.freebsd.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org From: Robert Clausecker Subject: git: d92e98742100 - main - lib/libmd: import md5 amd64 kernels List-Id: Commit messages for all branches of the src repository List-Archive: https://lists.freebsd.org/archives/dev-commits-src-all List-Help: List-Post: List-Subscribe: List-Unsubscribe: X-BeenThere: dev-commits-src-all@freebsd.org Sender: owner-dev-commits-src-all@FreeBSD.org MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-Git-Committer: fuz X-Git-Repository: src X-Git-Refname: refs/heads/main X-Git-Reftype: branch X-Git-Commit: d92e987421001c365216b039f8c3303939c195f7 Auto-Submitted: auto-generated The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=d92e987421001c365216b039f8c3303939c195f7 commit d92e987421001c365216b039f8c3303939c195f7 Author: Robert Clausecker AuthorDate: 2025-10-10 17:40:49 +0000 Commit: Robert Clausecker CommitDate: 2025-10-24 10:17:05 +0000 lib/libmd: import md5 amd64 kernels Differential Revision: https://reviews.freebsd.org/D45670 Reviewed by: imp Approved by: markj (mentor) MFC after: 1 month --- lib/libmd/Makefile | 7 + lib/libmd/amd64/md5block.S | 363 ++++++++++++++++++++++++++++++++++++++++++ lib/libmd/amd64/md5dispatch.c | 41 +++++ 3 files changed, 411 insertions(+) diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile index 59a519a882af..c4ab767c8b2f 100644 --- a/lib/libmd/Makefile +++ b/lib/libmd/Makefile @@ -117,6 +117,13 @@ USE_ASM_SOURCES:=0 .endif .if ${USE_ASM_SOURCES} != 0 +.if exists(${MACHINE_ARCH}/md5block.S) +SRCS+= md5block.S +CFLAGS+= -DMD5_ASM +.if exists(${MACHINE_ARCH}/md5dispatch.c) +SRCS+= md5dispatch.c +.endif +.endif .if exists(${MACHINE_ARCH}/sha1block.S) SRCS+= sha1block.S CFLAGS+= -DSHA1_ASM diff --git a/lib/libmd/amd64/md5block.S b/lib/libmd/amd64/md5block.S new file mode 100644 index 000000000000..0dd594dd5dc2 --- /dev/null +++ b/lib/libmd/amd64/md5block.S @@ -0,0 +1,363 @@ +/*- + * Copyright (c) 2024, 2025 Robert Clausecker + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +/* apply the round keys to the four round functions */ +.macro allrounds rfn0, rfn1, rfn2, rfn3 + \rfn0 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee + \rfn0 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 + \rfn0 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be + \rfn0 12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 + + \rfn1 16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa + \rfn1 20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 + \rfn1 24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed + \rfn1 28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a + + \rfn2 32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c + \rfn2 36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 + \rfn2 40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 + \rfn2 44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 + + \rfn3 48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 + \rfn3 52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 + \rfn3 56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 + \rfn3 60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 +.endm + + // md5block(MD5_CTX, buf, len) +ENTRY(_libmd_md5block_baseline) +.macro round a, b, c, d, f, k, m, s + \f %ebp, \b, \c, \d + add $\k, \a // a + k[i] + add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] + add %ebp, \a // a + k[i] + m[g] + f + rol $\s, \a + add \b, \a +.endm + + // f = b ? c : d +.macro f0 f, b, c, d + mov \c, \f + xor \d, \f + and \b, \f + xor \d, \f +.endm + + // f = d ? b : c +.macro f1 f, b, c, d + mov \c, \f + xor \b, \f + and \d, \f + xor \c, \f +.endm + + // f = b ^ c ^ d +.macro f2 f, b, c, d + mov \c, \f + xor \d, \f + xor \b, \f +.endm + + // f = c ^ (b | ~d) +.macro f3 f, b, c, d + mov $-1, \f + xor \d, \f + or \b, \f + xor \c, \f +.endm + + // do 4 rounds +.macro rounds f, p, q, s0, s1, s2, s3, k0, k1, k2, k3 + round %eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0 + round %edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1 + round %ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2 + round %ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3 +.endm + + // do 4 rounds with f0, f1, f2, f3 +.macro rounds0 i, k0, k1, k2, k3 + rounds f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3 +.endm + +.macro rounds1 i, k0, k1, k2, k3 + rounds f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3 +.endm + +.macro rounds2 i, k0, k1, k2, k3 + rounds f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3 +.endm + +.macro rounds3 i, k0, k1, k2, k3 + rounds f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3 +.endm + + push %rbx + push %rbp + push %r12 + + and $~63, %rdx // length in blocks + lea (%rsi, %rdx, 1), %r12 // end pointer + + mov (%rdi), %eax // a + mov 4(%rdi), %ebx // b + mov 8(%rdi), %ecx // c + mov 12(%rdi), %edx // d + + cmp %rsi, %r12 // any data to process? + je .Lend + + .balign 16 +.Lloop: mov %eax, %r8d + mov %ebx, %r9d + mov %ecx, %r10d + mov %edx, %r11d + + allrounds rounds0, rounds1, rounds2, rounds3 + + add %r8d, %eax + add %r9d, %ebx + add %r10d, %ecx + add %r11d, %edx + + add $64, %rsi + cmp %rsi, %r12 + jne .Lloop + + mov %eax, (%rdi) + mov %ebx, 4(%rdi) + mov %ecx, 8(%rdi) + mov %edx, 12(%rdi) + +.Lend: pop %r12 + pop %rbp + pop %rbx + ret +END(_libmd_md5block_baseline) + + /* + * An implementation leveraging the ANDN instruction + * from BMI1 to shorten some dependency chains. + */ +ENTRY(_libmd_md5block_bmi1) + // special-cased round 1 + // f1 = d ? b : c = (d & b) + (~d & c) +.macro round1 a, b, c, d, k, m, s + andn \c, \d, %edi // ~d & c + add $\k, \a // a + k[i] + mov \d, %ebp + add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] + and \b, %ebp // d & b + add %edi, \a // a + k[i] + m[g] + (~d & c) + add %ebp, \a // a + k[i] + m[g] + (~d & c) + (d & b) + rol $\s, \a + add \b, \a +.endm + + // special-cased round 3 + // f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d) +.macro round3 a, b, c, d, k, m, s + andn \d, \b, %ebp + add $\k - 1, \a // a + k[i] - 1 + add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] + xor \c, %ebp + sub %ebp, \a // a + k[i] + m[g] + f + rol $\s, \a + add \b, \a +.endm + + .purgem rounds1 +.macro rounds1 i, k0, k1, k2, k3 + round1 %eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1, 5 + round1 %edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6, 9 + round1 %ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14 + round1 %ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20 +.endm + + .purgem rounds3 +.macro rounds3 i, k0, k1, k2, k3 + round3 %eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0, 6 + round3 %edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10 + round3 %ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15 + round3 %ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21 +.endm + + push %rbx + push %rbp + push %r12 + + and $~63, %rdx // length in blocks + lea (%rsi, %rdx, 1), %r12 // end pointer + + mov (%rdi), %eax // a + mov 4(%rdi), %ebx // b + mov 8(%rdi), %ecx // c + mov 12(%rdi), %edx // d + + cmp %rsi, %r12 // any data to process? + je 0f + + push %rdi + + .balign 16 +1: mov %eax, %r8d + mov %ebx, %r9d + mov %ecx, %r10d + mov %edx, %r11d + + allrounds rounds0, rounds1, rounds2, rounds3 + + add %r8d, %eax + add %r9d, %ebx + add %r10d, %ecx + add %r11d, %edx + + add $64, %rsi + cmp %rsi, %r12 + jne 1b + + pop %rdi + mov %eax, (%rdi) + mov %ebx, 4(%rdi) + mov %ecx, 8(%rdi) + mov %edx, 12(%rdi) + +0: pop %r12 + pop %rbp + pop %rbx + ret +END(_libmd_md5block_bmi1) + +#ifndef _KERNEL + /* + * An implementation leveraging AVX-512 for its VPTERNLOGD + * instruction. We're using only XMM registers here, + * avoiding costly thermal licensing. + */ +ENTRY(_libmd_md5block_avx512) +.macro vround a, b, c, d, f, i, m, mi, s + vmovdqa \b, %xmm4 + vpternlogd $\f, \d, \c, %xmm4 + vpaddd 4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i] +.if \mi != 0 + vpshufd $0x55 * \mi, %xmm5, %xmm5 // broadcast to each dword +.endif + vpaddd %xmm5, \a, \a // a + k[i] + m[g] + vpaddd %xmm4, \a, \a // a + k[i] + m[g] + f + vprold $\s, \a, \a + vpaddd \b, \a, \a +.endm + +.macro vrounds f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3 + vround %xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0 + vround %xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1 + vround %xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2 + vround %xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3 +.endm + +/* + * d c b f0 f1 f2 f3 + * 0 0 0 0 0 0 1 + * 1 0 0 1 0 1 0 + * 0 1 0 0 1 1 0 + * 1 1 0 1 0 0 1 + * 0 0 1 0 0 1 1 + * 1 0 1 0 1 0 1 + * 0 1 1 1 1 0 0 + * 1 1 1 1 1 1 0 + */ + +.macro vrounds0 i, m + vrounds 0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22 +.endm + +.macro vrounds1 i, m0, i0, m1, i1, m2, i2, m3, i3 + vrounds 0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20 +.endm + +.macro vrounds2 i, m0, i0, m1, i1, m2, i2, m3, i3 + vrounds 0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23 +.endm + +.macro vrounds3 i, m0, i0, m1, i1, m2, i2, m3, i3 + vrounds 0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21 +.endm + + and $~63, %rdx // length in blocks + add %rsi, %rdx // end pointer + + vmovd (%rdi), %xmm0 // a + vmovd 4(%rdi), %xmm1 // b + vmovd 8(%rdi), %xmm2 // c + vmovd 12(%rdi), %xmm3 // d + + lea keys(%rip), %rax + + cmp %rsi, %rdx // any data to process? + je 0f + + .balign 16 +1: vmovdqu 0*4(%rsi), %xmm8 // message words + vmovdqu 4*4(%rsi), %xmm9 + vmovdqu 8*4(%rsi), %xmm10 + vmovdqu 12*4(%rsi), %xmm11 + + vmovdqa %xmm0, %xmm12 // stash old state variables + vmovdqa %xmm1, %xmm13 + vmovdqa %xmm2, %xmm14 + vmovdqa %xmm3, %xmm15 + + vrounds0 0, %xmm8 + vrounds0 4, %xmm9 + vrounds0 8, %xmm10 + vrounds0 12, %xmm11 + + vrounds1 16, %xmm8, 1, %xmm9, 2, %xmm10, 3, %xmm8, 0 + vrounds1 20, %xmm9, 1, %xmm10, 2, %xmm11, 3, %xmm9, 0 + vrounds1 24, %xmm10, 1, %xmm11, 2, %xmm8, 3, %xmm10, 0 + vrounds1 28, %xmm11, 1, %xmm8, 2, %xmm9, 3, %xmm11, 0 + + vrounds2 32, %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2 + vrounds2 36, %xmm8, 1, %xmm9, 0, %xmm9, 3, %xmm10, 2 + vrounds2 40, %xmm11, 1, %xmm8, 0, %xmm8, 3, %xmm9, 2 + vrounds2 44 %xmm10, 1, %xmm11, 0, %xmm11, 3, %xmm8, 2 + + vrounds3 48, %xmm8, 0, %xmm9, 3, %xmm11, 2, %xmm9, 1 + vrounds3 52, %xmm11, 0, %xmm8, 3, %xmm10, 2, %xmm8, 1 + vrounds3 56, %xmm10, 0, %xmm11, 3, %xmm9, 2, %xmm11, 1 + vrounds3 60, %xmm9, 0, %xmm10, 3, %xmm8, 2, %xmm10, 1 + + vpaddd %xmm12, %xmm0, %xmm0 + vpaddd %xmm13, %xmm1, %xmm1 + vpaddd %xmm14, %xmm2, %xmm2 + vpaddd %xmm15, %xmm3, %xmm3 + + add $64, %rsi + cmp %rsi, %rdx + jne 1b + + vmovd %xmm0, (%rdi) + vmovd %xmm1, 4(%rdi) + vmovd %xmm2, 8(%rdi) + vmovd %xmm3, 12(%rdi) + +0: ret +END(_libmd_md5block_avx512) + + // round keys, for use in md5block_avx512 + .section .rodata + .balign 16 + +.macro putkeys i, a, b, c, d + .4byte \a, \b, \c, \d +.endm + +keys: allrounds putkeys, putkeys, putkeys, putkeys + .size keys, .-keys +#endif /* !defined(_KERNEL) */ + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libmd/amd64/md5dispatch.c b/lib/libmd/amd64/md5dispatch.c new file mode 100644 index 000000000000..dd2131c5a57c --- /dev/null +++ b/lib/libmd/amd64/md5dispatch.c @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2024 Robert Clausecker + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +#include +#include +#include +#include +#include + +extern void _libmd_md5block_baseline(MD5_CTX *, const void *, size_t); +extern void _libmd_md5block_bmi1(MD5_CTX *, const void *, size_t); +extern void _libmd_md5block_avx512(MD5_CTX *, const void *, size_t); + +DEFINE_UIFUNC(, void, _libmd_md5block, (MD5_CTX *, const void *, size_t)) +{ + if ((cpu_stdext_feature & (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL)) + == (CPUID_STDEXT_AVX512F | CPUID_STDEXT_AVX512VL)) { + u_int regs[4]; + char cpu_vendor[12]; + + do_cpuid(0, regs); + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + + /* the AVX-512 kernel performs poorly on AMD */ + if (memcmp(cpu_vendor, AMD_VENDOR_ID, sizeof(cpu_vendor)) != 0) + return (_libmd_md5block_avx512); + } + + if (cpu_stdext_feature & CPUID_STDEXT_BMI1) + return (_libmd_md5block_bmi1); + else + return (_libmd_md5block_baseline); +}