git: 8b4684afcde3 - main - lib/libmd: add optimised SHA1 implementations for amd64
Date: Wed, 14 May 2025 23:40:38 UTC
The branch main has been updated by fuz:
URL: https://cgit.FreeBSD.org/src/commit/?id=8b4684afcde3930eb49490f0b8431c4cb2ad9a46
commit 8b4684afcde3930eb49490f0b8431c4cb2ad9a46
Author: Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2024-05-28 15:20:41 +0000
Commit: Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-05-14 23:39:58 +0000
lib/libmd: add optimised SHA1 implementations for amd64
Three implementations are provided: one using just scalar
instructions, one using AVX2, and one using the SHA instructions
(SHANI). The AVX2 version uses a complicated multi-block carry
scheme described in an Intel whitepaper; the code was
carefully transcribed from the implementatio shipped with the
Go runtime. The performance is quite good. From my Tiger Lake
based NUC:
old: 16.7s ( 613 MB/s)
scalar: 14.5s ( 706 MB/s)
avx2: 10.5s ( 975 MB/s)
shani: 5.6s (1829 MB/s)
Reviewed by: getz
Obtained from: https://github.com/golang/go/blob/b0dfcb74651b82123746273bbf6bb9988cd96e18/src/crypto/sha1/sha1block_amd64.s
Differential Revision: https://reviews.freebsd.org/D45444
---
lib/libmd/Makefile | 3 +
lib/libmd/amd64/sha1block.S | 1851 ++++++++++++++++++++++++++++++++++++++++
lib/libmd/amd64/sha1dispatch.c | 77 ++
3 files changed, 1931 insertions(+)
diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile
index 427da5b9d68f..547a134fc440 100644
--- a/lib/libmd/Makefile
+++ b/lib/libmd/Makefile
@@ -120,6 +120,9 @@ USE_ASM_SOURCES:=0
.if exists(${MACHINE_ARCH}/sha1block.S)
SRCS+= sha1block.S
CFLAGS+= -DSHA1_ASM
+.if exists(${MACHINE_ARCH}/sha1dispatch.c)
+SRCS+= sha1dispatch.c
+.endif
.endif
.if exists(${MACHINE_ARCH}/rmd160.S)
SRCS+= rmd160.S
diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S
new file mode 100644
index 000000000000..0307dcdece32
--- /dev/null
+++ b/lib/libmd/amd64/sha1block.S
@@ -0,0 +1,1851 @@
+/*-
+ * Copyright (c) 2013 The Go Authors. All rights reserved.
+ * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ *
+ * Adapted from Go's crypto/sha1/sha1block_amd64.s.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+
+/*
+ * SHA-1 block routine. See sha1c.c for C equivalent.
+ *
+ * There are 80 rounds of 4 types:
+ * - rounds 0-15 are type 1 and load data (round1 macro).
+ * - rounds 16-19 are type 1 and do not load data (round1x macro).
+ * - rounds 20-39 are type 2 and do not load data (round2 macro).
+ * - rounds 40-59 are type 3 and do not load data (round3 macro).
+ * - rounds 60-79 are type 4 and do not load data (round4 macro).
+ *
+ * Each round loads or shuffles the data, then computes a per-round
+ * function of b, c, d, and then mixes the result into and rotates the
+ * five registers a, b, c, d, e holding the intermediate results.
+ *
+ * The register rotation is implemented by rotating the arguments to
+ * the round macros instead of by explicit move instructions.
+ */
+.macro load index
+ mov (\index)*4(%rsi), %r10d
+ bswap %r10d
+ mov %r10d, (\index)*4(%rsp)
+.endm
+
+.macro shuffle index
+ mov ((\index )&0xf)*4(%rsp), %r10d
+ xor ((\index- 3)&0xf)*4(%rsp), %r10d
+ xor ((\index- 8)&0xf)*4(%rsp), %r10d
+ xor ((\index-14)&0xf)*4(%rsp), %r10d
+ rol $1, %r10d
+ mov %r10d, ((\index)&0xf)*4(%rsp)
+.endm
+
+.macro func1 a, b, c, d, e
+ mov \d, %r9d
+ xor \c, %r9d
+ and \b, %r9d
+ xor \d, %r9d
+.endm
+
+.macro func2 a, b, c, d, e
+ mov \b, %r9d
+ xor \c, %r9d
+ xor \d, %r9d
+.endm
+
+.macro func3 a, b, c, d, e
+ mov \b, %r8d
+ or \c, %r8d
+ and \d, %r8d
+ mov \b, %r9d
+ and \c, %r9d
+ or %r8d, %r9d
+.endm
+
+.macro func4 a, b, c, d, e
+ func2 \a, \b, \c, \d, \e
+.endm
+
+.macro mix a, b, c, d, e, const
+ rol $30, \b
+ add %r9d, \e
+ mov \a, %r8d
+ rol $5, %r8d
+ lea \const(\e, %r10d, 1), \e
+ add %r8d, \e
+.endm
+
+.macro round1 a, b, c, d, e, index
+ load \index
+ func1 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x5a827999
+.endm
+
+.macro round1x a, b, c, d, e, index
+ shuffle \index
+ func1 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x5a827999
+.endm
+
+.macro round2 a, b, c, d, e, index
+ shuffle \index
+ func2 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x6ed9eba1
+.endm
+
+.macro round3 a, b, c, d, e, index
+ shuffle \index
+ func3 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x8f1bbcdc
+.endm
+
+.macro round4 a, b, c, d, e, index
+ shuffle \index
+ func4 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0xca62c1d6
+.endm
+
+ // sha1block(SHA1_CTX, buf, len)
+ENTRY(_libmd_sha1block_scalar)
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ push %rdi // rdi: SHA1_CTX
+ sub $64+8, %rsp // 64 bytes for round keys
+ // plus alignment
+
+ mov %rdi, %rbp
+ // rsi: buf
+ and $~63, %rdx // rdx: length in blocks
+ lea (%rsi, %rdx, 1), %rdi // rdi: end pointer
+ mov (%rbp), %eax // c->h0
+ mov 4(%rbp), %ebx // c->h1
+ mov 8(%rbp), %ecx // c->h2
+ mov 12(%rbp), %edx // c->h3
+ mov 16(%rbp), %ebp // c->h4
+
+ cmp %rsi, %rdi // any data to process?
+ je .Lend
+
+.Lloop: mov %eax, %r11d
+ mov %ebx, %r12d
+ mov %ecx, %r13d
+ mov %edx, %r14d
+ mov %ebp, %r15d
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 0
+ round1 %ebp, %eax, %ebx, %ecx, %edx, 1
+ round1 %edx, %ebp, %eax, %ebx, %ecx, 2
+ round1 %ecx, %edx, %ebp, %eax, %ebx, 3
+ round1 %ebx, %ecx, %edx, %ebp, %eax, 4
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 5
+ round1 %ebp, %eax, %ebx, %ecx, %edx, 6
+ round1 %edx, %ebp, %eax, %ebx, %ecx, 7
+ round1 %ecx, %edx, %ebp, %eax, %ebx, 8
+ round1 %ebx, %ecx, %edx, %ebp, %eax, 9
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 10
+ round1 %ebp, %eax, %ebx, %ecx, %edx, 11
+ round1 %edx, %ebp, %eax, %ebx, %ecx, 12
+ round1 %ecx, %edx, %ebp, %eax, %ebx, 13
+ round1 %ebx, %ecx, %edx, %ebp, %eax, 14
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 15
+ round1x %ebp, %eax, %ebx, %ecx, %edx, 16
+ round1x %edx, %ebp, %eax, %ebx, %ecx, 17
+ round1x %ecx, %edx, %ebp, %eax, %ebx, 18
+ round1x %ebx, %ecx, %edx, %ebp, %eax, 19
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 20
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 21
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 22
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 23
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 24
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 25
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 26
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 27
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 28
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 29
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 30
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 31
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 32
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 33
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 34
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 35
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 36
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 37
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 38
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 39
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 40
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 41
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 42
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 43
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 44
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 45
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 46
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 47
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 48
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 49
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 50
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 51
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 52
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 53
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 54
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 55
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 56
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 57
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 58
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 59
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 60
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 61
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 62
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 63
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 64
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 65
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 66
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 67
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 68
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 69
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 70
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 71
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 72
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 73
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 74
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 75
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 76
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 77
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 78
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 79
+
+ add %r11d, %eax
+ add %r12d, %ebx
+ add %r13d, %ecx
+ add %r14d, %edx
+ add %r15d, %ebp
+
+ add $64, %rsi
+ cmp %rdi, %rsi
+ jb .Lloop
+
+.Lend: add $64+8, %rsp
+ pop %rdi // SHA1_CTX
+ mov %eax, (%rdi)
+ mov %ebx, 4(%rdi)
+ mov %ecx, 8(%rdi)
+ mov %edx, 12(%rdi)
+ mov %ebp, 16(%rdi)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+END(_libmd_sha1block_scalar)
+
+/*
+ * This is the implementation using AVX2, BMI1 and BMI2. It is based on:
+ * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
+ * From http://software.intel.com/en-us/articles
+ * (look for improving-the-performance-of-the-secure-hash-algorithm-1)
+ * This implementation is 2x unrolled, and interleaves vector instructions,
+ * used to precompute W, with scalar computation of current round
+ * for optimal scheduling.
+ */
+
+ /* trivial helper macros */
+.macro update_hash a, tb, c, d, e
+ add (%r9), \a
+ mov \a, (%r9)
+ add 4(%r9), \tb
+ mov \tb, 4(%r9)
+ add 8(%r9), \c
+ mov \c, 8(%r9)
+ add 12(%r9), \d
+ mov \d, 12(%r9)
+ add 16(%r9), \e
+ mov \e, 16(%r9)
+.endm
+
+ /* help macros for recalc, which does precomputations */
+.macro precalc0 offset
+ vmovdqu \offset(%r10), %xmm0
+.endm
+
+.macro precalc1 offset
+ vinserti128 $1, \offset(%r13), %ymm0, %ymm0
+.endm
+
+.macro precalc2 yreg
+ vpshufb %ymm10, %ymm0, \yreg
+.endm
+
+.macro precalc4 yreg, k_offset
+ vpaddd \k_offset(%r8), \yreg, %ymm0
+.endm
+
+.macro precalc7 offset
+ vmovdqu %ymm0, (\offset)*2(%r14)
+.endm
+
+/*
+ * Message scheduling pre-compute for rounds 0-15
+ * r13 is a pointer to the even 64-byte block
+ * r10 is a pointer to the odd 64-byte block
+ * r14 is a pointer to the temp buffer
+ * xmm0 is used as a temp register
+ * yreg is clobbered as part of the computation
+ * offset chooses a 16 byte chunk within a block
+ * r8 is a pointer to the constants block
+ * k_offset chooses K constants relevant to this round
+ * xmm10 holds the swap mask
+ */
+.macro precalc00_15 offset, yreg
+ precalc0 \offset
+ precalc1 \offset
+ precalc2 \yreg
+ precalc4 \yreg, 0
+ precalc7 \offset
+.endm
+
+ /* helper macros for precalc16_31 */
+.macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg
+ vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14]
+ vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3]
+.endm
+
+.macro precalc17 reg_sub16, reg_sub8, reg
+ vpxor \reg_sub8, \reg, \reg
+ vpxor \reg_sub16, %ymm0, %ymm0
+.endm
+
+.macro precalc18 reg
+ vpxor %ymm0, \reg, \reg
+ vpslldq $12, \reg, %ymm9
+.endm
+
+.macro precalc19 reg
+ vpslld $1, \reg, %ymm0
+ vpsrld $31, \reg, \reg
+ .endm
+
+.macro precalc20 reg
+ vpor \reg, %ymm0, %ymm0
+ vpslld $2, %ymm9, \reg
+.endm
+
+.macro precalc21 reg
+ vpsrld $30, %ymm9, %ymm9
+ vpxor \reg, %ymm0, %ymm0
+.endm
+
+.macro precalc23 reg, k_offset, offset
+ vpxor %ymm9, %ymm0, \reg
+ vpaddd \k_offset(%r8), \reg, %ymm0
+ vmovdqu %ymm0, (\offset)(%r14)
+.endm
+
+/*
+ * Message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction.
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency.
+ + clobbers 5 input ymm registers REG_SUB*
+ * uses xmm0 and xmm9 as temp registers
+ * As always, r8 is a pointer to constants block
+ * and r14 is a pointer to temp buffer
+ */
+.macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset
+ precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg
+ precalc17 \reg_sub16, \reg_sub8, \reg
+ precalc18 \reg
+ precalc19 \reg
+ precalc20 \reg
+ precalc21 \reg
+ precalc23 \reg, \k_offset, \offset
+.endm
+
+ /* helper macros for precalc_32_79 */
+.macro precalc32 reg_sub8, reg_sub4
+ vpalignr $8, \reg_sub8, \reg_sub4, %ymm0
+.endm
+
+.macro precalc33 reg_sub28, reg
+ vpxor \reg_sub28, \reg, \reg
+.endm
+
+.macro precalc34 reg_sub16
+ vpxor \reg_sub16, %ymm0, %ymm0
+.endm
+
+.macro precalc35 reg
+ vpxor %ymm0, \reg, \reg
+.endm
+
+.macro precalc36 reg
+ vpslld $2, \reg, %ymm0
+.endm
+
+.macro precalc37 reg
+ vpsrld $30, \reg, \reg
+ vpor \reg, %ymm0, \reg
+.endm
+
+.macro precalc39 reg, k_offset, offset
+ vpaddd \k_offset(%r8), \reg, %ymm0
+ vmovdqu %ymm0, \offset(%r14)
+.endm
+
+.macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset
+ precalc32 \reg_sub8, \reg_sub4
+ precalc33 \reg_sub28, \reg
+ precalc34 \reg_sub16
+ precalc35 \reg
+ precalc36 \reg
+ precalc37 \reg
+ precalc39 \reg, \k_offset, \offset
+.endm
+
+.macro precalc
+ precalc00_15 0x00, %ymm15
+ precalc00_15 0x10, %ymm14
+ precalc00_15 0x20, %ymm13
+ precalc00_15 0x30, %ymm12
+ precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080
+ precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0
+ precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0
+ precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0
+ precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100
+ precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120
+ precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140
+ precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160
+ precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180
+ precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0
+ precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0
+ precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0
+ precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200
+ precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220
+ precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240
+ precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260
+.endm
+
+/*
+ * Macros calculating individual rounds have general form
+ * calc_round_pre + precalc_round + calc_round_post
+ * calc_round_{pre,post} macros follow
+ */
+.macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e
+ add \offset(%r15), \reg_e
+ andn \reg_c, \reg_a, %ebp
+ add \reg_b, \reg_e // add F from the previous round
+ rorx $0x1b, \reg_a, %r12d
+ rorx $2, \reg_a, \reg_b // for the next round
+.endm
+
+/*
+ * Calculate F for the next round
+ */
+.macro calc_f1_post reg_a, reg_b, reg_e
+ and \reg_b, \reg_a // b & c
+ xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d)
+ add %r12d, \reg_e
+.endm
+
+/*
+ * Registers are cyclically rotated:
+ * edx -> eax -> edi -> esi -> ebx -> ecx
+ */
+.macro calc0
+ mov %esi, %ebx // precalculate first round
+ rorx $2, %esi, %esi
+ andn %eax, %ebx, %ebp
+ and %edi, %ebx
+ xor %ebp, %ebx
+ calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx
+ precalc0 0x80
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc1
+ calc_f1_pre 0x4, %edx, %ecx, %esi, %eax
+ precalc1 0x80
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc2
+ calc_f1_pre 0x8, %eax, %edx, %ebx, %edi
+ precalc2 %ymm15
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc3
+ calc_f1_pre 0xc, %edi, %eax, %ecx, %esi
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc4
+ calc_f1_pre 0x20, %esi, %edi, %edx, %ebx
+ precalc4 %ymm15, 0x0
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc5
+ calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc6
+ calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc7
+ calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax
+ precalc7 0x0
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc8
+ calc_f1_pre 0x40, %eax, %edx, %ebx, %edi
+ precalc0 0x90
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc9
+ calc_f1_pre 0x44, %edi, %eax, %ecx, %esi
+ precalc1 0x90
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc10
+ calc_f1_pre 0x48, %esi, %edi, %edx, %ebx
+ precalc2 %ymm14
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc11
+ calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc12
+ calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx
+ precalc4 %ymm14, 0
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc13
+ calc_f1_pre 0x64, %edx, %ecx, %esi, %eax
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc14
+ calc_f1_pre 0x68, %eax, %edx, %ebx, %edi
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc15
+ calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi
+ precalc7 0x10
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc16
+ calc_f1_pre 0x80, %esi, %edi, %edx, %ebx
+ precalc0 0xa0
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc17
+ calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx
+ precalc1 0xa0
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc18
+ calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx
+ precalc2 %ymm13
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc_f2_pre offset, reg_a, reg_b, reg_e
+ add \offset(%r15), \reg_e
+ add \reg_b, \reg_e // add F from the previous round
+ rorx $0x1b, \reg_a, %r12d
+ rorx $2, \reg_a, \reg_b // for next round
+.endm
+
+.macro calc_f2_post reg_a, reg_b, reg_c, reg_e
+ xor \reg_b, \reg_a
+ add %r12d, \reg_e
+ xor \reg_c, \reg_a
+.endm
+
+.macro calc19
+ calc_f2_pre 0x8c, %edx, %ecx, %eax
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc20
+ calc_f2_pre 0xa0, %eax, %edx, %edi
+ precalc4 %ymm13, 0x0
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc21
+ calc_f2_pre 0xa4, %edi, %eax, %esi
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc22
+ calc_f2_pre 0xa8, %esi, %edi, %ebx
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc23
+ calc_f2_pre 0xac, %ebx, %esi, %ecx
+ precalc7 0x20
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc24
+ calc_f2_pre 0xc0, %ecx, %ebx, %edx
+ precalc0 0xb0
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc25
+ calc_f2_pre 0xc4, %edx, %ecx, %eax
+ precalc1 0xb0
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc26
+ calc_f2_pre 0xc8, %eax, %edx, %edi
+ precalc2 %ymm12
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc27
+ calc_f2_pre 0xcc, %edi, %eax, %esi
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc28
+ calc_f2_pre 0xe0, %esi, %edi, %ebx
+ precalc4 %ymm12, 0x0
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc29
+ calc_f2_pre 0xe4, %ebx, %esi, %ecx
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc30
+ calc_f2_pre 0xe8, %ecx, %ebx, %edx
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc31
+ calc_f2_pre 0xec, %edx, %ecx, %eax
+ precalc7 0x30
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc32
+ calc_f2_pre 0x100, %eax, %edx, %edi
+ precalc16 %ymm15, %ymm14, %ymm12, %ymm8
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc33
+ calc_f2_pre 0x104, %edi, %eax, %esi
+ precalc17 %ymm15, %ymm13, %ymm8
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc34
+ calc_f2_pre 0x108, %esi, %edi, %ebx
+ precalc18 %ymm8
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc35
+ calc_f2_pre 0x10c, %ebx, %esi, %ecx
+ precalc19 %ymm8
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc36
+ calc_f2_pre 0x120, %ecx, %ebx, %edx
+ precalc20 %ymm8
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc37
+ calc_f2_pre 0x124, %edx, %ecx, %eax
+ precalc21 %ymm8
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc38
+ calc_f2_pre 0x128, %eax, %edx, %edi
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc_f3_pre offset, reg_e
+ add \offset(%r15), \reg_e
+.endm
+
+.macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb
+ add \reg_tb, \reg_e // add F from the previous round
+ mov \reg_b, %ebp
+ or \reg_a, %ebp
+ rorx $0x1b, \reg_a, %r12d
+ rorx $2, \reg_a, \reg_tb
+ and \reg_c, %ebp // calculate F for the next round
+ and \reg_b, \reg_a
+ or %ebp, \reg_a
+ add %r12d, \reg_e
+.endm
+
+.macro calc39
+ calc_f3_pre 0x12c, %esi
+ precalc23 %ymm8, 0x0, 0x80
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc40
+ calc_f3_pre 0x140, %ebx
+ precalc16 %ymm14, %ymm13, %ymm8, %ymm7
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc41
+ calc_f3_pre 0x144, %ecx
+ precalc17 %ymm14, %ymm12, %ymm7
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc42
+ calc_f3_pre 0x148, %edx
+ precalc18 %ymm7
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc43
+ calc_f3_pre 0x14c, %eax
+ precalc19 %ymm7
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc44
+ calc_f3_pre 0x160, %edi
+ precalc20 %ymm7
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc45
+ calc_f3_pre 0x164, %esi
+ precalc21 %ymm7
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc46
+ calc_f3_pre 0x168, %ebx
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc47
+ calc_f3_pre 0x16c, %ecx
+ vpxor %ymm9, %ymm0, %ymm7
+ vpaddd 0x20(%r8), %ymm7, %ymm0
+ vmovdqu %ymm0, 0xa0(%r14)
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc48
+ calc_f3_pre 0x180, %edx
+ precalc16 %ymm13, %ymm12, %ymm7, %ymm5
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc49
+ calc_f3_pre 0x184, %eax
+ precalc17 %ymm13, %ymm8, %ymm5
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc50
+ calc_f3_pre 0x188, %edi
+ precalc18 %ymm5
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc51
+ calc_f3_pre 0x18c, %esi
+ precalc19 %ymm5
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc52
+ calc_f3_pre 0x1a0, %ebx
+ precalc20 %ymm5
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc53
+ calc_f3_pre 0x1a4, %ecx
+ precalc21 %ymm5
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc54
+ calc_f3_pre 0x1a8, %edx
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc55
+ calc_f3_pre 0x1ac, %eax
+ precalc23 %ymm5, 0x20, 0xc0
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc56
+ calc_f3_pre 0x1c0, %edi
+ precalc16 %ymm12, %ymm8, %ymm5, %ymm3
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc57
+ calc_f3_pre 0x1c4, %esi
+ precalc17 %ymm12, %ymm7, %ymm3
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc58
+ calc_f3_pre 0x1c8, %ebx
+ precalc18 %ymm3
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc59
+ calc_f2_pre 0x1cc, %ebx, %esi, %ecx
+ precalc19 %ymm3
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc60
+ calc_f2_pre 0x1e0, %ecx, %ebx, %edx
+ precalc20 %ymm3
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc61
+ calc_f2_pre 0x1e4, %edx, %ecx, %eax
+ precalc21 %ymm3
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc62
+ calc_f2_pre 0x1e8, %eax, %edx, %edi
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc63
+ calc_f2_pre 0x1ec, %edi, %eax, %esi
+ precalc23 %ymm3, 0x20, 0xe0
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc64
+ calc_f2_pre 0x200, %esi, %edi, %ebx
+ precalc32 %ymm5, %ymm3
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc65
+ calc_f2_pre 0x204, %ebx, %esi, %ecx
+ precalc33 %ymm14, %ymm15
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc66
+ calc_f2_pre 0x208, %ecx, %ebx, %edx
+ precalc34 %ymm8
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc67
+ calc_f2_pre 0x20c, %edx, %ecx, %eax
+ precalc35 %ymm15
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc68
+ calc_f2_pre 0x220, %eax, %edx, %edi
+ precalc36 %ymm15
+ calc_f2_post %eax, %ecx, %ebx, %edi
*** 1002 LINES SKIPPED ***