git: 8b4684afcde3 - main - lib/libmd: add optimised SHA1 implementations for amd64

Reply: Alexey Dokuchaev : "Re: git: 8b4684afcde3 - main - lib/libmd: add optimised SHA1 implementations for amd64"
Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Wed, 14 May 2025 23:40:38 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=8b4684afcde3930eb49490f0b8431c4cb2ad9a46

commit 8b4684afcde3930eb49490f0b8431c4cb2ad9a46
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2024-05-28 15:20:41 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2025-05-14 23:39:58 +0000

    lib/libmd: add optimised SHA1 implementations for amd64
    
    Three implementations are provided: one using just scalar
    instructions, one using AVX2, and one using the SHA instructions
    (SHANI).  The AVX2 version uses a complicated multi-block carry
    scheme described in an Intel whitepaper; the code was
    carefully transcribed from the implementatio shipped with the
    Go runtime.  The performance is quite good.  From my Tiger Lake
    based NUC:
    
    old:    16.7s ( 613 MB/s)
    scalar: 14.5s ( 706 MB/s)
    avx2:   10.5s ( 975 MB/s)
    shani:   5.6s (1829 MB/s)
    
    Reviewed by:    getz
    Obtained from:  https://github.com/golang/go/blob/b0dfcb74651b82123746273bbf6bb9988cd96e18/src/crypto/sha1/sha1block_amd64.s
    Differential Revision:  https://reviews.freebsd.org/D45444
---
 lib/libmd/Makefile             |    3 +
 lib/libmd/amd64/sha1block.S    | 1851 ++++++++++++++++++++++++++++++++++++++++
 lib/libmd/amd64/sha1dispatch.c |   77 ++
 3 files changed, 1931 insertions(+)

diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile
index 427da5b9d68f..547a134fc440 100644
--- a/lib/libmd/Makefile
+++ b/lib/libmd/Makefile
@@ -120,6 +120,9 @@ USE_ASM_SOURCES:=0
 .if exists(${MACHINE_ARCH}/sha1block.S)
 SRCS+=	sha1block.S
 CFLAGS+= -DSHA1_ASM
+.if exists(${MACHINE_ARCH}/sha1dispatch.c)
+SRCS+=  sha1dispatch.c
+.endif
 .endif
 .if exists(${MACHINE_ARCH}/rmd160.S)
 SRCS+=	rmd160.S
diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S
new file mode 100644
index 000000000000..0307dcdece32
--- /dev/null
+++ b/lib/libmd/amd64/sha1block.S
@@ -0,0 +1,1851 @@
+/*-
+ * Copyright (c) 2013 The Go Authors. All rights reserved.
+ * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ *
+ * Adapted from Go's crypto/sha1/sha1block_amd64.s.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *   * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+
+/*
+ * SHA-1 block routine. See sha1c.c for C equivalent.
+ *
+ * There are 80 rounds of 4 types:
+ *   - rounds 0-15 are type 1 and load data (round1 macro).
+ *   - rounds 16-19 are type 1 and do not load data (round1x macro).
+ *   - rounds 20-39 are type 2 and do not load data (round2 macro).
+ *   - rounds 40-59 are type 3 and do not load data (round3 macro).
+ *   - rounds 60-79 are type 4 and do not load data (round4 macro).
+ *
+ * Each round loads or shuffles the data, then computes a per-round
+ * function of b, c, d, and then mixes the result into and rotates the
+ * five registers a, b, c, d, e holding the intermediate results.
+ *
+ * The register rotation is implemented by rotating the arguments to
+ * the round macros instead of by explicit move instructions.
+ */
+.macro	load		index
+	mov		(\index)*4(%rsi), %r10d
+	bswap		%r10d
+	mov		%r10d, (\index)*4(%rsp)
+.endm
+
+.macro	shuffle		index
+	mov		((\index   )&0xf)*4(%rsp), %r10d
+	xor		((\index- 3)&0xf)*4(%rsp), %r10d
+	xor		((\index- 8)&0xf)*4(%rsp), %r10d
+	xor		((\index-14)&0xf)*4(%rsp), %r10d
+	rol		$1, %r10d
+	mov		%r10d, ((\index)&0xf)*4(%rsp)
+.endm
+
+.macro	func1		a, b, c, d, e
+	mov		\d, %r9d
+	xor		\c, %r9d
+	and		\b, %r9d
+	xor		\d, %r9d
+.endm
+
+.macro	func2		a, b, c, d, e
+	mov		\b, %r9d
+	xor		\c, %r9d
+	xor		\d, %r9d
+.endm
+
+.macro	func3		a, b, c, d, e
+	mov		\b, %r8d
+	or		\c, %r8d
+	and		\d, %r8d
+	mov		\b, %r9d
+	and		\c, %r9d
+	or		%r8d, %r9d
+.endm
+
+.macro	func4		a, b, c, d, e
+	func2		\a, \b, \c, \d, \e
+.endm
+
+.macro	mix		a, b, c, d, e, const
+	rol		$30, \b
+	add		%r9d, \e
+	mov		\a, %r8d
+	rol		$5, %r8d
+	lea		\const(\e, %r10d, 1), \e
+	add		%r8d, \e
+.endm
+
+.macro	round1		a, b, c, d, e, index
+	load		\index
+	func1		\a, \b, \c, \d, \e
+	mix		\a, \b, \c, \d, \e, 0x5a827999
+.endm
+
+.macro	round1x		a, b, c, d, e, index
+	shuffle		\index
+	func1		\a, \b, \c, \d, \e
+	mix		\a, \b, \c, \d, \e, 0x5a827999
+.endm
+
+.macro	round2		a, b, c, d, e, index
+	shuffle		\index
+	func2		\a, \b, \c, \d, \e
+	mix		\a, \b, \c, \d, \e, 0x6ed9eba1
+.endm
+
+.macro	round3		a, b, c, d, e, index
+	shuffle		\index
+	func3		\a, \b, \c, \d, \e
+	mix		\a, \b, \c, \d, \e, 0x8f1bbcdc
+.endm
+
+.macro	round4		a, b, c, d, e, index
+	shuffle		\index
+	func4		\a, \b, \c, \d, \e
+	mix		\a, \b, \c, \d, \e, 0xca62c1d6
+.endm
+
+	// sha1block(SHA1_CTX, buf, len)
+ENTRY(_libmd_sha1block_scalar)
+	push		%rbp
+	push		%rbx
+	push		%r12
+	push		%r13
+	push		%r14
+	push		%r15
+	push		%rdi			// rdi: SHA1_CTX
+	sub		$64+8, %rsp		// 64 bytes for round keys
+						// plus alignment
+
+	mov		%rdi, %rbp
+						// rsi: buf
+	and		$~63, %rdx		// rdx: length in blocks
+	lea		(%rsi, %rdx, 1), %rdi	// rdi: end pointer
+	mov		(%rbp),	%eax		// c->h0
+	mov		4(%rbp), %ebx		// c->h1
+	mov		8(%rbp), %ecx		// c->h2
+	mov		12(%rbp), %edx		// c->h3
+	mov		16(%rbp), %ebp		// c->h4
+
+	cmp		%rsi, %rdi		// any data to process?
+	je		.Lend
+
+.Lloop:	mov		%eax, %r11d
+	mov		%ebx, %r12d
+	mov		%ecx, %r13d
+	mov		%edx, %r14d
+	mov		%ebp, %r15d
+
+	round1		%eax, %ebx, %ecx, %edx, %ebp,  0
+	round1		%ebp, %eax, %ebx, %ecx, %edx,  1
+	round1		%edx, %ebp, %eax, %ebx, %ecx,  2
+	round1		%ecx, %edx, %ebp, %eax, %ebx,  3
+	round1		%ebx, %ecx, %edx, %ebp, %eax,  4
+
+	round1		%eax, %ebx, %ecx, %edx, %ebp,  5
+	round1		%ebp, %eax, %ebx, %ecx, %edx,  6
+	round1		%edx, %ebp, %eax, %ebx, %ecx,  7
+	round1		%ecx, %edx, %ebp, %eax, %ebx,  8
+	round1		%ebx, %ecx, %edx, %ebp, %eax,  9
+
+	round1		%eax, %ebx, %ecx, %edx, %ebp, 10
+	round1		%ebp, %eax, %ebx, %ecx, %edx, 11
+	round1		%edx, %ebp, %eax, %ebx, %ecx, 12
+	round1		%ecx, %edx, %ebp, %eax, %ebx, 13
+	round1		%ebx, %ecx, %edx, %ebp, %eax, 14
+
+	round1		%eax, %ebx, %ecx, %edx, %ebp, 15
+	round1x		%ebp, %eax, %ebx, %ecx, %edx, 16
+	round1x		%edx, %ebp, %eax, %ebx, %ecx, 17
+	round1x		%ecx, %edx, %ebp, %eax, %ebx, 18
+	round1x		%ebx, %ecx, %edx, %ebp, %eax, 19
+
+	round2		%eax, %ebx, %ecx, %edx, %ebp, 20
+	round2		%ebp, %eax, %ebx, %ecx, %edx, 21
+	round2		%edx, %ebp, %eax, %ebx, %ecx, 22
+	round2		%ecx, %edx, %ebp, %eax, %ebx, 23
+	round2		%ebx, %ecx, %edx, %ebp, %eax, 24
+
+	round2		%eax, %ebx, %ecx, %edx, %ebp, 25
+	round2		%ebp, %eax, %ebx, %ecx, %edx, 26
+	round2		%edx, %ebp, %eax, %ebx, %ecx, 27
+	round2		%ecx, %edx, %ebp, %eax, %ebx, 28
+	round2		%ebx, %ecx, %edx, %ebp, %eax, 29
+
+	round2		%eax, %ebx, %ecx, %edx, %ebp, 30
+	round2		%ebp, %eax, %ebx, %ecx, %edx, 31
+	round2		%edx, %ebp, %eax, %ebx, %ecx, 32
+	round2		%ecx, %edx, %ebp, %eax, %ebx, 33
+	round2		%ebx, %ecx, %edx, %ebp, %eax, 34
+
+	round2		%eax, %ebx, %ecx, %edx, %ebp, 35
+	round2		%ebp, %eax, %ebx, %ecx, %edx, 36
+	round2		%edx, %ebp, %eax, %ebx, %ecx, 37
+	round2		%ecx, %edx, %ebp, %eax, %ebx, 38
+	round2		%ebx, %ecx, %edx, %ebp, %eax, 39
+
+	round3		%eax, %ebx, %ecx, %edx, %ebp, 40
+	round3		%ebp, %eax, %ebx, %ecx, %edx, 41
+	round3		%edx, %ebp, %eax, %ebx, %ecx, 42
+	round3		%ecx, %edx, %ebp, %eax, %ebx, 43
+	round3		%ebx, %ecx, %edx, %ebp, %eax, 44
+
+	round3		%eax, %ebx, %ecx, %edx, %ebp, 45
+	round3		%ebp, %eax, %ebx, %ecx, %edx, 46
+	round3		%edx, %ebp, %eax, %ebx, %ecx, 47
+	round3		%ecx, %edx, %ebp, %eax, %ebx, 48
+	round3		%ebx, %ecx, %edx, %ebp, %eax, 49
+
+	round3		%eax, %ebx, %ecx, %edx, %ebp, 50
+	round3		%ebp, %eax, %ebx, %ecx, %edx, 51
+	round3		%edx, %ebp, %eax, %ebx, %ecx, 52
+	round3		%ecx, %edx, %ebp, %eax, %ebx, 53
+	round3		%ebx, %ecx, %edx, %ebp, %eax, 54
+
+	round3		%eax, %ebx, %ecx, %edx, %ebp, 55
+	round3		%ebp, %eax, %ebx, %ecx, %edx, 56
+	round3		%edx, %ebp, %eax, %ebx, %ecx, 57
+	round3		%ecx, %edx, %ebp, %eax, %ebx, 58
+	round3		%ebx, %ecx, %edx, %ebp, %eax, 59
+
+	round4		%eax, %ebx, %ecx, %edx, %ebp, 60
+	round4		%ebp, %eax, %ebx, %ecx, %edx, 61
+	round4		%edx, %ebp, %eax, %ebx, %ecx, 62
+	round4		%ecx, %edx, %ebp, %eax, %ebx, 63
+	round4		%ebx, %ecx, %edx, %ebp, %eax, 64
+
+	round4		%eax, %ebx, %ecx, %edx, %ebp, 65
+	round4		%ebp, %eax, %ebx, %ecx, %edx, 66
+	round4		%edx, %ebp, %eax, %ebx, %ecx, 67
+	round4		%ecx, %edx, %ebp, %eax, %ebx, 68
+	round4		%ebx, %ecx, %edx, %ebp, %eax, 69
+
+	round4		%eax, %ebx, %ecx, %edx, %ebp, 70
+	round4		%ebp, %eax, %ebx, %ecx, %edx, 71
+	round4		%edx, %ebp, %eax, %ebx, %ecx, 72
+	round4		%ecx, %edx, %ebp, %eax, %ebx, 73
+	round4		%ebx, %ecx, %edx, %ebp, %eax, 74
+
+	round4		%eax, %ebx, %ecx, %edx, %ebp, 75
+	round4		%ebp, %eax, %ebx, %ecx, %edx, 76
+	round4		%edx, %ebp, %eax, %ebx, %ecx, 77
+	round4		%ecx, %edx, %ebp, %eax, %ebx, 78
+	round4		%ebx, %ecx, %edx, %ebp, %eax, 79
+
+	add		%r11d, %eax
+	add		%r12d, %ebx
+	add		%r13d, %ecx
+	add		%r14d, %edx
+	add		%r15d, %ebp
+
+	add		$64, %rsi
+	cmp		%rdi, %rsi
+	jb		.Lloop
+
+.Lend:	add		$64+8, %rsp
+	pop		%rdi			// SHA1_CTX
+	mov		%eax, (%rdi)
+	mov		%ebx, 4(%rdi)
+	mov		%ecx, 8(%rdi)
+	mov		%edx, 12(%rdi)
+	mov		%ebp, 16(%rdi)
+
+	pop		%r15
+	pop		%r14
+	pop		%r13
+	pop		%r12
+	pop		%rbx
+	pop		%rbp
+	ret
+END(_libmd_sha1block_scalar)
+
+/*
+ * This is the implementation using AVX2, BMI1 and BMI2. It is based on:
+ * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
+ * From http://software.intel.com/en-us/articles
+ * (look for improving-the-performance-of-the-secure-hash-algorithm-1)
+ * This implementation is 2x unrolled, and interleaves vector instructions,
+ * used to precompute W, with scalar computation of current round
+ * for optimal scheduling.
+ */
+
+	/* trivial helper macros */
+.macro	update_hash	a, tb, c, d, e
+	add		(%r9), \a
+	mov		\a, (%r9)
+	add		4(%r9), \tb
+	mov		\tb, 4(%r9)
+	add		8(%r9), \c
+	mov		\c, 8(%r9)
+	add		12(%r9), \d
+	mov		\d, 12(%r9)
+	add		16(%r9), \e
+	mov		\e, 16(%r9)
+.endm
+
+	/* help macros for recalc, which does precomputations */
+.macro	precalc0	offset
+	vmovdqu		\offset(%r10), %xmm0
+.endm
+
+.macro	precalc1	offset
+	vinserti128	$1, \offset(%r13), %ymm0, %ymm0
+.endm
+
+.macro	precalc2	yreg
+	vpshufb		%ymm10, %ymm0, \yreg
+.endm
+
+.macro	precalc4	yreg, k_offset
+	vpaddd		\k_offset(%r8), \yreg, %ymm0
+.endm
+
+.macro	precalc7	offset
+	vmovdqu		%ymm0, (\offset)*2(%r14)
+.endm
+
+/*
+ * Message scheduling pre-compute for rounds 0-15
+ * r13      is a pointer to the even 64-byte block
+ * r10      is a pointer to the odd 64-byte block
+ * r14      is a pointer to the temp buffer
+ * xmm0     is used as a temp register
+ * yreg     is clobbered as part of the computation
+ * offset   chooses a 16 byte chunk within a block
+ * r8       is a pointer to the constants block
+ * k_offset chooses K constants relevant to this round
+ * xmm10    holds the swap mask
+ */
+.macro	precalc00_15	offset, yreg
+	precalc0	\offset
+	precalc1	\offset
+	precalc2	\yreg
+	precalc4	\yreg, 0
+	precalc7	\offset
+.endm
+
+	/* helper macros for precalc16_31 */
+.macro	precalc16	reg_sub16, reg_sub12, reg_sub4, reg
+	vpalignr	$8, \reg_sub16, \reg_sub12, \reg	// w[i - 14]
+	vpsrldq		$4, \reg_sub4, %ymm0			// w[i -  3]
+.endm
+
+.macro	precalc17	reg_sub16, reg_sub8, reg
+	vpxor		\reg_sub8, \reg, \reg
+	vpxor		\reg_sub16, %ymm0, %ymm0
+.endm
+
+.macro	precalc18	reg
+	vpxor		%ymm0, \reg, \reg
+	vpslldq		$12, \reg, %ymm9
+.endm
+
+.macro	precalc19	reg
+	vpslld		$1, \reg, %ymm0
+	vpsrld		$31, \reg, \reg
+	.endm
+
+.macro	precalc20	reg
+	vpor		\reg, %ymm0, %ymm0
+	vpslld		$2, %ymm9, \reg
+.endm
+
+.macro	precalc21	reg
+	vpsrld		$30, %ymm9, %ymm9
+	vpxor		\reg, %ymm0, %ymm0
+.endm
+
+.macro	precalc23	reg, k_offset, offset
+	vpxor		%ymm9, %ymm0, \reg
+	vpaddd		\k_offset(%r8), \reg, %ymm0
+	vmovdqu		%ymm0, (\offset)(%r14)
+.endm
+
+/*
+ * Message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction.
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency.
+ + clobbers 5 input ymm registers REG_SUB*
+ * uses xmm0 and xmm9 as temp registers
+ * As always, r8 is a pointer to constants block
+ * and r14 is a pointer to temp buffer
+ */
+.macro	precalc16_31	reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset
+	precalc16	\reg_sub16, \reg_sub12, \reg_sub4, \reg
+	precalc17	\reg_sub16, \reg_sub8, \reg
+	precalc18	\reg
+	precalc19	\reg
+	precalc20	\reg
+	precalc21	\reg
+	precalc23	\reg, \k_offset, \offset
+.endm
+
+	/* helper macros for precalc_32_79 */
+.macro	precalc32	reg_sub8, reg_sub4
+	vpalignr	$8, \reg_sub8, \reg_sub4, %ymm0
+.endm
+
+.macro	precalc33	reg_sub28, reg
+	vpxor		\reg_sub28, \reg, \reg
+.endm
+
+.macro	precalc34	reg_sub16
+	vpxor		\reg_sub16, %ymm0, %ymm0
+.endm
+
+.macro	precalc35	reg
+	vpxor		%ymm0, \reg, \reg
+.endm
+
+.macro	precalc36	reg
+	vpslld		$2, \reg, %ymm0
+.endm
+
+.macro	precalc37	reg
+	vpsrld		$30, \reg, \reg
+	vpor		\reg, %ymm0, \reg
+.endm
+
+.macro	precalc39	reg, k_offset, offset
+	vpaddd		\k_offset(%r8), \reg, %ymm0
+	vmovdqu		%ymm0, \offset(%r14)
+.endm
+
+.macro	precalc32_79	reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset
+	precalc32	\reg_sub8, \reg_sub4
+	precalc33	\reg_sub28, \reg
+	precalc34	\reg_sub16
+	precalc35	\reg
+	precalc36	\reg
+	precalc37	\reg
+	precalc39	\reg, \k_offset, \offset
+.endm
+
+.macro	precalc
+	precalc00_15	0x00, %ymm15
+	precalc00_15	0x10, %ymm14
+	precalc00_15	0x20, %ymm13
+	precalc00_15	0x30, %ymm12
+	precalc16_31	%ymm8,  %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080
+	precalc16_31	%ymm7,  %ymm8,  %ymm12, %ymm13, %ymm14, 0x20, 0x0a0
+	precalc16_31	%ymm5,  %ymm7,  %ymm8,  %ymm12, %ymm13, 0x20, 0x0c0
+	precalc16_31	%ymm3,  %ymm5,  %ymm7,  %ymm8,  %ymm12, 0x20, 0x0e0
+	precalc32_79	%ymm15, %ymm3,  %ymm5,  %ymm8,  %ymm14, 0x20, 0x100
+	precalc32_79	%ymm14, %ymm15, %ymm3,  %ymm7,  %ymm13, 0x20, 0x120
+	precalc32_79	%ymm13, %ymm14, %ymm15, %ymm5,  %ymm12, 0x40, 0x140
+	precalc32_79	%ymm12, %ymm13, %ymm14, %ymm3,  %ymm8,  0x40, 0x160
+	precalc32_79	%ymm8,  %ymm12, %ymm13, %ymm15, %ymm7,  0x40, 0x180
+	precalc32_79	%ymm7,  %ymm8,  %ymm12, %ymm14, %ymm5,  0x40, 0x1a0
+	precalc32_79	%ymm5,  %ymm7,  %ymm8,  %ymm13, %ymm3,  0x40, 0x1c0
+	precalc32_79	%ymm3,  %ymm5,  %ymm7,  %ymm12, %ymm15, 0x60, 0x1e0
+	precalc32_79	%ymm15, %ymm3,  %ymm5,  %ymm8,  %ymm14, 0x60, 0x200
+	precalc32_79	%ymm14, %ymm15, %ymm3,  %ymm7,  %ymm13, 0x60, 0x220
+	precalc32_79	%ymm13, %ymm14, %ymm15, %ymm5,  %ymm12, 0x60, 0x240
+	precalc32_79	%ymm12, %ymm13, %ymm14, %ymm3,  %ymm8,  0x60, 0x260
+.endm
+
+/*
+ * Macros calculating individual rounds have general form
+ * calc_round_pre + precalc_round + calc_round_post
+ * calc_round_{pre,post} macros follow
+ */
+.macro	calc_f1_pre	offset, reg_a, reg_b, reg_c, reg_e
+	add		\offset(%r15), \reg_e
+	andn		\reg_c, \reg_a, %ebp
+	add		\reg_b, \reg_e			// add F from the previous round
+	rorx		$0x1b, \reg_a, %r12d
+	rorx		$2, \reg_a, \reg_b		// for the next round
+.endm
+
+/*
+ * Calculate F for the next round
+ */
+.macro	calc_f1_post	reg_a, reg_b, reg_e
+	and		\reg_b, \reg_a			// b & c
+	xor		%ebp, \reg_a			// F1 = (b&c) ^ (~b&d)
+	add		%r12d, \reg_e
+.endm
+
+/*
+ * Registers are cyclically rotated:
+ * edx -> eax -> edi -> esi -> ebx -> ecx
+ */
+.macro	calc0
+	mov		%esi, %ebx			// precalculate first round
+	rorx		$2, %esi, %esi
+	andn		%eax, %ebx, %ebp
+	and		%edi, %ebx
+	xor		%ebp, %ebx
+	calc_f1_pre	0x0, %ecx, %ebx, %edi, %edx
+	precalc0	0x80
+	calc_f1_post	%ecx, %esi, %edx
+.endm
+
+.macro	calc1
+	calc_f1_pre	0x4, %edx, %ecx, %esi, %eax
+	precalc1	0x80
+	calc_f1_post	%edx, %ebx, %eax
+.endm
+
+.macro	calc2
+	calc_f1_pre	0x8, %eax, %edx, %ebx, %edi
+	precalc2	%ymm15
+	calc_f1_post	%eax, %ecx, %edi
+.endm
+
+.macro	calc3
+	calc_f1_pre	0xc, %edi, %eax, %ecx, %esi
+	calc_f1_post	%edi, %edx, %esi
+.endm
+
+.macro	calc4
+	calc_f1_pre	0x20, %esi, %edi, %edx, %ebx
+	precalc4	%ymm15, 0x0
+	calc_f1_post	%esi, %eax, %ebx
+.endm
+
+.macro	calc5
+	calc_f1_pre	0x24, %ebx, %esi, %eax, %ecx
+	calc_f1_post	%ebx, %edi, %ecx
+.endm
+
+.macro	calc6
+	calc_f1_pre	0x28, %ecx, %ebx, %edi, %edx
+	calc_f1_post	%ecx, %esi, %edx
+.endm
+
+.macro	calc7
+	calc_f1_pre	0x2c, %edx, %ecx, %esi, %eax
+	precalc7	0x0
+	calc_f1_post	%edx, %ebx, %eax
+.endm
+
+.macro	calc8
+	calc_f1_pre	0x40, %eax, %edx, %ebx, %edi
+	precalc0	0x90
+	calc_f1_post	%eax, %ecx, %edi
+.endm
+
+.macro	calc9
+	calc_f1_pre	0x44, %edi, %eax, %ecx, %esi
+	precalc1	0x90
+	calc_f1_post	%edi, %edx, %esi
+.endm
+
+.macro	calc10
+	calc_f1_pre	0x48, %esi, %edi, %edx, %ebx
+	precalc2	%ymm14
+	calc_f1_post	%esi, %eax, %ebx
+.endm
+
+.macro	calc11
+	calc_f1_pre	0x4c, %ebx, %esi, %eax, %ecx
+	calc_f1_post	%ebx, %edi, %ecx
+.endm
+
+.macro	calc12
+	calc_f1_pre	0x60, %ecx, %ebx, %edi, %edx
+	precalc4	%ymm14, 0
+	calc_f1_post	%ecx, %esi, %edx
+.endm
+
+.macro	calc13
+	calc_f1_pre	0x64, %edx, %ecx, %esi, %eax
+	calc_f1_post	%edx, %ebx, %eax
+.endm
+
+.macro	calc14
+	calc_f1_pre	0x68, %eax, %edx, %ebx, %edi
+	calc_f1_post	%eax, %ecx, %edi
+.endm
+
+.macro	calc15
+	calc_f1_pre	0x6c, %edi, %eax, %ecx, %esi
+	precalc7	0x10
+	calc_f1_post	%edi, %edx, %esi
+.endm
+
+.macro	calc16
+	calc_f1_pre	0x80, %esi, %edi, %edx, %ebx
+	precalc0	0xa0
+	calc_f1_post	%esi, %eax, %ebx
+.endm
+
+.macro	calc17
+	calc_f1_pre	0x84, %ebx, %esi, %eax, %ecx
+	precalc1	0xa0
+	calc_f1_post	%ebx, %edi, %ecx
+.endm
+
+.macro	calc18
+	calc_f1_pre	0x88, %ecx, %ebx, %edi, %edx
+	precalc2	%ymm13
+	calc_f1_post	%ecx, %esi, %edx
+.endm
+
+.macro	calc_f2_pre	offset, reg_a, reg_b, reg_e
+	add		\offset(%r15), \reg_e
+	add		\reg_b, \reg_e			// add F from the previous round
+	rorx		$0x1b, \reg_a, %r12d
+	rorx		$2, \reg_a, \reg_b		// for next round
+.endm
+
+.macro	calc_f2_post	reg_a, reg_b, reg_c, reg_e
+	xor		\reg_b, \reg_a
+	add		%r12d, \reg_e
+	xor		\reg_c, \reg_a
+.endm
+
+.macro	calc19
+	calc_f2_pre	0x8c, %edx, %ecx, %eax
+	calc_f2_post	%edx, %ebx, %esi, %eax
+.endm
+
+.macro	calc20
+	calc_f2_pre	0xa0, %eax, %edx, %edi
+	precalc4	%ymm13, 0x0
+	calc_f2_post	%eax, %ecx, %ebx, %edi
+.endm
+
+.macro	calc21
+	calc_f2_pre	0xa4, %edi, %eax, %esi
+	calc_f2_post	%edi, %edx, %ecx, %esi
+.endm
+
+.macro	calc22
+	calc_f2_pre	0xa8, %esi, %edi, %ebx
+	calc_f2_post	%esi, %eax, %edx, %ebx
+.endm
+
+.macro	calc23
+	calc_f2_pre	0xac, %ebx, %esi, %ecx
+	precalc7	0x20
+	calc_f2_post	%ebx, %edi, %eax, %ecx
+.endm
+
+.macro	calc24
+	calc_f2_pre	0xc0, %ecx, %ebx, %edx
+	precalc0	0xb0
+	calc_f2_post	%ecx, %esi, %edi, %edx
+.endm
+
+.macro	calc25
+	calc_f2_pre	0xc4, %edx, %ecx, %eax
+	precalc1	0xb0
+	calc_f2_post	%edx, %ebx, %esi, %eax
+.endm
+
+.macro	calc26
+	calc_f2_pre	0xc8, %eax, %edx, %edi
+	precalc2	%ymm12
+	calc_f2_post	%eax, %ecx, %ebx, %edi
+.endm
+
+.macro	calc27
+	calc_f2_pre	0xcc, %edi, %eax, %esi
+	calc_f2_post	%edi, %edx, %ecx, %esi
+.endm
+
+.macro	calc28
+	calc_f2_pre	0xe0, %esi, %edi, %ebx
+	precalc4	%ymm12, 0x0
+	calc_f2_post	%esi, %eax, %edx, %ebx
+.endm
+
+.macro	calc29
+	calc_f2_pre	0xe4, %ebx, %esi, %ecx
+	calc_f2_post	%ebx, %edi, %eax, %ecx
+.endm
+
+.macro	calc30
+	calc_f2_pre	0xe8, %ecx, %ebx, %edx
+	calc_f2_post	%ecx, %esi, %edi, %edx
+.endm
+
+.macro	calc31
+	calc_f2_pre	0xec, %edx, %ecx, %eax
+	precalc7	0x30
+	calc_f2_post	%edx, %ebx, %esi, %eax
+.endm
+
+.macro	calc32
+	calc_f2_pre	0x100, %eax, %edx, %edi
+	precalc16	%ymm15, %ymm14, %ymm12, %ymm8
+	calc_f2_post	%eax, %ecx, %ebx, %edi
+.endm
+
+.macro	calc33
+	calc_f2_pre	0x104, %edi, %eax, %esi
+	precalc17	%ymm15, %ymm13, %ymm8
+	calc_f2_post	%edi, %edx, %ecx, %esi
+.endm
+
+.macro	calc34
+	calc_f2_pre	0x108, %esi, %edi, %ebx
+	precalc18	%ymm8
+	calc_f2_post	%esi, %eax, %edx, %ebx
+.endm
+
+.macro	calc35
+	calc_f2_pre	0x10c, %ebx, %esi, %ecx
+	precalc19	%ymm8
+	calc_f2_post	%ebx, %edi, %eax, %ecx
+.endm
+
+.macro	calc36
+	calc_f2_pre	0x120, %ecx, %ebx, %edx
+	precalc20	%ymm8
+	calc_f2_post	%ecx, %esi, %edi, %edx
+.endm
+
+.macro	calc37
+	calc_f2_pre	0x124, %edx, %ecx, %eax
+	precalc21	%ymm8
+	calc_f2_post	%edx, %ebx, %esi, %eax
+.endm
+
+.macro	calc38
+	calc_f2_pre	0x128, %eax, %edx, %edi
+	calc_f2_post	%eax, %ecx, %ebx, %edi
+.endm
+
+.macro	calc_f3_pre	offset, reg_e
+	add		\offset(%r15), \reg_e
+.endm
+
+.macro	calc_f3_post	reg_a, reg_b, reg_c, reg_e, reg_tb
+	add		\reg_tb, \reg_e		// add F from the previous round
+	mov		\reg_b, %ebp
+	or		\reg_a, %ebp
+	rorx		$0x1b, \reg_a, %r12d
+	rorx		$2, \reg_a, \reg_tb
+	and		\reg_c, %ebp		// calculate F for the next round
+	and		\reg_b, \reg_a
+	or		%ebp, \reg_a
+	add		%r12d, \reg_e
+.endm
+
+.macro	calc39
+	calc_f3_pre	0x12c, %esi
+	precalc23	%ymm8, 0x0, 0x80
+	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro	calc40
+	calc_f3_pre	0x140, %ebx
+	precalc16	%ymm14, %ymm13, %ymm8, %ymm7
+	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro	calc41
+	calc_f3_pre	0x144, %ecx
+	precalc17	%ymm14, %ymm12, %ymm7
+	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro	calc42
+	calc_f3_pre	0x148, %edx
+	precalc18	%ymm7
+	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro	calc43
+	calc_f3_pre	0x14c, %eax
+	precalc19	%ymm7
+	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro	calc44
+	calc_f3_pre	0x160, %edi
+	precalc20	%ymm7
+	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro	calc45
+	calc_f3_pre	0x164, %esi
+	precalc21	%ymm7
+	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro	calc46
+	calc_f3_pre	0x168, %ebx
+	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro	calc47
+	calc_f3_pre	0x16c, %ecx
+	vpxor		%ymm9, %ymm0, %ymm7
+	vpaddd		0x20(%r8), %ymm7, %ymm0
+	vmovdqu		%ymm0, 0xa0(%r14)
+	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro	calc48
+	calc_f3_pre	0x180, %edx
+	precalc16	%ymm13, %ymm12, %ymm7, %ymm5
+	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro	calc49
+	calc_f3_pre	0x184, %eax
+	precalc17	%ymm13, %ymm8, %ymm5
+	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro	calc50
+	calc_f3_pre	0x188, %edi
+	precalc18	%ymm5
+	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro	calc51
+	calc_f3_pre	0x18c, %esi
+	precalc19	%ymm5
+	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro	calc52
+	calc_f3_pre	0x1a0, %ebx
+	precalc20	%ymm5
+	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro	calc53
+	calc_f3_pre	0x1a4, %ecx
+	precalc21	%ymm5
+	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro	calc54
+	calc_f3_pre	0x1a8, %edx
+	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro	calc55
+	calc_f3_pre	0x1ac, %eax
+	precalc23	%ymm5, 0x20, 0xc0
+	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro	calc56
+	calc_f3_pre	0x1c0, %edi
+	precalc16	%ymm12, %ymm8, %ymm5, %ymm3
+	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro	calc57
+	calc_f3_pre	0x1c4, %esi
+	precalc17	%ymm12, %ymm7, %ymm3
+	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro	calc58
+	calc_f3_pre	0x1c8, %ebx
+	precalc18	%ymm3
+	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro	calc59
+	calc_f2_pre	0x1cc, %ebx, %esi, %ecx
+	precalc19	%ymm3
+	calc_f2_post	%ebx, %edi, %eax, %ecx
+.endm
+
+.macro	calc60
+	calc_f2_pre	0x1e0, %ecx, %ebx, %edx
+	precalc20	%ymm3
+	calc_f2_post	%ecx, %esi, %edi, %edx
+.endm
+
+.macro	calc61
+	calc_f2_pre	0x1e4, %edx, %ecx, %eax
+	precalc21	%ymm3
+	calc_f2_post	%edx, %ebx, %esi, %eax
+.endm
+
+.macro	calc62
+	calc_f2_pre	0x1e8, %eax, %edx, %edi
+	calc_f2_post	%eax, %ecx, %ebx, %edi
+.endm
+
+.macro	calc63
+	calc_f2_pre	0x1ec, %edi, %eax, %esi
+	precalc23	%ymm3, 0x20, 0xe0
+	calc_f2_post	%edi, %edx, %ecx, %esi
+.endm
+
+.macro	calc64
+	calc_f2_pre	0x200, %esi, %edi, %ebx
+	precalc32	%ymm5, %ymm3
+	calc_f2_post	%esi, %eax, %edx, %ebx
+.endm
+
+.macro	calc65
+	calc_f2_pre	0x204, %ebx, %esi, %ecx
+	precalc33	%ymm14, %ymm15
+	calc_f2_post	%ebx, %edi, %eax, %ecx
+.endm
+
+.macro	calc66
+	calc_f2_pre	0x208, %ecx, %ebx, %edx
+	precalc34	%ymm8
+	calc_f2_post	%ecx, %esi, %edi, %edx
+.endm
+
+.macro	calc67
+	calc_f2_pre	0x20c, %edx, %ecx, %eax
+	precalc35	%ymm15
+	calc_f2_post	%edx, %ebx, %esi, %eax
+.endm
+
+.macro	calc68
+	calc_f2_pre	0x220, %eax, %edx, %edi
+	precalc36	%ymm15
+	calc_f2_post	%eax, %ecx, %ebx, %edi
*** 1002 LINES SKIPPED ***