svn commit: r300819 - in vendor-sys/skein: . dist dist/Additional_Implementations dist/Optimized_32bit dist/Optimized_64bit dist/README dist/Reference_Implementation dist/Supporting_Documentation d...

Allan Jude allanjude at FreeBSD.org
Fri May 27 02:42:48 UTC 2016


Author: allanjude
Date: Fri May 27 02:42:46 2016
New Revision: 300819
URL: https://svnweb.freebsd.org/changeset/base/300819

Log:
  Import Skein 1.3
  
  Bruce Schneier's hashing algorithm
  
  Used by newer versions of ZFS

Added:
  vendor-sys/skein/
  vendor-sys/skein/dist/
  vendor-sys/skein/dist/Additional_Implementations/
  vendor-sys/skein/dist/Additional_Implementations/Atmel_AVR.c   (contents, props changed)
  vendor-sys/skein/dist/Additional_Implementations/skein_8bit_estimates.xls   (contents, props changed)
  vendor-sys/skein/dist/Additional_Implementations/skein_MSC_v9_perf.txt   (contents, props changed)
  vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.asm
  vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.s   (contents, props changed)
  vendor-sys/skein/dist/Additional_Implementations/skein_block_x86.asm
  vendor-sys/skein/dist/Additional_Implementations/skein_block_xmm32.asm
  vendor-sys/skein/dist/Additional_Implementations/skein_block_xmm32.s   (contents, props changed)
  vendor-sys/skein/dist/Additional_Implementations/skein_perf_core2.txt   (contents, props changed)
  vendor-sys/skein/dist/Additional_Implementations/skein_rot_search2.c   (contents, props changed)
  vendor-sys/skein/dist/Additional_Implementations/skein_test.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/
  vendor-sys/skein/dist/Optimized_32bit/SHA3api_ref.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/SHA3api_ref.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/brg_endian.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/brg_types.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/skein.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/skein.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/skein_block.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/skein_debug.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/skein_debug.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/skein_iv.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_32bit/skein_port.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/
  vendor-sys/skein/dist/Optimized_64bit/SHA3api_ref.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/SHA3api_ref.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/brg_endian.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/brg_types.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/skein.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/skein.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/skein_block.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/skein_debug.c   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/skein_debug.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/skein_iv.h   (contents, props changed)
  vendor-sys/skein/dist/Optimized_64bit/skein_port.h   (contents, props changed)
  vendor-sys/skein/dist/README/
  vendor-sys/skein/dist/README/readme.txt   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/
  vendor-sys/skein/dist/Reference_Implementation/SHA3api_ref.c   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/SHA3api_ref.h   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/brg_endian.h   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/brg_types.h   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/skein.c   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/skein.h   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/skein_block.c   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/skein_debug.c   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/skein_debug.h   (contents, props changed)
  vendor-sys/skein/dist/Reference_Implementation/skein_port.h   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/
  vendor-sys/skein/dist/Supporting_Documentation/Skein Cover Sheet.pdf   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/Skein_Implementation_Statement.pdf   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/Skein_Submitter_Statement.pdf   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/skein1.3.pdf   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/skeinround3Mods.pdf   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/
  vendor-sys/skein/dist/Supporting_Documentation/tex/key_recover.pdf   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/reverserounds256.pdf   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-21.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-22.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-23.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-24.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-25.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-31.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-32.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-33.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-41.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-42.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-51.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-52.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-53.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-61.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-71.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein-81.mps   (contents, props changed)
  vendor-sys/skein/dist/Supporting_Documentation/tex/skein1.3.tex
  vendor-sys/skein/dist/Supporting_Documentation/tex/skeinround3Mods.tex

Added: vendor-sys/skein/dist/Additional_Implementations/Atmel_AVR.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ vendor-sys/skein/dist/Additional_Implementations/Atmel_AVR.c	Fri May 27 02:42:46 2016	(r300819)
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include "skein.h"
+
+#define   SKEIN_CODE_SIZE (1)       /* instantiate code size routines */
+#define   SKEIN_LOOP    (111)       /* unroll only 8 rounds */
+#define   SKEIN_USE_ASM (512+1024)  /* what to exclude here */
+#include "skein.c"
+#include "skein_block.c"
+
+/* for code size limitations, make "dummy" versions of unused block functions */
+#if SKEIN_USE_ASM & 256
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 512
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+#if SKEIN_USE_ASM & 1024
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) { }
+#endif
+
+const u08b_t msg[1] = 
+  {
+  0
+  };
+
+int main(int argc,char *argv[])
+    {
+    u08b_t hash[1024/8];
+	u08b_t i,x;
+    static size_t aBytes,bBytes,uCount;
+
+#if !(SKEIN_USE_ASM & 256)
+    Skein_256_Ctxt_t ctx;
+
+    aBytes = 2*Skein_256_API_CodeSize();
+	bBytes = 2*Skein_256_Process_Block_CodeSize();
+	uCount =   Skein_256_Unroll_Cnt();
+
+    Skein_256_Init  (&ctx,256);
+	Skein_256_Update(&ctx,msg,sizeof(msg));
+	Skein_256_Final (&ctx,hash);
+
+    Skein_256_Process_Block(&ctx,msg,1,256);
+#endif
+
+#if !(SKEIN_USE_ASM & 512)
+    Skein_512_Ctxt_t ctx;
+
+    aBytes = 2*Skein_512_API_CodeSize();
+	bBytes = 2*Skein_512_Process_Block_CodeSize();
+	uCount =   Skein_512_Unroll_Cnt();
+
+    Skein_512_Init  (&ctx,512);
+	Skein_512_Update(&ctx,msg,sizeof(msg));
+	Skein_512_Final (&ctx,hash);
+
+    Skein_512_Process_Block(&ctx,msg,1,512);
+#endif
+
+#if !(SKEIN_USE_ASM & 1024)
+    Skein1024_Ctxt_t ctx;
+
+    aBytes = 2*Skein1024_API_CodeSize();
+	bBytes = 2*Skein1024_Process_Block_CodeSize();
+	uCount =   Skein1024_Unroll_Cnt();
+
+    Skein1024_Init  (&ctx,1024);
+	Skein1024_Update(&ctx,msg,sizeof(msg));
+	Skein1024_Final (&ctx,hash);
+
+    Skein1024_Process_Block(&ctx,msg,1,1024);
+#endif
+    printf("API size = %4d bytes. Block size = %4d bytes. Unroll=%d\n",
+	          aBytes,bBytes,uCount);
+    for (i=x=0;i<5;i++)
+	    printf("hash[%d] = %02X [%02X]\n",i,hash[i],x ^= hash[i]);
+    }

Added: vendor-sys/skein/dist/Additional_Implementations/skein_8bit_estimates.xls
==============================================================================
Binary file. No diff available.

Added: vendor-sys/skein/dist/Additional_Implementations/skein_MSC_v9_perf.txt
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ vendor-sys/skein/dist/Additional_Implementations/skein_MSC_v9_perf.txt	Fri May 27 02:42:46 2016	(r300819)
@@ -0,0 +1,129 @@
+File STDIN:
+      1_ ||  2802.00  2814.00  |  5952.00  5952.00  | 30606.00 30606.00  | //: 32-bit, MSC_v9.00 [ C =...]
+     10_ ||   278.40   278.40  |   593.40   593.40  |  3063.00  3063.00  | //: 32-bit, MSC_v9.00 [ C =...]
+    100_ ||    65.52    65.58  |    88.02    88.08  |   306.30   306.30  | //: 32-bit, MSC_v9.00 [ C =...]
+   1000_ ||    41.26    41.41  |    47.96    47.96  |   135.28   135.29  | //: 32-bit, MSC_v9.00 [ C =...]
+  10000_ ||    38.86    39.08  |    44.13    44.21  |   119.88   120.11  | //: 32-bit, MSC_v9.00 [ C =...]
+ 100000_ ||    38.85    39.09  |    43.56    43.77  |   105.79   114.18  | //: 32-bit, MSC_v9.00 [ C =...]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+  Block  ||        10192 bytes |        22960 bytes |        53072 bytes | //: 32-bit, MSC_v9.00 [ C =...]
+      1_ ||   780.00   786.00  |  1110.00  1110.00  |  3288.00  3318.00  | //: 64-bit, MSC_v9.00 [ C =...]
+     10_ ||    78.60    79.80  |   109.80   109.80  |   331.20   331.80  | //: 64-bit, MSC_v9.00 [ C =...]
+    100_ ||    16.74    16.80  |    15.54    15.54  |    33.30    33.30  | //: 64-bit, MSC_v9.00 [ C =...]
+   1000_ ||     9.88    10.67  |     7.38     7.38  |    14.16    14.17  | //: 64-bit, MSC_v9.00 [ C =...]
+  10000_ ||     9.21     9.22  |     6.60     6.60  |    12.27    12.39  | //: 64-bit, MSC_v9.00 [ C =...]
+ 100000_ ||     9.98    10.01  |     7.04     7.08  |    12.36    13.14  | //: 64-bit, MSC_v9.00 [ C =...]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+  Block  ||         2272 bytes |         4944 bytes |        15264 bytes | //: 64-bit, MSC_v9.00 [ C =...]
+      1_ ||  2484.00  2490.00  |  4830.00  4836.00  | 22182.00 22188.00  | //: 32-bit, MSC_v9.00 [asm=...]
+     10_ ||   250.20   252.00  |   485.40   488.40  |  1936.80  1959.00  | //: 32-bit, MSC_v9.00 [asm=...]
+    100_ ||    58.62    58.68  |    70.74    70.80  |   221.76   221.76  | //: 32-bit, MSC_v9.00 [asm=...]
+   1000_ ||    34.12    34.16  |    35.44    35.44  |    85.27    85.31  | //: 32-bit, MSC_v9.00 [asm=...]
+  10000_ ||    34.78    34.98  |    35.36    35.36  |    86.31    86.35  | //: 32-bit, MSC_v9.00 [asm=...]
+ 100000_ ||    32.96    33.40  |    33.29    33.60  |    75.79    76.81  | //: 32-bit, MSC_v9.00 [asm=...]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+  Block  ||         7588 bytes |        16636 bytes |        38262 bytes | //: 32-bit, MSC_v9.00 [asm=...]
+      1_ ||   672.00   672.00  |  1068.00  1068.00  |  1920.00  1926.00  | //: 64-bit, MSC_v9.00 [asm=...]
+     10_ ||    64.80    65.40  |   107.40   108.00  |   192.00   192.60  | //: 64-bit, MSC_v9.00 [asm=...]
+    100_ ||    15.54    15.60  |    16.20    16.26  |    21.06    21.06  | //: 64-bit, MSC_v9.00 [asm=...]
+   1000_ ||     8.18     8.18  |     6.97     6.97  |     7.77     7.78  | //: 64-bit, MSC_v9.00 [asm=...]
+  10000_ ||     7.59     7.59  |     6.23     6.23  |     6.69     6.69  | //: 64-bit, MSC_v9.00 [asm=...]
+ 100000_ ||     7.55     7.71  |     6.14     6.38  |     6.56     6.86  | //: 64-bit, MSC_v9.00 [asm=...]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+  Block  ||         2323 bytes |         4733 bytes |        11817 bytes | //: 64-bit, MSC_v9.00 [asm=...]
+      1_ ||  2952.00  2958.00  |  6030.00  6036.00  | 13668.00 13674.00  | //: 32-bit, MSC_v9.00 [ C =111]
+     10_ ||   295.80   295.80  |   603.00   603.60  |  1366.80  1366.80  | //: 32-bit, MSC_v9.00 [ C =111]
+    100_ ||    69.96    70.02  |    88.98    89.04  |   136.92   137.52  | //: 32-bit, MSC_v9.00 [ C =111]
+   1000_ ||    43.90    43.96  |    48.78    48.85  |    60.08    60.11  | //: 32-bit, MSC_v9.00 [ C =111]
+  10000_ ||    41.53    41.59  |    44.76    44.80  |    53.01    53.01  | //: 32-bit, MSC_v9.00 [ C =111]
+ 100000_ ||    41.32    41.60  |    44.52    44.62  |    51.75    51.92  | //: 32-bit, MSC_v9.00 [ C =111]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+  Block  ||         1712 bytes |         3664 bytes |         7200 bytes | //: 32-bit, MSC_v9.00 [ C =111]
+      1_ ||   780.00   786.00  |  1422.00  1434.00  |  3810.00  3816.00  | //: 64-bit, MSC_v9.00 [ C =111]
+     10_ ||    75.60    76.20  |   140.40   140.40  |   380.40   381.00  | //: 64-bit, MSC_v9.00 [ C =111]
+    100_ ||    17.16    17.22  |    20.52    21.00  |    38.22    38.28  | //: 64-bit, MSC_v9.00 [ C =111]
+   1000_ ||     9.69     9.69  |    10.42    10.42  |    16.51    16.51  | //: 64-bit, MSC_v9.00 [ C =111]
+  10000_ ||     8.97     8.97  |     9.38     9.38  |    14.38    14.40  | //: 64-bit, MSC_v9.00 [ C =111]
+ 100000_ ||     9.18     9.71  |     9.35     9.49  |    14.79    14.99  | //: 64-bit, MSC_v9.00 [ C =111]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+  Block  ||          704 bytes |         1456 bytes |         2976 bytes | //: 64-bit, MSC_v9.00 [ C =111]
+      1_ ||  2580.00  2598.00  |  4842.00  4848.00  | 10578.00 10602.00  | //: 32-bit, MSC_v9.00 [asm=111]
+     10_ ||   259.80   259.80  |   484.20   484.20  |  1059.60  1060.20  | //: 32-bit, MSC_v9.00 [asm=111]
+    100_ ||    57.18    57.24  |    66.42    66.48  |    98.40    98.46  | //: 32-bit, MSC_v9.00 [asm=111]
+   1000_ ||    35.56    35.59  |    35.96    35.96  |    42.79    42.80  | //: 32-bit, MSC_v9.00 [asm=111]
+  10000_ ||    33.69    36.50  |    33.29    33.42  |    37.98    41.34  | //: 32-bit, MSC_v9.00 [asm=111]
+ 100000_ ||    33.96    34.57  |    33.93    35.69  |    38.04    38.20  | //: 32-bit, MSC_v9.00 [asm=111]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+  Block  ||         1276 bytes |         2532 bytes |         4983 bytes | //: 32-bit, MSC_v9.00 [asm=111]
+      1_ ||   678.00   678.00  |  1098.00  1098.00  |  2034.00  2040.00  | //: 64-bit, MSC_v9.00 [asm=111]
+     10_ ||    66.60    66.60  |   109.80   109.80  |   204.00   204.00  | //: 64-bit, MSC_v9.00 [asm=111]
+    100_ ||    15.48    16.68  |    16.98    16.98  |    22.38    22.38  | //: 64-bit, MSC_v9.00 [asm=111]
+   1000_ ||     8.45     8.45  |     7.93     7.93  |     8.39     8.39  | //: 64-bit, MSC_v9.00 [asm=111]
+  10000_ ||     7.81     7.81  |     6.50     6.50  |     7.18     7.18  | //: 64-bit, MSC_v9.00 [asm=111]
+ 100000_ ||     8.08     8.09  |     6.40     6.71  |     6.98     7.21  | //: 64-bit, MSC_v9.00 [asm=111]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+  Block  ||          664 bytes |         1074 bytes |         2221 bytes | //: 64-bit, MSC_v9.00 [asm=111]
+      1_ ||  2988.00  2994.00  |  6240.00  6246.00  | 13794.00 13800.00  | //: 32-bit, MSC_v9.00 [ C =332]
+     10_ ||   297.60   299.40  |   623.40   624.00  |  1379.40  1380.00  | //: 32-bit, MSC_v9.00 [ C =332]
+    100_ ||    70.26    70.32  |    91.92    91.92  |   138.00   138.06  | //: 32-bit, MSC_v9.00 [ C =332]
+   1000_ ||    44.88    44.89  |    50.20    50.20  |    60.44    60.45  | //: 32-bit, MSC_v9.00 [ C =332]
+  10000_ ||    42.42    42.42  |    46.30    46.31  |    53.29    53.31  | //: 32-bit, MSC_v9.00 [ C =332]
+ 100000_ ||    42.21    42.50  |    43.60    45.77  |    49.55    50.03  | //: 32-bit, MSC_v9.00 [ C =332]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+  Block  ||         4560 bytes |         9232 bytes |        12560 bytes | //: 32-bit, MSC_v9.00 [ C =332]
+      1_ ||   780.00   798.00  |  1920.00  1920.00  |  3732.00  3732.00  | //: 64-bit, MSC_v9.00 [ C =332]
+     10_ ||    76.80    76.80  |   189.00   191.40  |   402.60   402.60  | //: 64-bit, MSC_v9.00 [ C =332]
+    100_ ||    17.10    17.16  |    27.66    27.90  |    37.62    37.62  | //: 64-bit, MSC_v9.00 [ C =332]
+   1000_ ||     9.98    10.12  |    14.23    14.25  |    16.13    16.13  | //: 64-bit, MSC_v9.00 [ C =332]
+  10000_ ||     9.27     9.28  |    12.89    12.99  |    13.98    13.98  | //: 64-bit, MSC_v9.00 [ C =332]
+ 100000_ ||     9.32     9.56  |    13.12    13.19  |    14.15    14.23  | //: 64-bit, MSC_v9.00 [ C =332]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+  Block  ||         1200 bytes |         2928 bytes |         5008 bytes | //: 64-bit, MSC_v9.00 [ C =332]
+      1_ ||  2598.00  2604.00  |  4866.00  4878.00  | 10614.00 10632.00  | //: 32-bit, MSC_v9.00 [asm=332]
+     10_ ||   260.40   261.00  |   490.20   490.20  |  1067.40  1067.40  | //: 32-bit, MSC_v9.00 [asm=332]
+    100_ ||    60.78    60.78  |    72.00    72.00  |   106.86   106.92  | //: 32-bit, MSC_v9.00 [asm=332]
+   1000_ ||    38.38    38.42  |    39.17    39.19  |    46.49    46.61  | //: 32-bit, MSC_v9.00 [asm=332]
+  10000_ ||    40.98    47.69  |    35.81    35.86  |    40.96    43.93  | //: 32-bit, MSC_v9.00 [asm=332]
+ 100000_ ||    34.46    36.34  |    34.07    37.16  |    39.60    43.18  | //: 32-bit, MSC_v9.00 [asm=332]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+  Block  ||         3060 bytes |         6300 bytes |         8835 bytes | //: 32-bit, MSC_v9.00 [asm=332]
+      1_ ||   684.00   690.00  |  1104.00  1104.00  |  2028.00  2034.00  | //: 64-bit, MSC_v9.00 [asm=332]
+     10_ ||    70.80    70.80  |   120.00   120.00  |   219.00   219.00  | //: 64-bit, MSC_v9.00 [asm=332]
+    100_ ||    15.72    15.72  |    16.74    16.74  |    22.20    22.20  | //: 64-bit, MSC_v9.00 [asm=332]
+   1000_ ||     8.42     8.42  |     7.22     7.22  |     8.30     8.30  | //: 64-bit, MSC_v9.00 [asm=332]
+  10000_ ||     7.85     8.51  |     6.58     6.58  |     7.11     7.12  | //: 64-bit, MSC_v9.00 [asm=332]
+ 100000_ ||     7.80     9.43  |     6.90     7.71  |     7.18     8.48  | //: 64-bit, MSC_v9.00 [asm=332]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+  Block  ||         1288 bytes |         2182 bytes |         3449 bytes | //: 64-bit, MSC_v9.00 [asm=332]
+      1_ ||  2994.00  2994.00  |  6240.00  6240.00  | 14598.00 14604.00  | //: 32-bit, MSC_v9.00 [ C =335]
+     10_ ||   300.60   301.20  |   624.00   624.60  |  1459.20  1461.00  | //: 32-bit, MSC_v9.00 [ C =335]
+    100_ ||    70.62    70.68  |    91.86    91.92  |   146.10   146.16  | //: 32-bit, MSC_v9.00 [ C =335]
+   1000_ ||    44.65    44.65  |    50.20    50.20  |    62.74    62.76  | //: 32-bit, MSC_v9.00 [ C =335]
+  10000_ ||    42.16    42.42  |    46.31    46.73  |    55.11    55.13  | //: 32-bit, MSC_v9.00 [ C =335]
+ 100000_ ||    40.09    40.55  |    45.76    45.97  |    51.00    53.08  | //: 32-bit, MSC_v9.00 [ C =335]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+  Block  ||         4560 bytes |         9232 bytes |        29280 bytes | //: 32-bit, MSC_v9.00 [ C =335]
+      1_ ||   780.00   798.00  |  1890.00  1920.00  |  3498.00  3498.00  | //: 64-bit, MSC_v9.00 [ C =335]
+     10_ ||    77.40    78.00  |   190.80   195.00  |   350.40   379.20  | //: 64-bit, MSC_v9.00 [ C =335]
+    100_ ||    17.10    17.10  |    27.72    28.08  |    35.28    35.28  | //: 64-bit, MSC_v9.00 [ C =335]
+   1000_ ||     9.95    10.00  |    14.23    14.24  |    15.09    15.10  | //: 64-bit, MSC_v9.00 [ C =335]
+  10000_ ||     9.30    10.06  |    12.94    14.10  |    13.07    14.36  | //: 64-bit, MSC_v9.00 [ C =335]
+ 100000_ ||     9.33     9.58  |    13.94    13.95  |    13.24    13.92  | //: 64-bit, MSC_v9.00 [ C =335]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+  Block  ||         1200 bytes |         2928 bytes |        10880 bytes | //: 64-bit, MSC_v9.00 [ C =335]
+      1_ ||  2586.00  2592.00  |  4896.00  4902.00  | 10668.00 10668.00  | //: 32-bit, MSC_v9.00 [asm=335]
+     10_ ||   263.40   263.40  |   489.60   489.60  |  1069.20  1069.80  | //: 32-bit, MSC_v9.00 [asm=335]
+    100_ ||    61.08    61.14  |    72.30    72.36  |   107.04   107.10  | //: 32-bit, MSC_v9.00 [asm=335]
+   1000_ ||    35.57    35.57  |    36.11    36.12  |    43.07    43.12  | //: 32-bit, MSC_v9.00 [asm=335]
+  10000_ ||    33.68    34.51  |    33.29    36.32  |    37.91    39.80  | //: 32-bit, MSC_v9.00 [asm=335]
+ 100000_ ||    36.32    36.43  |    35.91    35.98  |    38.02    38.19  | //: 32-bit, MSC_v9.00 [asm=335]
+    API  ||          864 bytes |          704 bytes |          720 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+  Block  ||         3060 bytes |         6300 bytes |        20391 bytes | //: 32-bit, MSC_v9.00 [asm=335]
+      1_ ||   684.00   690.00  |  1104.00  1104.00  |  2022.00  2022.00  | //: 64-bit, MSC_v9.00 [asm=335]
+     10_ ||    65.40    65.40  |   109.80   109.80  |   201.60   202.20  | //: 64-bit, MSC_v9.00 [asm=335]
+    100_ ||    15.78    15.78  |    16.80    16.80  |    22.02    22.08  | //: 64-bit, MSC_v9.00 [asm=335]
+   1000_ ||     8.41     8.42  |     7.21     7.22  |     8.24     8.26  | //: 64-bit, MSC_v9.00 [asm=335]
+  10000_ ||     7.84     7.84  |     6.45     6.50  |     7.12     7.12  | //: 64-bit, MSC_v9.00 [asm=335]
+ 100000_ ||     8.11     8.11  |     6.49     6.74  |     6.95     7.26  | //: 64-bit, MSC_v9.00 [asm=335]
+    API  ||          992 bytes |         1312 bytes |          864 bytes | //: 64-bit, MSC_v9.00 [asm=335]
+  Block  ||         1288 bytes |         2182 bytes |         7133 bytes | //: 64-bit, MSC_v9.00 [asm=335]

Added: vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.asm
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ vendor-sys/skein/dist/Additional_Implementations/skein_block_x64.asm	Fri May 27 02:42:46 2016	(r300819)
@@ -0,0 +1,1335 @@
+;
+;----------------------------------------------------------------
+; 64-bit x86 assembler code (Microsoft ML64) for Skein block functions
+;
+; Author: Doug Whiting, Hifn
+;
+; This code is released to the public domain.
+;----------------------------------------------------------------
+;
+    .code
+;
+_MASK_ALL_  equ (256+512+1024)      ;all three algorithm bits
+_MAX_FRAME_ equ 240
+;
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_USE_ASM
+_USE_ASM_        = _MASK_ALL_
+elseif SKEIN_USE_ASM and _MASK_ALL_
+_USE_ASM_        = SKEIN_USE_ASM
+else
+_USE_ASM_        = _MASK_ALL_
+endif
+;;;;;;;;;;;;;;;;;
+ifndef SKEIN_LOOP                           ;configure loop unrolling
+_SKEIN_LOOP       = 0                       ;default is all fully unrolled
+else
+_SKEIN_LOOP       = SKEIN_LOOP
+endif
+; the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) mod 10
+SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) mod 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) mod 10
+;
+SKEIN_ASM_UNROLL  = 0
+  irp _NN_,<256,512,1024>
+    if (SKEIN_UNROLL_&_NN_) eq 0
+SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + _NN_
+    endif
+  endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_ROUNDS
+ROUNDS_256  =   72
+ROUNDS_512  =   72
+ROUNDS_1024 =   80
+else
+ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5)
+ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) mod 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) mod 10) + 5)
+endif
+;
+irp _NN_,<256,512,1024>
+  if _USE_ASM_ and _NN_
+    irp _RR_,<%(ROUNDS_&_NN_)>
+      if _NN_ eq 1024
+%out  +++ SKEIN_ROUNDS_&_NN_ = _RR_
+      else
+%out  +++ SKEIN_ROUNDS_&_NN_  = _RR_
+      endif
+    endm
+  endif
+endm
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_CODE_SIZE
+ifdef  SKEIN_PERF
+SKEIN_CODE_SIZE equ (1)
+endif
+endif
+;
+;;;;;;;;;;;;;;;;;
+;
+ifndef SKEIN_DEBUG
+_SKEIN_DEBUG      = 0
+else
+_SKEIN_DEBUG      = 1
+endif
+;;;;;;;;;;;;;;;;;
+;
+; define offsets of fields in hash context structure
+;
+HASH_BITS   =   0                   ;# bits of hash output
+BCNT        =   8 + HASH_BITS       ;number of bytes in BUFFER[]
+TWEAK       =   8 + BCNT            ;tweak values[0..1]
+X_VARS      =  16 + TWEAK           ;chaining vars
+;
+;(Note: buffer[] in context structure is NOT needed here :-)
+;
+r08     equ     <r8>
+r09     equ     <r9>
+;
+KW_PARITY   =   01BD11BDAA9FC1A22h  ;overall parity of key schedule words
+FIRST_MASK  =   NOT (1 SHL 62)
+;
+; rotation constants for Skein
+;
+RC_256_0_0  = 14
+RC_256_0_1  = 16
+
+RC_256_1_0  = 52
+RC_256_1_1  = 57
+
+RC_256_2_0  = 23
+RC_256_2_1  = 40
+
+RC_256_3_0  =  5
+RC_256_3_1  = 37
+
+RC_256_4_0  = 25
+RC_256_4_1  = 33
+
+RC_256_5_0  = 46
+RC_256_5_1  = 12
+
+RC_256_6_0  = 58
+RC_256_6_1  = 22
+
+RC_256_7_0  = 32
+RC_256_7_1  = 32
+
+RC_512_0_0  = 46
+RC_512_0_1  = 36
+RC_512_0_2  = 19
+RC_512_0_3  = 37
+
+RC_512_1_0  = 33
+RC_512_1_1  = 27
+RC_512_1_2  = 14
+RC_512_1_3  = 42
+
+RC_512_2_0  = 17
+RC_512_2_1  = 49
+RC_512_2_2  = 36
+RC_512_2_3  = 39
+
+RC_512_3_0  = 44
+RC_512_3_1  =  9
+RC_512_3_2  = 54
+RC_512_3_3  = 56
+
+RC_512_4_0  = 39
+RC_512_4_1  = 30
+RC_512_4_2  = 34
+RC_512_4_3  = 24
+
+RC_512_5_0  = 13
+RC_512_5_1  = 50
+RC_512_5_2  = 10
+RC_512_5_3  = 17
+
+RC_512_6_0  = 25
+RC_512_6_1  = 29
+RC_512_6_2  = 39
+RC_512_6_3  = 43
+
+RC_512_7_0  =  8
+RC_512_7_1  = 35
+RC_512_7_2  = 56
+RC_512_7_3  = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 =  8
+RC_1024_0_3 = 47
+RC_1024_0_4 =  8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 =  4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 =  5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 =  9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 =  4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 =  9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+;
+;  Input:  reg
+; Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+;
+RotL64 macro reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 )
+  if _RCNT_  ;is there anything to do?
+    rol     reg,_RCNT_
+  endif
+endm
+;
+;----------------------------------------------------------------
+;
+; MACROS: define local vars and configure stack
+;
+;----------------------------------------------------------------
+; declare allocated space on the stack
+StackVar    macro localName,localSize
+localName   =   _STK_OFFS_
+_STK_OFFS_  =   _STK_OFFS_+(localSize)
+endm ;StackVar
+;
+;----------------------------------------------------------------
+;
+; MACRO: Configure stack frame, allocate local vars
+;
+Setup_Stack macro BLK_BITS,KS_CNT,NO_FRAME,debugCnt
+    WCNT    =    (BLK_BITS)/64
+;
+_PushCnt_   =   0                   ;save nonvolatile regs on stack
+  irp _reg_,<rbp,rsi,rdi,rbx,r12,r13,r14,r15>
+       push     _reg_
+      .pushreg  _reg_               ;pseudo-op push for exception handling
+_PushCnt_ = _PushCnt_ + 1           ;track count to keep alignment
+  endm
+;
+_STK_OFFS_  =   0                   ;starting offset from rsp
+    ;---- local  variables         ;<-- rsp
+    StackVar    X_stk  ,8*(WCNT)    ;local context vars
+    StackVar    ksTwk  ,8*3         ;key schedule: tweak words
+    StackVar    ksKey  ,8*(WCNT)+8  ;key schedule: key   words
+  if (SKEIN_ASM_UNROLL and (BLK_BITS)) eq 0
+    StackVar    ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen
+  endif
+    StackVar    Wcopy  ,8*(WCNT)    ;copy of input block    
+  if _SKEIN_DEBUG
+  ifnb  <debugCnt>                  ;temp location for debug X[] info
+    StackVar    xDebug_&BLK_BITS ,8*(debugCnt)
+  endif
+  endif
+  if ((8*_PushCnt_ + _STK_OFFS_) and 8) eq 0
+    StackVar    align16,8           ;keep 16-byte aligned (adjust for retAddr?)
+tmpStk_&BLK_BITS = align16          ;use this
+  endif
+LOCAL_SIZE  =   _STK_OFFS_          ;size of local vars
+    ;---- 
+    StackVar    savRegs,8*_PushCnt_ ;saved registers
+    StackVar    retAddr,8           ;return address
+    ;---- caller parameters
+    StackVar    ctxPtr ,8           ;context ptr
+    StackVar    blkPtr ,8           ;pointer to block data
+    StackVar    blkCnt ,8           ;number of full blocks to process
+    StackVar    bitAdd ,8           ;bit count to add to tweak
+    ;---- caller's stack frame
+;
+; set up the stack frame pointer (rbp)
+;
+FRAME_OFFS  =   ksTwk + 128         ;allow short (negative) offset to ksTwk, kwKey
+  if FRAME_OFFS gt _STK_OFFS_       ;keep rbp in the "locals" range
+FRAME_OFFS  =      _STK_OFFS_
+  endif
+  if FRAME_OFFS gt _MAX_FRAME_      ;keep Microsoft .setframe happy
+FRAME_OFFS  =      _MAX_FRAME_
+  endif
+;
+ifdef SKEIN_ASM_INFO
+  if     FRAME_OFFS+128 lt savRegs
+%out +++ SKEIN_&BLK_BITS: Unable to reach all of Wcopy with short offset from rbp.
+  elseif FRAME_OFFS+128 lt Wcopy
+%out +++ SKEIN_&BLK_BITS: Unable to reach end of Wcopy with short offset from rbp.
+  elseif FRAME_OFFS+128 lt _STK_OFFS_
+%out +++ SKEIN_&BLK_BITS: Unable to reach caller parms with short offset from rbp
+  endif
+endif
+  ;put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_&BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_&BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_&BLK_BITS = FRAME_OFFS
+;
+; Notes on stack frame setup:
+;   * the most frequently used variable is X_stk[], based at [rsp+0]
+;   * the next most used is the key schedule arrays, ksKey and ksTwk
+;       so rbp is "centered" there, allowing short offsets to the key 
+;       schedule even in 1024-bit Skein case
+;   * the Wcopy variables are infrequently accessed, but they have long 
+;       offsets from both rsp and rbp only in the 1024-bit case.
+;   * all other local vars and calling parameters can be accessed 
+;       with short offsets, except in the 1024-bit case
+;
+    sub     rsp,LOCAL_SIZE          ;make room for the locals
+    .allocstack LOCAL_SIZE          ;pseudo op for exception handling
+    lea     rbp,[rsp+FRAME_OFFS]    ;maximize use of short offsets
+  ifb <NO_FRAME>
+    .setframe rbp,   FRAME_OFFS     ;pseudo op for exception handling
+  endif
+    mov         [FP_+ctxPtr],rcx    ;save caller's parameters on the stack
+    mov         [FP_+blkPtr],rdx
+    mov         [FP_+blkCnt],r08
+    mov         [FP_+bitAdd],r09
+    .endprolog                      ;pseudo op to support exception handling
+
+    mov     rdi,[FP_+ctxPtr ]       ;rdi --> context
+;
+endm ;Setup_Stack
+;
+FP_         equ <rbp-FRAME_OFFS>    ;keep as many short offsets as possible
+;
+;----------------------------------------------------------------
+;
+Reset_Stack macro   procStart
+    add     rsp,LOCAL_SIZE          ;get rid of locals (wipe??)
+  irp _reg_,<r15,r14,r13,r12,rbx,rdi,rsi,rbp>
+    pop     _reg_
+_PushCnt_ = _PushCnt_ - 1
+  endm
+  if _PushCnt_
+    .err    "Mismatched push/pops?"
+  endif
+
+    ;display code size in bytes to stdout
+  irp  _BCNT_,<%($+1-procStart)>    ;account for return opcode
+_ProcBytes_ = _BCNT_
+if     _BCNT_ ge 10000
+%out procStart code size = _BCNT_ bytes  
+elseif _BCNT_ ge  1000
+%out procStart code size =  _BCNT_ bytes  
+else
+%out procStart code size =   _BCNT_ bytes  
+endif
+  endm ;irp _BCNT_
+endm ; Reset_Stack
+;
+;----------------------------------------------------------------
+; macros to help debug internals
+;
+if _SKEIN_DEBUG
+    extrn   Skein_Show_Block:proc   ;calls to C routines
+    extrn   Skein_Show_Round:proc
+;
+SKEIN_RND_SPECIAL       =   1000
+SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
+;
+Skein_Debug_Block macro BLK_BITS
+;
+;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+;                     const u08b_t *blkPtr, const u64b_t *wPtr, 
+;                     const u64b_t *ksPtr,const u64b_t *tsPtr);
+;
+  irp _reg_,<rax,rcx,rdx,r08,r09,r10,r11>
+    push    _reg_                   ;save all volatile regs on tack before the call
+  endm
+    ; get and push call parameters
+    lea     rax,[FP_+ksTwk]         ;tweak pointer
+    push    rax
+    lea     rax,[FP_+ksKey]         ;key pointer
+    push    rax
+    lea     rax,[FP_+Wcopy]         ;wPtr
+    push    rax
+    mov     r09,[FP_+blkPtr]        ;blkPtr
+    push    r09                     ;(push register parameters anyway to make room on stack)
+    mov     rdx,[FP_+ctxPtr]        
+    lea     r08,[rdx+X_VARS]        ;X (pointer)
+    push    r08
+    push    rdx                     ;h (pointer)
+    mov     rcx, BLK_BITS           ;bits
+    push    rdx
+    call    Skein_Show_Block        ;call external debug handler
+    add     rsp,7*8                 ;discard parameters on stack
+  irp _reg_,<r11,r10,r09,r08,rdx,rcx,rax>
+    pop     _reg_                   ;restore regs
+  endm
+endm ; Skein_Debug_Block
+;
+;
+; the macro to "call" to debug a round
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+    ; call the appropriate (local) debug function
+    push    r08
+  if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL)
+    mov     r08, R
+  else                              ;compute round number using edi
+_rOffs_ = RDI_OFFS + 0
+   if BLK_BITS eq 1024
+    mov     r08,[rsp+8+rIdx_offs]   ;get rIdx off the stack (adjust for push r08)
+    lea     r08,[4*r08+1+(((R)-1) and 3)+_rOffs_]
+   else
+    lea     r08,[4*rdi+1+(((R)-1) and 3)+_rOffs_]
+   endif
+  endif
+    call    Skein_Debug_Round_&BLK_BITS
+    pop     r08
+;
+  afterOp
+endm  ;  Skein_Debug_Round
+else  ;------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+Skein_Debug_Block macro BLK_BITS,afterOp
+endm
+;
+Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp
+endm
+;
+endif ; _SKEIN_DEBUG
+;
+;----------------------------------------------------------------
+;
+addReg  macro   dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+  ifnb <immOffs>
+       lea     dstReg,[srcReg_A&&srcReg_B + dstReg + immOffs]
+  elseif ((useAddOp + 0) eq 0)
+    ifndef ASM_NO_LEA
+      ;lea seems to be faster on Core 2 Duo CPUs!
+       lea     dstReg,[srcReg_A&&srcReg_B + dstReg]   
+    else
+       add     dstReg, srcReg_A&&srcReg_B
+    endif
+  else
+       add     dstReg, srcReg_A&&srcReg_B
+  endif
+endm
+;
+;=================================== Skein_256 =============================================
+;
+if _USE_ASM_ and 256
+    public  Skein_256_Process_Block
+;
+; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+;;;;;;;;;;;;;;;;;
+;
+; code
+;
+Skein_256_Process_Block proc frame
+    Setup_Stack 256,((ROUNDS_256/8)+1)
+    mov     r14,[rdi+TWEAK+8]
+    jmp   short Skein_256_block_loop
+    align   16
+    ; main hash loop for Skein_256
+Skein_256_block_loop:
+    ;
+    ; general register usage:
+    ;   RAX..RDX        = X0..X3    
+    ;   R08..R12        = ks[0..4]
+    ;   R13..R15        = ts[0..2]
+    ;   RSP, RBP        = stack/frame pointers
+    ;   RDI             = round counter or context pointer
+    ;   RSI             = temp
+    ;
+    mov     r13,[rdi+TWEAK+0]
+    add     r13,[FP_+bitAdd]        ;computed updated tweak value T0
+    mov     r15,r14
+    xor     r15,r13                 ;now r13.r15 is set as the tweak 
+
+    mov     r12,KW_PARITY
+    mov     r08,[rdi+X_VARS+ 0]
+    mov     r09,[rdi+X_VARS+ 8]
+    mov     r10,[rdi+X_VARS+16]
+    mov     r11,[rdi+X_VARS+24]
+    mov         [rdi+TWEAK+0],r13   ;save updated tweak value ctx->h.T[0]
+    xor     r12,r08                 ;start accumulating overall parity
+
+    mov     rsi,[FP_+blkPtr ]       ;esi --> input block
+    xor     r12,r09
+    mov     rax,[rsi+ 0]            ;get X[0..3]
+    xor     r12,r10
+    mov     rbx,[rsi+ 8]
+    xor     r12,r11
+    mov     rcx,[rsi+16]
+    mov     rdx,[rsi+24]
+
+    mov         [FP_+Wcopy+ 0],rax  ;save copy of input block
+    mov         [FP_+Wcopy+ 8],rbx
+    mov         [FP_+Wcopy+16],rcx
+    mov         [FP_+Wcopy+24],rdx
+
+    add     rax, r08                ;initial key injection
+    add     rbx, r09
+    add     rcx, r10
+    add     rdx, r11
+    add     rbx, r13
+    add     rcx, r14
+
+if _SKEIN_DEBUG
+    mov         [rdi+TWEAK+ 8],r14  ;save updated tweak T[1] (start bit cleared?)
+    mov         [FP_+ksKey+ 0],r08  ;save key schedule on stack for Skein_Debug_Block
+    mov         [FP_+ksKey+ 8],r09
+    mov         [FP_+ksKey+16],r10
+    mov         [FP_+ksKey+24],r11
+    mov         [FP_+ksKey+32],r12
+
+    mov         [FP_+ksTwk+ 0],r13
+    mov         [FP_+ksTwk+ 8],r14
+    mov         [FP_+ksTwk+16],r15
+
+    mov         [rsp+X_stk + 0],rax ;save X[] on stack for Skein_Debug_Block
+    mov         [rsp+X_stk + 8],rbx
+    mov         [rsp+X_stk +16],rcx
+    mov         [rsp+X_stk +24],rdx
+
+    Skein_Debug_Block 256           ;debug dump
+    Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+endif
+;
+if ((SKEIN_ASM_UNROLL and 256) eq 0)
+    mov         [FP_+ksKey+40],r08 ;save key schedule on stack for looping code
+    mov         [FP_+ksKey+ 8],r09
+    mov         [FP_+ksKey+16],r10
+    mov         [FP_+ksKey+24],r11
+    mov         [FP_+ksKey+32],r12
+
+    mov         [FP_+ksTwk+24],r13
+    mov         [FP_+ksTwk+ 8],r14
+    mov         [FP_+ksTwk+16],r15
+endif
+    add     rsi, WCNT*8             ;skip the block
+    mov         [FP_+blkPtr   ],rsi ;update block pointer
+;
+opLoop macro op1,op2
+  if (SKEIN_ASM_UNROLL and 256) eq 0
+    op1
+  else
+    op2
+  endif
+endm
+;
+    ;
+    ; now the key schedule is computed. Start the rounds
+    ;
+if SKEIN_ASM_UNROLL and 256
+_UNROLL_CNT =   ROUNDS_256/8
+else
+_UNROLL_CNT =   SKEIN_UNROLL_256
+  if ((ROUNDS_256/8) mod _UNROLL_CNT)
+    .err "Invalid SKEIN_UNROLL_256"
+  endif
+    xor     rdi,rdi                   ;rdi = iteration count
+Skein_256_round_loop:
+endif
+_Rbase_ = 0
+rept _UNROLL_CNT*2
+    ; all X and ks vars in regs     ; (ops to "rotate" ks vars, via mem, if not unrolled)
+    ; round 4*_RBase_ + 0
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_RBase_+0) and 7),0
+    addReg  rcx, rdx
+                    opLoop  <mov r08,[FP_+ksKey+8*rdi+8*1]>
+    xor     rbx, rax
+    RotL64  rdx, 256,%((4*_RBase_+0) and 7),1
+    xor     rdx, rcx
+ if SKEIN_ASM_UNROLL and 256
+    irp _r0_,<%(08+(_Rbase_+3) mod 5)>
+    irp _r1_,<%(13+(_Rbase_+2) mod 3)>
+      lea   rdi,[r&_r0_+r&_r1_]     ;precompute key injection value for rcx
+    endm
+    endm
+ endif
+                    opLoop  <mov r13,[FP_+ksTwk+8*rdi+8*1]>
+    Skein_Debug_Round 256,%(4*_RBase_+1)
+
+    ; round 4*_RBase_ + 1
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_RBase_+1) and 7),0
+    xor     rdx, rax
+                    opLoop  <mov r09,[FP_+ksKey+8*rdi+8*2]>
+    addReg  rcx, rbx
+    RotL64  rbx, 256,%((4*_RBase_+1) and 7),1
+    xor     rbx, rcx
+                    opLoop  <mov r11,[FP_+ksKey+8*rdi+8*4]>
+    Skein_Debug_Round 256,%(4*_RBase_+2)
+ if SKEIN_ASM_UNROLL and 256
+    irp _r0_,<%(08+(_Rbase_+2) mod 5)>
+    irp _r1_,<%(13+(_Rbase_+1) mod 3)>
+      lea   rsi,[r&_r0_+r&_r1_]     ;precompute key injection value for rbx
+    endm
+    endm
+ endif
+    ; round 4*_RBase_ + 2
+    addReg  rax, rbx
+    RotL64  rbx, 256,%((4*_RBase_+2) and 7),0
+    addReg  rcx, rdx
+                    opLoop  <mov r10,[FP_+ksKey+8*rdi+8*3]>
+    xor     rbx, rax
+    RotL64  rdx, 256,%((4*_RBase_+2) and 7),1
+    xor     rdx, rcx
+                    opLoop  <mov     [FP_+ksKey+8*rdi+8*6],r08> ;"rotate" the key
+                    opLoop  <lea r11,[r11+rdi+1]>   ;precompute key + tweak
+    Skein_Debug_Round 256,%(4*_RBase_+3)
+    ; round 4*_RBase_ + 3
+    addReg  rax, rdx
+    RotL64  rdx, 256,%((4*_RBase_+3) and 7),0
+    addReg  rcx, rbx
+                    opLoop  <add r10,[FP_+ksTwk+8*rdi+8*2]>    ;precompute key + tweak
+                    opLoop  <mov     [FP_+ksTwk+8*rdi+8*4],r13> ;"rotate" the tweak
+    xor     rdx, rax
+    RotL64  rbx, 256,%((4*_RBase_+3) and 7),1
+    xor     rbx, rcx
+    Skein_Debug_Round 256,%(4*_RBase_+4)
+                    opLoop  <addReg r09,r13>    ;precompute key+tweak
+      ;inject key schedule words
+_Rbase_ = _Rbase_+1
+  if SKEIN_ASM_UNROLL and 256
+      addReg    rax,r,%(08+((_Rbase_+0) mod 5))
+      addReg    rbx,rsi
+      addReg    rcx,rdi
+      addReg    rdx,r,%(08+((_Rbase_+3) mod 5)),,_Rbase_
+  else
+      inc       rdi
+      addReg    rax,r08
+      addReg    rcx,r10
+      addReg    rbx,r09
+      addReg    rdx,r11
+  endif
+      Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+endm ;rept _UNROLL_CNT
+
+;
+if (SKEIN_ASM_UNROLL and 256) eq 0
+    cmp     rdi,2*(ROUNDS_256/8)
+    jb      Skein_256_round_loop
+endif ; (SKEIN_ASM_UNROLL and 256) eq 0
+    mov     rdi,[FP_+ctxPtr ]           ;restore edi --> context
+
+    ;----------------------------
+    ; feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
+    xor     rax,[FP_+Wcopy + 0]
+    mov     r14,FIRST_MASK
+    xor     rbx,[FP_+Wcopy + 8]
+    xor     rcx,[FP_+Wcopy +16]
+    xor     rdx,[FP_+Wcopy +24]
+    mov         [rdi+X_VARS+ 0],rax     ;store final result
+    and     r14,[rdi+TWEAK + 8]
+    dec     qword ptr [FP_+blkCnt]      ;set zero flag
+    mov         [rdi+X_VARS+ 8],rbx
+    mov         [rdi+X_VARS+16],rcx
+    mov         [rdi+X_VARS+24],rdx
+
+    Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,,<cmp qword ptr [FP_+blkCnt],0>
+
+    ; go back for more blocks, if needed
+    jnz     Skein_256_block_loop
+    mov         [rdi+TWEAK + 8],r14
+    Reset_Stack Skein_256_Process_Block
+    ret
+
+  if _SKEIN_DEBUG
+Skein_Debug_Round_256:
+    mov         [FP_+X_stk+ 0],rax  ;first, save X[] state on stack so debug routines can access it
+    mov         [FP_+X_stk+ 8],rbx  ;(use FP_ since rsp has changed!)
+    mov         [FP_+X_stk+16],rcx
+    mov         [FP_+X_stk+24],rdx
+    push    rdx                     ;save two regs for BLK_BITS-specific parms
+    push    rcx
+    mov     rdx,[FP_+ctxPtr]        ;ctx_hdr_ptr
+    mov     rcx, 256
+    jmp     Skein_Debug_Round_Common
+  endif
+
+Skein_256_Process_Block endp
+;
+ifdef SKEIN_CODE_SIZE
+    public  Skein_256_Process_Block_CodeSize
+Skein_256_Process_Block_CodeSize proc
+    mov     rax,_ProcBytes_
+    ret
+Skein_256_Process_Block_CodeSize endp
+;
+    public  Skein_256_Unroll_Cnt
+Skein_256_Unroll_Cnt proc
+  if _UNROLL_CNT ne ROUNDS_256/8
+    mov     rax,_UNROLL_CNT
+  else
+    xor     rax,rax
+  endif
+    ret
+Skein_256_Unroll_Cnt endp
+endif
+;
+endif ;_USE_ASM_ and 256
+;
+;=================================== Skein_512 =============================================
+;
+if _USE_ASM_ and 512
+    public  Skein_512_Process_Block
+;
+; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd);
+;
+rX_512_0    equ r08         ;register assignments for X[] values during rounds
+rX_512_1    equ r09
+rX_512_2    equ r10
+rX_512_3    equ r11
+rX_512_4    equ r12
+rX_512_5    equ r13
+rX_512_6    equ r14
+rX_512_7    equ r15
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: one round for 512-bit blocks
+;
+R_512_OneRound  macro r0,r1,r2,r3,r4,r5,r6,r7,_Rn_,op1,op2,op3,op4
+;
+    addReg      rX_512_&r0, rX_512_&r1
+    RotL64      rX_512_&r1, 512,%((_Rn_) and 7),0
+    xor         rX_512_&r1, rX_512_&r0
+            op1
+    addReg      rX_512_&r2, rX_512_&r3
+    RotL64      rX_512_&r3, 512,%((_Rn_) and 7),1
+    xor         rX_512_&r3, rX_512_&r2
+            op2
+    addReg      rX_512_&r4, rX_512_&r5
+    RotL64      rX_512_&r5, 512,%((_Rn_) and 7),2
+    xor         rX_512_&r5, rX_512_&r4
+            op3
+    addReg      rX_512_&r6, rX_512_&r7
+    RotL64      rX_512_&r7, 512,%((_Rn_) and 7),3
+    xor         rX_512_&r7, rX_512_&r6
+            op4
+    Skein_Debug_Round 512,%(_Rn_+1),-4
+;
+endm ;R_512_OneRound
+;
+;;;;;;;;;;;;;;;;;
+; MACRO: eight rounds for 512-bit blocks
+;
+R_512_FourRounds macro _RR_    ;RR = base round number (0 mod 8)
+  if SKEIN_ASM_UNROLL and 512
+    ; here for fully unrolled case.
+    _II_ = ((_RR_)/4) + 1       ;key injection counter
+    R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),<mov rax,[FP_+ksKey+8*(((_II_)+3) mod 9)]>,,<mov rbx,[FP_+ksKey+8*(((_II_)+4) mod 9)]>
+    R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),<mov rcx,[FP_+ksKey+8*(((_II_)+5) mod 9)]>,,<mov rdx,[FP_+ksKey+8*(((_II_)+6) mod 9)]>
+    R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),<mov rsi,[FP_+ksKey+8*(((_II_)+7) mod 9)]>,,<add rcx,[FP_+ksTwk+8*(((_II_)+0) mod 3)]>
+    R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),<add rdx,[FP_+ksTwk+8*(((_II_)+1) mod 3)]>,
+    ; inject the key schedule
+    add     r08,[FP_+ksKey+8*(((_II_)+0) mod 9)]
+    addReg  r11,rax
+    add     r09,[FP_+ksKey+8*(((_II_)+1) mod 9)]
+    addReg  r12,rbx
+    add     r10,[FP_+ksKey+8*(((_II_)+2) mod 9)]
+    addReg  r13,rcx
+    addReg  r14,rdx
+    addReg  r15,rsi,,,(_II_)
+  else
+    ; here for looping case                                                    ;"rotate" key/tweak schedule (move up on stack)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list