Text file src/crypto/internal/fips140/sha256/sha256block_amd64.s

     1  // Code generated by command: go run sha256block_amd64_asm.go -out ../sha256block_amd64.s. DO NOT EDIT.
     2  
     3  //go:build !purego
     4  
     5  #include "textflag.h"
     6  
     7  // func blockAVX2(dig *Digest, p []byte)
     8  // Requires: AVX, AVX2, BMI2
     9  TEXT ·blockAVX2(SB), $536-32
    10  	MOVQ dig+0(FP), SI
    11  	MOVQ p_base+8(FP), DI
    12  	MOVQ p_len+16(FP), DX
    13  	LEAQ -64(DI)(DX*1), DX
    14  	MOVQ DX, 512(SP)
    15  	CMPQ DX, DI
    16  	JE   avx2_only_one_block
    17  
    18  	// Load initial digest
    19  	MOVL (SI), AX
    20  	MOVL 4(SI), BX
    21  	MOVL 8(SI), CX
    22  	MOVL 12(SI), R8
    23  	MOVL 16(SI), DX
    24  	MOVL 20(SI), R9
    25  	MOVL 24(SI), R10
    26  	MOVL 28(SI), R11
    27  
    28  avx2_loop0:
    29  	// at each iteration works with one block (512 bit)
    30  	VMOVDQU (DI), Y0
    31  	VMOVDQU 32(DI), Y1
    32  	VMOVDQU 64(DI), Y2
    33  	VMOVDQU 96(DI), Y3
    34  	VMOVDQU flip_mask<>+0(SB), Y13
    35  
    36  	// Apply Byte Flip Mask: LE -> BE
    37  	VPSHUFB Y13, Y0, Y0
    38  	VPSHUFB Y13, Y1, Y1
    39  	VPSHUFB Y13, Y2, Y2
    40  	VPSHUFB Y13, Y3, Y3
    41  
    42  	// Transpose data into high/low parts
    43  	VPERM2I128 $0x20, Y2, Y0, Y4
    44  	VPERM2I128 $0x31, Y2, Y0, Y5
    45  	VPERM2I128 $0x20, Y3, Y1, Y6
    46  	VPERM2I128 $0x31, Y3, Y1, Y7
    47  	LEAQ       K256<>+0(SB), BP
    48  
    49  avx2_last_block_enter:
    50  	ADDQ $0x40, DI
    51  	MOVQ DI, 520(SP)
    52  	XORQ SI, SI
    53  
    54  avx2_loop1:
    55  	// Do 4 rounds and scheduling
    56  	VPADDD   (BP)(SI*1), Y4, Y9
    57  	VMOVDQU  Y9, (SP)(SI*1)
    58  	MOVL     AX, DI
    59  	RORXL    $0x19, DX, R13
    60  	RORXL    $0x0b, DX, R14
    61  	ADDL     (SP)(SI*1), R11
    62  	ORL      CX, DI
    63  	VPALIGNR $0x04, Y6, Y7, Y0
    64  	MOVL     R9, R15
    65  	RORXL    $0x0d, AX, R12
    66  	XORL     R14, R13
    67  	XORL     R10, R15
    68  	VPADDD   Y4, Y0, Y0
    69  	RORXL    $0x06, DX, R14
    70  	ANDL     DX, R15
    71  	XORL     R14, R13
    72  	RORXL    $0x16, AX, R14
    73  	ADDL     R11, R8
    74  	ANDL     BX, DI
    75  	VPALIGNR $0x04, Y4, Y5, Y1
    76  	XORL     R12, R14
    77  	RORXL    $0x02, AX, R12
    78  	XORL     R10, R15
    79  	VPSRLD   $0x07, Y1, Y2
    80  	XORL     R12, R14
    81  	MOVL     AX, R12
    82  	ANDL     CX, R12
    83  	ADDL     R13, R15
    84  	VPSLLD   $0x19, Y1, Y3
    85  	ORL      R12, DI
    86  	ADDL     R14, R11
    87  	ADDL     R15, R8
    88  	VPOR     Y2, Y3, Y3
    89  	VPSRLD   $0x12, Y1, Y2
    90  	ADDL     R15, R11
    91  	ADDL     DI, R11
    92  	MOVL     R11, DI
    93  	RORXL    $0x19, R8, R13
    94  	RORXL    $0x0b, R8, R14
    95  	ADDL     4(SP)(SI*1), R10
    96  	ORL      BX, DI
    97  	VPSRLD   $0x03, Y1, Y8
    98  	MOVL     DX, R15
    99  	RORXL    $0x0d, R11, R12
   100  	XORL     R14, R13
   101  	XORL     R9, R15
   102  	RORXL    $0x06, R8, R14
   103  	XORL     R14, R13
   104  	RORXL    $0x16, R11, R14
   105  	ANDL     R8, R15
   106  	ADDL     R10, CX
   107  	VPSLLD   $0x0e, Y1, Y1
   108  	ANDL     AX, DI
   109  	XORL     R12, R14
   110  	VPXOR    Y1, Y3, Y3
   111  	RORXL    $0x02, R11, R12
   112  	XORL     R9, R15
   113  	VPXOR    Y2, Y3, Y3
   114  	XORL     R12, R14
   115  	MOVL     R11, R12
   116  	ANDL     BX, R12
   117  	ADDL     R13, R15
   118  	VPXOR    Y8, Y3, Y1
   119  	VPSHUFD  $0xfa, Y7, Y2
   120  	ORL      R12, DI
   121  	ADDL     R14, R10
   122  	VPADDD   Y1, Y0, Y0
   123  	ADDL     R15, CX
   124  	ADDL     R15, R10
   125  	ADDL     DI, R10
   126  	VPSRLD   $0x0a, Y2, Y8
   127  	MOVL     R10, DI
   128  	RORXL    $0x19, CX, R13
   129  	ADDL     8(SP)(SI*1), R9
   130  	VPSRLQ   $0x13, Y2, Y3
   131  	RORXL    $0x0b, CX, R14
   132  	ORL      AX, DI
   133  	MOVL     R8, R15
   134  	XORL     DX, R15
   135  	RORXL    $0x0d, R10, R12
   136  	XORL     R14, R13
   137  	VPSRLQ   $0x11, Y2, Y2
   138  	ANDL     CX, R15
   139  	RORXL    $0x06, CX, R14
   140  	VPXOR    Y3, Y2, Y2
   141  	ADDL     R9, BX
   142  	ANDL     R11, DI
   143  	XORL     R14, R13
   144  	RORXL    $0x16, R10, R14
   145  	VPXOR    Y2, Y8, Y8
   146  	XORL     DX, R15
   147  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   148  	XORL     R12, R14
   149  	RORXL    $0x02, R10, R12
   150  	VPADDD   Y8, Y0, Y0
   151  	XORL     R12, R14
   152  	MOVL     R10, R12
   153  	ANDL     AX, R12
   154  	ADDL     R13, R15
   155  	VPSHUFD  $0x50, Y0, Y2
   156  	ORL      R12, DI
   157  	ADDL     R14, R9
   158  	ADDL     R15, BX
   159  	ADDL     R15, R9
   160  	ADDL     DI, R9
   161  	MOVL     R9, DI
   162  	RORXL    $0x19, BX, R13
   163  	RORXL    $0x0b, BX, R14
   164  	ADDL     12(SP)(SI*1), DX
   165  	ORL      R11, DI
   166  	VPSRLD   $0x0a, Y2, Y11
   167  	MOVL     CX, R15
   168  	RORXL    $0x0d, R9, R12
   169  	XORL     R14, R13
   170  	XORL     R8, R15
   171  	VPSRLQ   $0x13, Y2, Y3
   172  	RORXL    $0x06, BX, R14
   173  	ANDL     BX, R15
   174  	ADDL     DX, AX
   175  	ANDL     R10, DI
   176  	VPSRLQ   $0x11, Y2, Y2
   177  	XORL     R14, R13
   178  	XORL     R8, R15
   179  	VPXOR    Y3, Y2, Y2
   180  	RORXL    $0x16, R9, R14
   181  	ADDL     R13, R15
   182  	VPXOR    Y2, Y11, Y11
   183  	XORL     R12, R14
   184  	ADDL     R15, AX
   185  	RORXL    $0x02, R9, R12
   186  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   187  	VPADDD   Y0, Y11, Y4
   188  	XORL     R12, R14
   189  	MOVL     R9, R12
   190  	ANDL     R11, R12
   191  	ORL      R12, DI
   192  	ADDL     R14, DX
   193  	ADDL     R15, DX
   194  	ADDL     DI, DX
   195  
   196  	// Do 4 rounds and scheduling
   197  	VPADDD   32(BP)(SI*1), Y5, Y9
   198  	VMOVDQU  Y9, 32(SP)(SI*1)
   199  	MOVL     DX, DI
   200  	RORXL    $0x19, AX, R13
   201  	RORXL    $0x0b, AX, R14
   202  	ADDL     32(SP)(SI*1), R8
   203  	ORL      R10, DI
   204  	VPALIGNR $0x04, Y7, Y4, Y0
   205  	MOVL     BX, R15
   206  	RORXL    $0x0d, DX, R12
   207  	XORL     R14, R13
   208  	XORL     CX, R15
   209  	VPADDD   Y5, Y0, Y0
   210  	RORXL    $0x06, AX, R14
   211  	ANDL     AX, R15
   212  	XORL     R14, R13
   213  	RORXL    $0x16, DX, R14
   214  	ADDL     R8, R11
   215  	ANDL     R9, DI
   216  	VPALIGNR $0x04, Y5, Y6, Y1
   217  	XORL     R12, R14
   218  	RORXL    $0x02, DX, R12
   219  	XORL     CX, R15
   220  	VPSRLD   $0x07, Y1, Y2
   221  	XORL     R12, R14
   222  	MOVL     DX, R12
   223  	ANDL     R10, R12
   224  	ADDL     R13, R15
   225  	VPSLLD   $0x19, Y1, Y3
   226  	ORL      R12, DI
   227  	ADDL     R14, R8
   228  	ADDL     R15, R11
   229  	VPOR     Y2, Y3, Y3
   230  	VPSRLD   $0x12, Y1, Y2
   231  	ADDL     R15, R8
   232  	ADDL     DI, R8
   233  	MOVL     R8, DI
   234  	RORXL    $0x19, R11, R13
   235  	RORXL    $0x0b, R11, R14
   236  	ADDL     36(SP)(SI*1), CX
   237  	ORL      R9, DI
   238  	VPSRLD   $0x03, Y1, Y8
   239  	MOVL     AX, R15
   240  	RORXL    $0x0d, R8, R12
   241  	XORL     R14, R13
   242  	XORL     BX, R15
   243  	RORXL    $0x06, R11, R14
   244  	XORL     R14, R13
   245  	RORXL    $0x16, R8, R14
   246  	ANDL     R11, R15
   247  	ADDL     CX, R10
   248  	VPSLLD   $0x0e, Y1, Y1
   249  	ANDL     DX, DI
   250  	XORL     R12, R14
   251  	VPXOR    Y1, Y3, Y3
   252  	RORXL    $0x02, R8, R12
   253  	XORL     BX, R15
   254  	VPXOR    Y2, Y3, Y3
   255  	XORL     R12, R14
   256  	MOVL     R8, R12
   257  	ANDL     R9, R12
   258  	ADDL     R13, R15
   259  	VPXOR    Y8, Y3, Y1
   260  	VPSHUFD  $0xfa, Y4, Y2
   261  	ORL      R12, DI
   262  	ADDL     R14, CX
   263  	VPADDD   Y1, Y0, Y0
   264  	ADDL     R15, R10
   265  	ADDL     R15, CX
   266  	ADDL     DI, CX
   267  	VPSRLD   $0x0a, Y2, Y8
   268  	MOVL     CX, DI
   269  	RORXL    $0x19, R10, R13
   270  	ADDL     40(SP)(SI*1), BX
   271  	VPSRLQ   $0x13, Y2, Y3
   272  	RORXL    $0x0b, R10, R14
   273  	ORL      DX, DI
   274  	MOVL     R11, R15
   275  	XORL     AX, R15
   276  	RORXL    $0x0d, CX, R12
   277  	XORL     R14, R13
   278  	VPSRLQ   $0x11, Y2, Y2
   279  	ANDL     R10, R15
   280  	RORXL    $0x06, R10, R14
   281  	VPXOR    Y3, Y2, Y2
   282  	ADDL     BX, R9
   283  	ANDL     R8, DI
   284  	XORL     R14, R13
   285  	RORXL    $0x16, CX, R14
   286  	VPXOR    Y2, Y8, Y8
   287  	XORL     AX, R15
   288  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   289  	XORL     R12, R14
   290  	RORXL    $0x02, CX, R12
   291  	VPADDD   Y8, Y0, Y0
   292  	XORL     R12, R14
   293  	MOVL     CX, R12
   294  	ANDL     DX, R12
   295  	ADDL     R13, R15
   296  	VPSHUFD  $0x50, Y0, Y2
   297  	ORL      R12, DI
   298  	ADDL     R14, BX
   299  	ADDL     R15, R9
   300  	ADDL     R15, BX
   301  	ADDL     DI, BX
   302  	MOVL     BX, DI
   303  	RORXL    $0x19, R9, R13
   304  	RORXL    $0x0b, R9, R14
   305  	ADDL     44(SP)(SI*1), AX
   306  	ORL      R8, DI
   307  	VPSRLD   $0x0a, Y2, Y11
   308  	MOVL     R10, R15
   309  	RORXL    $0x0d, BX, R12
   310  	XORL     R14, R13
   311  	XORL     R11, R15
   312  	VPSRLQ   $0x13, Y2, Y3
   313  	RORXL    $0x06, R9, R14
   314  	ANDL     R9, R15
   315  	ADDL     AX, DX
   316  	ANDL     CX, DI
   317  	VPSRLQ   $0x11, Y2, Y2
   318  	XORL     R14, R13
   319  	XORL     R11, R15
   320  	VPXOR    Y3, Y2, Y2
   321  	RORXL    $0x16, BX, R14
   322  	ADDL     R13, R15
   323  	VPXOR    Y2, Y11, Y11
   324  	XORL     R12, R14
   325  	ADDL     R15, DX
   326  	RORXL    $0x02, BX, R12
   327  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   328  	VPADDD   Y0, Y11, Y5
   329  	XORL     R12, R14
   330  	MOVL     BX, R12
   331  	ANDL     R8, R12
   332  	ORL      R12, DI
   333  	ADDL     R14, AX
   334  	ADDL     R15, AX
   335  	ADDL     DI, AX
   336  
   337  	// Do 4 rounds and scheduling
   338  	VPADDD   64(BP)(SI*1), Y6, Y9
   339  	VMOVDQU  Y9, 64(SP)(SI*1)
   340  	MOVL     AX, DI
   341  	RORXL    $0x19, DX, R13
   342  	RORXL    $0x0b, DX, R14
   343  	ADDL     64(SP)(SI*1), R11
   344  	ORL      CX, DI
   345  	VPALIGNR $0x04, Y4, Y5, Y0
   346  	MOVL     R9, R15
   347  	RORXL    $0x0d, AX, R12
   348  	XORL     R14, R13
   349  	XORL     R10, R15
   350  	VPADDD   Y6, Y0, Y0
   351  	RORXL    $0x06, DX, R14
   352  	ANDL     DX, R15
   353  	XORL     R14, R13
   354  	RORXL    $0x16, AX, R14
   355  	ADDL     R11, R8
   356  	ANDL     BX, DI
   357  	VPALIGNR $0x04, Y6, Y7, Y1
   358  	XORL     R12, R14
   359  	RORXL    $0x02, AX, R12
   360  	XORL     R10, R15
   361  	VPSRLD   $0x07, Y1, Y2
   362  	XORL     R12, R14
   363  	MOVL     AX, R12
   364  	ANDL     CX, R12
   365  	ADDL     R13, R15
   366  	VPSLLD   $0x19, Y1, Y3
   367  	ORL      R12, DI
   368  	ADDL     R14, R11
   369  	ADDL     R15, R8
   370  	VPOR     Y2, Y3, Y3
   371  	VPSRLD   $0x12, Y1, Y2
   372  	ADDL     R15, R11
   373  	ADDL     DI, R11
   374  	MOVL     R11, DI
   375  	RORXL    $0x19, R8, R13
   376  	RORXL    $0x0b, R8, R14
   377  	ADDL     68(SP)(SI*1), R10
   378  	ORL      BX, DI
   379  	VPSRLD   $0x03, Y1, Y8
   380  	MOVL     DX, R15
   381  	RORXL    $0x0d, R11, R12
   382  	XORL     R14, R13
   383  	XORL     R9, R15
   384  	RORXL    $0x06, R8, R14
   385  	XORL     R14, R13
   386  	RORXL    $0x16, R11, R14
   387  	ANDL     R8, R15
   388  	ADDL     R10, CX
   389  	VPSLLD   $0x0e, Y1, Y1
   390  	ANDL     AX, DI
   391  	XORL     R12, R14
   392  	VPXOR    Y1, Y3, Y3
   393  	RORXL    $0x02, R11, R12
   394  	XORL     R9, R15
   395  	VPXOR    Y2, Y3, Y3
   396  	XORL     R12, R14
   397  	MOVL     R11, R12
   398  	ANDL     BX, R12
   399  	ADDL     R13, R15
   400  	VPXOR    Y8, Y3, Y1
   401  	VPSHUFD  $0xfa, Y5, Y2
   402  	ORL      R12, DI
   403  	ADDL     R14, R10
   404  	VPADDD   Y1, Y0, Y0
   405  	ADDL     R15, CX
   406  	ADDL     R15, R10
   407  	ADDL     DI, R10
   408  	VPSRLD   $0x0a, Y2, Y8
   409  	MOVL     R10, DI
   410  	RORXL    $0x19, CX, R13
   411  	ADDL     72(SP)(SI*1), R9
   412  	VPSRLQ   $0x13, Y2, Y3
   413  	RORXL    $0x0b, CX, R14
   414  	ORL      AX, DI
   415  	MOVL     R8, R15
   416  	XORL     DX, R15
   417  	RORXL    $0x0d, R10, R12
   418  	XORL     R14, R13
   419  	VPSRLQ   $0x11, Y2, Y2
   420  	ANDL     CX, R15
   421  	RORXL    $0x06, CX, R14
   422  	VPXOR    Y3, Y2, Y2
   423  	ADDL     R9, BX
   424  	ANDL     R11, DI
   425  	XORL     R14, R13
   426  	RORXL    $0x16, R10, R14
   427  	VPXOR    Y2, Y8, Y8
   428  	XORL     DX, R15
   429  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   430  	XORL     R12, R14
   431  	RORXL    $0x02, R10, R12
   432  	VPADDD   Y8, Y0, Y0
   433  	XORL     R12, R14
   434  	MOVL     R10, R12
   435  	ANDL     AX, R12
   436  	ADDL     R13, R15
   437  	VPSHUFD  $0x50, Y0, Y2
   438  	ORL      R12, DI
   439  	ADDL     R14, R9
   440  	ADDL     R15, BX
   441  	ADDL     R15, R9
   442  	ADDL     DI, R9
   443  	MOVL     R9, DI
   444  	RORXL    $0x19, BX, R13
   445  	RORXL    $0x0b, BX, R14
   446  	ADDL     76(SP)(SI*1), DX
   447  	ORL      R11, DI
   448  	VPSRLD   $0x0a, Y2, Y11
   449  	MOVL     CX, R15
   450  	RORXL    $0x0d, R9, R12
   451  	XORL     R14, R13
   452  	XORL     R8, R15
   453  	VPSRLQ   $0x13, Y2, Y3
   454  	RORXL    $0x06, BX, R14
   455  	ANDL     BX, R15
   456  	ADDL     DX, AX
   457  	ANDL     R10, DI
   458  	VPSRLQ   $0x11, Y2, Y2
   459  	XORL     R14, R13
   460  	XORL     R8, R15
   461  	VPXOR    Y3, Y2, Y2
   462  	RORXL    $0x16, R9, R14
   463  	ADDL     R13, R15
   464  	VPXOR    Y2, Y11, Y11
   465  	XORL     R12, R14
   466  	ADDL     R15, AX
   467  	RORXL    $0x02, R9, R12
   468  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   469  	VPADDD   Y0, Y11, Y6
   470  	XORL     R12, R14
   471  	MOVL     R9, R12
   472  	ANDL     R11, R12
   473  	ORL      R12, DI
   474  	ADDL     R14, DX
   475  	ADDL     R15, DX
   476  	ADDL     DI, DX
   477  
   478  	// Do 4 rounds and scheduling
   479  	VPADDD   96(BP)(SI*1), Y7, Y9
   480  	VMOVDQU  Y9, 96(SP)(SI*1)
   481  	MOVL     DX, DI
   482  	RORXL    $0x19, AX, R13
   483  	RORXL    $0x0b, AX, R14
   484  	ADDL     96(SP)(SI*1), R8
   485  	ORL      R10, DI
   486  	VPALIGNR $0x04, Y5, Y6, Y0
   487  	MOVL     BX, R15
   488  	RORXL    $0x0d, DX, R12
   489  	XORL     R14, R13
   490  	XORL     CX, R15
   491  	VPADDD   Y7, Y0, Y0
   492  	RORXL    $0x06, AX, R14
   493  	ANDL     AX, R15
   494  	XORL     R14, R13
   495  	RORXL    $0x16, DX, R14
   496  	ADDL     R8, R11
   497  	ANDL     R9, DI
   498  	VPALIGNR $0x04, Y7, Y4, Y1
   499  	XORL     R12, R14
   500  	RORXL    $0x02, DX, R12
   501  	XORL     CX, R15
   502  	VPSRLD   $0x07, Y1, Y2
   503  	XORL     R12, R14
   504  	MOVL     DX, R12
   505  	ANDL     R10, R12
   506  	ADDL     R13, R15
   507  	VPSLLD   $0x19, Y1, Y3
   508  	ORL      R12, DI
   509  	ADDL     R14, R8
   510  	ADDL     R15, R11
   511  	VPOR     Y2, Y3, Y3
   512  	VPSRLD   $0x12, Y1, Y2
   513  	ADDL     R15, R8
   514  	ADDL     DI, R8
   515  	MOVL     R8, DI
   516  	RORXL    $0x19, R11, R13
   517  	RORXL    $0x0b, R11, R14
   518  	ADDL     100(SP)(SI*1), CX
   519  	ORL      R9, DI
   520  	VPSRLD   $0x03, Y1, Y8
   521  	MOVL     AX, R15
   522  	RORXL    $0x0d, R8, R12
   523  	XORL     R14, R13
   524  	XORL     BX, R15
   525  	RORXL    $0x06, R11, R14
   526  	XORL     R14, R13
   527  	RORXL    $0x16, R8, R14
   528  	ANDL     R11, R15
   529  	ADDL     CX, R10
   530  	VPSLLD   $0x0e, Y1, Y1
   531  	ANDL     DX, DI
   532  	XORL     R12, R14
   533  	VPXOR    Y1, Y3, Y3
   534  	RORXL    $0x02, R8, R12
   535  	XORL     BX, R15
   536  	VPXOR    Y2, Y3, Y3
   537  	XORL     R12, R14
   538  	MOVL     R8, R12
   539  	ANDL     R9, R12
   540  	ADDL     R13, R15
   541  	VPXOR    Y8, Y3, Y1
   542  	VPSHUFD  $0xfa, Y6, Y2
   543  	ORL      R12, DI
   544  	ADDL     R14, CX
   545  	VPADDD   Y1, Y0, Y0
   546  	ADDL     R15, R10
   547  	ADDL     R15, CX
   548  	ADDL     DI, CX
   549  	VPSRLD   $0x0a, Y2, Y8
   550  	MOVL     CX, DI
   551  	RORXL    $0x19, R10, R13
   552  	ADDL     104(SP)(SI*1), BX
   553  	VPSRLQ   $0x13, Y2, Y3
   554  	RORXL    $0x0b, R10, R14
   555  	ORL      DX, DI
   556  	MOVL     R11, R15
   557  	XORL     AX, R15
   558  	RORXL    $0x0d, CX, R12
   559  	XORL     R14, R13
   560  	VPSRLQ   $0x11, Y2, Y2
   561  	ANDL     R10, R15
   562  	RORXL    $0x06, R10, R14
   563  	VPXOR    Y3, Y2, Y2
   564  	ADDL     BX, R9
   565  	ANDL     R8, DI
   566  	XORL     R14, R13
   567  	RORXL    $0x16, CX, R14
   568  	VPXOR    Y2, Y8, Y8
   569  	XORL     AX, R15
   570  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   571  	XORL     R12, R14
   572  	RORXL    $0x02, CX, R12
   573  	VPADDD   Y8, Y0, Y0
   574  	XORL     R12, R14
   575  	MOVL     CX, R12
   576  	ANDL     DX, R12
   577  	ADDL     R13, R15
   578  	VPSHUFD  $0x50, Y0, Y2
   579  	ORL      R12, DI
   580  	ADDL     R14, BX
   581  	ADDL     R15, R9
   582  	ADDL     R15, BX
   583  	ADDL     DI, BX
   584  	MOVL     BX, DI
   585  	RORXL    $0x19, R9, R13
   586  	RORXL    $0x0b, R9, R14
   587  	ADDL     108(SP)(SI*1), AX
   588  	ORL      R8, DI
   589  	VPSRLD   $0x0a, Y2, Y11
   590  	MOVL     R10, R15
   591  	RORXL    $0x0d, BX, R12
   592  	XORL     R14, R13
   593  	XORL     R11, R15
   594  	VPSRLQ   $0x13, Y2, Y3
   595  	RORXL    $0x06, R9, R14
   596  	ANDL     R9, R15
   597  	ADDL     AX, DX
   598  	ANDL     CX, DI
   599  	VPSRLQ   $0x11, Y2, Y2
   600  	XORL     R14, R13
   601  	XORL     R11, R15
   602  	VPXOR    Y3, Y2, Y2
   603  	RORXL    $0x16, BX, R14
   604  	ADDL     R13, R15
   605  	VPXOR    Y2, Y11, Y11
   606  	XORL     R12, R14
   607  	ADDL     R15, DX
   608  	RORXL    $0x02, BX, R12
   609  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   610  	VPADDD   Y0, Y11, Y7
   611  	XORL     R12, R14
   612  	MOVL     BX, R12
   613  	ANDL     R8, R12
   614  	ORL      R12, DI
   615  	ADDL     R14, AX
   616  	ADDL     R15, AX
   617  	ADDL     DI, AX
   618  	ADDQ     $0x80, SI
   619  	CMPQ     SI, $0x00000180
   620  	JB       avx2_loop1
   621  
   622  avx2_loop2:
   623  	VPADDD  (BP)(SI*1), Y4, Y9
   624  	VMOVDQU Y9, (SP)(SI*1)
   625  	MOVL    R9, R15
   626  	RORXL   $0x19, DX, R13
   627  	RORXL   $0x0b, DX, R14
   628  	XORL    R10, R15
   629  	XORL    R14, R13
   630  	RORXL   $0x06, DX, R14
   631  	ANDL    DX, R15
   632  	XORL    R14, R13
   633  	RORXL   $0x0d, AX, R12
   634  	XORL    R10, R15
   635  	RORXL   $0x16, AX, R14
   636  	MOVL    AX, DI
   637  	XORL    R12, R14
   638  	RORXL   $0x02, AX, R12
   639  	ADDL    (SP)(SI*1), R11
   640  	ORL     CX, DI
   641  	XORL    R12, R14
   642  	MOVL    AX, R12
   643  	ANDL    BX, DI
   644  	ANDL    CX, R12
   645  	ADDL    R13, R15
   646  	ADDL    R11, R8
   647  	ORL     R12, DI
   648  	ADDL    R14, R11
   649  	ADDL    R15, R8
   650  	ADDL    R15, R11
   651  	MOVL    DX, R15
   652  	RORXL   $0x19, R8, R13
   653  	RORXL   $0x0b, R8, R14
   654  	XORL    R9, R15
   655  	XORL    R14, R13
   656  	RORXL   $0x06, R8, R14
   657  	ANDL    R8, R15
   658  	ADDL    DI, R11
   659  	XORL    R14, R13
   660  	RORXL   $0x0d, R11, R12
   661  	XORL    R9, R15
   662  	RORXL   $0x16, R11, R14
   663  	MOVL    R11, DI
   664  	XORL    R12, R14
   665  	RORXL   $0x02, R11, R12
   666  	ADDL    4(SP)(SI*1), R10
   667  	ORL     BX, DI
   668  	XORL    R12, R14
   669  	MOVL    R11, R12
   670  	ANDL    AX, DI
   671  	ANDL    BX, R12
   672  	ADDL    R13, R15
   673  	ADDL    R10, CX
   674  	ORL     R12, DI
   675  	ADDL    R14, R10
   676  	ADDL    R15, CX
   677  	ADDL    R15, R10
   678  	MOVL    R8, R15
   679  	RORXL   $0x19, CX, R13
   680  	RORXL   $0x0b, CX, R14
   681  	XORL    DX, R15
   682  	XORL    R14, R13
   683  	RORXL   $0x06, CX, R14
   684  	ANDL    CX, R15
   685  	ADDL    DI, R10
   686  	XORL    R14, R13
   687  	RORXL   $0x0d, R10, R12
   688  	XORL    DX, R15
   689  	RORXL   $0x16, R10, R14
   690  	MOVL    R10, DI
   691  	XORL    R12, R14
   692  	RORXL   $0x02, R10, R12
   693  	ADDL    8(SP)(SI*1), R9
   694  	ORL     AX, DI
   695  	XORL    R12, R14
   696  	MOVL    R10, R12
   697  	ANDL    R11, DI
   698  	ANDL    AX, R12
   699  	ADDL    R13, R15
   700  	ADDL    R9, BX
   701  	ORL     R12, DI
   702  	ADDL    R14, R9
   703  	ADDL    R15, BX
   704  	ADDL    R15, R9
   705  	MOVL    CX, R15
   706  	RORXL   $0x19, BX, R13
   707  	RORXL   $0x0b, BX, R14
   708  	XORL    R8, R15
   709  	XORL    R14, R13
   710  	RORXL   $0x06, BX, R14
   711  	ANDL    BX, R15
   712  	ADDL    DI, R9
   713  	XORL    R14, R13
   714  	RORXL   $0x0d, R9, R12
   715  	XORL    R8, R15
   716  	RORXL   $0x16, R9, R14
   717  	MOVL    R9, DI
   718  	XORL    R12, R14
   719  	RORXL   $0x02, R9, R12
   720  	ADDL    12(SP)(SI*1), DX
   721  	ORL     R11, DI
   722  	XORL    R12, R14
   723  	MOVL    R9, R12
   724  	ANDL    R10, DI
   725  	ANDL    R11, R12
   726  	ADDL    R13, R15
   727  	ADDL    DX, AX
   728  	ORL     R12, DI
   729  	ADDL    R14, DX
   730  	ADDL    R15, AX
   731  	ADDL    R15, DX
   732  	ADDL    DI, DX
   733  	VPADDD  32(BP)(SI*1), Y5, Y9
   734  	VMOVDQU Y9, 32(SP)(SI*1)
   735  	MOVL    BX, R15
   736  	RORXL   $0x19, AX, R13
   737  	RORXL   $0x0b, AX, R14
   738  	XORL    CX, R15
   739  	XORL    R14, R13
   740  	RORXL   $0x06, AX, R14
   741  	ANDL    AX, R15
   742  	XORL    R14, R13
   743  	RORXL   $0x0d, DX, R12
   744  	XORL    CX, R15
   745  	RORXL   $0x16, DX, R14
   746  	MOVL    DX, DI
   747  	XORL    R12, R14
   748  	RORXL   $0x02, DX, R12
   749  	ADDL    32(SP)(SI*1), R8
   750  	ORL     R10, DI
   751  	XORL    R12, R14
   752  	MOVL    DX, R12
   753  	ANDL    R9, DI
   754  	ANDL    R10, R12
   755  	ADDL    R13, R15
   756  	ADDL    R8, R11
   757  	ORL     R12, DI
   758  	ADDL    R14, R8
   759  	ADDL    R15, R11
   760  	ADDL    R15, R8
   761  	MOVL    AX, R15
   762  	RORXL   $0x19, R11, R13
   763  	RORXL   $0x0b, R11, R14
   764  	XORL    BX, R15
   765  	XORL    R14, R13
   766  	RORXL   $0x06, R11, R14
   767  	ANDL    R11, R15
   768  	ADDL    DI, R8
   769  	XORL    R14, R13
   770  	RORXL   $0x0d, R8, R12
   771  	XORL    BX, R15
   772  	RORXL   $0x16, R8, R14
   773  	MOVL    R8, DI
   774  	XORL    R12, R14
   775  	RORXL   $0x02, R8, R12
   776  	ADDL    36(SP)(SI*1), CX
   777  	ORL     R9, DI
   778  	XORL    R12, R14
   779  	MOVL    R8, R12
   780  	ANDL    DX, DI
   781  	ANDL    R9, R12
   782  	ADDL    R13, R15
   783  	ADDL    CX, R10
   784  	ORL     R12, DI
   785  	ADDL    R14, CX
   786  	ADDL    R15, R10
   787  	ADDL    R15, CX
   788  	MOVL    R11, R15
   789  	RORXL   $0x19, R10, R13
   790  	RORXL   $0x0b, R10, R14
   791  	XORL    AX, R15
   792  	XORL    R14, R13
   793  	RORXL   $0x06, R10, R14
   794  	ANDL    R10, R15
   795  	ADDL    DI, CX
   796  	XORL    R14, R13
   797  	RORXL   $0x0d, CX, R12
   798  	XORL    AX, R15
   799  	RORXL   $0x16, CX, R14
   800  	MOVL    CX, DI
   801  	XORL    R12, R14
   802  	RORXL   $0x02, CX, R12
   803  	ADDL    40(SP)(SI*1), BX
   804  	ORL     DX, DI
   805  	XORL    R12, R14
   806  	MOVL    CX, R12
   807  	ANDL    R8, DI
   808  	ANDL    DX, R12
   809  	ADDL    R13, R15
   810  	ADDL    BX, R9
   811  	ORL     R12, DI
   812  	ADDL    R14, BX
   813  	ADDL    R15, R9
   814  	ADDL    R15, BX
   815  	MOVL    R10, R15
   816  	RORXL   $0x19, R9, R13
   817  	RORXL   $0x0b, R9, R14
   818  	XORL    R11, R15
   819  	XORL    R14, R13
   820  	RORXL   $0x06, R9, R14
   821  	ANDL    R9, R15
   822  	ADDL    DI, BX
   823  	XORL    R14, R13
   824  	RORXL   $0x0d, BX, R12
   825  	XORL    R11, R15
   826  	RORXL   $0x16, BX, R14
   827  	MOVL    BX, DI
   828  	XORL    R12, R14
   829  	RORXL   $0x02, BX, R12
   830  	ADDL    44(SP)(SI*1), AX
   831  	ORL     R8, DI
   832  	XORL    R12, R14
   833  	MOVL    BX, R12
   834  	ANDL    CX, DI
   835  	ANDL    R8, R12
   836  	ADDL    R13, R15
   837  	ADDL    AX, DX
   838  	ORL     R12, DI
   839  	ADDL    R14, AX
   840  	ADDL    R15, DX
   841  	ADDL    R15, AX
   842  	ADDL    DI, AX
   843  	ADDQ    $0x40, SI
   844  	VMOVDQU Y6, Y4
   845  	VMOVDQU Y7, Y5
   846  	CMPQ    SI, $0x00000200
   847  	JB      avx2_loop2
   848  	MOVQ    dig+0(FP), SI
   849  	MOVQ    520(SP), DI
   850  	ADDL    AX, (SI)
   851  	MOVL    (SI), AX
   852  	ADDL    BX, 4(SI)
   853  	MOVL    4(SI), BX
   854  	ADDL    CX, 8(SI)
   855  	MOVL    8(SI), CX
   856  	ADDL    R8, 12(SI)
   857  	MOVL    12(SI), R8
   858  	ADDL    DX, 16(SI)
   859  	MOVL    16(SI), DX
   860  	ADDL    R9, 20(SI)
   861  	MOVL    20(SI), R9
   862  	ADDL    R10, 24(SI)
   863  	MOVL    24(SI), R10
   864  	ADDL    R11, 28(SI)
   865  	MOVL    28(SI), R11
   866  	CMPQ    512(SP), DI
   867  	JB      done_hash
   868  	XORQ    SI, SI
   869  
   870  avx2_loop3:
   871  	MOVL  R9, R15
   872  	RORXL $0x19, DX, R13
   873  	RORXL $0x0b, DX, R14
   874  	XORL  R10, R15
   875  	XORL  R14, R13
   876  	RORXL $0x06, DX, R14
   877  	ANDL  DX, R15
   878  	XORL  R14, R13
   879  	RORXL $0x0d, AX, R12
   880  	XORL  R10, R15
   881  	RORXL $0x16, AX, R14
   882  	MOVL  AX, DI
   883  	XORL  R12, R14
   884  	RORXL $0x02, AX, R12
   885  	ADDL  16(SP)(SI*1), R11
   886  	ORL   CX, DI
   887  	XORL  R12, R14
   888  	MOVL  AX, R12
   889  	ANDL  BX, DI
   890  	ANDL  CX, R12
   891  	ADDL  R13, R15
   892  	ADDL  R11, R8
   893  	ORL   R12, DI
   894  	ADDL  R14, R11
   895  	ADDL  R15, R8
   896  	ADDL  R15, R11
   897  	MOVL  DX, R15
   898  	RORXL $0x19, R8, R13
   899  	RORXL $0x0b, R8, R14
   900  	XORL  R9, R15
   901  	XORL  R14, R13
   902  	RORXL $0x06, R8, R14
   903  	ANDL  R8, R15
   904  	ADDL  DI, R11
   905  	XORL  R14, R13
   906  	RORXL $0x0d, R11, R12
   907  	XORL  R9, R15
   908  	RORXL $0x16, R11, R14
   909  	MOVL  R11, DI
   910  	XORL  R12, R14
   911  	RORXL $0x02, R11, R12
   912  	ADDL  20(SP)(SI*1), R10
   913  	ORL   BX, DI
   914  	XORL  R12, R14
   915  	MOVL  R11, R12
   916  	ANDL  AX, DI
   917  	ANDL  BX, R12
   918  	ADDL  R13, R15
   919  	ADDL  R10, CX
   920  	ORL   R12, DI
   921  	ADDL  R14, R10
   922  	ADDL  R15, CX
   923  	ADDL  R15, R10
   924  	MOVL  R8, R15
   925  	RORXL $0x19, CX, R13
   926  	RORXL $0x0b, CX, R14
   927  	XORL  DX, R15
   928  	XORL  R14, R13
   929  	RORXL $0x06, CX, R14
   930  	ANDL  CX, R15
   931  	ADDL  DI, R10
   932  	XORL  R14, R13
   933  	RORXL $0x0d, R10, R12
   934  	XORL  DX, R15
   935  	RORXL $0x16, R10, R14
   936  	MOVL  R10, DI
   937  	XORL  R12, R14
   938  	RORXL $0x02, R10, R12
   939  	ADDL  24(SP)(SI*1), R9
   940  	ORL   AX, DI
   941  	XORL  R12, R14
   942  	MOVL  R10, R12
   943  	ANDL  R11, DI
   944  	ANDL  AX, R12
   945  	ADDL  R13, R15
   946  	ADDL  R9, BX
   947  	ORL   R12, DI
   948  	ADDL  R14, R9
   949  	ADDL  R15, BX
   950  	ADDL  R15, R9
   951  	MOVL  CX, R15
   952  	RORXL $0x19, BX, R13
   953  	RORXL $0x0b, BX, R14
   954  	XORL  R8, R15
   955  	XORL  R14, R13
   956  	RORXL $0x06, BX, R14
   957  	ANDL  BX, R15
   958  	ADDL  DI, R9
   959  	XORL  R14, R13
   960  	RORXL $0x0d, R9, R12
   961  	XORL  R8, R15
   962  	RORXL $0x16, R9, R14
   963  	MOVL  R9, DI
   964  	XORL  R12, R14
   965  	RORXL $0x02, R9, R12
   966  	ADDL  28(SP)(SI*1), DX
   967  	ORL   R11, DI
   968  	XORL  R12, R14
   969  	MOVL  R9, R12
   970  	ANDL  R10, DI
   971  	ANDL  R11, R12
   972  	ADDL  R13, R15
   973  	ADDL  DX, AX
   974  	ORL   R12, DI
   975  	ADDL  R14, DX
   976  	ADDL  R15, AX
   977  	ADDL  R15, DX
   978  	ADDL  DI, DX
   979  	MOVL  BX, R15
   980  	RORXL $0x19, AX, R13
   981  	RORXL $0x0b, AX, R14
   982  	XORL  CX, R15
   983  	XORL  R14, R13
   984  	RORXL $0x06, AX, R14
   985  	ANDL  AX, R15
   986  	XORL  R14, R13
   987  	RORXL $0x0d, DX, R12
   988  	XORL  CX, R15
   989  	RORXL $0x16, DX, R14
   990  	MOVL  DX, DI
   991  	XORL  R12, R14
   992  	RORXL $0x02, DX, R12
   993  	ADDL  48(SP)(SI*1), R8
   994  	ORL   R10, DI
   995  	XORL  R12, R14
   996  	MOVL  DX, R12
   997  	ANDL  R9, DI
   998  	ANDL  R10, R12
   999  	ADDL  R13, R15
  1000  	ADDL  R8, R11
  1001  	ORL   R12, DI
  1002  	ADDL  R14, R8
  1003  	ADDL  R15, R11
  1004  	ADDL  R15, R8
  1005  	MOVL  AX, R15
  1006  	RORXL $0x19, R11, R13
  1007  	RORXL $0x0b, R11, R14
  1008  	XORL  BX, R15
  1009  	XORL  R14, R13
  1010  	RORXL $0x06, R11, R14
  1011  	ANDL  R11, R15
  1012  	ADDL  DI, R8
  1013  	XORL  R14, R13
  1014  	RORXL $0x0d, R8, R12
  1015  	XORL  BX, R15
  1016  	RORXL $0x16, R8, R14
  1017  	MOVL  R8, DI
  1018  	XORL  R12, R14
  1019  	RORXL $0x02, R8, R12
  1020  	ADDL  52(SP)(SI*1), CX
  1021  	ORL   R9, DI
  1022  	XORL  R12, R14
  1023  	MOVL  R8, R12
  1024  	ANDL  DX, DI
  1025  	ANDL  R9, R12
  1026  	ADDL  R13, R15
  1027  	ADDL  CX, R10
  1028  	ORL   R12, DI
  1029  	ADDL  R14, CX
  1030  	ADDL  R15, R10
  1031  	ADDL  R15, CX
  1032  	MOVL  R11, R15
  1033  	RORXL $0x19, R10, R13
  1034  	RORXL $0x0b, R10, R14
  1035  	XORL  AX, R15
  1036  	XORL  R14, R13
  1037  	RORXL $0x06, R10, R14
  1038  	ANDL  R10, R15
  1039  	ADDL  DI, CX
  1040  	XORL  R14, R13
  1041  	RORXL $0x0d, CX, R12
  1042  	XORL  AX, R15
  1043  	RORXL $0x16, CX, R14
  1044  	MOVL  CX, DI
  1045  	XORL  R12, R14
  1046  	RORXL $0x02, CX, R12
  1047  	ADDL  56(SP)(SI*1), BX
  1048  	ORL   DX, DI
  1049  	XORL  R12, R14
  1050  	MOVL  CX, R12
  1051  	ANDL  R8, DI
  1052  	ANDL  DX, R12
  1053  	ADDL  R13, R15
  1054  	ADDL  BX, R9
  1055  	ORL   R12, DI
  1056  	ADDL  R14, BX
  1057  	ADDL  R15, R9
  1058  	ADDL  R15, BX
  1059  	MOVL  R10, R15
  1060  	RORXL $0x19, R9, R13
  1061  	RORXL $0x0b, R9, R14
  1062  	XORL  R11, R15
  1063  	XORL  R14, R13
  1064  	RORXL $0x06, R9, R14
  1065  	ANDL  R9, R15
  1066  	ADDL  DI, BX
  1067  	XORL  R14, R13
  1068  	RORXL $0x0d, BX, R12
  1069  	XORL  R11, R15
  1070  	RORXL $0x16, BX, R14
  1071  	MOVL  BX, DI
  1072  	XORL  R12, R14
  1073  	RORXL $0x02, BX, R12
  1074  	ADDL  60(SP)(SI*1), AX
  1075  	ORL   R8, DI
  1076  	XORL  R12, R14
  1077  	MOVL  BX, R12
  1078  	ANDL  CX, DI
  1079  	ANDL  R8, R12
  1080  	ADDL  R13, R15
  1081  	ADDL  AX, DX
  1082  	ORL   R12, DI
  1083  	ADDL  R14, AX
  1084  	ADDL  R15, DX
  1085  	ADDL  R15, AX
  1086  	ADDL  DI, AX
  1087  	ADDQ  $0x40, SI
  1088  	CMPQ  SI, $0x00000200
  1089  	JB    avx2_loop3
  1090  	MOVQ  dig+0(FP), SI
  1091  	MOVQ  520(SP), DI
  1092  	ADDQ  $0x40, DI
  1093  	ADDL  AX, (SI)
  1094  	MOVL  (SI), AX
  1095  	ADDL  BX, 4(SI)
  1096  	MOVL  4(SI), BX
  1097  	ADDL  CX, 8(SI)
  1098  	MOVL  8(SI), CX
  1099  	ADDL  R8, 12(SI)
  1100  	MOVL  12(SI), R8
  1101  	ADDL  DX, 16(SI)
  1102  	MOVL  16(SI), DX
  1103  	ADDL  R9, 20(SI)
  1104  	MOVL  20(SI), R9
  1105  	ADDL  R10, 24(SI)
  1106  	MOVL  24(SI), R10
  1107  	ADDL  R11, 28(SI)
  1108  	MOVL  28(SI), R11
  1109  	CMPQ  512(SP), DI
  1110  	JA    avx2_loop0
  1111  	JB    done_hash
  1112  
  1113  avx2_do_last_block:
  1114  	VMOVDQU (DI), X4
  1115  	VMOVDQU 16(DI), X5
  1116  	VMOVDQU 32(DI), X6
  1117  	VMOVDQU 48(DI), X7
  1118  	VMOVDQU flip_mask<>+0(SB), Y13
  1119  	VPSHUFB X13, X4, X4
  1120  	VPSHUFB X13, X5, X5
  1121  	VPSHUFB X13, X6, X6
  1122  	VPSHUFB X13, X7, X7
  1123  	LEAQ    K256<>+0(SB), BP
  1124  	JMP     avx2_last_block_enter
  1125  
  1126  avx2_only_one_block:
  1127  	MOVL (SI), AX
  1128  	MOVL 4(SI), BX
  1129  	MOVL 8(SI), CX
  1130  	MOVL 12(SI), R8
  1131  	MOVL 16(SI), DX
  1132  	MOVL 20(SI), R9
  1133  	MOVL 24(SI), R10
  1134  	MOVL 28(SI), R11
  1135  	JMP  avx2_do_last_block
  1136  
  1137  done_hash:
  1138  	VZEROUPPER
  1139  	RET
  1140  
  1141  DATA flip_mask<>+0(SB)/8, $0x0405060700010203
  1142  DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b
  1143  DATA flip_mask<>+16(SB)/8, $0x0405060700010203
  1144  DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b
  1145  GLOBL flip_mask<>(SB), RODATA, $32
  1146  
  1147  DATA K256<>+0(SB)/4, $0x428a2f98
  1148  DATA K256<>+4(SB)/4, $0x71374491
  1149  DATA K256<>+8(SB)/4, $0xb5c0fbcf
  1150  DATA K256<>+12(SB)/4, $0xe9b5dba5
  1151  DATA K256<>+16(SB)/4, $0x428a2f98
  1152  DATA K256<>+20(SB)/4, $0x71374491
  1153  DATA K256<>+24(SB)/4, $0xb5c0fbcf
  1154  DATA K256<>+28(SB)/4, $0xe9b5dba5
  1155  DATA K256<>+32(SB)/4, $0x3956c25b
  1156  DATA K256<>+36(SB)/4, $0x59f111f1
  1157  DATA K256<>+40(SB)/4, $0x923f82a4
  1158  DATA K256<>+44(SB)/4, $0xab1c5ed5
  1159  DATA K256<>+48(SB)/4, $0x3956c25b
  1160  DATA K256<>+52(SB)/4, $0x59f111f1
  1161  DATA K256<>+56(SB)/4, $0x923f82a4
  1162  DATA K256<>+60(SB)/4, $0xab1c5ed5
  1163  DATA K256<>+64(SB)/4, $0xd807aa98
  1164  DATA K256<>+68(SB)/4, $0x12835b01
  1165  DATA K256<>+72(SB)/4, $0x243185be
  1166  DATA K256<>+76(SB)/4, $0x550c7dc3
  1167  DATA K256<>+80(SB)/4, $0xd807aa98
  1168  DATA K256<>+84(SB)/4, $0x12835b01
  1169  DATA K256<>+88(SB)/4, $0x243185be
  1170  DATA K256<>+92(SB)/4, $0x550c7dc3
  1171  DATA K256<>+96(SB)/4, $0x72be5d74
  1172  DATA K256<>+100(SB)/4, $0x80deb1fe
  1173  DATA K256<>+104(SB)/4, $0x9bdc06a7
  1174  DATA K256<>+108(SB)/4, $0xc19bf174
  1175  DATA K256<>+112(SB)/4, $0x72be5d74
  1176  DATA K256<>+116(SB)/4, $0x80deb1fe
  1177  DATA K256<>+120(SB)/4, $0x9bdc06a7
  1178  DATA K256<>+124(SB)/4, $0xc19bf174
  1179  DATA K256<>+128(SB)/4, $0xe49b69c1
  1180  DATA K256<>+132(SB)/4, $0xefbe4786
  1181  DATA K256<>+136(SB)/4, $0x0fc19dc6
  1182  DATA K256<>+140(SB)/4, $0x240ca1cc
  1183  DATA K256<>+144(SB)/4, $0xe49b69c1
  1184  DATA K256<>+148(SB)/4, $0xefbe4786
  1185  DATA K256<>+152(SB)/4, $0x0fc19dc6
  1186  DATA K256<>+156(SB)/4, $0x240ca1cc
  1187  DATA K256<>+160(SB)/4, $0x2de92c6f
  1188  DATA K256<>+164(SB)/4, $0x4a7484aa
  1189  DATA K256<>+168(SB)/4, $0x5cb0a9dc
  1190  DATA K256<>+172(SB)/4, $0x76f988da
  1191  DATA K256<>+176(SB)/4, $0x2de92c6f
  1192  DATA K256<>+180(SB)/4, $0x4a7484aa
  1193  DATA K256<>+184(SB)/4, $0x5cb0a9dc
  1194  DATA K256<>+188(SB)/4, $0x76f988da
  1195  DATA K256<>+192(SB)/4, $0x983e5152
  1196  DATA K256<>+196(SB)/4, $0xa831c66d
  1197  DATA K256<>+200(SB)/4, $0xb00327c8
  1198  DATA K256<>+204(SB)/4, $0xbf597fc7
  1199  DATA K256<>+208(SB)/4, $0x983e5152
  1200  DATA K256<>+212(SB)/4, $0xa831c66d
  1201  DATA K256<>+216(SB)/4, $0xb00327c8
  1202  DATA K256<>+220(SB)/4, $0xbf597fc7
  1203  DATA K256<>+224(SB)/4, $0xc6e00bf3
  1204  DATA K256<>+228(SB)/4, $0xd5a79147
  1205  DATA K256<>+232(SB)/4, $0x06ca6351
  1206  DATA K256<>+236(SB)/4, $0x14292967
  1207  DATA K256<>+240(SB)/4, $0xc6e00bf3
  1208  DATA K256<>+244(SB)/4, $0xd5a79147
  1209  DATA K256<>+248(SB)/4, $0x06ca6351
  1210  DATA K256<>+252(SB)/4, $0x14292967
  1211  DATA K256<>+256(SB)/4, $0x27b70a85
  1212  DATA K256<>+260(SB)/4, $0x2e1b2138
  1213  DATA K256<>+264(SB)/4, $0x4d2c6dfc
  1214  DATA K256<>+268(SB)/4, $0x53380d13
  1215  DATA K256<>+272(SB)/4, $0x27b70a85
  1216  DATA K256<>+276(SB)/4, $0x2e1b2138
  1217  DATA K256<>+280(SB)/4, $0x4d2c6dfc
  1218  DATA K256<>+284(SB)/4, $0x53380d13
  1219  DATA K256<>+288(SB)/4, $0x650a7354
  1220  DATA K256<>+292(SB)/4, $0x766a0abb
  1221  DATA K256<>+296(SB)/4, $0x81c2c92e
  1222  DATA K256<>+300(SB)/4, $0x92722c85
  1223  DATA K256<>+304(SB)/4, $0x650a7354
  1224  DATA K256<>+308(SB)/4, $0x766a0abb
  1225  DATA K256<>+312(SB)/4, $0x81c2c92e
  1226  DATA K256<>+316(SB)/4, $0x92722c85
  1227  DATA K256<>+320(SB)/4, $0xa2bfe8a1
  1228  DATA K256<>+324(SB)/4, $0xa81a664b
  1229  DATA K256<>+328(SB)/4, $0xc24b8b70
  1230  DATA K256<>+332(SB)/4, $0xc76c51a3
  1231  DATA K256<>+336(SB)/4, $0xa2bfe8a1
  1232  DATA K256<>+340(SB)/4, $0xa81a664b
  1233  DATA K256<>+344(SB)/4, $0xc24b8b70
  1234  DATA K256<>+348(SB)/4, $0xc76c51a3
  1235  DATA K256<>+352(SB)/4, $0xd192e819
  1236  DATA K256<>+356(SB)/4, $0xd6990624
  1237  DATA K256<>+360(SB)/4, $0xf40e3585
  1238  DATA K256<>+364(SB)/4, $0x106aa070
  1239  DATA K256<>+368(SB)/4, $0xd192e819
  1240  DATA K256<>+372(SB)/4, $0xd6990624
  1241  DATA K256<>+376(SB)/4, $0xf40e3585
  1242  DATA K256<>+380(SB)/4, $0x106aa070
  1243  DATA K256<>+384(SB)/4, $0x19a4c116
  1244  DATA K256<>+388(SB)/4, $0x1e376c08
  1245  DATA K256<>+392(SB)/4, $0x2748774c
  1246  DATA K256<>+396(SB)/4, $0x34b0bcb5
  1247  DATA K256<>+400(SB)/4, $0x19a4c116
  1248  DATA K256<>+404(SB)/4, $0x1e376c08
  1249  DATA K256<>+408(SB)/4, $0x2748774c
  1250  DATA K256<>+412(SB)/4, $0x34b0bcb5
  1251  DATA K256<>+416(SB)/4, $0x391c0cb3
  1252  DATA K256<>+420(SB)/4, $0x4ed8aa4a
  1253  DATA K256<>+424(SB)/4, $0x5b9cca4f
  1254  DATA K256<>+428(SB)/4, $0x682e6ff3
  1255  DATA K256<>+432(SB)/4, $0x391c0cb3
  1256  DATA K256<>+436(SB)/4, $0x4ed8aa4a
  1257  DATA K256<>+440(SB)/4, $0x5b9cca4f
  1258  DATA K256<>+444(SB)/4, $0x682e6ff3
  1259  DATA K256<>+448(SB)/4, $0x748f82ee
  1260  DATA K256<>+452(SB)/4, $0x78a5636f
  1261  DATA K256<>+456(SB)/4, $0x84c87814
  1262  DATA K256<>+460(SB)/4, $0x8cc70208
  1263  DATA K256<>+464(SB)/4, $0x748f82ee
  1264  DATA K256<>+468(SB)/4, $0x78a5636f
  1265  DATA K256<>+472(SB)/4, $0x84c87814
  1266  DATA K256<>+476(SB)/4, $0x8cc70208
  1267  DATA K256<>+480(SB)/4, $0x90befffa
  1268  DATA K256<>+484(SB)/4, $0xa4506ceb
  1269  DATA K256<>+488(SB)/4, $0xbef9a3f7
  1270  DATA K256<>+492(SB)/4, $0xc67178f2
  1271  DATA K256<>+496(SB)/4, $0x90befffa
  1272  DATA K256<>+500(SB)/4, $0xa4506ceb
  1273  DATA K256<>+504(SB)/4, $0xbef9a3f7
  1274  DATA K256<>+508(SB)/4, $0xc67178f2
  1275  GLOBL K256<>(SB), RODATA|NOPTR, $512
  1276  
  1277  DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100
  1278  DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff
  1279  DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100
  1280  DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff
  1281  GLOBL shuff_00BA<>(SB), RODATA, $32
  1282  
  1283  DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff
  1284  DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100
  1285  DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff
  1286  DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100
  1287  GLOBL shuff_DC00<>(SB), RODATA, $32
  1288  
  1289  // func blockSHANI(dig *Digest, p []byte)
  1290  // Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
  1291  TEXT ·blockSHANI(SB), $0-32
  1292  	MOVQ    dig+0(FP), DI
  1293  	MOVQ    p_base+8(FP), SI
  1294  	MOVQ    p_len+16(FP), DX
  1295  	SHRQ    $0x06, DX
  1296  	SHLQ    $0x06, DX
  1297  	CMPQ    DX, $0x00
  1298  	JEQ     done
  1299  	ADDQ    SI, DX
  1300  	VMOVDQU (DI), X1
  1301  	VMOVDQU 16(DI), X2
  1302  	PSHUFD  $0xb1, X1, X1
  1303  	PSHUFD  $0x1b, X2, X2
  1304  	VMOVDQA X1, X7
  1305  	PALIGNR $0x08, X2, X1
  1306  	PBLENDW $0xf0, X7, X2
  1307  	VMOVDQA flip_mask<>+0(SB), X8
  1308  	LEAQ    K256<>+0(SB), AX
  1309  
  1310  roundLoop:
  1311  	// save hash values for addition after rounds
  1312  	VMOVDQA X1, X9
  1313  	VMOVDQA X2, X10
  1314  
  1315  	// do rounds 0-59
  1316  	VMOVDQU     (SI), X0
  1317  	PSHUFB      X8, X0
  1318  	VMOVDQA     X0, X3
  1319  	PADDD       (AX), X0
  1320  	SHA256RNDS2 X0, X1, X2
  1321  	PSHUFD      $0x0e, X0, X0
  1322  	SHA256RNDS2 X0, X2, X1
  1323  	VMOVDQU     16(SI), X0
  1324  	PSHUFB      X8, X0
  1325  	VMOVDQA     X0, X4
  1326  	PADDD       32(AX), X0
  1327  	SHA256RNDS2 X0, X1, X2
  1328  	PSHUFD      $0x0e, X0, X0
  1329  	SHA256RNDS2 X0, X2, X1
  1330  	SHA256MSG1  X4, X3
  1331  	VMOVDQU     32(SI), X0
  1332  	PSHUFB      X8, X0
  1333  	VMOVDQA     X0, X5
  1334  	PADDD       64(AX), X0
  1335  	SHA256RNDS2 X0, X1, X2
  1336  	PSHUFD      $0x0e, X0, X0
  1337  	SHA256RNDS2 X0, X2, X1
  1338  	SHA256MSG1  X5, X4
  1339  	VMOVDQU     48(SI), X0
  1340  	PSHUFB      X8, X0
  1341  	VMOVDQA     X0, X6
  1342  	PADDD       96(AX), X0
  1343  	SHA256RNDS2 X0, X1, X2
  1344  	VMOVDQA     X6, X7
  1345  	PALIGNR     $0x04, X5, X7
  1346  	PADDD       X7, X3
  1347  	SHA256MSG2  X6, X3
  1348  	PSHUFD      $0x0e, X0, X0
  1349  	SHA256RNDS2 X0, X2, X1
  1350  	SHA256MSG1  X6, X5
  1351  	VMOVDQA     X3, X0
  1352  	PADDD       128(AX), X0
  1353  	SHA256RNDS2 X0, X1, X2
  1354  	VMOVDQA     X3, X7
  1355  	PALIGNR     $0x04, X6, X7
  1356  	PADDD       X7, X4
  1357  	SHA256MSG2  X3, X4
  1358  	PSHUFD      $0x0e, X0, X0
  1359  	SHA256RNDS2 X0, X2, X1
  1360  	SHA256MSG1  X3, X6
  1361  	VMOVDQA     X4, X0
  1362  	PADDD       160(AX), X0
  1363  	SHA256RNDS2 X0, X1, X2
  1364  	VMOVDQA     X4, X7
  1365  	PALIGNR     $0x04, X3, X7
  1366  	PADDD       X7, X5
  1367  	SHA256MSG2  X4, X5
  1368  	PSHUFD      $0x0e, X0, X0
  1369  	SHA256RNDS2 X0, X2, X1
  1370  	SHA256MSG1  X4, X3
  1371  	VMOVDQA     X5, X0
  1372  	PADDD       192(AX), X0
  1373  	SHA256RNDS2 X0, X1, X2
  1374  	VMOVDQA     X5, X7
  1375  	PALIGNR     $0x04, X4, X7
  1376  	PADDD       X7, X6
  1377  	SHA256MSG2  X5, X6
  1378  	PSHUFD      $0x0e, X0, X0
  1379  	SHA256RNDS2 X0, X2, X1
  1380  	SHA256MSG1  X5, X4
  1381  	VMOVDQA     X6, X0
  1382  	PADDD       224(AX), X0
  1383  	SHA256RNDS2 X0, X1, X2
  1384  	VMOVDQA     X6, X7
  1385  	PALIGNR     $0x04, X5, X7
  1386  	PADDD       X7, X3
  1387  	SHA256MSG2  X6, X3
  1388  	PSHUFD      $0x0e, X0, X0
  1389  	SHA256RNDS2 X0, X2, X1
  1390  	SHA256MSG1  X6, X5
  1391  	VMOVDQA     X3, X0
  1392  	PADDD       256(AX), X0
  1393  	SHA256RNDS2 X0, X1, X2
  1394  	VMOVDQA     X3, X7
  1395  	PALIGNR     $0x04, X6, X7
  1396  	PADDD       X7, X4
  1397  	SHA256MSG2  X3, X4
  1398  	PSHUFD      $0x0e, X0, X0
  1399  	SHA256RNDS2 X0, X2, X1
  1400  	SHA256MSG1  X3, X6
  1401  	VMOVDQA     X4, X0
  1402  	PADDD       288(AX), X0
  1403  	SHA256RNDS2 X0, X1, X2
  1404  	VMOVDQA     X4, X7
  1405  	PALIGNR     $0x04, X3, X7
  1406  	PADDD       X7, X5
  1407  	SHA256MSG2  X4, X5
  1408  	PSHUFD      $0x0e, X0, X0
  1409  	SHA256RNDS2 X0, X2, X1
  1410  	SHA256MSG1  X4, X3
  1411  	VMOVDQA     X5, X0
  1412  	PADDD       320(AX), X0
  1413  	SHA256RNDS2 X0, X1, X2
  1414  	VMOVDQA     X5, X7
  1415  	PALIGNR     $0x04, X4, X7
  1416  	PADDD       X7, X6
  1417  	SHA256MSG2  X5, X6
  1418  	PSHUFD      $0x0e, X0, X0
  1419  	SHA256RNDS2 X0, X2, X1
  1420  	SHA256MSG1  X5, X4
  1421  	VMOVDQA     X6, X0
  1422  	PADDD       352(AX), X0
  1423  	SHA256RNDS2 X0, X1, X2
  1424  	VMOVDQA     X6, X7
  1425  	PALIGNR     $0x04, X5, X7
  1426  	PADDD       X7, X3
  1427  	SHA256MSG2  X6, X3
  1428  	PSHUFD      $0x0e, X0, X0
  1429  	SHA256RNDS2 X0, X2, X1
  1430  	SHA256MSG1  X6, X5
  1431  	VMOVDQA     X3, X0
  1432  	PADDD       384(AX), X0
  1433  	SHA256RNDS2 X0, X1, X2
  1434  	VMOVDQA     X3, X7
  1435  	PALIGNR     $0x04, X6, X7
  1436  	PADDD       X7, X4
  1437  	SHA256MSG2  X3, X4
  1438  	PSHUFD      $0x0e, X0, X0
  1439  	SHA256RNDS2 X0, X2, X1
  1440  	SHA256MSG1  X3, X6
  1441  	VMOVDQA     X4, X0
  1442  	PADDD       416(AX), X0
  1443  	SHA256RNDS2 X0, X1, X2
  1444  	VMOVDQA     X4, X7
  1445  	PALIGNR     $0x04, X3, X7
  1446  	PADDD       X7, X5
  1447  	SHA256MSG2  X4, X5
  1448  	PSHUFD      $0x0e, X0, X0
  1449  	SHA256RNDS2 X0, X2, X1
  1450  	VMOVDQA     X5, X0
  1451  	PADDD       448(AX), X0
  1452  	SHA256RNDS2 X0, X1, X2
  1453  	VMOVDQA     X5, X7
  1454  	PALIGNR     $0x04, X4, X7
  1455  	PADDD       X7, X6
  1456  	SHA256MSG2  X5, X6
  1457  	PSHUFD      $0x0e, X0, X0
  1458  	SHA256RNDS2 X0, X2, X1
  1459  
  1460  	// do rounds 60-63
  1461  	VMOVDQA     X6, X0
  1462  	PADDD       480(AX), X0
  1463  	SHA256RNDS2 X0, X1, X2
  1464  	PSHUFD      $0x0e, X0, X0
  1465  	SHA256RNDS2 X0, X2, X1
  1466  
  1467  	// add current hash values with previously saved
  1468  	PADDD X9, X1
  1469  	PADDD X10, X2
  1470  
  1471  	// advance data pointer; loop until buffer empty
  1472  	ADDQ $0x40, SI
  1473  	CMPQ DX, SI
  1474  	JNE  roundLoop
  1475  
  1476  	// write hash values back in the correct order
  1477  	PSHUFD  $0x1b, X1, X1
  1478  	PSHUFD  $0xb1, X2, X2
  1479  	VMOVDQA X1, X7
  1480  	PBLENDW $0xf0, X2, X1
  1481  	PALIGNR $0x08, X7, X2
  1482  	VMOVDQU X1, (DI)
  1483  	VMOVDQU X2, 16(DI)
  1484  
  1485  done:
  1486  	RET
  1487  

View as plain text