1 // Code generated by command: go run sha256block_amd64_asm.go -out ../sha256block_amd64.s. DO NOT EDIT.
2
3 //go:build !purego
4
5 #include "textflag.h"
6
7 // func blockAVX2(dig *Digest, p []byte)
8 // Requires: AVX, AVX2, BMI2
9 TEXT ·blockAVX2(SB), $536-32
10 MOVQ dig+0(FP), SI
11 MOVQ p_base+8(FP), DI
12 MOVQ p_len+16(FP), DX
13 LEAQ -64(DI)(DX*1), DX
14 MOVQ DX, 512(SP)
15 CMPQ DX, DI
16 JE avx2_only_one_block
17
18 // Load initial digest
19 MOVL (SI), AX
20 MOVL 4(SI), BX
21 MOVL 8(SI), CX
22 MOVL 12(SI), R8
23 MOVL 16(SI), DX
24 MOVL 20(SI), R9
25 MOVL 24(SI), R10
26 MOVL 28(SI), R11
27
28 avx2_loop0:
29 // at each iteration works with one block (512 bit)
30 VMOVDQU (DI), Y0
31 VMOVDQU 32(DI), Y1
32 VMOVDQU 64(DI), Y2
33 VMOVDQU 96(DI), Y3
34 VMOVDQU flip_mask<>+0(SB), Y13
35
36 // Apply Byte Flip Mask: LE -> BE
37 VPSHUFB Y13, Y0, Y0
38 VPSHUFB Y13, Y1, Y1
39 VPSHUFB Y13, Y2, Y2
40 VPSHUFB Y13, Y3, Y3
41
42 // Transpose data into high/low parts
43 VPERM2I128 $0x20, Y2, Y0, Y4
44 VPERM2I128 $0x31, Y2, Y0, Y5
45 VPERM2I128 $0x20, Y3, Y1, Y6
46 VPERM2I128 $0x31, Y3, Y1, Y7
47 LEAQ K256<>+0(SB), BP
48
49 avx2_last_block_enter:
50 ADDQ $0x40, DI
51 MOVQ DI, 520(SP)
52 XORQ SI, SI
53
54 avx2_loop1:
55 // Do 4 rounds and scheduling
56 VPADDD (BP)(SI*1), Y4, Y9
57 VMOVDQU Y9, (SP)(SI*1)
58 MOVL AX, DI
59 RORXL $0x19, DX, R13
60 RORXL $0x0b, DX, R14
61 ADDL (SP)(SI*1), R11
62 ORL CX, DI
63 VPALIGNR $0x04, Y6, Y7, Y0
64 MOVL R9, R15
65 RORXL $0x0d, AX, R12
66 XORL R14, R13
67 XORL R10, R15
68 VPADDD Y4, Y0, Y0
69 RORXL $0x06, DX, R14
70 ANDL DX, R15
71 XORL R14, R13
72 RORXL $0x16, AX, R14
73 ADDL R11, R8
74 ANDL BX, DI
75 VPALIGNR $0x04, Y4, Y5, Y1
76 XORL R12, R14
77 RORXL $0x02, AX, R12
78 XORL R10, R15
79 VPSRLD $0x07, Y1, Y2
80 XORL R12, R14
81 MOVL AX, R12
82 ANDL CX, R12
83 ADDL R13, R15
84 VPSLLD $0x19, Y1, Y3
85 ORL R12, DI
86 ADDL R14, R11
87 ADDL R15, R8
88 VPOR Y2, Y3, Y3
89 VPSRLD $0x12, Y1, Y2
90 ADDL R15, R11
91 ADDL DI, R11
92 MOVL R11, DI
93 RORXL $0x19, R8, R13
94 RORXL $0x0b, R8, R14
95 ADDL 4(SP)(SI*1), R10
96 ORL BX, DI
97 VPSRLD $0x03, Y1, Y8
98 MOVL DX, R15
99 RORXL $0x0d, R11, R12
100 XORL R14, R13
101 XORL R9, R15
102 RORXL $0x06, R8, R14
103 XORL R14, R13
104 RORXL $0x16, R11, R14
105 ANDL R8, R15
106 ADDL R10, CX
107 VPSLLD $0x0e, Y1, Y1
108 ANDL AX, DI
109 XORL R12, R14
110 VPXOR Y1, Y3, Y3
111 RORXL $0x02, R11, R12
112 XORL R9, R15
113 VPXOR Y2, Y3, Y3
114 XORL R12, R14
115 MOVL R11, R12
116 ANDL BX, R12
117 ADDL R13, R15
118 VPXOR Y8, Y3, Y1
119 VPSHUFD $0xfa, Y7, Y2
120 ORL R12, DI
121 ADDL R14, R10
122 VPADDD Y1, Y0, Y0
123 ADDL R15, CX
124 ADDL R15, R10
125 ADDL DI, R10
126 VPSRLD $0x0a, Y2, Y8
127 MOVL R10, DI
128 RORXL $0x19, CX, R13
129 ADDL 8(SP)(SI*1), R9
130 VPSRLQ $0x13, Y2, Y3
131 RORXL $0x0b, CX, R14
132 ORL AX, DI
133 MOVL R8, R15
134 XORL DX, R15
135 RORXL $0x0d, R10, R12
136 XORL R14, R13
137 VPSRLQ $0x11, Y2, Y2
138 ANDL CX, R15
139 RORXL $0x06, CX, R14
140 VPXOR Y3, Y2, Y2
141 ADDL R9, BX
142 ANDL R11, DI
143 XORL R14, R13
144 RORXL $0x16, R10, R14
145 VPXOR Y2, Y8, Y8
146 XORL DX, R15
147 VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
148 XORL R12, R14
149 RORXL $0x02, R10, R12
150 VPADDD Y8, Y0, Y0
151 XORL R12, R14
152 MOVL R10, R12
153 ANDL AX, R12
154 ADDL R13, R15
155 VPSHUFD $0x50, Y0, Y2
156 ORL R12, DI
157 ADDL R14, R9
158 ADDL R15, BX
159 ADDL R15, R9
160 ADDL DI, R9
161 MOVL R9, DI
162 RORXL $0x19, BX, R13
163 RORXL $0x0b, BX, R14
164 ADDL 12(SP)(SI*1), DX
165 ORL R11, DI
166 VPSRLD $0x0a, Y2, Y11
167 MOVL CX, R15
168 RORXL $0x0d, R9, R12
169 XORL R14, R13
170 XORL R8, R15
171 VPSRLQ $0x13, Y2, Y3
172 RORXL $0x06, BX, R14
173 ANDL BX, R15
174 ADDL DX, AX
175 ANDL R10, DI
176 VPSRLQ $0x11, Y2, Y2
177 XORL R14, R13
178 XORL R8, R15
179 VPXOR Y3, Y2, Y2
180 RORXL $0x16, R9, R14
181 ADDL R13, R15
182 VPXOR Y2, Y11, Y11
183 XORL R12, R14
184 ADDL R15, AX
185 RORXL $0x02, R9, R12
186 VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
187 VPADDD Y0, Y11, Y4
188 XORL R12, R14
189 MOVL R9, R12
190 ANDL R11, R12
191 ORL R12, DI
192 ADDL R14, DX
193 ADDL R15, DX
194 ADDL DI, DX
195
196 // Do 4 rounds and scheduling
197 VPADDD 32(BP)(SI*1), Y5, Y9
198 VMOVDQU Y9, 32(SP)(SI*1)
199 MOVL DX, DI
200 RORXL $0x19, AX, R13
201 RORXL $0x0b, AX, R14
202 ADDL 32(SP)(SI*1), R8
203 ORL R10, DI
204 VPALIGNR $0x04, Y7, Y4, Y0
205 MOVL BX, R15
206 RORXL $0x0d, DX, R12
207 XORL R14, R13
208 XORL CX, R15
209 VPADDD Y5, Y0, Y0
210 RORXL $0x06, AX, R14
211 ANDL AX, R15
212 XORL R14, R13
213 RORXL $0x16, DX, R14
214 ADDL R8, R11
215 ANDL R9, DI
216 VPALIGNR $0x04, Y5, Y6, Y1
217 XORL R12, R14
218 RORXL $0x02, DX, R12
219 XORL CX, R15
220 VPSRLD $0x07, Y1, Y2
221 XORL R12, R14
222 MOVL DX, R12
223 ANDL R10, R12
224 ADDL R13, R15
225 VPSLLD $0x19, Y1, Y3
226 ORL R12, DI
227 ADDL R14, R8
228 ADDL R15, R11
229 VPOR Y2, Y3, Y3
230 VPSRLD $0x12, Y1, Y2
231 ADDL R15, R8
232 ADDL DI, R8
233 MOVL R8, DI
234 RORXL $0x19, R11, R13
235 RORXL $0x0b, R11, R14
236 ADDL 36(SP)(SI*1), CX
237 ORL R9, DI
238 VPSRLD $0x03, Y1, Y8
239 MOVL AX, R15
240 RORXL $0x0d, R8, R12
241 XORL R14, R13
242 XORL BX, R15
243 RORXL $0x06, R11, R14
244 XORL R14, R13
245 RORXL $0x16, R8, R14
246 ANDL R11, R15
247 ADDL CX, R10
248 VPSLLD $0x0e, Y1, Y1
249 ANDL DX, DI
250 XORL R12, R14
251 VPXOR Y1, Y3, Y3
252 RORXL $0x02, R8, R12
253 XORL BX, R15
254 VPXOR Y2, Y3, Y3
255 XORL R12, R14
256 MOVL R8, R12
257 ANDL R9, R12
258 ADDL R13, R15
259 VPXOR Y8, Y3, Y1
260 VPSHUFD $0xfa, Y4, Y2
261 ORL R12, DI
262 ADDL R14, CX
263 VPADDD Y1, Y0, Y0
264 ADDL R15, R10
265 ADDL R15, CX
266 ADDL DI, CX
267 VPSRLD $0x0a, Y2, Y8
268 MOVL CX, DI
269 RORXL $0x19, R10, R13
270 ADDL 40(SP)(SI*1), BX
271 VPSRLQ $0x13, Y2, Y3
272 RORXL $0x0b, R10, R14
273 ORL DX, DI
274 MOVL R11, R15
275 XORL AX, R15
276 RORXL $0x0d, CX, R12
277 XORL R14, R13
278 VPSRLQ $0x11, Y2, Y2
279 ANDL R10, R15
280 RORXL $0x06, R10, R14
281 VPXOR Y3, Y2, Y2
282 ADDL BX, R9
283 ANDL R8, DI
284 XORL R14, R13
285 RORXL $0x16, CX, R14
286 VPXOR Y2, Y8, Y8
287 XORL AX, R15
288 VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
289 XORL R12, R14
290 RORXL $0x02, CX, R12
291 VPADDD Y8, Y0, Y0
292 XORL R12, R14
293 MOVL CX, R12
294 ANDL DX, R12
295 ADDL R13, R15
296 VPSHUFD $0x50, Y0, Y2
297 ORL R12, DI
298 ADDL R14, BX
299 ADDL R15, R9
300 ADDL R15, BX
301 ADDL DI, BX
302 MOVL BX, DI
303 RORXL $0x19, R9, R13
304 RORXL $0x0b, R9, R14
305 ADDL 44(SP)(SI*1), AX
306 ORL R8, DI
307 VPSRLD $0x0a, Y2, Y11
308 MOVL R10, R15
309 RORXL $0x0d, BX, R12
310 XORL R14, R13
311 XORL R11, R15
312 VPSRLQ $0x13, Y2, Y3
313 RORXL $0x06, R9, R14
314 ANDL R9, R15
315 ADDL AX, DX
316 ANDL CX, DI
317 VPSRLQ $0x11, Y2, Y2
318 XORL R14, R13
319 XORL R11, R15
320 VPXOR Y3, Y2, Y2
321 RORXL $0x16, BX, R14
322 ADDL R13, R15
323 VPXOR Y2, Y11, Y11
324 XORL R12, R14
325 ADDL R15, DX
326 RORXL $0x02, BX, R12
327 VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
328 VPADDD Y0, Y11, Y5
329 XORL R12, R14
330 MOVL BX, R12
331 ANDL R8, R12
332 ORL R12, DI
333 ADDL R14, AX
334 ADDL R15, AX
335 ADDL DI, AX
336
337 // Do 4 rounds and scheduling
338 VPADDD 64(BP)(SI*1), Y6, Y9
339 VMOVDQU Y9, 64(SP)(SI*1)
340 MOVL AX, DI
341 RORXL $0x19, DX, R13
342 RORXL $0x0b, DX, R14
343 ADDL 64(SP)(SI*1), R11
344 ORL CX, DI
345 VPALIGNR $0x04, Y4, Y5, Y0
346 MOVL R9, R15
347 RORXL $0x0d, AX, R12
348 XORL R14, R13
349 XORL R10, R15
350 VPADDD Y6, Y0, Y0
351 RORXL $0x06, DX, R14
352 ANDL DX, R15
353 XORL R14, R13
354 RORXL $0x16, AX, R14
355 ADDL R11, R8
356 ANDL BX, DI
357 VPALIGNR $0x04, Y6, Y7, Y1
358 XORL R12, R14
359 RORXL $0x02, AX, R12
360 XORL R10, R15
361 VPSRLD $0x07, Y1, Y2
362 XORL R12, R14
363 MOVL AX, R12
364 ANDL CX, R12
365 ADDL R13, R15
366 VPSLLD $0x19, Y1, Y3
367 ORL R12, DI
368 ADDL R14, R11
369 ADDL R15, R8
370 VPOR Y2, Y3, Y3
371 VPSRLD $0x12, Y1, Y2
372 ADDL R15, R11
373 ADDL DI, R11
374 MOVL R11, DI
375 RORXL $0x19, R8, R13
376 RORXL $0x0b, R8, R14
377 ADDL 68(SP)(SI*1), R10
378 ORL BX, DI
379 VPSRLD $0x03, Y1, Y8
380 MOVL DX, R15
381 RORXL $0x0d, R11, R12
382 XORL R14, R13
383 XORL R9, R15
384 RORXL $0x06, R8, R14
385 XORL R14, R13
386 RORXL $0x16, R11, R14
387 ANDL R8, R15
388 ADDL R10, CX
389 VPSLLD $0x0e, Y1, Y1
390 ANDL AX, DI
391 XORL R12, R14
392 VPXOR Y1, Y3, Y3
393 RORXL $0x02, R11, R12
394 XORL R9, R15
395 VPXOR Y2, Y3, Y3
396 XORL R12, R14
397 MOVL R11, R12
398 ANDL BX, R12
399 ADDL R13, R15
400 VPXOR Y8, Y3, Y1
401 VPSHUFD $0xfa, Y5, Y2
402 ORL R12, DI
403 ADDL R14, R10
404 VPADDD Y1, Y0, Y0
405 ADDL R15, CX
406 ADDL R15, R10
407 ADDL DI, R10
408 VPSRLD $0x0a, Y2, Y8
409 MOVL R10, DI
410 RORXL $0x19, CX, R13
411 ADDL 72(SP)(SI*1), R9
412 VPSRLQ $0x13, Y2, Y3
413 RORXL $0x0b, CX, R14
414 ORL AX, DI
415 MOVL R8, R15
416 XORL DX, R15
417 RORXL $0x0d, R10, R12
418 XORL R14, R13
419 VPSRLQ $0x11, Y2, Y2
420 ANDL CX, R15
421 RORXL $0x06, CX, R14
422 VPXOR Y3, Y2, Y2
423 ADDL R9, BX
424 ANDL R11, DI
425 XORL R14, R13
426 RORXL $0x16, R10, R14
427 VPXOR Y2, Y8, Y8
428 XORL DX, R15
429 VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
430 XORL R12, R14
431 RORXL $0x02, R10, R12
432 VPADDD Y8, Y0, Y0
433 XORL R12, R14
434 MOVL R10, R12
435 ANDL AX, R12
436 ADDL R13, R15
437 VPSHUFD $0x50, Y0, Y2
438 ORL R12, DI
439 ADDL R14, R9
440 ADDL R15, BX
441 ADDL R15, R9
442 ADDL DI, R9
443 MOVL R9, DI
444 RORXL $0x19, BX, R13
445 RORXL $0x0b, BX, R14
446 ADDL 76(SP)(SI*1), DX
447 ORL R11, DI
448 VPSRLD $0x0a, Y2, Y11
449 MOVL CX, R15
450 RORXL $0x0d, R9, R12
451 XORL R14, R13
452 XORL R8, R15
453 VPSRLQ $0x13, Y2, Y3
454 RORXL $0x06, BX, R14
455 ANDL BX, R15
456 ADDL DX, AX
457 ANDL R10, DI
458 VPSRLQ $0x11, Y2, Y2
459 XORL R14, R13
460 XORL R8, R15
461 VPXOR Y3, Y2, Y2
462 RORXL $0x16, R9, R14
463 ADDL R13, R15
464 VPXOR Y2, Y11, Y11
465 XORL R12, R14
466 ADDL R15, AX
467 RORXL $0x02, R9, R12
468 VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
469 VPADDD Y0, Y11, Y6
470 XORL R12, R14
471 MOVL R9, R12
472 ANDL R11, R12
473 ORL R12, DI
474 ADDL R14, DX
475 ADDL R15, DX
476 ADDL DI, DX
477
478 // Do 4 rounds and scheduling
479 VPADDD 96(BP)(SI*1), Y7, Y9
480 VMOVDQU Y9, 96(SP)(SI*1)
481 MOVL DX, DI
482 RORXL $0x19, AX, R13
483 RORXL $0x0b, AX, R14
484 ADDL 96(SP)(SI*1), R8
485 ORL R10, DI
486 VPALIGNR $0x04, Y5, Y6, Y0
487 MOVL BX, R15
488 RORXL $0x0d, DX, R12
489 XORL R14, R13
490 XORL CX, R15
491 VPADDD Y7, Y0, Y0
492 RORXL $0x06, AX, R14
493 ANDL AX, R15
494 XORL R14, R13
495 RORXL $0x16, DX, R14
496 ADDL R8, R11
497 ANDL R9, DI
498 VPALIGNR $0x04, Y7, Y4, Y1
499 XORL R12, R14
500 RORXL $0x02, DX, R12
501 XORL CX, R15
502 VPSRLD $0x07, Y1, Y2
503 XORL R12, R14
504 MOVL DX, R12
505 ANDL R10, R12
506 ADDL R13, R15
507 VPSLLD $0x19, Y1, Y3
508 ORL R12, DI
509 ADDL R14, R8
510 ADDL R15, R11
511 VPOR Y2, Y3, Y3
512 VPSRLD $0x12, Y1, Y2
513 ADDL R15, R8
514 ADDL DI, R8
515 MOVL R8, DI
516 RORXL $0x19, R11, R13
517 RORXL $0x0b, R11, R14
518 ADDL 100(SP)(SI*1), CX
519 ORL R9, DI
520 VPSRLD $0x03, Y1, Y8
521 MOVL AX, R15
522 RORXL $0x0d, R8, R12
523 XORL R14, R13
524 XORL BX, R15
525 RORXL $0x06, R11, R14
526 XORL R14, R13
527 RORXL $0x16, R8, R14
528 ANDL R11, R15
529 ADDL CX, R10
530 VPSLLD $0x0e, Y1, Y1
531 ANDL DX, DI
532 XORL R12, R14
533 VPXOR Y1, Y3, Y3
534 RORXL $0x02, R8, R12
535 XORL BX, R15
536 VPXOR Y2, Y3, Y3
537 XORL R12, R14
538 MOVL R8, R12
539 ANDL R9, R12
540 ADDL R13, R15
541 VPXOR Y8, Y3, Y1
542 VPSHUFD $0xfa, Y6, Y2
543 ORL R12, DI
544 ADDL R14, CX
545 VPADDD Y1, Y0, Y0
546 ADDL R15, R10
547 ADDL R15, CX
548 ADDL DI, CX
549 VPSRLD $0x0a, Y2, Y8
550 MOVL CX, DI
551 RORXL $0x19, R10, R13
552 ADDL 104(SP)(SI*1), BX
553 VPSRLQ $0x13, Y2, Y3
554 RORXL $0x0b, R10, R14
555 ORL DX, DI
556 MOVL R11, R15
557 XORL AX, R15
558 RORXL $0x0d, CX, R12
559 XORL R14, R13
560 VPSRLQ $0x11, Y2, Y2
561 ANDL R10, R15
562 RORXL $0x06, R10, R14
563 VPXOR Y3, Y2, Y2
564 ADDL BX, R9
565 ANDL R8, DI
566 XORL R14, R13
567 RORXL $0x16, CX, R14
568 VPXOR Y2, Y8, Y8
569 XORL AX, R15
570 VPSHUFB shuff_00BA<>+0(SB), Y8, Y8
571 XORL R12, R14
572 RORXL $0x02, CX, R12
573 VPADDD Y8, Y0, Y0
574 XORL R12, R14
575 MOVL CX, R12
576 ANDL DX, R12
577 ADDL R13, R15
578 VPSHUFD $0x50, Y0, Y2
579 ORL R12, DI
580 ADDL R14, BX
581 ADDL R15, R9
582 ADDL R15, BX
583 ADDL DI, BX
584 MOVL BX, DI
585 RORXL $0x19, R9, R13
586 RORXL $0x0b, R9, R14
587 ADDL 108(SP)(SI*1), AX
588 ORL R8, DI
589 VPSRLD $0x0a, Y2, Y11
590 MOVL R10, R15
591 RORXL $0x0d, BX, R12
592 XORL R14, R13
593 XORL R11, R15
594 VPSRLQ $0x13, Y2, Y3
595 RORXL $0x06, R9, R14
596 ANDL R9, R15
597 ADDL AX, DX
598 ANDL CX, DI
599 VPSRLQ $0x11, Y2, Y2
600 XORL R14, R13
601 XORL R11, R15
602 VPXOR Y3, Y2, Y2
603 RORXL $0x16, BX, R14
604 ADDL R13, R15
605 VPXOR Y2, Y11, Y11
606 XORL R12, R14
607 ADDL R15, DX
608 RORXL $0x02, BX, R12
609 VPSHUFB shuff_DC00<>+0(SB), Y11, Y11
610 VPADDD Y0, Y11, Y7
611 XORL R12, R14
612 MOVL BX, R12
613 ANDL R8, R12
614 ORL R12, DI
615 ADDL R14, AX
616 ADDL R15, AX
617 ADDL DI, AX
618 ADDQ $0x80, SI
619 CMPQ SI, $0x00000180
620 JB avx2_loop1
621
622 avx2_loop2:
623 VPADDD (BP)(SI*1), Y4, Y9
624 VMOVDQU Y9, (SP)(SI*1)
625 MOVL R9, R15
626 RORXL $0x19, DX, R13
627 RORXL $0x0b, DX, R14
628 XORL R10, R15
629 XORL R14, R13
630 RORXL $0x06, DX, R14
631 ANDL DX, R15
632 XORL R14, R13
633 RORXL $0x0d, AX, R12
634 XORL R10, R15
635 RORXL $0x16, AX, R14
636 MOVL AX, DI
637 XORL R12, R14
638 RORXL $0x02, AX, R12
639 ADDL (SP)(SI*1), R11
640 ORL CX, DI
641 XORL R12, R14
642 MOVL AX, R12
643 ANDL BX, DI
644 ANDL CX, R12
645 ADDL R13, R15
646 ADDL R11, R8
647 ORL R12, DI
648 ADDL R14, R11
649 ADDL R15, R8
650 ADDL R15, R11
651 MOVL DX, R15
652 RORXL $0x19, R8, R13
653 RORXL $0x0b, R8, R14
654 XORL R9, R15
655 XORL R14, R13
656 RORXL $0x06, R8, R14
657 ANDL R8, R15
658 ADDL DI, R11
659 XORL R14, R13
660 RORXL $0x0d, R11, R12
661 XORL R9, R15
662 RORXL $0x16, R11, R14
663 MOVL R11, DI
664 XORL R12, R14
665 RORXL $0x02, R11, R12
666 ADDL 4(SP)(SI*1), R10
667 ORL BX, DI
668 XORL R12, R14
669 MOVL R11, R12
670 ANDL AX, DI
671 ANDL BX, R12
672 ADDL R13, R15
673 ADDL R10, CX
674 ORL R12, DI
675 ADDL R14, R10
676 ADDL R15, CX
677 ADDL R15, R10
678 MOVL R8, R15
679 RORXL $0x19, CX, R13
680 RORXL $0x0b, CX, R14
681 XORL DX, R15
682 XORL R14, R13
683 RORXL $0x06, CX, R14
684 ANDL CX, R15
685 ADDL DI, R10
686 XORL R14, R13
687 RORXL $0x0d, R10, R12
688 XORL DX, R15
689 RORXL $0x16, R10, R14
690 MOVL R10, DI
691 XORL R12, R14
692 RORXL $0x02, R10, R12
693 ADDL 8(SP)(SI*1), R9
694 ORL AX, DI
695 XORL R12, R14
696 MOVL R10, R12
697 ANDL R11, DI
698 ANDL AX, R12
699 ADDL R13, R15
700 ADDL R9, BX
701 ORL R12, DI
702 ADDL R14, R9
703 ADDL R15, BX
704 ADDL R15, R9
705 MOVL CX, R15
706 RORXL $0x19, BX, R13
707 RORXL $0x0b, BX, R14
708 XORL R8, R15
709 XORL R14, R13
710 RORXL $0x06, BX, R14
711 ANDL BX, R15
712 ADDL DI, R9
713 XORL R14, R13
714 RORXL $0x0d, R9, R12
715 XORL R8, R15
716 RORXL $0x16, R9, R14
717 MOVL R9, DI
718 XORL R12, R14
719 RORXL $0x02, R9, R12
720 ADDL 12(SP)(SI*1), DX
721 ORL R11, DI
722 XORL R12, R14
723 MOVL R9, R12
724 ANDL R10, DI
725 ANDL R11, R12
726 ADDL R13, R15
727 ADDL DX, AX
728 ORL R12, DI
729 ADDL R14, DX
730 ADDL R15, AX
731 ADDL R15, DX
732 ADDL DI, DX
733 VPADDD 32(BP)(SI*1), Y5, Y9
734 VMOVDQU Y9, 32(SP)(SI*1)
735 MOVL BX, R15
736 RORXL $0x19, AX, R13
737 RORXL $0x0b, AX, R14
738 XORL CX, R15
739 XORL R14, R13
740 RORXL $0x06, AX, R14
741 ANDL AX, R15
742 XORL R14, R13
743 RORXL $0x0d, DX, R12
744 XORL CX, R15
745 RORXL $0x16, DX, R14
746 MOVL DX, DI
747 XORL R12, R14
748 RORXL $0x02, DX, R12
749 ADDL 32(SP)(SI*1), R8
750 ORL R10, DI
751 XORL R12, R14
752 MOVL DX, R12
753 ANDL R9, DI
754 ANDL R10, R12
755 ADDL R13, R15
756 ADDL R8, R11
757 ORL R12, DI
758 ADDL R14, R8
759 ADDL R15, R11
760 ADDL R15, R8
761 MOVL AX, R15
762 RORXL $0x19, R11, R13
763 RORXL $0x0b, R11, R14
764 XORL BX, R15
765 XORL R14, R13
766 RORXL $0x06, R11, R14
767 ANDL R11, R15
768 ADDL DI, R8
769 XORL R14, R13
770 RORXL $0x0d, R8, R12
771 XORL BX, R15
772 RORXL $0x16, R8, R14
773 MOVL R8, DI
774 XORL R12, R14
775 RORXL $0x02, R8, R12
776 ADDL 36(SP)(SI*1), CX
777 ORL R9, DI
778 XORL R12, R14
779 MOVL R8, R12
780 ANDL DX, DI
781 ANDL R9, R12
782 ADDL R13, R15
783 ADDL CX, R10
784 ORL R12, DI
785 ADDL R14, CX
786 ADDL R15, R10
787 ADDL R15, CX
788 MOVL R11, R15
789 RORXL $0x19, R10, R13
790 RORXL $0x0b, R10, R14
791 XORL AX, R15
792 XORL R14, R13
793 RORXL $0x06, R10, R14
794 ANDL R10, R15
795 ADDL DI, CX
796 XORL R14, R13
797 RORXL $0x0d, CX, R12
798 XORL AX, R15
799 RORXL $0x16, CX, R14
800 MOVL CX, DI
801 XORL R12, R14
802 RORXL $0x02, CX, R12
803 ADDL 40(SP)(SI*1), BX
804 ORL DX, DI
805 XORL R12, R14
806 MOVL CX, R12
807 ANDL R8, DI
808 ANDL DX, R12
809 ADDL R13, R15
810 ADDL BX, R9
811 ORL R12, DI
812 ADDL R14, BX
813 ADDL R15, R9
814 ADDL R15, BX
815 MOVL R10, R15
816 RORXL $0x19, R9, R13
817 RORXL $0x0b, R9, R14
818 XORL R11, R15
819 XORL R14, R13
820 RORXL $0x06, R9, R14
821 ANDL R9, R15
822 ADDL DI, BX
823 XORL R14, R13
824 RORXL $0x0d, BX, R12
825 XORL R11, R15
826 RORXL $0x16, BX, R14
827 MOVL BX, DI
828 XORL R12, R14
829 RORXL $0x02, BX, R12
830 ADDL 44(SP)(SI*1), AX
831 ORL R8, DI
832 XORL R12, R14
833 MOVL BX, R12
834 ANDL CX, DI
835 ANDL R8, R12
836 ADDL R13, R15
837 ADDL AX, DX
838 ORL R12, DI
839 ADDL R14, AX
840 ADDL R15, DX
841 ADDL R15, AX
842 ADDL DI, AX
843 ADDQ $0x40, SI
844 VMOVDQU Y6, Y4
845 VMOVDQU Y7, Y5
846 CMPQ SI, $0x00000200
847 JB avx2_loop2
848 MOVQ dig+0(FP), SI
849 MOVQ 520(SP), DI
850 ADDL AX, (SI)
851 MOVL (SI), AX
852 ADDL BX, 4(SI)
853 MOVL 4(SI), BX
854 ADDL CX, 8(SI)
855 MOVL 8(SI), CX
856 ADDL R8, 12(SI)
857 MOVL 12(SI), R8
858 ADDL DX, 16(SI)
859 MOVL 16(SI), DX
860 ADDL R9, 20(SI)
861 MOVL 20(SI), R9
862 ADDL R10, 24(SI)
863 MOVL 24(SI), R10
864 ADDL R11, 28(SI)
865 MOVL 28(SI), R11
866 CMPQ 512(SP), DI
867 JB done_hash
868 XORQ SI, SI
869
870 avx2_loop3:
871 MOVL R9, R15
872 RORXL $0x19, DX, R13
873 RORXL $0x0b, DX, R14
874 XORL R10, R15
875 XORL R14, R13
876 RORXL $0x06, DX, R14
877 ANDL DX, R15
878 XORL R14, R13
879 RORXL $0x0d, AX, R12
880 XORL R10, R15
881 RORXL $0x16, AX, R14
882 MOVL AX, DI
883 XORL R12, R14
884 RORXL $0x02, AX, R12
885 ADDL 16(SP)(SI*1), R11
886 ORL CX, DI
887 XORL R12, R14
888 MOVL AX, R12
889 ANDL BX, DI
890 ANDL CX, R12
891 ADDL R13, R15
892 ADDL R11, R8
893 ORL R12, DI
894 ADDL R14, R11
895 ADDL R15, R8
896 ADDL R15, R11
897 MOVL DX, R15
898 RORXL $0x19, R8, R13
899 RORXL $0x0b, R8, R14
900 XORL R9, R15
901 XORL R14, R13
902 RORXL $0x06, R8, R14
903 ANDL R8, R15
904 ADDL DI, R11
905 XORL R14, R13
906 RORXL $0x0d, R11, R12
907 XORL R9, R15
908 RORXL $0x16, R11, R14
909 MOVL R11, DI
910 XORL R12, R14
911 RORXL $0x02, R11, R12
912 ADDL 20(SP)(SI*1), R10
913 ORL BX, DI
914 XORL R12, R14
915 MOVL R11, R12
916 ANDL AX, DI
917 ANDL BX, R12
918 ADDL R13, R15
919 ADDL R10, CX
920 ORL R12, DI
921 ADDL R14, R10
922 ADDL R15, CX
923 ADDL R15, R10
924 MOVL R8, R15
925 RORXL $0x19, CX, R13
926 RORXL $0x0b, CX, R14
927 XORL DX, R15
928 XORL R14, R13
929 RORXL $0x06, CX, R14
930 ANDL CX, R15
931 ADDL DI, R10
932 XORL R14, R13
933 RORXL $0x0d, R10, R12
934 XORL DX, R15
935 RORXL $0x16, R10, R14
936 MOVL R10, DI
937 XORL R12, R14
938 RORXL $0x02, R10, R12
939 ADDL 24(SP)(SI*1), R9
940 ORL AX, DI
941 XORL R12, R14
942 MOVL R10, R12
943 ANDL R11, DI
944 ANDL AX, R12
945 ADDL R13, R15
946 ADDL R9, BX
947 ORL R12, DI
948 ADDL R14, R9
949 ADDL R15, BX
950 ADDL R15, R9
951 MOVL CX, R15
952 RORXL $0x19, BX, R13
953 RORXL $0x0b, BX, R14
954 XORL R8, R15
955 XORL R14, R13
956 RORXL $0x06, BX, R14
957 ANDL BX, R15
958 ADDL DI, R9
959 XORL R14, R13
960 RORXL $0x0d, R9, R12
961 XORL R8, R15
962 RORXL $0x16, R9, R14
963 MOVL R9, DI
964 XORL R12, R14
965 RORXL $0x02, R9, R12
966 ADDL 28(SP)(SI*1), DX
967 ORL R11, DI
968 XORL R12, R14
969 MOVL R9, R12
970 ANDL R10, DI
971 ANDL R11, R12
972 ADDL R13, R15
973 ADDL DX, AX
974 ORL R12, DI
975 ADDL R14, DX
976 ADDL R15, AX
977 ADDL R15, DX
978 ADDL DI, DX
979 MOVL BX, R15
980 RORXL $0x19, AX, R13
981 RORXL $0x0b, AX, R14
982 XORL CX, R15
983 XORL R14, R13
984 RORXL $0x06, AX, R14
985 ANDL AX, R15
986 XORL R14, R13
987 RORXL $0x0d, DX, R12
988 XORL CX, R15
989 RORXL $0x16, DX, R14
990 MOVL DX, DI
991 XORL R12, R14
992 RORXL $0x02, DX, R12
993 ADDL 48(SP)(SI*1), R8
994 ORL R10, DI
995 XORL R12, R14
996 MOVL DX, R12
997 ANDL R9, DI
998 ANDL R10, R12
999 ADDL R13, R15
1000 ADDL R8, R11
1001 ORL R12, DI
1002 ADDL R14, R8
1003 ADDL R15, R11
1004 ADDL R15, R8
1005 MOVL AX, R15
1006 RORXL $0x19, R11, R13
1007 RORXL $0x0b, R11, R14
1008 XORL BX, R15
1009 XORL R14, R13
1010 RORXL $0x06, R11, R14
1011 ANDL R11, R15
1012 ADDL DI, R8
1013 XORL R14, R13
1014 RORXL $0x0d, R8, R12
1015 XORL BX, R15
1016 RORXL $0x16, R8, R14
1017 MOVL R8, DI
1018 XORL R12, R14
1019 RORXL $0x02, R8, R12
1020 ADDL 52(SP)(SI*1), CX
1021 ORL R9, DI
1022 XORL R12, R14
1023 MOVL R8, R12
1024 ANDL DX, DI
1025 ANDL R9, R12
1026 ADDL R13, R15
1027 ADDL CX, R10
1028 ORL R12, DI
1029 ADDL R14, CX
1030 ADDL R15, R10
1031 ADDL R15, CX
1032 MOVL R11, R15
1033 RORXL $0x19, R10, R13
1034 RORXL $0x0b, R10, R14
1035 XORL AX, R15
1036 XORL R14, R13
1037 RORXL $0x06, R10, R14
1038 ANDL R10, R15
1039 ADDL DI, CX
1040 XORL R14, R13
1041 RORXL $0x0d, CX, R12
1042 XORL AX, R15
1043 RORXL $0x16, CX, R14
1044 MOVL CX, DI
1045 XORL R12, R14
1046 RORXL $0x02, CX, R12
1047 ADDL 56(SP)(SI*1), BX
1048 ORL DX, DI
1049 XORL R12, R14
1050 MOVL CX, R12
1051 ANDL R8, DI
1052 ANDL DX, R12
1053 ADDL R13, R15
1054 ADDL BX, R9
1055 ORL R12, DI
1056 ADDL R14, BX
1057 ADDL R15, R9
1058 ADDL R15, BX
1059 MOVL R10, R15
1060 RORXL $0x19, R9, R13
1061 RORXL $0x0b, R9, R14
1062 XORL R11, R15
1063 XORL R14, R13
1064 RORXL $0x06, R9, R14
1065 ANDL R9, R15
1066 ADDL DI, BX
1067 XORL R14, R13
1068 RORXL $0x0d, BX, R12
1069 XORL R11, R15
1070 RORXL $0x16, BX, R14
1071 MOVL BX, DI
1072 XORL R12, R14
1073 RORXL $0x02, BX, R12
1074 ADDL 60(SP)(SI*1), AX
1075 ORL R8, DI
1076 XORL R12, R14
1077 MOVL BX, R12
1078 ANDL CX, DI
1079 ANDL R8, R12
1080 ADDL R13, R15
1081 ADDL AX, DX
1082 ORL R12, DI
1083 ADDL R14, AX
1084 ADDL R15, DX
1085 ADDL R15, AX
1086 ADDL DI, AX
1087 ADDQ $0x40, SI
1088 CMPQ SI, $0x00000200
1089 JB avx2_loop3
1090 MOVQ dig+0(FP), SI
1091 MOVQ 520(SP), DI
1092 ADDQ $0x40, DI
1093 ADDL AX, (SI)
1094 MOVL (SI), AX
1095 ADDL BX, 4(SI)
1096 MOVL 4(SI), BX
1097 ADDL CX, 8(SI)
1098 MOVL 8(SI), CX
1099 ADDL R8, 12(SI)
1100 MOVL 12(SI), R8
1101 ADDL DX, 16(SI)
1102 MOVL 16(SI), DX
1103 ADDL R9, 20(SI)
1104 MOVL 20(SI), R9
1105 ADDL R10, 24(SI)
1106 MOVL 24(SI), R10
1107 ADDL R11, 28(SI)
1108 MOVL 28(SI), R11
1109 CMPQ 512(SP), DI
1110 JA avx2_loop0
1111 JB done_hash
1112
1113 avx2_do_last_block:
1114 VMOVDQU (DI), X4
1115 VMOVDQU 16(DI), X5
1116 VMOVDQU 32(DI), X6
1117 VMOVDQU 48(DI), X7
1118 VMOVDQU flip_mask<>+0(SB), Y13
1119 VPSHUFB X13, X4, X4
1120 VPSHUFB X13, X5, X5
1121 VPSHUFB X13, X6, X6
1122 VPSHUFB X13, X7, X7
1123 LEAQ K256<>+0(SB), BP
1124 JMP avx2_last_block_enter
1125
1126 avx2_only_one_block:
1127 MOVL (SI), AX
1128 MOVL 4(SI), BX
1129 MOVL 8(SI), CX
1130 MOVL 12(SI), R8
1131 MOVL 16(SI), DX
1132 MOVL 20(SI), R9
1133 MOVL 24(SI), R10
1134 MOVL 28(SI), R11
1135 JMP avx2_do_last_block
1136
1137 done_hash:
1138 VZEROUPPER
1139 RET
1140
1141 DATA flip_mask<>+0(SB)/8, $0x0405060700010203
1142 DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b
1143 DATA flip_mask<>+16(SB)/8, $0x0405060700010203
1144 DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b
1145 GLOBL flip_mask<>(SB), RODATA, $32
1146
1147 DATA K256<>+0(SB)/4, $0x428a2f98
1148 DATA K256<>+4(SB)/4, $0x71374491
1149 DATA K256<>+8(SB)/4, $0xb5c0fbcf
1150 DATA K256<>+12(SB)/4, $0xe9b5dba5
1151 DATA K256<>+16(SB)/4, $0x428a2f98
1152 DATA K256<>+20(SB)/4, $0x71374491
1153 DATA K256<>+24(SB)/4, $0xb5c0fbcf
1154 DATA K256<>+28(SB)/4, $0xe9b5dba5
1155 DATA K256<>+32(SB)/4, $0x3956c25b
1156 DATA K256<>+36(SB)/4, $0x59f111f1
1157 DATA K256<>+40(SB)/4, $0x923f82a4
1158 DATA K256<>+44(SB)/4, $0xab1c5ed5
1159 DATA K256<>+48(SB)/4, $0x3956c25b
1160 DATA K256<>+52(SB)/4, $0x59f111f1
1161 DATA K256<>+56(SB)/4, $0x923f82a4
1162 DATA K256<>+60(SB)/4, $0xab1c5ed5
1163 DATA K256<>+64(SB)/4, $0xd807aa98
1164 DATA K256<>+68(SB)/4, $0x12835b01
1165 DATA K256<>+72(SB)/4, $0x243185be
1166 DATA K256<>+76(SB)/4, $0x550c7dc3
1167 DATA K256<>+80(SB)/4, $0xd807aa98
1168 DATA K256<>+84(SB)/4, $0x12835b01
1169 DATA K256<>+88(SB)/4, $0x243185be
1170 DATA K256<>+92(SB)/4, $0x550c7dc3
1171 DATA K256<>+96(SB)/4, $0x72be5d74
1172 DATA K256<>+100(SB)/4, $0x80deb1fe
1173 DATA K256<>+104(SB)/4, $0x9bdc06a7
1174 DATA K256<>+108(SB)/4, $0xc19bf174
1175 DATA K256<>+112(SB)/4, $0x72be5d74
1176 DATA K256<>+116(SB)/4, $0x80deb1fe
1177 DATA K256<>+120(SB)/4, $0x9bdc06a7
1178 DATA K256<>+124(SB)/4, $0xc19bf174
1179 DATA K256<>+128(SB)/4, $0xe49b69c1
1180 DATA K256<>+132(SB)/4, $0xefbe4786
1181 DATA K256<>+136(SB)/4, $0x0fc19dc6
1182 DATA K256<>+140(SB)/4, $0x240ca1cc
1183 DATA K256<>+144(SB)/4, $0xe49b69c1
1184 DATA K256<>+148(SB)/4, $0xefbe4786
1185 DATA K256<>+152(SB)/4, $0x0fc19dc6
1186 DATA K256<>+156(SB)/4, $0x240ca1cc
1187 DATA K256<>+160(SB)/4, $0x2de92c6f
1188 DATA K256<>+164(SB)/4, $0x4a7484aa
1189 DATA K256<>+168(SB)/4, $0x5cb0a9dc
1190 DATA K256<>+172(SB)/4, $0x76f988da
1191 DATA K256<>+176(SB)/4, $0x2de92c6f
1192 DATA K256<>+180(SB)/4, $0x4a7484aa
1193 DATA K256<>+184(SB)/4, $0x5cb0a9dc
1194 DATA K256<>+188(SB)/4, $0x76f988da
1195 DATA K256<>+192(SB)/4, $0x983e5152
1196 DATA K256<>+196(SB)/4, $0xa831c66d
1197 DATA K256<>+200(SB)/4, $0xb00327c8
1198 DATA K256<>+204(SB)/4, $0xbf597fc7
1199 DATA K256<>+208(SB)/4, $0x983e5152
1200 DATA K256<>+212(SB)/4, $0xa831c66d
1201 DATA K256<>+216(SB)/4, $0xb00327c8
1202 DATA K256<>+220(SB)/4, $0xbf597fc7
1203 DATA K256<>+224(SB)/4, $0xc6e00bf3
1204 DATA K256<>+228(SB)/4, $0xd5a79147
1205 DATA K256<>+232(SB)/4, $0x06ca6351
1206 DATA K256<>+236(SB)/4, $0x14292967
1207 DATA K256<>+240(SB)/4, $0xc6e00bf3
1208 DATA K256<>+244(SB)/4, $0xd5a79147
1209 DATA K256<>+248(SB)/4, $0x06ca6351
1210 DATA K256<>+252(SB)/4, $0x14292967
1211 DATA K256<>+256(SB)/4, $0x27b70a85
1212 DATA K256<>+260(SB)/4, $0x2e1b2138
1213 DATA K256<>+264(SB)/4, $0x4d2c6dfc
1214 DATA K256<>+268(SB)/4, $0x53380d13
1215 DATA K256<>+272(SB)/4, $0x27b70a85
1216 DATA K256<>+276(SB)/4, $0x2e1b2138
1217 DATA K256<>+280(SB)/4, $0x4d2c6dfc
1218 DATA K256<>+284(SB)/4, $0x53380d13
1219 DATA K256<>+288(SB)/4, $0x650a7354
1220 DATA K256<>+292(SB)/4, $0x766a0abb
1221 DATA K256<>+296(SB)/4, $0x81c2c92e
1222 DATA K256<>+300(SB)/4, $0x92722c85
1223 DATA K256<>+304(SB)/4, $0x650a7354
1224 DATA K256<>+308(SB)/4, $0x766a0abb
1225 DATA K256<>+312(SB)/4, $0x81c2c92e
1226 DATA K256<>+316(SB)/4, $0x92722c85
1227 DATA K256<>+320(SB)/4, $0xa2bfe8a1
1228 DATA K256<>+324(SB)/4, $0xa81a664b
1229 DATA K256<>+328(SB)/4, $0xc24b8b70
1230 DATA K256<>+332(SB)/4, $0xc76c51a3
1231 DATA K256<>+336(SB)/4, $0xa2bfe8a1
1232 DATA K256<>+340(SB)/4, $0xa81a664b
1233 DATA K256<>+344(SB)/4, $0xc24b8b70
1234 DATA K256<>+348(SB)/4, $0xc76c51a3
1235 DATA K256<>+352(SB)/4, $0xd192e819
1236 DATA K256<>+356(SB)/4, $0xd6990624
1237 DATA K256<>+360(SB)/4, $0xf40e3585
1238 DATA K256<>+364(SB)/4, $0x106aa070
1239 DATA K256<>+368(SB)/4, $0xd192e819
1240 DATA K256<>+372(SB)/4, $0xd6990624
1241 DATA K256<>+376(SB)/4, $0xf40e3585
1242 DATA K256<>+380(SB)/4, $0x106aa070
1243 DATA K256<>+384(SB)/4, $0x19a4c116
1244 DATA K256<>+388(SB)/4, $0x1e376c08
1245 DATA K256<>+392(SB)/4, $0x2748774c
1246 DATA K256<>+396(SB)/4, $0x34b0bcb5
1247 DATA K256<>+400(SB)/4, $0x19a4c116
1248 DATA K256<>+404(SB)/4, $0x1e376c08
1249 DATA K256<>+408(SB)/4, $0x2748774c
1250 DATA K256<>+412(SB)/4, $0x34b0bcb5
1251 DATA K256<>+416(SB)/4, $0x391c0cb3
1252 DATA K256<>+420(SB)/4, $0x4ed8aa4a
1253 DATA K256<>+424(SB)/4, $0x5b9cca4f
1254 DATA K256<>+428(SB)/4, $0x682e6ff3
1255 DATA K256<>+432(SB)/4, $0x391c0cb3
1256 DATA K256<>+436(SB)/4, $0x4ed8aa4a
1257 DATA K256<>+440(SB)/4, $0x5b9cca4f
1258 DATA K256<>+444(SB)/4, $0x682e6ff3
1259 DATA K256<>+448(SB)/4, $0x748f82ee
1260 DATA K256<>+452(SB)/4, $0x78a5636f
1261 DATA K256<>+456(SB)/4, $0x84c87814
1262 DATA K256<>+460(SB)/4, $0x8cc70208
1263 DATA K256<>+464(SB)/4, $0x748f82ee
1264 DATA K256<>+468(SB)/4, $0x78a5636f
1265 DATA K256<>+472(SB)/4, $0x84c87814
1266 DATA K256<>+476(SB)/4, $0x8cc70208
1267 DATA K256<>+480(SB)/4, $0x90befffa
1268 DATA K256<>+484(SB)/4, $0xa4506ceb
1269 DATA K256<>+488(SB)/4, $0xbef9a3f7
1270 DATA K256<>+492(SB)/4, $0xc67178f2
1271 DATA K256<>+496(SB)/4, $0x90befffa
1272 DATA K256<>+500(SB)/4, $0xa4506ceb
1273 DATA K256<>+504(SB)/4, $0xbef9a3f7
1274 DATA K256<>+508(SB)/4, $0xc67178f2
1275 GLOBL K256<>(SB), RODATA|NOPTR, $512
1276
1277 DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100
1278 DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff
1279 DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100
1280 DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff
1281 GLOBL shuff_00BA<>(SB), RODATA, $32
1282
1283 DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff
1284 DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100
1285 DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff
1286 DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100
1287 GLOBL shuff_DC00<>(SB), RODATA, $32
1288
1289 // func blockSHANI(dig *Digest, p []byte)
1290 // Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
1291 TEXT ·blockSHANI(SB), $0-32
1292 MOVQ dig+0(FP), DI
1293 MOVQ p_base+8(FP), SI
1294 MOVQ p_len+16(FP), DX
1295 SHRQ $0x06, DX
1296 SHLQ $0x06, DX
1297 CMPQ DX, $0x00
1298 JEQ done
1299 ADDQ SI, DX
1300 VMOVDQU (DI), X1
1301 VMOVDQU 16(DI), X2
1302 PSHUFD $0xb1, X1, X1
1303 PSHUFD $0x1b, X2, X2
1304 VMOVDQA X1, X7
1305 PALIGNR $0x08, X2, X1
1306 PBLENDW $0xf0, X7, X2
1307 VMOVDQA flip_mask<>+0(SB), X8
1308 LEAQ K256<>+0(SB), AX
1309
1310 roundLoop:
1311 // save hash values for addition after rounds
1312 VMOVDQA X1, X9
1313 VMOVDQA X2, X10
1314
1315 // do rounds 0-59
1316 VMOVDQU (SI), X0
1317 PSHUFB X8, X0
1318 VMOVDQA X0, X3
1319 PADDD (AX), X0
1320 SHA256RNDS2 X0, X1, X2
1321 PSHUFD $0x0e, X0, X0
1322 SHA256RNDS2 X0, X2, X1
1323 VMOVDQU 16(SI), X0
1324 PSHUFB X8, X0
1325 VMOVDQA X0, X4
1326 PADDD 32(AX), X0
1327 SHA256RNDS2 X0, X1, X2
1328 PSHUFD $0x0e, X0, X0
1329 SHA256RNDS2 X0, X2, X1
1330 SHA256MSG1 X4, X3
1331 VMOVDQU 32(SI), X0
1332 PSHUFB X8, X0
1333 VMOVDQA X0, X5
1334 PADDD 64(AX), X0
1335 SHA256RNDS2 X0, X1, X2
1336 PSHUFD $0x0e, X0, X0
1337 SHA256RNDS2 X0, X2, X1
1338 SHA256MSG1 X5, X4
1339 VMOVDQU 48(SI), X0
1340 PSHUFB X8, X0
1341 VMOVDQA X0, X6
1342 PADDD 96(AX), X0
1343 SHA256RNDS2 X0, X1, X2
1344 VMOVDQA X6, X7
1345 PALIGNR $0x04, X5, X7
1346 PADDD X7, X3
1347 SHA256MSG2 X6, X3
1348 PSHUFD $0x0e, X0, X0
1349 SHA256RNDS2 X0, X2, X1
1350 SHA256MSG1 X6, X5
1351 VMOVDQA X3, X0
1352 PADDD 128(AX), X0
1353 SHA256RNDS2 X0, X1, X2
1354 VMOVDQA X3, X7
1355 PALIGNR $0x04, X6, X7
1356 PADDD X7, X4
1357 SHA256MSG2 X3, X4
1358 PSHUFD $0x0e, X0, X0
1359 SHA256RNDS2 X0, X2, X1
1360 SHA256MSG1 X3, X6
1361 VMOVDQA X4, X0
1362 PADDD 160(AX), X0
1363 SHA256RNDS2 X0, X1, X2
1364 VMOVDQA X4, X7
1365 PALIGNR $0x04, X3, X7
1366 PADDD X7, X5
1367 SHA256MSG2 X4, X5
1368 PSHUFD $0x0e, X0, X0
1369 SHA256RNDS2 X0, X2, X1
1370 SHA256MSG1 X4, X3
1371 VMOVDQA X5, X0
1372 PADDD 192(AX), X0
1373 SHA256RNDS2 X0, X1, X2
1374 VMOVDQA X5, X7
1375 PALIGNR $0x04, X4, X7
1376 PADDD X7, X6
1377 SHA256MSG2 X5, X6
1378 PSHUFD $0x0e, X0, X0
1379 SHA256RNDS2 X0, X2, X1
1380 SHA256MSG1 X5, X4
1381 VMOVDQA X6, X0
1382 PADDD 224(AX), X0
1383 SHA256RNDS2 X0, X1, X2
1384 VMOVDQA X6, X7
1385 PALIGNR $0x04, X5, X7
1386 PADDD X7, X3
1387 SHA256MSG2 X6, X3
1388 PSHUFD $0x0e, X0, X0
1389 SHA256RNDS2 X0, X2, X1
1390 SHA256MSG1 X6, X5
1391 VMOVDQA X3, X0
1392 PADDD 256(AX), X0
1393 SHA256RNDS2 X0, X1, X2
1394 VMOVDQA X3, X7
1395 PALIGNR $0x04, X6, X7
1396 PADDD X7, X4
1397 SHA256MSG2 X3, X4
1398 PSHUFD $0x0e, X0, X0
1399 SHA256RNDS2 X0, X2, X1
1400 SHA256MSG1 X3, X6
1401 VMOVDQA X4, X0
1402 PADDD 288(AX), X0
1403 SHA256RNDS2 X0, X1, X2
1404 VMOVDQA X4, X7
1405 PALIGNR $0x04, X3, X7
1406 PADDD X7, X5
1407 SHA256MSG2 X4, X5
1408 PSHUFD $0x0e, X0, X0
1409 SHA256RNDS2 X0, X2, X1
1410 SHA256MSG1 X4, X3
1411 VMOVDQA X5, X0
1412 PADDD 320(AX), X0
1413 SHA256RNDS2 X0, X1, X2
1414 VMOVDQA X5, X7
1415 PALIGNR $0x04, X4, X7
1416 PADDD X7, X6
1417 SHA256MSG2 X5, X6
1418 PSHUFD $0x0e, X0, X0
1419 SHA256RNDS2 X0, X2, X1
1420 SHA256MSG1 X5, X4
1421 VMOVDQA X6, X0
1422 PADDD 352(AX), X0
1423 SHA256RNDS2 X0, X1, X2
1424 VMOVDQA X6, X7
1425 PALIGNR $0x04, X5, X7
1426 PADDD X7, X3
1427 SHA256MSG2 X6, X3
1428 PSHUFD $0x0e, X0, X0
1429 SHA256RNDS2 X0, X2, X1
1430 SHA256MSG1 X6, X5
1431 VMOVDQA X3, X0
1432 PADDD 384(AX), X0
1433 SHA256RNDS2 X0, X1, X2
1434 VMOVDQA X3, X7
1435 PALIGNR $0x04, X6, X7
1436 PADDD X7, X4
1437 SHA256MSG2 X3, X4
1438 PSHUFD $0x0e, X0, X0
1439 SHA256RNDS2 X0, X2, X1
1440 SHA256MSG1 X3, X6
1441 VMOVDQA X4, X0
1442 PADDD 416(AX), X0
1443 SHA256RNDS2 X0, X1, X2
1444 VMOVDQA X4, X7
1445 PALIGNR $0x04, X3, X7
1446 PADDD X7, X5
1447 SHA256MSG2 X4, X5
1448 PSHUFD $0x0e, X0, X0
1449 SHA256RNDS2 X0, X2, X1
1450 VMOVDQA X5, X0
1451 PADDD 448(AX), X0
1452 SHA256RNDS2 X0, X1, X2
1453 VMOVDQA X5, X7
1454 PALIGNR $0x04, X4, X7
1455 PADDD X7, X6
1456 SHA256MSG2 X5, X6
1457 PSHUFD $0x0e, X0, X0
1458 SHA256RNDS2 X0, X2, X1
1459
1460 // do rounds 60-63
1461 VMOVDQA X6, X0
1462 PADDD 480(AX), X0
1463 SHA256RNDS2 X0, X1, X2
1464 PSHUFD $0x0e, X0, X0
1465 SHA256RNDS2 X0, X2, X1
1466
1467 // add current hash values with previously saved
1468 PADDD X9, X1
1469 PADDD X10, X2
1470
1471 // advance data pointer; loop until buffer empty
1472 ADDQ $0x40, SI
1473 CMPQ DX, SI
1474 JNE roundLoop
1475
1476 // write hash values back in the correct order
1477 PSHUFD $0x1b, X1, X1
1478 PSHUFD $0xb1, X2, X2
1479 VMOVDQA X1, X7
1480 PBLENDW $0xf0, X2, X1
1481 PALIGNR $0x08, X7, X2
1482 VMOVDQU X1, (DI)
1483 VMOVDQU X2, 16(DI)
1484
1485 done:
1486 RET
1487
View as plain text