1
0
mirror of https://github.com/fumiama/blake2b-simd.git synced 2026-06-12 22:40:54 +08:00

Restyled AVX and SSE version using asm2plan9s and asmfmt (#20)

This commit is contained in:
Frank
2016-07-08 16:47:17 +02:00
committed by Harshavardhana
parent 25efc542f2
commit 959f72aaa5
2 changed files with 1326 additions and 1548 deletions

View File

@@ -44,43 +44,43 @@
#define G1 \ #define G1 \
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */ LONG $0xd479c1c4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */ LONG $0xd471c1c4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */ LONG $0xf670f9c5; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */ LONG $0xff70f9c5; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
BYTE $0xc4; BYTE $0xc2; BYTE $0x69; BYTE $0x00; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */ LONG $0x0069c2c4; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
BYTE $0xc4; BYTE $0xc2; BYTE $0x61; BYTE $0x00; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */ LONG $0x0061c2c4; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
#define G2 \ #define G2 \
\ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ LONG $0xd479c1c4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ LONG $0xd471c1c4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ LONG $0xf670fbc5; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ LONG $0xf670fac5; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ LONG $0xff70fbc5; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ LONG $0xff70fac5; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
BYTE $0xc5; BYTE $0x69; BYTE $0xd4; BYTE $0xfa \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */ LONG $0xfad469c5 \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */
BYTE $0xc5; BYTE $0xe9; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */ LONG $0xd273e9c5; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */
BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0xef; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ LONG $0xef69c1c4; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
BYTE $0xc5; BYTE $0x61; BYTE $0xd4; BYTE $0xfb \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */ LONG $0xfbd461c5 \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */
BYTE $0xc5; BYTE $0xe1; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */ LONG $0xd373e1c5; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0xef; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ LONG $0xef61c1c4; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
#define DIAGONALIZE \ #define DIAGONALIZE \
\ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
@@ -89,14 +89,14 @@
MOVOU X4, X6 \ /* row4l = row3l;\ */ MOVOU X4, X6 \ /* row4l = row3l;\ */
MOVOU X5, X4 \ /* row3l = row3h;\ */ MOVOU X5, X4 \ /* row3l = row3h;\ */
MOVOU X6, X5 \ /* row3h = row4l;\ */ MOVOU X6, X5 \ /* row3h = row4l;\ */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */ LONG $0x6c1141c4; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */ LONG $0x6d41c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */ LONG $0x6d11c1c4; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */ LONG $0x6d69c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ LONG $0x6d61c1c4; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
#define UNDIAGONALIZE \ #define UNDIAGONALIZE \
\ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
@@ -105,14 +105,14 @@
MOVOU X13, X5 \ /* row3h = t0;\ */ MOVOU X13, X5 \ /* row3h = t0;\ */
MOVOU X2, X13 \ /* t0 = row2l;\ */ MOVOU X2, X13 \ /* t0 = row2l;\ */
MOVOU X6, X14 \ /* t1 = row4l;\ */ MOVOU X6, X14 \ /* t1 = row4l;\ */
BYTE $0xc5; BYTE $0x69; BYTE $0x6c; BYTE $0xfa \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ LONG $0xfa6c69c5 \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ LONG $0x6d61c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ LONG $0x6d11c1c4; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x49; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ LONG $0x6d49c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ LONG $0x6d41c1c4; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
#define LOAD_SHUFFLE \ #define LOAD_SHUFFLE \
\ // Load shuffle value \ // Load shuffle value
@@ -154,14 +154,14 @@ TEXT ·blockAVXLoop(SB), 7, $0
loop: loop:
// Increment counter // Increment counter
MOVQ t+72(FP), SI // SI: &t MOVQ t+72(FP), SI // SI: &t
MOVQ 0(SI), R9 // MOVQ 0(SI), R9
ADDQ $128, R9 // /* d.t[0] += BlockSize */ ADDQ $128, R9 // /* d.t[0] += BlockSize */
MOVQ R9, 0(SI) // MOVQ R9, 0(SI)
CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
JGE noincr // JGE noincr
MOVQ 8(SI), R9 // MOVQ 8(SI), R9
ADDQ $1, R9 // /* d.t[1]++ */ ADDQ $1, R9 // /* d.t[1]++ */
MOVQ R9, 8(SI) // MOVQ R9, 8(SI)
noincr: // /* } */ noincr: // /* } */
// Load initialization vector // Load initialization vector
@@ -181,582 +181,472 @@ noincr: // /* } */
// R O U N D 1 // R O U N D 1
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+m[1] MOVOU 0(DX), X12 // X12 = m[0]+m[1]
MOVOU 16(DX), X13 // X13 = m[2]+m[3] MOVOU 16(DX), X13 // X13 = m[2]+m[3]
MOVOU 32(DX), X14 // X14 = m[4]+m[5] MOVOU 32(DX), X14 // X14 = m[4]+m[5]
MOVOU 48(DX), X15 // X15 = m[6]+m[7] MOVOU 48(DX), X15 // X15 = m[6]+m[7]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 2 // R O U N D 2
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 112(DX), X12 // X12 = m[14]+m[15] MOVOU 112(DX), X12 // X12 = m[14]+m[15]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 48(DX), X15 // X15 = m[6]+ m[7] MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
BYTE $0x08 LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 3 // R O U N D 3
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 32(DX), X12 // X12 = m[4]+ m[5] MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xc5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */ LONG $0x0f0943c4; WORD $0x08c5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */
BYTE $0x08 LONG $0x6d1941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */ LONG $0x6c0141c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */ LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */ LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */ LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */ LONG $0x6d1141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X14 // X14 = m[4]+ m[5] MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */ LONG $0x6c0141c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */ LONG $0x0f0943c4; WORD $0x08dc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */
BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 4 // R O U N D 4
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */ LONG $0x6d1141c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */ LONG $0x6d0141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
MOVOU 112(DX), X14 // X14 = m[14]+m[15] MOVOU 112(DX), X14 // X14 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */ LONG $0x6d1141c4; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */ LONG $0x6c0141c4; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */ LONG $0x6d1141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */ LONG $0x6c1941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */ LONG $0x6d0141c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */ LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */ LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */ LONG $0x6c1941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 5 // R O U N D 5
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */ LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */ LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */ LONG $0x6d0941c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */ LONG $0x6c1941c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */ LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */ LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */ LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */ LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */ LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xd4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */ LONG $0x0f0943c4; WORD $0x08d4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */
BYTE $0x08 LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 6 // R O U N D 6
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */ LONG $0x6c1141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */ LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */
MOVOU 80(DX), X12 // X12 = m[10]+m[11] MOVOU 80(DX), X12 // X12 = m[10]+m[11]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */ LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */ LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */ LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */ LONG $0x6c1141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */ LONG $0x6d0141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */ LONG $0x6d0941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */ LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */ LONG $0x6c0141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 7 // R O U N D 7
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */ LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */ LONG $0x6c0941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */ LONG $0x6c0141c4; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */
MOVOU 80(DX), X12 // X12 = m[10]+m[11] MOVOU 80(DX), X12 // X12 = m[10]+m[11]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */ LONG $0x6d1141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xde // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */ LONG $0x0f1943c4; WORD $0x08de // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */
BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */ LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */ LONG $0x0f0943c4; WORD $0x08ce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */
BYTE $0x08
MOVOU 16(DX), X14 // X14 = m[2]+ m[3] MOVOU 16(DX), X14 // X14 = m[2]+ m[3]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */ LONG $0x6d1141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */ LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */ LONG $0x6c0941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 8 // R O U N D 8
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */ LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */ LONG $0x6c0941c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */ LONG $0x0f0143c4; WORD $0x08d6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */
BYTE $0x08 LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */ LONG $0x6d1141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */ LONG $0x6c0941c4; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */ LONG $0x6c1941c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */ LONG $0x6c0941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 9 // R O U N D 9
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */ LONG $0x6c1141c4; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */ LONG $0x0f1943c4; WORD $0x08ce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */
BYTE $0x08
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */ LONG $0x6d0141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */ LONG $0x0f0943c4; WORD $0x08dd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */
BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */ LONG $0x6d0141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */ LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xcc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */ LONG $0x0f0943c4; WORD $0x08cc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */
BYTE $0x08
MOVOU 32(DX), X12 // X12 = m[4]+ m[5] MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
MOVOU 48(DX), X15 // X15 = m[6]+ m[7] MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */ LONG $0x6d0141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */ LONG $0x6c1141c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */ LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */ LONG $0x6c1941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 1 0 // R O U N D 1 0
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */ LONG $0x6c0141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */ LONG $0x6d1141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 32(DX), X14 // X14 = m[4]+ m[5] MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */ LONG $0x6c1941c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */ LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */ LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */ LONG $0x6d0141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */ LONG $0x6d1941c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */ LONG $0x0f0143c4; WORD $0x08d5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */
BYTE $0x08 LONG $0x6c0941c4; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 1 1 // R O U N D 1 1
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+m[1] MOVOU 0(DX), X12 // X12 = m[0]+m[1]
MOVOU 16(DX), X13 // X13 = m[2]+m[3] MOVOU 16(DX), X13 // X13 = m[2]+m[3]
MOVOU 32(DX), X14 // X14 = m[4]+m[5] MOVOU 32(DX), X14 // X14 = m[4]+m[5]
MOVOU 48(DX), X15 // X15 = m[6]+m[7] MOVOU 48(DX), X15 // X15 = m[6]+m[7]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 1 2 // R O U N D 1 2
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 112(DX), X12 // X12 = m[14]+m[15] MOVOU 112(DX), X12 // X12 = m[14]+m[15]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 48(DX), X15 // X15 = m[6]+ m[7] MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
BYTE $0x08 LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
// Reload digest (most current value store in &out) // Reload digest (most current value store in &out)

View File

@@ -44,48 +44,45 @@
#define G1 \ #define G1 \
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc0 \ // PADDQ XMM0,XMM8 /* v0 += m[0], v1 += m[2] */ LONG $0xd40f4166; BYTE $0xc0 \ // PADDQ XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc9 \ // PADDQ XMM1,XMM9 /* v2 += m[4], v3 += m[6] */ LONG $0xd40f4166; BYTE $0xc9 \ // PADDQ XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */ LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */ LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf0 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf9 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // PSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */ LONG $0xf6700f66; BYTE $0xb1 \ // PSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // PSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */ LONG $0xff700f66; BYTE $0xb1 \ // PSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xe6 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */ LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xef \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */ LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xd4 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xdd \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x38; BYTE $0x00 \ // PSHUFB XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */ LONG $0x380f4166; WORD $0xd400 \ // PSHUFB XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
BYTE $0xd4 \ LONG $0x380f4166; WORD $0xdc00 // PSHUFB XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x38; BYTE $0x00 \ // PSHUFB XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
BYTE $0xdc \
// DO NOT DELETE -- macro delimiter (previous line extended)
#define G2 \ #define G2 \
\ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ LONG $0xd40f4166; BYTE $0xc2 \ // PADDQ XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ LONG $0xd40f4166; BYTE $0xcb \ // PADDQ XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */ LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */ LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf0 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf9 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
BYTE $0xf2; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // PSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ LONG $0xf6700ff2; BYTE $0x39 \ // PSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
BYTE $0xf3; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // PSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ LONG $0xf6700ff3; BYTE $0x39 \ // PSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
BYTE $0xf2; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // PSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ LONG $0xff700ff2; BYTE $0x39 \ // PSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
BYTE $0xf3; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // PSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ LONG $0xff700ff3; BYTE $0x39 \ // PSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xe6 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */ LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xef \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */ LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xd4 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xdd \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
MOVOU X2, X15 \ MOVOU X2, X15 \
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0xd4; BYTE $0xfa \ // PADDQ XMM15,XMM2 /* temp reg = reg*2 */ LONG $0xd40f4466; BYTE $0xfa \ // PADDQ XMM15,XMM2 /* temp reg = reg*2 */
BYTE $0x66; BYTE $0x0f; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // PSRLQ XMM2,0x3f /* reg = reg>>63 */ LONG $0xd2730f66; BYTE $0x3f \ // PSRLQ XMM2,0x3f /* reg = reg>>63 */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xef; BYTE $0xd7 \ // PXOR XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ LONG $0xef0f4166; BYTE $0xd7 \ // PXOR XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
MOVOU X3, X15 \ MOVOU X3, X15 \
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0xd4; BYTE $0xfb \ // PADDQ XMM15,XMM3 /* temp reg = reg*2 */ LONG $0xd40f4466; BYTE $0xfb \ // PADDQ XMM15,XMM3 /* temp reg = reg*2 */
BYTE $0x66; BYTE $0x0f; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // PSRLQ XMM3,0x3f /* reg = reg>>63 */ LONG $0xd3730f66; BYTE $0x3f \ // PSRLQ XMM3,0x3f /* reg = reg>>63 */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xef; BYTE $0xdf // PXOR XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ LONG $0xef0f4166; BYTE $0xdf // PXOR XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
#define DIAGONALIZE \ #define DIAGONALIZE \
\ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
@@ -94,16 +91,16 @@
MOVOU X4, X6 \ /* row4l = row3l;\ */ MOVOU X4, X6 \ /* row4l = row3l;\ */
MOVOU X5, X4 \ /* row3l = row3h;\ */ MOVOU X5, X4 \ /* row3l = row3h;\ */
MOVOU X6, X5 \ /* row3h = row4l;\ */ MOVOU X6, X5 \ /* row3h = row4l;\ */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfd \ // PUNPCKLQDQ XMM15, XMM13 /* _mm_unpacklo_epi64(t0, t0) */ LONG $0x6c0f4566; BYTE $0xfd \ // PUNPCKLQDQ XMM15, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
MOVOU X7, X6 \ MOVOU X7, X6 \
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */ LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
MOVOU X13, X7 \ MOVOU X13, X7 \
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xff \ // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */ LONG $0x6d0f4166; BYTE $0xff \ // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */ LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ LONG $0x6d0f4166; BYTE $0xdf // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
#define UNDIAGONALIZE \ #define UNDIAGONALIZE \
\ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
@@ -112,16 +109,16 @@
MOVOU X13, X5 \ /* row3h = t0;\ */ MOVOU X13, X5 \ /* row3h = t0;\ */
MOVOU X2, X13 \ /* t0 = row2l;\ */ MOVOU X2, X13 \ /* t0 = row2l;\ */
MOVOU X6, X14 \ /* t1 = row4l;\ */ MOVOU X6, X14 \ /* t1 = row4l;\ */
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfa \ // PUNPCKLQDQ XMM15, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ LONG $0x6c0f4466; BYTE $0xfa \ // PUNPCKLQDQ XMM15, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
MOVOU X3, X2 \ MOVOU X3, X2 \
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
MOVOU X13, X3 \ MOVOU X13, X3 \
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf \ // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ LONG $0x6d0f4166; BYTE $0xdf \ // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ LONG $0x6d0f4166; BYTE $0xff // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
#define LOAD_SHUFFLE \ #define LOAD_SHUFFLE \
\ // Load shuffle value \ // Load shuffle value
@@ -163,14 +160,15 @@ TEXT ·blockSSELoop(SB), 7, $0
loop: loop:
// Increment counter // Increment counter
MOVQ t+72(FP), SI // SI: &t MOVQ t+72(FP), SI // SI: &t
MOVQ 0(SI), R9 // MOVQ 0(SI), R9
ADDQ $128, R9 // /* d.t[0] += BlockSize */ ADDQ $128, R9 // /* d.t[0] += BlockSize */
MOVQ R9, 0(SI) // MOVQ R9, 0(SI)
CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
JGE noincr // JGE noincr
MOVQ 8(SI), R9 // MOVQ 8(SI), R9
ADDQ $1, R9 // /* d.t[1]++ */ ADDQ $1, R9 // /* d.t[1]++ */
MOVQ R9, 8(SI) // MOVQ R9, 8(SI)
noincr: // /* } */ noincr: // /* } */
// Load initialization vector // Load initialization vector
@@ -190,663 +188,553 @@ noincr: // /* } */
// R O U N D 1 // R O U N D 1
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+m[1] MOVOU 0(DX), X12 // X12 = m[0]+m[1]
MOVOU 16(DX), X13 // X13 = m[2]+m[3] MOVOU 16(DX), X13 // X13 = m[2]+m[3]
MOVOU 32(DX), X14 // X14 = m[4]+m[5] MOVOU 32(DX), X14 // X14 = m[4]+m[5]
MOVOU 48(DX), X15 // X15 = m[6]+m[7] MOVOU 48(DX), X15 // X15 = m[6]+m[7]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */ LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */ LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
MOVOU X12, X10 MOVOU X12, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */ LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
MOVOU X14, X11 MOVOU X14, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */ LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */ LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */ LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
MOVOU X12, X10 MOVOU X12, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */ LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
MOVOU X14, X11 MOVOU X14, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */ LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 2 // R O U N D 2
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 112(DX), X12 // X12 = m[14]+m[15] MOVOU 112(DX), X12 // X12 = m[14]+m[15]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */ LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */ LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
MOVOU 80(DX), X10 // X10 = m[10]+m[11] MOVOU 80(DX), X10 // X10 = m[10]+m[11]
MOVOU 48(DX), X11 // X11 = m[6]+ m[7] MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */ LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ; LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
BYTE $0xdc; BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */ LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
BYTE $0xc4; BYTE $0x08
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */ LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X11 // X11 = m[6]+ m[7] MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
MOVOU 96(DX), X10 // X10 = m[12]+m[13] MOVOU 96(DX), X10 // X10 = m[12]+m[13]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */ LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */ LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 3 // R O U N D 3
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 32(DX), X12 // X12 = m[4]+ m[5] MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X14, X8 MOVOU X14, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM13, 0x8 /* m[11], m[12] */ LONG $0x3a0f4566; WORD $0xc50f; BYTE $0x08 // PALIGNR XMM8, XMM13, 0x8 /* m[11], m[12] */
BYTE $0xc5; BYTE $0x08
MOVOU X12, X9 MOVOU X12, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[5], m[15] */ LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[5], m[15] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 64(DX), X10 // X10 = m[8]+ m[9] MOVOU 64(DX), X10 // X10 = m[8]+ m[9]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[8], m[0] */ LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[8], m[0] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */ LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
MOVOU X13, X11 MOVOU X13, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[2], ___ */ LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[2], ___ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
MOVOU X12, X9 MOVOU X12, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[3] */ LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[3] */
MOVOU X15, X8 MOVOU X15, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[10], ___ */ LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[10], ___ */
MOVOU X13, X9 MOVOU X13, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[7], m[9] */ LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[7], m[9] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X11 // X11 = m[4]+ m[5] MOVOU 32(DX), X11 // X11 = m[4]+ m[5]
MOVOU 112(DX), X10 // X10 = m[14]+m[15] MOVOU 112(DX), X10 // X10 = m[14]+m[15]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[14], m[6] */ LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[14], m[6] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[1], m[4] */ LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[1], m[4] */
BYTE $0xdc; BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 4 // R O U N D 4
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
MOVOU X13, X8 MOVOU X13, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc4 // PUNPCKHQDQ XMM8, XMM12 /* m[7], m[3] */ LONG $0x6d0f4566; BYTE $0xc4 // PUNPCKHQDQ XMM8, XMM12 /* m[7], m[3] */
MOVOU X15, X9 MOVOU X15, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[13], m[11] */ LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[13], m[11] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 64(DX), X10 // X10 = m[8]+ m[9] MOVOU 64(DX), X10 // X10 = m[8]+ m[9]
MOVOU 112(DX), X14 // X14 = m[14]+m[15] MOVOU 112(DX), X14 // X14 = m[14]+m[15]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* m[9], m[1] */ LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* m[9], m[1] */
MOVOU X15, X11 MOVOU X15, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[12], m[14] */ LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[12], m[14] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X13, X9 MOVOU X13, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* ___, m[5] */ LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* ___, m[5] */
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[2], ____ */ LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[2], ____ */
MOVOU X15, X10 MOVOU X15, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* ___, m[15] */ LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* ___, m[15] */
MOVOU X13, X9 MOVOU X13, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[4], ____ */ LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[4], ____ */
MOVOU 0(DX), X11 // X11 = m[0]+ m[1] MOVOU 0(DX), X11 // X11 = m[0]+ m[1]
MOVOU 48(DX), X10 // X10 = m[6]+ m[7] MOVOU 48(DX), X10 // X10 = m[6]+ m[7]
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[6], m[10] */ LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[6], m[10] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[0], m[8] */ LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[0], m[8] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 5 // R O U N D 5
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
MOVOU X14, X8 MOVOU X14, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[9], m[5] */ LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[9], m[5] */
MOVOU X12, X9 MOVOU X12, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[2], m[10] */ LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[2], m[10] */
MOVOU 0(DX), X10 // X10 = m[0]+ m[1] MOVOU 0(DX), X10 // X10 = m[0]+ m[1]
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[7] */ LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[7] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[0], ____ */ LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[0], ____ */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[15] */ LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[15] */
MOVOU X13, X11 MOVOU X13, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */ LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[11] */ LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[11] */
MOVOU X15, X8 MOVOU X15, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[14], ____ */ LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[14], ____ */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[3] */ LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[3] */
MOVOU X13, X9 MOVOU X13, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[6], ____ */ LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[6], ____ */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 64(DX), X11 // X11 = m[8]+ m[9] MOVOU 64(DX), X11 // X11 = m[8]+ m[9]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU X14, X10 MOVOU X14, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM12, 0x8 /* m[1], m[12] */ LONG $0x3a0f4566; WORD $0xd40f; BYTE $0x08 // PALIGNR XMM10, XMM12, 0x8 /* m[1], m[12] */
BYTE $0xd4; BYTE $0x08 LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */ LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[8], ____ */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[8], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 6 // R O U N D 6
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
MOVOU X13, X8 MOVOU X13, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[2], m[6] */ LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[2], m[6] */
MOVOU X12, X9 MOVOU X12, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[0], m[8] */ LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[0], m[8] */
MOVOU 80(DX), X12 // X12 = m[10]+m[11] MOVOU 80(DX), X12 // X12 = m[10]+m[11]
MOVOU 96(DX), X10 // X10 = m[12]+m[13] MOVOU 96(DX), X10 // X10 = m[12]+m[13]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[10] */ LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[10] */
MOVOU X12, X11 MOVOU X12, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[11], m[3] */ LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[11], m[3] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* ___, m[7] */ LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* ___, m[7] */
MOVOU X13, X8 MOVOU X13, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[4], ____ */ LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[4], ____ */
MOVOU X15, X9 MOVOU X15, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[15], m[1] */ LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[15], m[1] */
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
MOVOU 96(DX), X10 // X10 = m[12]+m[13] MOVOU 96(DX), X10 // X10 = m[12]+m[13]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[13], m[5] */ LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[13], m[5] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[9] */ LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[9] */
MOVOU X15, X11 MOVOU X15, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[14], ____ */ LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[14], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 7 // R O U N D 7
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X12, X9 MOVOU X12, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[1] */ LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[1] */
MOVOU X14, X8 MOVOU X14, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */ LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
MOVOU X15, X9 MOVOU X15, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcd // PUNPCKLQDQ XMM9, XMM13 /* m[14], m[4] */ LONG $0x6c0f4566; BYTE $0xcd // PUNPCKLQDQ XMM9, XMM13 /* m[14], m[4] */
MOVOU 80(DX), X11 // X11 = m[10]+m[11] MOVOU 80(DX), X11 // X11 = m[10]+m[11]
MOVOU X13, X10 MOVOU X13, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* m[5], m[15] */ LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* m[5], m[15] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM14, 0x8 /* m[13], m[10] */ LONG $0x3a0f4566; WORD $0xde0f; BYTE $0x08 // PALIGNR XMM11, XMM14, 0x8 /* m[13], m[10] */
BYTE $0xde; BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[6] */ LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[6] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM14, 0x8 /* m[9], m[8] */ LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[9], m[8] */
BYTE $0xce; BYTE $0x08
MOVOU 16(DX), X11 // X14 = m[2]+ m[3] MOVOU 16(DX), X11 // X14 = m[2]+ m[3]
MOVOU X13, X10 MOVOU X13, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[7], m[3] */ LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[7], m[3] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[11] */ LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[11] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[2], ____ */ LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[2], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 8 // R O U N D 8
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X14, X8 MOVOU X14, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[13], m[7] */ LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[13], m[7] */
MOVOU X12, X10 MOVOU X12, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* ___, m[3] */ LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* ___, m[3] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[12], ____ */ LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[12], ____ */
MOVOU 0(DX), X11 // X11 = m[0]+ m[1] MOVOU 0(DX), X11 // X11 = m[0]+ m[1]
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU X15, X10 MOVOU X15, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM14, 0x8 /* m[11], m[14] */ LONG $0x3a0f4566; WORD $0xd60f; BYTE $0x08 // PALIGNR XMM10, XMM14, 0x8 /* m[11], m[14] */
BYTE $0xd6; BYTE $0x08 LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[1], m[9] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[1], m[9] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X13, X8 MOVOU X13, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc7 // PUNPCKHQDQ XMM8, XMM15 /* m[5], m[15] */ LONG $0x6d0f4566; BYTE $0xc7 // PUNPCKHQDQ XMM8, XMM15 /* m[5], m[15] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[8], m[2] */ LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[8], m[2] */
MOVOU 0(DX), X10 // X10 = m[0]+ m[1] MOVOU 0(DX), X10 // X10 = m[0]+ m[1]
MOVOU 48(DX), X11 // X11 = m[6]+ m[7] MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[0], m[4] */ LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[0], m[4] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], m[10] */ LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], m[10] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 9 // R O U N D 9
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X13, X8 MOVOU X13, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc7 // PUNPCKLQDQ XMM8, XMM15 /* m[6], m[14] */ LONG $0x6c0f4566; BYTE $0xc7 // PUNPCKLQDQ XMM8, XMM15 /* m[6], m[14] */
MOVOU X12, X9 MOVOU X12, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM14, 0x8 /* m[11], m[0] */ LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[11], m[0] */
BYTE $0xce; BYTE $0x08
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 64(DX), X11 // X11 = m[8]+ m[9] MOVOU 64(DX), X11 // X11 = m[8]+ m[9]
MOVOU X15, X10 MOVOU X15, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[15], m[9] */ LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[15], m[9] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM13, 0x8 /* m[3], m[8] */ LONG $0x3a0f4566; WORD $0xdd0f; BYTE $0x08 // PALIGNR XMM11, XMM13, 0x8 /* m[3], m[8] */
BYTE $0xdd; BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
MOVOU X15, X9 MOVOU X15, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* ___, m[13] */ LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* ___, m[13] */
MOVOU X15, X8 MOVOU X15, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */ LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM12, 0x8 /* m[1], m[10] */ LONG $0x3a0f4566; WORD $0xcc0f; BYTE $0x08 // PALIGNR XMM9, XMM12, 0x8 /* m[1], m[10] */
BYTE $0xcc; BYTE $0x08
MOVOU 32(DX), X12 // X12 = m[4]+ m[5] MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
MOVOU 48(DX), X15 // X15 = m[6]+ m[7] MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
MOVOU X15, X11 MOVOU X15, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* ___, m[7] */ LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* ___, m[7] */
MOVOU X13, X10 MOVOU X13, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd3 // PUNPCKLQDQ XMM10, XMM11 /* m[2], ____ */ LONG $0x6c0f4566; BYTE $0xd3 // PUNPCKLQDQ XMM10, XMM11 /* m[2], ____ */
MOVOU X12, X15 MOVOU X12, X15
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xfc // PUNPCKHQDQ XMM15, XMM12 /* ___, m[5] */ LONG $0x6d0f4566; BYTE $0xfc // PUNPCKHQDQ XMM15, XMM12 /* ___, m[5] */
MOVOU X12, X11 MOVOU X12, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */ LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 1 0 // R O U N D 1 0
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 80(DX), X15 // X15 = m[10]+m[11] MOVOU 80(DX), X15 // X15 = m[10]+m[11]
MOVOU X15, X8 MOVOU X15, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[10], m[8] */ LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[10], m[8] */
MOVOU X13, X9 MOVOU X13, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[7], m[1] */ LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[7], m[1] */
MOVOU 16(DX), X10 // X10 = m[2]+ m[3] MOVOU 16(DX), X10 // X10 = m[2]+ m[3]
MOVOU 32(DX), X14 // X14 = m[4]+ m[5] MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[2], m[4] */ LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[2], m[4] */
MOVOU X14, X15 MOVOU X14, X15
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xfe // PUNPCKHQDQ XMM15, XMM14 /* ___, m[5] */ LONG $0x6d0f4566; BYTE $0xfe // PUNPCKHQDQ XMM15, XMM14 /* ___, m[5] */
MOVOU X13, X11 MOVOU X13, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], ____ */ LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], ____ */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X15, X8 MOVOU X15, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[15], m[9] */ LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[15], m[9] */
MOVOU X12, X9 MOVOU X12, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[3], m[13] */ LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[3], m[13] */
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU X15, X10 MOVOU X15, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM13, 0x8 /* m[11], m[14] */ LONG $0x3a0f4566; WORD $0xd50f; BYTE $0x08 // PALIGNR XMM10, XMM13, 0x8 /* m[11], m[14] */
BYTE $0xd5; BYTE $0x08
MOVOU X14, X11 MOVOU X14, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[12], m[0] */ LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[12], m[0] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 1 1 // R O U N D 1 1
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+m[1] MOVOU 0(DX), X12 // X12 = m[0]+m[1]
MOVOU 16(DX), X13 // X13 = m[2]+m[3] MOVOU 16(DX), X13 // X13 = m[2]+m[3]
MOVOU 32(DX), X14 // X14 = m[4]+m[5] MOVOU 32(DX), X14 // X14 = m[4]+m[5]
MOVOU 48(DX), X15 // X15 = m[6]+m[7] MOVOU 48(DX), X15 // X15 = m[6]+m[7]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */ LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */ LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
MOVOU X12, X10 MOVOU X12, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */ LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
MOVOU X14, X11 MOVOU X14, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */ LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 80(DX), X13 // X13 = m[10]+m[11]
MOVOU 96(DX), X14 // X14 = m[12]+m[13] MOVOU 96(DX), X14 // X14 = m[12]+m[13]
MOVOU 112(DX), X15 // X15 = m[14]+m[15] MOVOU 112(DX), X15 // X15 = m[14]+m[15]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */ LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */ LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
MOVOU X12, X10 MOVOU X12, X10
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */ LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
MOVOU X14, X11 MOVOU X14, X11
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */ LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// R O U N D 1 2 // R O U N D 1 2
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_2(b0, b1);
// (X12 used as additional temp register)
MOVOU 112(DX), X12 // X12 = m[14]+m[15] MOVOU 112(DX), X12 // X12 = m[14]+m[15]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
MOVOU 96(DX), X15 // X15 = m[12]+m[13] MOVOU 96(DX), X15 // X15 = m[12]+m[13]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */ LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */ LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
MOVOU 80(DX), X10 // X10 = m[10]+m[11] MOVOU 80(DX), X10 // X10 = m[10]+m[11]
MOVOU 48(DX), X11 // X11 = m[6]+ m[7] MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */ LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ; LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
BYTE $0xdc; BYTE $0x08
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
DIAGONALIZE DIAGONALIZE
// LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
// LOAD_MSG_ ##r ##_4(b0, b1);
// (X12 used as additional temp register)
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
MOVOU 80(DX), X14 // X14 = m[10]+m[11] MOVOU 80(DX), X14 // X14 = m[10]+m[11]
MOVOU X12, X8 MOVOU X12, X8
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */ LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
BYTE $0xc4; BYTE $0x08
MOVOU X14, X9 MOVOU X14, X9
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */ LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
MOVOU 48(DX), X11 // X11 = m[6]+ m[7] MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
MOVOU 96(DX), X10 // X10 = m[12]+m[13] MOVOU 96(DX), X10 // X10 = m[12]+m[13]
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */ LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */ LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
LOAD_SHUFFLE LOAD_SHUFFLE
G1 G1
G2 G2
UNDIAGONALIZE UNDIAGONALIZE
// Reload digest (most current value store in &out) // Reload digest (most current value store in &out)