diff --git a/compressAvx2_amd64.go b/compressAvx2_amd64.go new file mode 100644 index 0000000..1b3ebae --- /dev/null +++ b/compressAvx2_amd64.go @@ -0,0 +1,46 @@ +//+build !noasm +//+build !appengine + +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package blake2b + +//go:noescape +func compressAVX2Loop(p []uint8, in, iv, t, f, shffle, out []uint64) + +func compressAVX2(d *digest, p []uint8) { + + in := make([]uint64, 8, 8) + out := make([]uint64, 8, 8) + + shffle := make([]uint64, 8, 8) + // vector for PSHUFB instruction + shffle[0] = 0x0201000706050403 + shffle[1] = 0x0a09080f0e0d0c0b + shffle[2] = 0x0201000706050403 + shffle[3] = 0x0a09080f0e0d0c0b + shffle[4] = 0x0100070605040302 + shffle[5] = 0x09080f0e0d0c0b0a + shffle[6] = 0x0100070605040302 + shffle[7] = 0x09080f0e0d0c0b0a + + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] + + compressAVX2Loop(p, in, iv[:], d.t[:], d.f[:], shffle, out) + + d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] +} diff --git a/compressAvx2_amd64.s b/compressAvx2_amd64.s new file mode 100644 index 0000000..24df234 --- /dev/null +++ b/compressAvx2_amd64.s @@ -0,0 +1,671 @@ +//+build !noasm !appengine + +// +// Minio Cloud Storage, (C) 2016 Minio, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// +// Based on AVX2 implementation from https://github.com/sneves/blake2-avx2/blob/master/blake2b-common.h +// +// Use github.com/fwessels/asm2plan9s on this file to assemble instructions to their Plan9 equivalent +// +// Assembly code below essentially follows the ROUND macro (see blake2b-round.h) which is defined as: +// #define ROUND(r) \ +// LOAD_MSG_ ##r ##_1(b0, b1); \ +// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// LOAD_MSG_ ##r ##_2(b0, b1); \ +// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ +// LOAD_MSG_ ##r ##_3(b0, b1); \ +// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// LOAD_MSG_ ##r ##_4(b0, b1); \ +// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); +// +// as well as the go equivalent in https://github.com/dchest/blake2b/blob/master/block.go +// +// As in the macro, G1/G2 in the 1st and 2nd half are identical (so literal copy of assembly) +// +// Rounds are also the same, except for the loading of the message (and rounds 1 & 11 and +// rounds 2 & 12 are identical) +// + +#define G1 \ + \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); + BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc4 \ // VPADDQ YMM0,YMM0,YMM4 /* v0 += m[0], v1 += m[2], v2 += m[4], v3 += m[6] */ + BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc1 \ // VPADDQ YMM0,YMM0,YMM1 /* v0 += v4, v1 += v5, v2 += v6, v3 += v7 */ + BYTE $0xc5; BYTE $0xe5; BYTE $0xef; BYTE $0xd8 \ // VPXOR YMM3,YMM3,YMM0 /* v12 ^= v0, v13 ^= v1, v14 ^= v2, v15 ^= v3 */ + BYTE $0xc5; BYTE $0xfd; BYTE $0x70; BYTE $0xdb; BYTE $0xb1 \ // VPSHUFD YMM3,YMM3,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = */ + BYTE $0xc5; BYTE $0xed; BYTE $0xd4; BYTE $0xd3 \ // VPADDQ YMM2,YMM2,YMM3 /* v8 += v12, v9 += v13, v10 += v14, v11 += v15 */ + BYTE $0xc5; BYTE $0xf5; BYTE $0xef; BYTE $0xca \ // VPXOR YMM1,YMM1,YMM2 /* v4 ^= v8, v5 ^= v9, v6 ^= v10, v7 ^= v11 */ + BYTE $0xc4; BYTE $0xe2; BYTE $0x75; BYTE $0x00; BYTE $0xce // VPSHUFB YMM1,YMM1,YMM6 /* v4 = v4<<(64-24) | v4>>24, ..., ..., v7 = v7<<(64-24) | v7>>24 */ + +#define G2 \ + BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc5 \ // VPADDQ YMM0,YMM0,YMM5 /* v0 += m[1], v1 += m[3], v2 += m[5], v3 += m[7] */ + BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc1 \ // VPADDQ YMM0,YMM0,YMM1 /* v0 += v4, v1 += v5, v2 += v6, v3 += v7 */ + BYTE $0xc5; BYTE $0xe5; BYTE $0xef; BYTE $0xd8 \ // VPXOR YMM3,YMM3,YMM0 /* v12 ^= v0, v13 ^= v1, v14 ^= v2, v15 ^= v3 */ + BYTE $0xc4; BYTE $0xe2; BYTE $0x65; BYTE $0x00; BYTE $0xdf \ // VPSHUFB YMM3,YMM3,YMM7 /* v12 = v12<<(64-16) | v12>>16, ..., ..., v15 = v15<<(64-16) | v15>>16 */ + BYTE $0xc5; BYTE $0xed; BYTE $0xd4; BYTE $0xd3 \ // VPADDQ YMM2,YMM2,YMM3 /* v8 += v12, v9 += v13, v10 += v14, v11 += v15 */ + BYTE $0xc5; BYTE $0xf5; BYTE $0xef; BYTE $0xca \ // VPXOR YMM1,YMM1,YMM2 /* v4 ^= v8, v5 ^= v9, v6 ^= v10, v7 ^= v11 */ + BYTE $0xc5; BYTE $0x75; BYTE $0xd4; BYTE $0xf9 \ // VPADDQ YMM15,YMM1,YMM1 /* temp reg = reg*2 */ + BYTE $0xc5; BYTE $0xf5; BYTE $0x73; BYTE $0xd1; BYTE $0x3f \ // VPSRLQ YMM1,YMM1,0x3f /* reg = reg>>63 */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xef; BYTE $0xcf // VPXOR YMM1,YMM1,YMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ + +#define DIAGONALIZE \ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb \ // VPERMQ YMM3, YMM3, 0x93 + BYTE $0x93 \ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2 \ // VPERMQ YMM2, YMM2, 0x4e + BYTE $0x4e \ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 \ // VPERMQ YMM1, YMM1, 0x39 + BYTE $0x39 \ + // DO NOT DELETE -- macro delimiter (previous line extended) + +#define UNDIAGONALIZE \ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb \ // VPERMQ YMM3, YMM3, 0x39 + BYTE $0x39 \ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2 \ // VPERMQ YMM2, YMM2, 0x4e + BYTE $0x4e \ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 \ // VPERMQ YMM1, YMM1, 0x93 + BYTE $0x93 \ + // DO NOT DELETE -- macro delimiter (previous line extended) + +#define LOAD_SHUFFLE \ + MOVQ shffle+120(FP), SI \ // SI: &shuffle + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x36 \ // VMOVDQU YMM6, [rsi] + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x7e; BYTE $0x20 // VMOVDQU YMM7, 32[rsi] + +// func compressAVX2Loop(compressSSE(p []uint8, in, iv, t, f, shffle, out []uint64) +TEXT ·compressAVX2Loop(SB), 7, $0 + + // REGISTER USE + // Y0 - Y3: v0 - v15 + // Y4 - Y5: m[0] - m[7] + // Y6 - Y7: shuffle value + // Y8 - Y9: temp registers + // Y10 -Y13: copy of full message + // Y15: temp register + + // Load digest + MOVQ in+24(FP), SI // SI: &in + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x06 // VMOVDQU YMM0, [rsi] + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x4e; BYTE $0x20 // VMOVDQU YMM1, 32[rsi] + + // Already store digest into &out (so we can reload it later generically) + MOVQ out+144(FP), SI // SI: &out + BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x06 // VMOVDQU [rsi], YMM0 + BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x4e; BYTE $0x20 // VMOVDQU 32[rsi], YMM1 + + // Initialize message pointer and loop counter + MOVQ message+0(FP), DX // DX: &p (message) + MOVQ message_len+8(FP), R8 // R8: len(message) + SHRQ $7, R8 // len(message) / 128 + CMPQ R8, $0 + JEQ complete + +loop: + // Increment counter + MOVQ t+72(FP), SI // SI: &t + MOVQ 0(SI), R9 // + ADDQ $128, R9 // /* d.t[0] += BlockSize */ + MOVQ R9, 0(SI) // + CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ + JGE noincr // + MOVQ 8(SI), R9 // + ADDQ $1, R9 // /* d.t[1]++ */ + MOVQ R9, 8(SI) // +noincr: // /* } */ + + // Load initialization vector + MOVQ iv+48(FP), SI // SI: &iv + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x16 // VMOVDQU YMM2, [rsi] + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x5e; BYTE $0x20 // VMOVDQU YMM3, 32[rsi] + MOVQ t+72(FP), SI // SI: &t + BYTE $0xc4; BYTE $0x63; BYTE $0x3d; BYTE $0x38; BYTE $0x06 // VINSERTI128 YMM8, YMM8, [rsi], 0 /* Y8 = t[0]+t[1] */ + BYTE $0x00 + MOVQ t+96(FP), SI // SI: &f + BYTE $0xc4; BYTE $0x63; BYTE $0x3d; BYTE $0x38; BYTE $0x06 // VINSERTI128 YMM8, YMM8, [rsi], 1 /* Y8 = t[0]+t[1]+f[0]+f[1] */ + BYTE $0x01 + BYTE $0xc4; BYTE $0xc1; BYTE $0x65; BYTE $0xef; BYTE $0xd8 // VPXOR YMM3,YMM3,YMM8 /* Y3 = Y3 ^ Y8 */ + + BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x12 // VMOVDQU YMM10, [rdx] /* Y10 = m[0]+ m[1]+ m[2]+ m[3] */ + BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x5a; BYTE $0x20 // VMOVDQU YMM11, 32[rdx] /* Y11 = m[4]+ m[5]+ m[6]+ m[7] */ + BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x62; BYTE $0x40 // VMOVDQU YMM12, 64[rdx] /* Y12 = m[8]+ m[9]+m[10]+m[11] */ + BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x6a; BYTE $0x60 // VMOVDQU YMM13, 96[rdx] /* Y13 = m[12]+m[13]+m[14]+m[15] */ + + LOAD_SHUFFLE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6c; BYTE $0xe3 // VPUNPCKLQDQ YMM4, YMM10, YMM11 /* m[0], m[4], m[2], m[6] */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6d; BYTE $0xeb // VPUNPCKHQDQ YMM5, YMM10, YMM11 /* m[1], m[5], m[3], m[7] */ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6c; BYTE $0xe5 // VPUNPCKLQDQ YMM4, YMM12, YMM13 /* m[8], m[12], m[10], m[14] */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6d; BYTE $0xed // VPUNPCKHQDQ YMM5, YMM12, YMM13 /* m[9], m[13], m[11], m[15] */ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 2 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM11, YMM13 /* m[4], ____, ____, m[14] */ + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x03 /* m[14], m[4], ____, ____ */ /* xxxx 0011 = 0x03 */ + BYTE $0x03 + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM12, YMM13 /* m[9], m[13], ____, ____ */ + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[9], m[13], ____, ____ */ /* 0010 0000 = 0x20 */ + BYTE $0x20 + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0x02 /* m[10], m[8], ____, ____ */ /* xxxx 0010 = 0x02 */ + BYTE $0x02 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x30 /* ____, ____, m[15], ____ */ /* xx11 xxxx = 0x30 */ + BYTE $0x30 + BYTE $0xc4; BYTE $0x41; BYTE $0x35; BYTE $0x6c; BYTE $0xcb // VPUNPCKLQDQ YMM9, YMM9, YMM11 /* ____, ____, m[15], m[6] */ + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */ + BYTE $0x30 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0x01 /* m[1], m[0], ____, ____ */ /* xxxx 0001 = 0x01 */ + BYTE $0x01 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM11, YMM12 /* m[5], ____, ____, m[11] */ + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x03 /* m[11], m[5], ____, ____ */ /* xxxx 0011 = 0x03 */ + BYTE $0x03 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[1], m[0], m[11], m[5] */ /* 0010 0000 = 0x20 */ + BYTE $0x20 + + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM10, YMM13 /* ___, m[12], m[2], ____ */ + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x09 /* m[12], m[2], ____, ____ */ /* xxxx 1001 = 0x09 */ + BYTE $0x09 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM11, YMM10 /* ____, ____, m[7], m[3] */ + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */ + BYTE $0x30 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 3 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0x00 + BYTE $0x00 + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM12, YMM8 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM11, YMM13 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21 + BYTE $0x21 + + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xc2 // VPUNPCKLQDQ YMM8, YMM12, YMM10 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x55 + BYTE $0x55 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM10, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 + BYTE $0x30 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM12, YMM8 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM11, YMM12 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31 + BYTE $0x31 + + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM13, YMM11 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcb // VPERMQ YMM9, YMM11, 0x00 + BYTE $0x00 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM10, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21 + BYTE $0x21 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 4 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc2 // VPUNPCKHQDQ YMM8, YMM11, YMM10 + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM13, YMM12 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21 + BYTE $0x21 + + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc2 // VPUNPCKHQDQ YMM8, YMM12, YMM10 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x08 + BYTE $0x08 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20 + BYTE $0x20 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0x55 + BYTE $0x55 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM10, YMM8 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM11, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21 + BYTE $0x21 + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc4 // VPUNPCKLQDQ YMM8, YMM11, YMM12 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM10, YMM12 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21 + BYTE $0x21 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 5 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc3 // VPUNPCKHQDQ YMM8, YMM12, YMM11 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM10, YMM12 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x30 + BYTE $0x30 + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM10, YMM8 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM11, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20 + BYTE $0x20 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM13, YMM8 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xca // VPERMQ YMM9, YMM10, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM11, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31 + BYTE $0x31 + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0x00 + BYTE $0x00 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM10, YMM8 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x55 + BYTE $0x55 + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM12, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20 + BYTE $0x20 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 6 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM10, YMM12 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21 + BYTE $0x21 + + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc4 // VPUNPCKLQDQ YMM8, YMM13, YMM12 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM12, YMM10 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 + BYTE $0x30 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM13, YMM10 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x30 + BYTE $0x30 + + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc3 // VPUNPCKHQDQ YMM8, YMM13, YMM11 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0x55 + BYTE $0x55 + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM13, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 + BYTE $0x30 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 7 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0x55 + BYTE $0x55 + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM13, YMM8 + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xcb // VPUNPCKLQDQ YMM9, YMM13, YMM11 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x30 + BYTE $0x30 + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ YMM8, YMM11, YMM13 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0xaa + BYTE $0xaa + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM13, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20 + BYTE $0x20 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0x01 + BYTE $0x01 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 + BYTE $0x20 + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc2 // VPUNPCKHQDQ YMM8, YMM11, YMM10 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM10, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x31 + BYTE $0x31 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 8 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc3 // VPUNPCKHQDQ YMM8, YMM13, YMM11 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xca // VPERMQ YMM9, YMM10, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM13, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 + BYTE $0x20 + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0xaa + BYTE $0xaa + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM12, YMM8 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM10, YMM12 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21 + BYTE $0x21 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ YMM8, YMM11, YMM13 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xca // VPUNPCKLQDQ YMM9, YMM12, YMM10 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x0c + BYTE $0x0c + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 + BYTE $0x20 + + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM11, YMM12 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 + BYTE $0x30 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 9 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM11, YMM13 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xca // VPERMQ YMM9, YMM10, 0x00 + BYTE $0x00 + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM12, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31 + BYTE $0x31 + + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ YMM8, YMM13, YMM12 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0x00 + BYTE $0x00 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM10, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x31 + BYTE $0x31 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0xaa + BYTE $0xaa + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM10, YMM9 + BYTE $0xc4; BYTE $0xc3; BYTE $0x15; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM13, YMM9, 0x20 + BYTE $0x20 + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0xff + BYTE $0xff + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM10, YMM8 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcb // VPERMQ YMM9, YMM11, 0x04 + BYTE $0x04 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21 + BYTE $0x21 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 10 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0x20 + BYTE $0x20 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM11, YMM10 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31 + BYTE $0x31 + + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcb // VPERMQ YMM9, YMM11, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x31 + BYTE $0x31 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ YMM8, YMM13, YMM12 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM10, YMM13 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60 + BYTE $0x60 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31 + BYTE $0x31 + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0xaa + BYTE $0xaa + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM12, YMM8 + BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xca // VPUNPCKLQDQ YMM9, YMM13, YMM10 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21 + BYTE $0x21 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 1 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6c; BYTE $0xe3 // VPUNPCKLQDQ YMM4, YMM10, YMM11 /* m[0], m[4], m[2], m[6] */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6d; BYTE $0xeb // VPUNPCKHQDQ YMM5, YMM10, YMM11 /* m[1], m[5], m[3], m[7] */ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6c; BYTE $0xe5 // VPUNPCKLQDQ YMM4, YMM12, YMM13 /* m[8], m[12], m[10], m[14] */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6d; BYTE $0xed // VPUNPCKHQDQ YMM5, YMM12, YMM13 /* m[9], m[13], m[11], m[15] */ + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */ + BYTE $0xd8 + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 2 + /////////////////////////////////////////////////////////////////////////// + + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM11, YMM13 /* m[4], ____, ____, m[14] */ + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x03 /* m[14], m[4], ____, ____ */ /* xxxx 0011 = 0x03 */ + BYTE $0x03 + BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM12, YMM13 /* m[9], m[13], ____, ____ */ + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[9], m[13], ____, ____ */ /* 0010 0000 = 0x20 */ + BYTE $0x20 + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0x02 /* m[10], m[8], ____, ____ */ /* xxxx 0010 = 0x02 */ + BYTE $0x02 + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x30 /* ____, ____, m[15], ____ */ /* xx11 xxxx = 0x30 */ + BYTE $0x30 + BYTE $0xc4; BYTE $0x41; BYTE $0x35; BYTE $0x6c; BYTE $0xcb // VPUNPCKLQDQ YMM9, YMM9, YMM11 /* ____, ____, m[15], m[6] */ + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */ + BYTE $0x30 + + G1 + G2 + + DIAGONALIZE + + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0x01 /* m[1], m[0], ____, ____ */ /* xxxx 0001 = 0x01 */ + BYTE $0x01 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM11, YMM12 /* m[5], ____, ____, m[11] */ + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x03 /* m[11], m[5], ____, ____ */ /* xxxx 0011 = 0x03 */ + BYTE $0x03 + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[1], m[0], m[11], m[5] */ /* 0010 0000 = 0x20 */ + BYTE $0x20 + + BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM10, YMM13 /* ___, m[12], m[2], ____ */ + BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x09 /* m[12], m[2], ____, ____ */ /* xxxx 1001 = 0x09 */ + BYTE $0x09 + BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM11, YMM10 /* ____, ____, m[7], m[3] */ + BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */ + BYTE $0x30 + + G1 + G2 + + UNDIAGONALIZE + + // Reload digest (most current value store in &out) + MOVQ out+144(FP), SI // SI: &in + BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x26 // VMOVDQU YMM12, [rsi] + BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x6e; BYTE $0x20 // VMOVDQU YMM13, 32[rsi] + + BYTE $0xc5; BYTE $0xfd; BYTE $0xef; BYTE $0xc2 // VPXOR YMM0,YMM0,YMM2 /* X0 = X0 ^ X4, X1 = X1 ^ X5 */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x7d; BYTE $0xef; BYTE $0xc4 // VPXOR YMM0,YMM0,YMM12 /* X0 = X0 ^ X12, X1 = X1 ^ X13 */ + BYTE $0xc5; BYTE $0xf5; BYTE $0xef; BYTE $0xcb // VPXOR YMM1,YMM1,YMM3 /* X2 = X2 ^ X6, X3 = X3 ^ X7 */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xef; BYTE $0xcd // VPXOR YMM1,YMM1,YMM13 /* X2 = X2 ^ X14, X3 = X3 ^ X15 */ + + // Store digest into &out + MOVQ out+144(FP), SI // SI: &out + BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x06 // VMOVDQU [rsi], YMM0 + BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x4e; BYTE $0x20 // VMOVDQU 32[rsi], YMM1 + + // Increment message pointer and check if there's more to do + ADDQ $128, DX // message += 128 + SUBQ $1, R8 + JNZ loop + +complete: + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER /* Prevent further context switches */ + RET + diff --git a/compressAvx2_noasm.go b/compressAvx2_noasm.go new file mode 100644 index 0000000..85de8fb --- /dev/null +++ b/compressAvx2_noasm.go @@ -0,0 +1,23 @@ +//+build !amd64 noasm appengine + +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package blake2b + +func compressAvx2(d *digest, p []uint8) { + compressGeneric(d, p) +} diff --git a/compress_amd64.go b/compress_amd64.go index 96d9793..d09d1e0 100644 --- a/compress_amd64.go +++ b/compress_amd64.go @@ -40,8 +40,11 @@ func compressAVX(d *digest, p []uint8) { } func compress(d *digest, p []uint8) { - // Verifies if AVX is available, use optimized code path. - if avx { + // Verifies if AVX2 or AVX is available, use optimized code path. + if avx2 { + compressAVX2(d, p) + return + } else if avx { compressAVX(d, p) return } // else { fallback to generic approach. diff --git a/compress_noasm.go b/compress_noasm.go index 7c198dd..d3c6758 100644 --- a/compress_noasm.go +++ b/compress_noasm.go @@ -1,11 +1,20 @@ //+build !amd64 noasm appengine -// Written in 2012 by Dmitry Chestnykh. -// -// To the extent possible under law, the author have dedicated all copyright -// and related and neighboring rights to this software to the public domain -// worldwide. This software is distributed without any warranty. -// http://creativecommons.org/publicdomain/zero/1.0/ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package blake2b diff --git a/cpuid.go b/cpuid.go index 696e064..e849867 100644 --- a/cpuid.go +++ b/cpuid.go @@ -18,12 +18,14 @@ package blake2b func cpuid(op uint32) (eax, ebx, ecx, edx uint32) +func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) func xgetbv(index uint32) (eax, edx uint32) // True when SIMD instructions are available. +var avx2 = haveAVX2() var avx = haveAVX() -// haveSSE returns true if we have streaming SIMD instructions. +// haveAVX returns true if when there is AVX support func haveAVX() bool { _, _, c, _ := cpuid(1) @@ -35,3 +37,15 @@ func haveAVX() bool { } return false } + +// haveAVX2 returns true if when there is AVX2 support +func haveAVX2() bool { + + mfi, _, _, _ := cpuid(0) + // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. + if mfi >= 7 && haveAVX() { + _, ebx, _, _ := cpuidex(7, 0) + return (ebx & 0x00000020) != 0 + } + return false +} diff --git a/cpuid_386.s b/cpuid_386.s index 9ae606e..fa38814 100644 --- a/cpuid_386.s +++ b/cpuid_386.s @@ -13,10 +13,21 @@ TEXT ·cpuid(SB), 7, $0 MOVL DX, edx+16(FP) RET +// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·cpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + // func xgetbv(index uint32) (eax, edx uint32) TEXT ·xgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+4(FP) - MOVL DX, edx+8(FP) - RET + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+4(FP) + MOVL DX, edx+8(FP) + RET diff --git a/cpuid_amd64.s b/cpuid_amd64.s index 27aad2a..fb45a65 100644 --- a/cpuid_amd64.s +++ b/cpuid_amd64.s @@ -13,10 +13,22 @@ TEXT ·cpuid(SB), 7, $0 MOVL DX, edx+20(FP) RET + +// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·cpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + // func xgetbv(index uint32) (eax, edx uint32) TEXT ·xgetbv(SB), 7, $0 - MOVL index+0(FP), CX - BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV - MOVL AX, eax+8(FP) - MOVL DX, edx+12(FP) - RET + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+8(FP) + MOVL DX, edx+12(FP) + RET