1
0
mirror of https://github.com/fumiama/blake2b-simd.git synced 2026-06-05 02:00:26 +08:00

First half of G2 macro completed

This commit is contained in:
frankw
2016-06-24 23:30:25 +02:00
parent 2fe67e0d78
commit 07dd36a646
2 changed files with 27 additions and 14 deletions

View File

@@ -80,20 +80,20 @@ func blocks(d *digest, p []uint8) {
//v11 += v15
//v7 ^= v11
//v7 = v7<<(64-63) | v7>>63
//v1 += m[3]
//v1 += v5
//v13 ^= v1
//v13 = v13<<(64-16) | v13>>16
//v9 += v13
//v5 ^= v9
//v5 = v5<<(64-63) | v5>>63
//v0 += m[1]
//v0 += v4
//v12 ^= v0
//v12 = v12<<(64-16) | v12>>16
//v8 += v12
//v4 ^= v8
//v4 = v4<<(64-63) | v4>>63
v1 += m[3]
v1 += v5
v13 ^= v1
v13 = v13<<(64-16) | v13>>16
v9 += v13
v5 ^= v9
v5 = v5<<(64-63) | v5>>63
v0 += m[1]
v0 += v4
v12 ^= v0
v12 = v12<<(64-16) | v12>>16
v8 += v12
v4 ^= v8
v4 = v4<<(64-63) | v4>>63
//v0 += m[8]
//v0 += v5
//v15 ^= v0

View File

@@ -58,6 +58,7 @@ TEXT ·compressSSE(SB), 7, $0
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
// Load shuffle value
MOVQ shffle+120(FP), SI // SI: &shuffle
@@ -79,6 +80,18 @@ TEXT ·compressSSE(SB), 7, $0
BYTE $0xc4; BYTE $0xc2; BYTE $0x69; BYTE $0x00; BYTE $0xd4 // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
BYTE $0xc4; BYTE $0xc2; BYTE $0x61; BYTE $0x00; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc2 // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xf6; BYTE $0x39 // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xf6; BYTE $0x39 // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
BYTE $0xc5; BYTE $0x69; BYTE $0xd4; BYTE $0xfa // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */
BYTE $0xc5; BYTE $0xe9; BYTE $0x73; BYTE $0xd2; BYTE $0x3f // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */
BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0xef; BYTE $0xd7 // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
// Reload digest
MOVQ in+24(FP), SI // SI: &in
MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */