From c5dac2387d8c3e3de36bc9d69aaacbf748e8d395 Mon Sep 17 00:00:00 2001 From: frankw Date: Sat, 25 Jun 2016 08:42:30 +0200 Subject: [PATCH] G2 macro completed --- block.go | 28 ++++++++++++++-------------- compress_amd64.s | 12 ++++++++++++ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/block.go b/block.go index b23cccf..a2d0f86 100644 --- a/block.go +++ b/block.go @@ -66,20 +66,20 @@ func blocks(d *digest, p []uint8) { v11 += v15 v7 ^= v11 v7 = v7<<(64-24) | v7>>24 - //v2 += m[5] - //v2 += v6 - //v14 ^= v2 - //v14 = v14<<(64-16) | v14>>16 - //v10 += v14 - //v6 ^= v10 - //v6 = v6<<(64-63) | v6>>63 - //v3 += m[7] - //v3 += v7 - //v15 ^= v3 - //v15 = v15<<(64-16) | v15>>16 - //v11 += v15 - //v7 ^= v11 - //v7 = v7<<(64-63) | v7>>63 + v2 += m[5] + v2 += v6 + v14 ^= v2 + v14 = v14<<(64-16) | v14>>16 + v10 += v14 + v6 ^= v10 + v6 = v6<<(64-63) | v6>>63 + v3 += m[7] + v3 += v7 + v15 ^= v3 + v15 = v15<<(64-16) | v15>>16 + v11 += v15 + v7 ^= v11 + v7 = v7<<(64-63) | v7>>63 v1 += m[3] v1 += v5 v13 ^= v1 diff --git a/compress_amd64.s b/compress_amd64.s index 6815511..8aae6ef 100644 --- a/compress_amd64.s +++ b/compress_amd64.s @@ -77,6 +77,7 @@ TEXT ·compressSSE(SB), 7, $0 /////////////////////////////////////////////////////////////////////////// // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+m[1] @@ -86,6 +87,7 @@ TEXT ·compressSSE(SB), 7, $0 BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ // Load shuffle value MOVQ shffle+120(FP), SI // SI: &shuffle @@ -109,15 +111,25 @@ TEXT ·compressSSE(SB), 7, $0 // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc2 // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xcb // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ + BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ + BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xf6; BYTE $0x39 // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xf6; BYTE $0x39 // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ + BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xff; BYTE $0x39 // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ + BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xff; BYTE $0x39 // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ + BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ + BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ BYTE $0xc5; BYTE $0x69; BYTE $0xd4; BYTE $0xfa // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */ BYTE $0xc5; BYTE $0xe9; BYTE $0x73; BYTE $0xd2; BYTE $0x3f // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */ BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0xef; BYTE $0xd7 // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ + BYTE $0xc5; BYTE $0x61; BYTE $0xd4; BYTE $0xfb // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */ + BYTE $0xc5; BYTE $0xe1; BYTE $0x73; BYTE $0xd3; BYTE $0x3f // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0xef; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ // Reload digest MOVQ in+24(FP), SI // SI: &in