1
0
mirror of https://github.com/fumiama/blake2b-simd.git synced 2026-06-11 21:50:30 +08:00

First half of G1 macro working for Round 1

This commit is contained in:
frankw
2016-06-24 22:14:06 +02:00
parent 8bc3dcee2c
commit 207d895ad9
3 changed files with 50 additions and 19 deletions

View File

@@ -37,21 +37,21 @@ func blocks(d *digest, p []uint8) {
j += 8 j += 8
} }
//// Round 1. // Round 1.
//v0 += m[0] v0 += m[0]
//v0 += v4 v0 += v4
//v12 ^= v0 v12 ^= v0
//v12 = v12<<(64-32) | v12>>32 v12 = v12<<(64-32) | v12>>32
//v8 += v12 v8 += v12
//v4 ^= v8 v4 ^= v8
//v4 = v4<<(64-24) | v4>>24 v4 = v4<<(64-24) | v4>>24
//v1 += m[2] v1 += m[2]
//v1 += v5 v1 += v5
//v13 ^= v1 v13 ^= v1
//v13 = v13<<(64-32) | v13>>32 v13 = v13<<(64-32) | v13>>32
//v9 += v13 v9 += v13
//v5 ^= v9 v5 ^= v9
//v5 = v5<<(64-24) | v5>>24 v5 = v5<<(64-24) | v5>>24
//v2 += m[4] //v2 += m[4]
//v2 += v6 //v2 += v6
//v14 ^= v2 //v14 ^= v2

View File

@@ -20,7 +20,7 @@
package blake2b package blake2b
//go:noescape //go:noescape
func compressSSE(p []uint8, in, iv, t, f, out []uint64) func compressSSE(p []uint8, in, iv , t, f, shffle, out []uint64)
func compress(d *digest, p []uint8) { func compress(d *digest, p []uint8) {
h0, h1, h2, h3, h4, h5, h6, h7 := d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] h0, h1, h2, h3, h4, h5, h6, h7 := d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7]
@@ -28,6 +28,11 @@ func compress(d *digest, p []uint8) {
in := make([]uint64, 8, 8) in := make([]uint64, 8, 8)
out := make([]uint64, 8, 8) out := make([]uint64, 8, 8)
shffle := make([]uint64, 2, 2)
// vector for PSHUFB instruction
shffle[0] = 0x0201000706050403
shffle[1] = 0x0a09080f0e0d0c0b
for len(p) >= BlockSize { for len(p) >= BlockSize {
// Increment counter. // Increment counter.
d.t[0] += BlockSize d.t[0] += BlockSize
@@ -37,7 +42,7 @@ func compress(d *digest, p []uint8) {
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = h0, h1, h2, h3, h4, h5, h6, h7 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = h0, h1, h2, h3, h4, h5, h6, h7
compressSSE(p, in, iv[:], d.t[:], d.f[:], out) compressSSE(p, in, iv[:], d.t[:], d.f[:], shffle, out)
h0, h1, h2, h3, h4, h5, h6, h7 = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] h0, h1, h2, h3, h4, h5, h6, h7 = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]

View File

@@ -16,7 +16,7 @@
// limitations under the License. // limitations under the License.
// //
// func compressSSE(compressSSE(p []uint8, in, iv, t, f, out []uint64) // func compressSSE(compressSSE(p []uint8, in, iv, t, f, shffle, out []uint64)
TEXT ·compressSSE(SB), 7, $0 TEXT ·compressSSE(SB), 7, $0
// Load digest // Load digest
@@ -39,6 +39,32 @@ TEXT ·compressSSE(SB), 7, $0
MOVOU 0(SI), X8 // X8 = f[0]+f[0] /* row4h = _mm_xor_si128( LOAD( &S->f[0] ) */ MOVOU 0(SI), X8 // X8 = f[0]+f[0] /* row4h = _mm_xor_si128( LOAD( &S->f[0] ) */
PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */ PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */
///////////////////////////////////////////////////////////////////////////
// R O U N D 1
///////////////////////////////////////////////////////////////////////////
// LOAD_MSG_ ##r ##_1(b0, b1);
// (X12 used as additional temp register)
MOVQ message+0(FP), DX // DX: &p (message)
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
// Load shuffle value
MOVQ shffle+120(FP), SI // SI: &shuffle
MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc0 // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
BYTE $0xc4; BYTE $0xc2; BYTE $0x69; BYTE $0x00; BYTE $0xd4 // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
// Reload digest // Reload digest
MOVQ in+24(FP), SI // SI: &in MOVQ in+24(FP), SI // SI: &in
MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
@@ -57,7 +83,7 @@ TEXT ·compressSSE(SB), 7, $0
PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */ PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */
// Store digest // Store digest
MOVQ out+120(FP), DX // DX: &out MOVQ out+144(FP), DX // DX: &out
MOVOU X0, 0(DX) // out[0]+out[1] = X0 MOVOU X0, 0(DX) // out[0]+out[1] = X0
MOVOU X1, 16(DX) // out[2]+out[3] = X1 MOVOU X1, 16(DX) // out[2]+out[3] = X1
MOVOU X2, 32(DX) // out[4]+out[5] = X2 MOVOU X2, 32(DX) // out[4]+out[5] = X2