diff --git a/compress_amd64.go b/compress_amd64.go index 0a5299e..96d9793 100644 --- a/compress_amd64.go +++ b/compress_amd64.go @@ -20,10 +20,9 @@ package blake2b //go:noescape -func blockAVX(p []uint8, in, iv, t, f, shffle, out []uint64) +func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64) func compressAVX(d *digest, p []uint8) { - h0, h1, h2, h3, h4, h5, h6, h7 := d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] in := make([]uint64, 8, 8) out := make([]uint64, 8, 8) @@ -33,23 +32,11 @@ func compressAVX(d *digest, p []uint8) { shffle[0] = 0x0201000706050403 shffle[1] = 0x0a09080f0e0d0c0b - for len(p) >= BlockSize { - // Increment counter. - d.t[0] += BlockSize - if d.t[0] < BlockSize { - d.t[1]++ - } + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] - in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = h0, h1, h2, h3, h4, h5, h6, h7 + blockAVXLoop(p, in, iv[:], d.t[:], d.f[:], shffle, out) - blockAVX(p, in, iv[:], d.t[:], d.f[:], shffle, out) - - h0, h1, h2, h3, h4, h5, h6, h7 = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] - - p = p[BlockSize:] - } - - d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = h0, h1, h2, h3, h4, h5, h6, h7 + d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] } func compress(d *digest, p []uint8) { diff --git a/compress_amd64.s b/compress_amd64.s index beb1429..9a0cd14 100644 --- a/compress_amd64.s +++ b/compress_amd64.s @@ -119,10 +119,12 @@ MOVQ shffle+120(FP), SI \ // SI: &shuffle MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a - -// func blockAVX(p []uint8, in, iv, t, f, shffle, out []uint64) -TEXT ·blockAVX(SB), 7, $0 +// func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64) +TEXT ·blockAVXLoop(SB), 7, $0 // REGISTER USE + // R8: loop counter + // DX: message pointer + // SI: temp pointer for loading // X0 - X7: v0 - v15 // X8 - X11: m[0] - m[7] // X12: shuffle value @@ -135,16 +137,43 @@ TEXT ·blockAVX(SB), 7, $0 MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ - // Load initialization vector - MOVQ iv+48(FP), DX // DX: &iv - MOVOU 0(DX), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */ - MOVOU 16(DX), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */ + // Already store digest into &out (so we can reload it later generically) + MOVQ out+144(FP), SI // SI: &out + MOVOU X0, 0(SI) // out[0]+out[1] = X0 + MOVOU X1, 16(SI) // out[2]+out[3] = X1 + MOVOU X2, 32(SI) // out[4]+out[5] = X2 + MOVOU X3, 48(SI) // out[6]+out[7] = X3 + + // Initialize message pointer and loop counter + MOVQ message+0(FP), DX // DX: &p (message) + MOVQ message_len+8(FP), R8 // R8: len(message) + SHRQ $7, R8 // len(message) / 128 + CMPQ R8, $0 + JEQ complete + +loop: + // Increment counter MOVQ t+72(FP), SI // SI: &t - MOVOU 32(DX), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ - MOVOU 0(SI), X7 // X7 = t[0]+t[1] /* LOAD( &S->t[0] ) */ - PXOR X7, X6 // X6 = X6 ^ X7 /* row4l = _mm_xor_si128( , ); */ + MOVQ 0(SI), R9 // + ADDQ $128, R9 // /* d.t[0] += BlockSize */ + MOVQ R9, 0(SI) // + CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ + JGE noincr // + MOVQ 8(SI), R9 // + ADDQ $1, R9 // /* d.t[1]++ */ + MOVQ R9, 8(SI) // +noincr: // /* } */ + + // Load initialization vector + MOVQ iv+48(FP), SI // SI: &iv + MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */ + MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */ + MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ + MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ + MOVQ t+72(FP), SI // SI: &t + MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */ + PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */ MOVQ t+96(FP), SI // SI: &f - MOVOU 48(DX), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */ PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */ @@ -155,7 +184,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+m[1] MOVOU 16(DX), X13 // X13 = m[2]+m[3] MOVOU 32(DX), X14 // X14 = m[4]+m[5] @@ -175,7 +203,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 96(DX), X14 // X14 = m[12]+m[13] @@ -199,7 +226,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 112(DX), X12 // X12 = m[14]+m[15] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 64(DX), X14 // X14 = m[8]+ m[9] @@ -221,7 +247,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 80(DX), X14 // X14 = m[10]+m[11] @@ -247,7 +272,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 32(DX), X12 // X12 = m[4]+ m[5] MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 96(DX), X14 // X14 = m[12]+m[13] @@ -271,7 +295,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 64(DX), X14 // X14 = m[8]+ m[9] @@ -299,7 +322,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 80(DX), X14 // X14 = m[10]+m[11] @@ -322,7 +344,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 80(DX), X14 // X14 = m[10]+m[11] @@ -351,7 +372,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 64(DX), X14 // X14 = m[8]+ m[9] @@ -376,7 +396,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 80(DX), X14 // X14 = m[10]+m[11] @@ -406,7 +425,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 48(DX), X14 // X14 = m[6]+ m[7] @@ -428,7 +446,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 48(DX), X14 // X14 = m[6]+ m[7] @@ -456,7 +473,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 96(DX), X14 // X14 = m[12]+m[13] @@ -478,7 +494,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 64(DX), X14 // X14 = m[8]+ m[9] @@ -504,7 +519,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 96(DX), X14 // X14 = m[12]+m[13] @@ -528,7 +542,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 64(DX), X14 // X14 = m[8]+ m[9] @@ -555,7 +568,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 80(DX), X14 // X14 = m[10]+m[11] @@ -577,7 +589,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 16(DX), X13 // X13 = m[2]+ m[3] MOVOU 80(DX), X14 // X14 = m[10]+m[11] @@ -606,7 +617,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 48(DX), X13 // X13 = m[6]+ m[7] MOVOU 64(DX), X14 // X14 = m[8]+ m[9] @@ -629,7 +639,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 16(DX), X12 // X12 = m[2]+ m[3] MOVOU 64(DX), X13 // X13 = m[8]+ m[9] MOVOU 96(DX), X14 // X14 = m[12]+m[13] @@ -655,7 +664,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+m[1] MOVOU 16(DX), X13 // X13 = m[2]+m[3] MOVOU 32(DX), X14 // X14 = m[4]+m[5] @@ -675,7 +683,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 64(DX), X12 // X12 = m[8]+ m[9] MOVOU 80(DX), X13 // X13 = m[10]+m[11] MOVOU 96(DX), X14 // X14 = m[12]+m[13] @@ -699,7 +706,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_1(b0, b1); // LOAD_MSG_ ##r ##_2(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 112(DX), X12 // X12 = m[14]+m[15] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 64(DX), X14 // X14 = m[8]+ m[9] @@ -721,7 +727,6 @@ TEXT ·blockAVX(SB), 7, $0 // LOAD_MSG_ ##r ##_3(b0, b1); // LOAD_MSG_ ##r ##_4(b0, b1); // (X12 used as additional temp register) - MOVQ message+0(FP), DX // DX: &p (message) MOVOU 0(DX), X12 // X12 = m[0]+ m[1] MOVOU 32(DX), X13 // X13 = m[4]+ m[5] MOVOU 80(DX), X14 // X14 = m[10]+m[11] @@ -740,8 +745,8 @@ TEXT ·blockAVX(SB), 7, $0 UNDIAGONALIZE - // Reload digest - MOVQ in+24(FP), SI // SI: &in + // Reload digest (most current value store in &out) + MOVQ out+144(FP), SI // SI: &in MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ @@ -757,12 +762,17 @@ TEXT ·blockAVX(SB), 7, $0 PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */ PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */ - // Store digest - MOVQ out+144(FP), DX // DX: &out - MOVOU X0, 0(DX) // out[0]+out[1] = X0 - MOVOU X1, 16(DX) // out[2]+out[3] = X1 - MOVOU X2, 32(DX) // out[4]+out[5] = X2 - MOVOU X3, 48(DX) // out[6]+out[7] = X3 + // Store digest into &out + MOVQ out+144(FP), SI // SI: &out + MOVOU X0, 0(SI) // out[0]+out[1] = X0 + MOVOU X1, 16(SI) // out[2]+out[3] = X1 + MOVOU X2, 32(SI) // out[4]+out[5] = X2 + MOVOU X3, 48(SI) // out[6]+out[7] = X3 + // Increment message pointer and check if there's more to do + ADDQ $128, DX // message += 128 + SUBQ $1, R8 + JNZ loop + +complete: RET -