From 53bb6668a36d31a56b5c3beb2d117cee7d792132 Mon Sep 17 00:00:00 2001 From: frankw Date: Sat, 25 Jun 2016 09:04:36 +0200 Subject: [PATCH] UNDIAGONALIZE macro --- blake2b_test.go | 2 -- compress_amd64.s | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/blake2b_test.go b/blake2b_test.go index 3af89a1..661ffa8 100644 --- a/blake2b_test.go +++ b/blake2b_test.go @@ -33,8 +33,6 @@ func TestCompress(t *testing.T) { hGo.Write(in) sumGo := fmt.Sprintf("%x", hGo.Sum(nil)) - // Digest for testing generated with modified codahale/blake2 with ROUND macro that stops after DIAGONALIZE - sumGo = "2306b43fd384cba9820ad5a79c6a0f19775f205e9e13f5956b8c271cf6d5b165de31323244522c59eca5c96d943d76df4b1770b86e26dae7839042fa1875bc60" hSSE.Write(in) sumSSE := fmt.Sprintf("%x", hSSE.Sum(nil)) diff --git a/compress_amd64.s b/compress_amd64.s index f11d6eb..e5319a0 100644 --- a/compress_amd64.s +++ b/compress_amd64.s @@ -146,6 +146,21 @@ TEXT ·compressSSE(SB), 7, $0 BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ + // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + MOVOU X4, X13 /* t0 = row3l;\ */ + MOVOU X5, X4 /* row3l = row3h;\ */ + MOVOU X13, X5 /* row3h = t0;\ */ + MOVOU X2, X13 /* t0 = row2l;\ */ + MOVOU X6, X14 /* t1 = row4l;\ */ + BYTE $0xc5; BYTE $0x69; BYTE $0x6c; BYTE $0xfa // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ + BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ + BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x49; BYTE $0x6d; BYTE $0xf7 // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ + // Reload digest MOVQ in+24(FP), SI // SI: &in MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */