From d78949ac03434aaa1690f82d5a9fe94fc303cf98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=BA=90=E6=96=87=E9=9B=A8?= <41315874+fumiama@users.noreply.github.com> Date: Mon, 3 Oct 2022 17:14:10 +0800 Subject: [PATCH] add asm dec128blk1 --- cpuid.go | 12 +++++++++++ cpuid_amd64.s | 15 +++++++++++++ decode.go | 35 ------------------------------- decode_amd64.go | 8 +++++++ decode_amd64.s | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ decode_noasm.go | 39 ++++++++++++++++++++++++++++++++++ uint128be.go | 6 ++++++ 7 files changed, 136 insertions(+), 35 deletions(-) create mode 100644 cpuid.go create mode 100644 cpuid_amd64.s create mode 100644 decode_amd64.go create mode 100644 decode_amd64.s create mode 100644 decode_noasm.go diff --git a/cpuid.go b/cpuid.go new file mode 100644 index 0000000..fbd118f --- /dev/null +++ b/cpuid.go @@ -0,0 +1,12 @@ +//go:build amd64 +// +build amd64 + +package unibase2n + +func cpuid(op uint32) (eax, ebx, ecx, edx uint32) + +// True when SSE2 instructions are available. +var canusesse2 = func() bool { + _, _, c, _ := cpuid(1) + return c&(1<<26) > 0 +}() diff --git a/cpuid_amd64.s b/cpuid_amd64.s new file mode 100644 index 0000000..98f50a2 --- /dev/null +++ b/cpuid_amd64.s @@ -0,0 +1,15 @@ +//go:build amd64 +// +build amd64 + +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// func cpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·cpuid(SB), 7, $0 + XORQ CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET diff --git a/decode.go b/decode.go index 58e2023..71b4d4d 100644 --- a/decode.go +++ b/decode.go @@ -51,41 +51,6 @@ func (bs Base) Decode(data []byte) (out []byte) { return } -// dec128blk1 for bit 1 -// len(in)>0, len(in)%16==0, len(out)==len(in)/16 -//go:nosplit -func dec128blk1(mask uint128be, in, out []byte) { - for i := range out { - c := i * 16 - n := readuint128be(in[c : c+16]) - one := u128one - n.subeq(mask) - sum := n.and(one) - n.shreq(16 - 1) - one.shleq(1) - sum.oreq(n.and(one)) - n.shreq(16 - 1) - one.shleq(1) - sum.oreq(n.and(one)) - n.shreq(16 - 1) - one.shleq(1) - sum.oreq(n.and(one)) - n.shreq(16 - 1) - one.shleq(1) - sum.oreq(n.and(one)) - n.shreq(16 - 1) - one.shleq(1) - sum.oreq(n.and(one)) - n.shreq(16 - 1) - one.shleq(1) - sum.oreq(n.and(one)) - n.shreq(16 - 1) - one.shleq(1) - sum.oreq(n.and(one)) - out[i] = uint8(sum.b) - } -} - // dec64blk2 for bit 2 // len(in)>0, len(in)%8==0, len(out)==len(in)/8 //go:nosplit diff --git a/decode_amd64.go b/decode_amd64.go new file mode 100644 index 0000000..4c23ea5 --- /dev/null +++ b/decode_amd64.go @@ -0,0 +1,8 @@ +//go:build amd64 +// +build amd64 + +package unibase2n + +// dec128blk1 for bit 1 +// len(in)>0, len(in)%16==0, len(out)==len(in)/16 +func dec128blk1(mask uint128be, in, out []byte) diff --git a/decode_amd64.s b/decode_amd64.s new file mode 100644 index 0000000..aadc85d --- /dev/null +++ b/decode_amd64.s @@ -0,0 +1,56 @@ +//go:build amd64 +// +build amd64 + +#include "textflag.h" + +// dec128blk1(mask uint128be, in, out []byte) +// len(in)>0, len(in)%16==0, len(out)==len(in)/16 +TEXT ·dec128blk1(SB), NOSPLIT, $0-64 + MOVQ ·mask+0(FP), DX + BSWAPQ DX + MOVQ ·in+16(FP), SI + MOVQ ·in+24(FP), CX + SHRQ $4, CX + MOVQ ·in+40(FP), DI + // go forward + CLD +lop: + LODSQ + BSWAPQ AX + SUBQ DX, AX + MOVQ AX, BX + ANDB $1, AX + RORQ $1, AX + SHRQ $17, BX + SETCS AX + RORQ $1, AX + SHRQ $16, BX + SETCS AX + RORQ $1, AX + SHRQ $16, BX + SETCS AX + + ROLQ $7, AX + MOVQ AX, R8 + + LODSQ + BSWAPQ AX + SUBQ DX, AX + MOVQ AX, BX + ANDB $1, AX + RORQ $1, AX + SHRQ $17, BX + SETCS AX + RORQ $1, AX + SHRQ $16, BX + SETCS AX + RORQ $1, AX + SHRQ $16, BX + SETCS AX + + ROLQ $3, AX + ORQ R8, AX + + STOSB + LOOP lop + RET diff --git a/decode_noasm.go b/decode_noasm.go new file mode 100644 index 0000000..14ea887 --- /dev/null +++ b/decode_noasm.go @@ -0,0 +1,39 @@ +//go:build !amd64 +// +build !amd64 + +package unibase2n + +// dec128blk1 for bit 1 +// len(in)>0, len(in)%16==0, len(out)==len(in)/16 +//go:nosplit +func dec128blk1(mask uint128be, in, out []byte) { + for i := range out { + c := i * 16 + n := readuint128be(in[c : c+16]) + one := u128one + n.subeq(mask) + sum := n.and(one) + n.shreq(16 - 1) + one.shleq(1) + sum.oreq(n.and(one)) + n.shreq(16 - 1) + one.shleq(1) + sum.oreq(n.and(one)) + n.shreq(16 - 1) + one.shleq(1) + sum.oreq(n.and(one)) + n.shreq(16 - 1) + one.shleq(1) + sum.oreq(n.and(one)) + n.shreq(16 - 1) + one.shleq(1) + sum.oreq(n.and(one)) + n.shreq(16 - 1) + one.shleq(1) + sum.oreq(n.and(one)) + n.shreq(16 - 1) + one.shleq(1) + sum.oreq(n.and(one)) + out[i] = uint8(sum.b) + } +} diff --git a/uint128be.go b/uint128be.go index 0e837f7..73329b7 100644 --- a/uint128be.go +++ b/uint128be.go @@ -86,6 +86,12 @@ func (num uint128be) or(n uint128be) (r uint128be) { return } +func (num uint128be) bswap() (r uint128be) { + r.a = bits.ReverseBytes64(num.b) + r.b = bits.ReverseBytes64(num.a) + return +} + func (num *uint128be) write(b []byte) { binary.BigEndian.PutUint64(b[:8], num.a) binary.BigEndian.PutUint64(b[8:16], num.b)