1
0
mirror of https://github.com/fumiama/unibase2n.git synced 2026-06-05 00:32:47 +08:00

add asm dec128blk1

This commit is contained in:
源文雨
2022-10-03 17:14:10 +08:00
parent f084ae5128
commit d78949ac03
7 changed files with 136 additions and 35 deletions

12
cpuid.go Normal file
View File

@@ -0,0 +1,12 @@
//go:build amd64
// +build amd64
package unibase2n
func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
// True when SSE2 instructions are available.
var canusesse2 = func() bool {
_, _, c, _ := cpuid(1)
return c&(1<<26) > 0
}()

15
cpuid_amd64.s Normal file
View File

@@ -0,0 +1,15 @@
//go:build amd64
// +build amd64
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
TEXT ·cpuid(SB), 7, $0
XORQ CX, CX
MOVL op+0(FP), AX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET

View File

@@ -51,41 +51,6 @@ func (bs Base) Decode(data []byte) (out []byte) {
return
}
// dec128blk1 for bit 1
// len(in)>0, len(in)%16==0, len(out)==len(in)/16
//go:nosplit
func dec128blk1(mask uint128be, in, out []byte) {
for i := range out {
c := i * 16
n := readuint128be(in[c : c+16])
one := u128one
n.subeq(mask)
sum := n.and(one)
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
out[i] = uint8(sum.b)
}
}
// dec64blk2 for bit 2
// len(in)>0, len(in)%8==0, len(out)==len(in)/8
//go:nosplit

8
decode_amd64.go Normal file
View File

@@ -0,0 +1,8 @@
//go:build amd64
// +build amd64
package unibase2n
// dec128blk1 for bit 1
// len(in)>0, len(in)%16==0, len(out)==len(in)/16
func dec128blk1(mask uint128be, in, out []byte)

56
decode_amd64.s Normal file
View File

@@ -0,0 +1,56 @@
//go:build amd64
// +build amd64
#include "textflag.h"
// dec128blk1(mask uint128be, in, out []byte)
// len(in)>0, len(in)%16==0, len(out)==len(in)/16
TEXT ·dec128blk1(SB), NOSPLIT, $0-64
MOVQ ·mask+0(FP), DX
BSWAPQ DX
MOVQ ·in+16(FP), SI
MOVQ ·in+24(FP), CX
SHRQ $4, CX
MOVQ ·in+40(FP), DI
// go forward
CLD
lop:
LODSQ
BSWAPQ AX
SUBQ DX, AX
MOVQ AX, BX
ANDB $1, AX
RORQ $1, AX
SHRQ $17, BX
SETCS AX
RORQ $1, AX
SHRQ $16, BX
SETCS AX
RORQ $1, AX
SHRQ $16, BX
SETCS AX
ROLQ $7, AX
MOVQ AX, R8
LODSQ
BSWAPQ AX
SUBQ DX, AX
MOVQ AX, BX
ANDB $1, AX
RORQ $1, AX
SHRQ $17, BX
SETCS AX
RORQ $1, AX
SHRQ $16, BX
SETCS AX
RORQ $1, AX
SHRQ $16, BX
SETCS AX
ROLQ $3, AX
ORQ R8, AX
STOSB
LOOP lop
RET

39
decode_noasm.go Normal file
View File

@@ -0,0 +1,39 @@
//go:build !amd64
// +build !amd64
package unibase2n
// dec128blk1 for bit 1
// len(in)>0, len(in)%16==0, len(out)==len(in)/16
//go:nosplit
func dec128blk1(mask uint128be, in, out []byte) {
for i := range out {
c := i * 16
n := readuint128be(in[c : c+16])
one := u128one
n.subeq(mask)
sum := n.and(one)
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
n.shreq(16 - 1)
one.shleq(1)
sum.oreq(n.and(one))
out[i] = uint8(sum.b)
}
}

View File

@@ -86,6 +86,12 @@ func (num uint128be) or(n uint128be) (r uint128be) {
return
}
func (num uint128be) bswap() (r uint128be) {
r.a = bits.ReverseBytes64(num.b)
r.b = bits.ReverseBytes64(num.a)
return
}
func (num *uint128be) write(b []byte) {
binary.BigEndian.PutUint64(b[:8], num.a)
binary.BigEndian.PutUint64(b[8:16], num.b)