1
0
mirror of https://github.com/fumiama/go-base16384.git synced 2026-06-05 00:32:52 +08:00
Files
go-base16384/base14_amd64.s
源文雨 75ee4a090e 优化 amd64 调用与内存
goos: darwin
goarch: amd64
pkg: github.com/fumiama/go-base16384
cpu: Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz

name            old time/op    new time/op    delta
EncodeTo/16-8     16.9ns ± 3%    16.7ns ± 1%    -1.62%  (p=0.048 n=5+5)
EncodeTo/256-8    78.0ns ± 1%    77.6ns ± 0%      ~     (p=0.286 n=5+4)
EncodeTo/4K-8      942ns ± 0%     943ns ± 0%      ~     (p=0.841 n=5+5)
EncodeTo/32K-8    7.59µs ± 1%    7.53µs ± 1%      ~     (p=0.222 n=5+5)
DecodeTo/16-8     43.1ns ± 1%    12.2ns ± 0%   -71.70%  (p=0.008 n=5+5)
DecodeTo/256-8     179ns ± 1%      74ns ± 1%   -58.93%  (p=0.008 n=5+5)
DecodeTo/4K-8     1.67µs ± 1%    0.94µs ± 0%   -43.89%  (p=0.008 n=5+5)
DecodeTo/32K-8    13.2µs ± 0%     7.5µs ± 1%   -43.48%  (p=0.008 n=5+5)
Encoder/16-8       118ns ± 4%     112ns ± 0%    -5.01%  (p=0.008 n=5+5)
Encoder/256-8      350ns ± 0%     341ns ± 0%    -2.48%  (p=0.008 n=5+5)
Encoder/4K-8      3.86µs ± 2%    3.83µs ± 0%      ~     (p=0.238 n=5+5)
Encoder/32K-8     29.6µs ± 0%    29.4µs ± 1%      ~     (p=0.095 n=5+5)
Decoder/16-8       417ns ± 6%     406ns ± 1%      ~     (p=0.056 n=5+5)
Decoder/256-8      471ns ± 1%     467ns ± 1%      ~     (p=0.222 n=5+5)
Decoder/4K-8      1.65µs ± 1%    1.65µs ± 2%      ~     (p=0.500 n=5+5)
Decoder/32K-8     14.3µs ±21%    12.7µs ± 1%      ~     (p=0.151 n=5+5)

name            old speed      new speed      delta
EncodeTo/16-8    946MB/s ± 3%   961MB/s ± 1%      ~     (p=0.056 n=5+5)
EncodeTo/256-8  3.28GB/s ± 1%  3.30GB/s ± 0%      ~     (p=0.286 n=5+4)
EncodeTo/4K-8   4.35GB/s ± 0%  4.34GB/s ± 0%      ~     (p=0.841 n=5+5)
EncodeTo/32K-8  4.32GB/s ± 1%  4.35GB/s ± 1%      ~     (p=0.222 n=5+5)
DecodeTo/16-8    510MB/s ± 1%  1803MB/s ± 0%  +253.37%  (p=0.008 n=5+5)
DecodeTo/256-8  1.65GB/s ± 1%  4.02GB/s ± 1%  +143.45%  (p=0.008 n=5+5)
DecodeTo/4K-8   2.80GB/s ± 1%  4.99GB/s ± 0%   +78.22%  (p=0.008 n=5+5)
DecodeTo/32K-8  2.83GB/s ± 0%  5.00GB/s ± 1%   +76.93%  (p=0.008 n=5+5)
Encoder/16-8     135MB/s ± 4%   142MB/s ± 0%    +5.22%  (p=0.008 n=5+5)
Encoder/256-8    731MB/s ± 0%   750MB/s ± 0%    +2.55%  (p=0.008 n=5+5)
Encoder/4K-8    1.06GB/s ± 2%  1.07GB/s ± 0%      ~     (p=0.310 n=5+5)
Encoder/32K-8   1.11GB/s ± 0%  1.12GB/s ± 1%      ~     (p=0.095 n=5+5)
Decoder/16-8    38.4MB/s ± 6%  39.4MB/s ± 1%      ~     (p=0.056 n=5+5)
Decoder/256-8    544MB/s ± 1%   548MB/s ± 1%      ~     (p=0.222 n=5+5)
Decoder/4K-8    2.49GB/s ± 1%  2.48GB/s ± 2%      ~     (p=0.548 n=5+5)
Decoder/32K-8   2.32GB/s ±18%  2.59GB/s ± 1%      ~     (p=0.151 n=5+5)

name            old alloc/op   new alloc/op   delta
EncodeTo/16-8      0.00B          0.00B           ~     (all equal)
EncodeTo/256-8     0.00B          0.00B           ~     (all equal)
EncodeTo/4K-8      0.00B          0.00B           ~     (all equal)
EncodeTo/32K-8     0.00B          0.00B           ~     (all equal)
DecodeTo/16-8      48.0B ± 0%      0.0B       -100.00%  (p=0.008 n=5+5)
DecodeTo/256-8      576B ± 0%        0B       -100.00%  (p=0.008 n=5+5)
DecodeTo/4K-8     6.14kB ± 0%    0.00kB       -100.00%  (p=0.008 n=5+5)
DecodeTo/32K-8    49.2kB ± 0%     0.0kB       -100.00%  (p=0.008 n=5+5)
Encoder/16-8       24.0B ± 0%     24.0B ± 0%      ~     (all equal)
Encoder/256-8      24.0B ± 0%     24.0B ± 0%      ~     (all equal)
Encoder/4K-8       24.0B ± 0%     24.0B ± 0%      ~     (all equal)
Encoder/32K-8      26.0B ± 0%     26.0B ± 0%      ~     (all equal)
Decoder/16-8      1.39kB ± 0%    1.39kB ± 0%      ~     (all equal)
Decoder/256-8     1.39kB ± 0%    1.39kB ± 0%      ~     (all equal)
Decoder/4K-8      4.98kB ± 0%    4.98kB ± 0%      ~     (all equal)
Decoder/32K-8     41.1kB ± 0%    41.1kB ± 0%      ~     (all equal)

name            old allocs/op  new allocs/op  delta
EncodeTo/16-8       0.00           0.00           ~     (all equal)
EncodeTo/256-8      0.00           0.00           ~     (all equal)
EncodeTo/4K-8       0.00           0.00           ~     (all equal)
EncodeTo/32K-8      0.00           0.00           ~     (all equal)
DecodeTo/16-8       1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
DecodeTo/256-8      1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
DecodeTo/4K-8       1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
DecodeTo/32K-8      1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
Encoder/16-8        1.00 ± 0%      1.00 ± 0%      ~     (all equal)
Encoder/256-8       1.00 ± 0%      1.00 ± 0%      ~     (all equal)
Encoder/4K-8        1.00 ± 0%      1.00 ± 0%      ~     (all equal)
Encoder/32K-8       1.00 ± 0%      1.00 ± 0%      ~     (all equal)
Decoder/16-8        3.00 ± 0%      3.00 ± 0%      ~     (all equal)
Decoder/256-8       3.00 ± 0%      3.00 ± 0%      ~     (all equal)
Decoder/4K-8        3.00 ± 0%      3.00 ± 0%      ~     (all equal)
Decoder/32K-8       3.00 ± 0%      3.00 ± 0%      ~     (all equal)
2022-12-14 10:38:19 +08:00

235 lines
4.0 KiB
ArmAsm

//go:build amd64
// +build amd64
#include "textflag.h"
// func _encode(offset int, b, encd []byte) (sum uint64, n uint64)
TEXT ·_encode(SB), NOSPLIT, $0-72
MOVQ ·offset+0(FP), R10
MOVQ ·data+8(FP), DI
MOVQ ·dlen+16(FP), R8
MOVQ ·encd+32(FP), R9
XORQ CX, CX
XORQ SI, SI
SUBQ $6, R8
JLE encrem
MOVQ $4611404543450677248, BP
MOVQ $70364449210368, BX
MOVQ $5620578098173988352, R11
enclop:
MOVBEQ (DI)(CX*1), DX
INCQ SI
ADDQ $7, CX
MOVQ DX, R13
MOVQ DX, R12
SHRQ $2, R13
SHRQ $4, R12
ANDQ BX, R12
ANDQ BP, R13
ORQ R12, R13
MOVQ DX, R12
SHRQ $8, DX
SHRQ $6, R12
ANDL $16383, DX
ANDL $1073676288, R12
ORQ R13, R12
ORQ R12, DX
ADDQ R11, DX
MOVBEQ DX, -8(R9)(SI*8)
CMPQ CX, R8
JL enclop
encrem:
TESTQ R10, R10
JE encend
MOVBLZX (DI)(CX*1), DX
MOVL DX, R8
SALQ $14, DX
SHRB $2, R8
ANDL $49152, DX
MOVBLZX R8, R8
ORQ R8, DX
CMPL R10, $1
JE encsav
MOVBQSX 1(DI)(CX*1), R8
MOVQ R8, R11
SALQ $20, R8
SALQ $6, R11
ANDL $3145728, R8
ANDL $16128, R11
ORQ R11, DX
ORQ R8, DX
CMPL R10, $2
JE encsav
MOVBQSX 2(DI)(CX*1), R8
MOVQ R8, R11
SALQ $28, R8
SALQ $12, R11
MOVL R8, R8
ANDL $983040, R11
ORQ R11, R8
ORQ R8, DX
CMPL R10, $3
JE encsav
MOVQ $257698037760, BX
MOVBQSX 3(DI)(CX*1), R8
MOVQ R8, R11
SALQ $34, R8
SALQ $20, R11
ANDQ BX, R8
ANDL $251658240, R11
ORQ R11, R8
ORQ R8, DX
CMPL R10, $4
JE encsav
MOVQ $12884901888, BX
MOVBQSX 4(DI)(CX*1), R8
MOVQ R8, R11
SALQ $42, R8
SALQ $26, R11
ANDQ BX, R11
MOVQ $277076930199552, BX
ANDQ BX, R8
ORQ R11, R8
ORQ R8, DX
CMPL R10, $5
JE encsav
MOVQ $3298534883328, R8
MOVBQSX 5(DI)(CX*1), CX
MOVQ CX, DI
SALQ $48, CX
SALQ $34, DI
ANDQ R8, DI
MOVQ $17732923532771328, R8
ANDQ R8, CX
ORQ DI, CX
ORQ CX, DX
encsav:
MOVQ $21955383195992142, CX
ADDQ CX, DX
SHLQ $3, SI
MOVQ DX, ·sum+56(FP)
MOVQ SI, ·n+64(FP)
encend:
RET
// func _decode(offset, outlen int, b, decd []byte)
TEXT ·_decode(SB), NOSPLIT, $0-64
MOVQ ·offset+0(FP), BX
MOVQ ·outlen+8(FP), R8
MOVQ ·data+16(FP), DI
MOVQ ·decd+40(FP), R9
XORQ CX, CX
XORQ SI, SI
SUBQ $6, R8
JLE decrem
MOVQ $-5620578098173988352, R12
MOVQ $-1125899906842624, BP
MOVQ $1125831187365888, R11
MOVQ $68715282432, R10
declop:
MOVBEQ (DI)(SI*8), DX
INCQ SI
ADDQ R12, DX
MOVQ DX, R13
LEAQ 0(DX*4), R14
SALQ $4, R13
ANDQ BP, R14
ANDQ R11, R13
ORQ R13, R14
MOVQ DX, R13
SALQ $8, DX
SALQ $6, R13
ANDL $4194048, DX
ANDQ R10, R13
ORQ R14, R13
ORQ R13, DX
MOVBEQ DX, (R9)(CX*1)
ADDQ $7, CX
CMPQ CX, R8
JL declop
decrem:
TESTQ BX, BX
JE decend
MOVQ (DI)(SI*8), DI
LEAQ -78(DI), SI
MOVQ SI, DX
SALL $2, SI
SHRQ $14, DX
ANDL $3, DX
ORL SI, DX
MOVB DX, 0(R9)(CX*1)
CMPL BX, $1
JE decend
LEAQ -5111886(DI), DX
MOVQ DX, SI
MOVQ DX, R8
SHRQ $6, SI
SHRQ $20, R8
ANDL $-4, SI
ANDL $3, R8
ORL R8, SI
MOVB SI, 1(R9)(CX*1)
CMPL BX, $2
JE decend
MOVQ DX, SI
SHRQ $28, DX
SHRQ $12, SI
ANDL $15, DX
ANDL $-16, SI
ORL SI, DX
MOVB DX, 2(R9)(CX*1)
CMPL BX, $3
JE decend
MOVQ $-335012560974, DX
ADDQ DI, DX
MOVQ DX, SI
MOVQ DX, R8
SHRQ $20, SI
SHRQ $34, R8
ANDL $-16, SI
ANDL $15, R8
ORL R8, SI
MOVB SI, 3(R9)(CX*1)
CMPL BX, $4
JE decend
MOVQ DX, SI
SHRQ $42, DX
SHRQ $26, SI
ANDL $63, DX
ANDL $-64, SI
ORL SI, DX
MOVB DX, 4(R9)(CX*1)
CMPL BX, $5
JE decend
MOVQ $-21955383195992142, DX
ADDQ DX, DI
MOVQ DI, DX
SHRQ $48, DI
SHRQ $34, DX
ANDL $63, DI
ANDL $-64, DX
ORL DI, DX
MOVB DX, 5(R9)(CX*1)
decend:
RET